from dataclasses import dataclass """ token: keyword identifier constant punctuator """ def is_nondigit(c): return c in { "_", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "’", } def is_digit(c): return c in { "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", } def is_hexadecimal_digit(c): return c in { "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "a", "b", "c", "d", "e", "f", "A", "B", "C", "D", "E", "F", } def is_punctuator(c): return c in { "[", "]", "(", ")", "{", "}", ".", "+", "-", "~", "!", "<<", ">>", "<", ">", "≤", "≥", "≠", "=", "∨", "∧", "⊕", "×", "/", "|", ";", ",", "←", } @dataclass class Identifier: line: int token: str @dataclass class IntegerConstant: line: int token: str value: int @dataclass class Punctuator: line: int token: str class Lexer: def __init__(self, buf): self.buf = buf self.start = 0 self.end = 0 self.line = 0 def peek(self): return self.buf[self.end] def match(self, c): if self.buf[self.end] == c: self.end += 1 return True else: return False def slice(self, offset=0): return self.buf[self.start:self.end+offset] def advance(self): c = self.buf[self.end] self.end += 1 return c def advance_whitespace(self): if self.match('\n'): self.line += 1 return True elif self.match('\n'): return True elif self.match('\t'): return True elif self.match(' '): return True return False def identifier(self): while True: c = self.peek() if is_digit(c) or is_nondigit(c): self.advance() else: return Identifier(self.line, self.slice()) def hexadecimal_constant(self): n = 0 while True: c = self.peek() if is_hexadecimal_digit(c): self.advance() n *= 16 i = ord(c) if i >= ord('0') and i <= ord('9'): n += i - ord('0') elif i >= ord('a') and i <= ord('f'): n += 10 + (i - ord('a')) elif i >= ord('A') and i <= ord('F'): n += 10 + (i - ord('A')) else: assert False else: return IntegerConstant(self.line, self.slice(), n) def decimal_constant(self): n = 0 while True: c = self.peek() if is_digit(c): self.advance() n *= 10 i = ord(c) if i >= ord('0') and i <= ord('9'): n += i - ord('0') else: assert False else: return IntegerConstant(self.line, self.slice(), n) def punctuator(self): while True: if self.end < len(self.buf) and is_punctuator(self.slice(1)): self.advance() else: assert is_punctuator(self.slice()) return Punctuator(self.line, self.slice()) def integer_constant(self): if self.buf[self.start] == '0' and (self.match('x') or self.match('X')): return self.hexadecimal_constant() else: self.end -= 1 return self.decimal_constant() def next_token(self): while self.advance_whitespace(): pass self.start = self.end c = self.advance() if is_nondigit(c): return self.identifier() elif is_digit(c): return self.integer_constant() elif is_punctuator(c): return self.punctuator() else: raise ValueError(c)