import string from dataclasses import dataclass from enum import Enum, auto import sys whitespace = set([ord(' '), ord('\r')]) string_digits = set(ord(i) for i in string.digits) identifier_start = set(map(ord, string.ascii_letters + "_")) identifier = identifier_start | string_digits @dataclass class Position: offset: int line: int column: int def from_position(p): return Position(p.offset, p.line, p.column) class TT(Enum): KEYWORD = auto() NEWLINE = auto() COLON = auto() STRING = auto() DOT = auto() EQUAL = auto() COMMA = auto() LPAREN = auto() RPAREN = auto() NUMBER = auto() IDENTIFIER = auto() WITH = auto() # keywords PLAY = auto() VOICE = auto() SCENE = auto() SHOW = auto() AT = auto() DEFINE = auto() IMAGE = auto() LABEL = auto() MENU = auto() JUMP = auto() RETURN = auto() INIT = auto() FADEOUT = auto() keywords = { b"play": TT.PLAY, b"voice": TT.VOICE, b"scene": TT.SCENE, b"show": TT.SHOW, b"at": TT.AT, b"define": TT.DEFINE, b"image": TT.IMAGE, b"label": TT.LABEL, b"with": TT.WITH, b"menu": TT.MENU, b"jump": TT.JUMP, b"return": TT.RETURN, b"init": TT.INIT, b"fadeout": TT.FADEOUT, } @dataclass class Token: position: Position lexeme: bytes type: TT def parse_string(mem, position): offset = position.offset assert mem[offset] == ord('"'), (position, chr(mem[offset])) offset += 1 start = offset while mem[offset] != ord('"'): assert mem[offset] != ord("\n") offset += 1 assert mem[offset] == ord('"'), position s = bytes(mem[start:offset]) token = Token( position = position, lexeme = s, type = TT.STRING ) next_position = Position( offset = offset + 1, line = position.line, column = position.column + 2 + len(s) ) return next_position, token def parse_number(mem, position): offset = position.offset whole = [] fraction = [] sign = 1 if mem[offset] == ord('-'): sign = -1 offset += 1 # whole while True: c = mem[offset] if c in string_digits: whole.append(c) offset += 1 elif c == ord('.'): assert whole != [], chr(c) offset += 1 break else: assert whole != [], chr(c) number = sign * int(bytes(whole)) next_position = Position( offset = offset, line = position.line, column = position.column + 1 + (offset - position.offset) ) return next_position, Token(position, number, TT.NUMBER) # fraction while True: c = mem[offset] if c in string_digits: fraction.append(c) offset += 1 else: assert fraction != [], chr(c) w = int(bytes(whole)) f = int(bytes(fraction)) / (10 ** len(fraction)) number = sign * (w + f) next_position = Position( offset = offset, line = position.line, column = position.column + 1 + (offset - position.offset) ) return next_position, Token(position, number, TT.NUMBER) def disambiguate_keyword(lexeme): if lexeme in keywords: return keywords[lexeme] else: return TT.IDENTIFIER def parse_identifier(mem, position): offset = position.offset l = [] while True: c = mem[offset] if c in identifier: l.append(c) offset += 1 else: assert l != [] lexeme = bytes(l) next_position = Position( offset = offset, line = position.line, column = position.column + 1 + len(lexeme) ) token_type = disambiguate_keyword(lexeme) return next_position, Token(position, lexeme, token_type) def next_token(mem, position): position = Position.from_position(position) while True: if position.offset >= len(mem): return position, None c = mem[position.offset] if c in whitespace: position.offset += 1 position.column += 1 else: break next_position = Position.from_position(position) next_position.offset += 1 next_position.column += 1 if c >= 128: print(f"warning: invalid garbage byte {hex(c)} at {position}") next_position.column = position.column return next_token(mem, next_position) if c == ord('\n'): next_position.line += 1 next_position.column = 0 return next_position, Token(position, b'\n', TT.NEWLINE) if c == ord(':'): return next_position, Token(position, b':', TT.COLON) if c == ord('.'): return next_position, Token(position, b':', TT.DOT) if c == ord('='): return next_position, Token(position, b'=', TT.EQUAL) if c == ord(','): return next_position, Token(position, b',', TT.COMMA) if c == ord('('): return next_position, Token(position, b'(', TT.LPAREN) if c == ord(')'): return next_position, Token(position, b')', TT.RPAREN) if c == ord('"'): return parse_string(mem, position) if c == ord('#'): offset = position.offset + 1 while mem[offset] != ord('\n'): offset += 1 return next_token(mem, Position(offset + 1, position.line + 1, 0)) if c in string_digits or c == ord('-'): return parse_number(mem, position) if c in identifier_start: return parse_identifier(mem, position) assert not "unexpected character", (position, chr(c), hex(c)) def tokenize(mem): position = Position(offset = 0, line = 1, column = 0) while True: position, token = next_token(mem, position) if token is None: return else: yield token def main(): with open(sys.argv[1], 'rb') as f: mem = memoryview(f.read()) for token in tokenize(mem): print(token.lexeme, token.type) if __name__ == "__main__": main()