242 lines
6.4 KiB
Python
242 lines
6.4 KiB
Python
import string
|
|
from dataclasses import dataclass
|
|
from enum import Enum, auto
|
|
import sys
|
|
|
|
whitespace = set([ord(' '), ord('\r')])
|
|
string_digits = set(ord(i) for i in string.digits)
|
|
identifier_start = set(map(ord, string.ascii_letters + "_"))
|
|
identifier = identifier_start | string_digits
|
|
|
|
@dataclass
|
|
class Position:
|
|
offset: int
|
|
line: int
|
|
column: int
|
|
|
|
def from_position(p):
|
|
return Position(p.offset, p.line, p.column)
|
|
|
|
class TT(Enum):
|
|
KEYWORD = auto()
|
|
NEWLINE = auto()
|
|
COLON = auto()
|
|
STRING = auto()
|
|
DOT = auto()
|
|
EQUAL = auto()
|
|
COMMA = auto()
|
|
LPAREN = auto()
|
|
RPAREN = auto()
|
|
NUMBER = auto()
|
|
IDENTIFIER = auto()
|
|
WITH = auto()
|
|
|
|
# keywords
|
|
PLAY = auto()
|
|
VOICE = auto()
|
|
SCENE = auto()
|
|
SHOW = auto()
|
|
AT = auto()
|
|
DEFINE = auto()
|
|
IMAGE = auto()
|
|
LABEL = auto()
|
|
MENU = auto()
|
|
JUMP = auto()
|
|
RETURN = auto()
|
|
INIT = auto()
|
|
FADEOUT = auto()
|
|
TRANSFORM = auto()
|
|
STOP = auto()
|
|
NOLOOP = auto()
|
|
PAUSE = auto()
|
|
HIDE = auto()
|
|
|
|
keywords = {
|
|
b"play": TT.PLAY,
|
|
b"voice": TT.VOICE,
|
|
b"scene": TT.SCENE,
|
|
b"show": TT.SHOW,
|
|
b"at": TT.AT,
|
|
b"define": TT.DEFINE,
|
|
b"image": TT.IMAGE,
|
|
b"label": TT.LABEL,
|
|
b"with": TT.WITH,
|
|
b"menu": TT.MENU,
|
|
b"jump": TT.JUMP,
|
|
b"return": TT.RETURN,
|
|
b"init": TT.INIT,
|
|
b"fadeout": TT.FADEOUT,
|
|
b"transform": TT.TRANSFORM,
|
|
b"stop": TT.STOP,
|
|
b"noloop": TT.NOLOOP,
|
|
b"pause": TT.PAUSE,
|
|
b"hide": TT.HIDE,
|
|
}
|
|
|
|
@dataclass
|
|
class Token:
|
|
position: Position
|
|
lexeme: bytes
|
|
type: TT
|
|
|
|
def parse_string(mem, position):
|
|
offset = position.offset
|
|
assert mem[offset] == ord('"'), (position, chr(mem[offset]))
|
|
offset += 1
|
|
start = offset
|
|
while mem[offset] != ord('"'):
|
|
assert mem[offset] != ord("\n")
|
|
offset += 1
|
|
assert mem[offset] == ord('"'), position
|
|
s = bytes(mem[start:offset])
|
|
token = Token(
|
|
position = position,
|
|
lexeme = s,
|
|
type = TT.STRING
|
|
)
|
|
next_position = Position(
|
|
offset = offset + 1,
|
|
line = position.line,
|
|
column = position.column + 2 + len(s)
|
|
)
|
|
return next_position, token
|
|
|
|
def parse_number(mem, position):
|
|
offset = position.offset
|
|
whole = []
|
|
fraction = []
|
|
sign = 1
|
|
if mem[offset] == ord('-'):
|
|
sign = -1
|
|
offset += 1
|
|
# whole
|
|
while True:
|
|
c = mem[offset]
|
|
if c in string_digits:
|
|
whole.append(c)
|
|
offset += 1
|
|
elif c == ord('.'):
|
|
assert whole != [], chr(c)
|
|
offset += 1
|
|
break
|
|
else:
|
|
assert whole != [], chr(c)
|
|
number = sign * int(bytes(whole))
|
|
next_position = Position(
|
|
offset = offset,
|
|
line = position.line,
|
|
column = position.column + 1 + (offset - position.offset)
|
|
)
|
|
return next_position, Token(position, number, TT.NUMBER)
|
|
# fraction
|
|
while True:
|
|
c = mem[offset]
|
|
if c in string_digits:
|
|
fraction.append(c)
|
|
offset += 1
|
|
else:
|
|
assert fraction != [], chr(c)
|
|
w = int(bytes(whole))
|
|
f = int(bytes(fraction)) / (10 ** len(fraction))
|
|
number = sign * (w + f)
|
|
next_position = Position(
|
|
offset = offset,
|
|
line = position.line,
|
|
column = position.column + 1 + (offset - position.offset)
|
|
)
|
|
return next_position, Token(position, number, TT.NUMBER)
|
|
|
|
def disambiguate_keyword(lexeme):
|
|
if lexeme in keywords:
|
|
return keywords[lexeme]
|
|
else:
|
|
return TT.IDENTIFIER
|
|
|
|
def parse_identifier(mem, position):
|
|
offset = position.offset
|
|
l = []
|
|
while True:
|
|
c = mem[offset]
|
|
if c in identifier:
|
|
l.append(c)
|
|
offset += 1
|
|
else:
|
|
assert l != []
|
|
lexeme = bytes(l)
|
|
next_position = Position(
|
|
offset = offset,
|
|
line = position.line,
|
|
column = position.column + 1 + len(lexeme)
|
|
)
|
|
token_type = disambiguate_keyword(lexeme)
|
|
return next_position, Token(position, lexeme, token_type)
|
|
|
|
def next_token(mem, position):
|
|
position = Position.from_position(position)
|
|
while True:
|
|
if position.offset >= len(mem):
|
|
return position, None
|
|
c = mem[position.offset]
|
|
|
|
if c in whitespace:
|
|
position.offset += 1
|
|
position.column += 1
|
|
else:
|
|
break
|
|
|
|
next_position = Position.from_position(position)
|
|
next_position.offset += 1
|
|
next_position.column += 1
|
|
|
|
if c >= 128:
|
|
print(f"invalid garbage byte {hex(c)} at {position}", file=sys.stderr)
|
|
next_position.column = position.column
|
|
return next_token(mem, next_position)
|
|
if c == ord('\n'):
|
|
next_position.line += 1
|
|
next_position.column = 0
|
|
return next_position, Token(position, b'\n', TT.NEWLINE)
|
|
if c == ord(':'):
|
|
return next_position, Token(position, b':', TT.COLON)
|
|
if c == ord('.'):
|
|
return next_position, Token(position, b':', TT.DOT)
|
|
if c == ord('='):
|
|
return next_position, Token(position, b'=', TT.EQUAL)
|
|
if c == ord(','):
|
|
return next_position, Token(position, b',', TT.COMMA)
|
|
if c == ord('('):
|
|
return next_position, Token(position, b'(', TT.LPAREN)
|
|
if c == ord(')'):
|
|
return next_position, Token(position, b')', TT.RPAREN)
|
|
if c == ord('"'):
|
|
return parse_string(mem, position)
|
|
if c == ord('#'):
|
|
offset = position.offset + 1
|
|
while mem[offset] != ord('\n'):
|
|
offset += 1
|
|
return next_token(mem, Position(offset + 1, position.line + 1, 0))
|
|
if c in string_digits or c == ord('-'):
|
|
return parse_number(mem, position)
|
|
if c in identifier_start:
|
|
return parse_identifier(mem, position)
|
|
assert not "unexpected character", (position, chr(c), hex(c))
|
|
|
|
def tokenize(mem):
|
|
position = Position(offset = 0, line = 1, column = 0)
|
|
while True:
|
|
position, token = next_token(mem, position)
|
|
if token is None:
|
|
return
|
|
else:
|
|
yield token
|
|
|
|
def main():
|
|
with open(sys.argv[1], 'rb') as f:
|
|
mem = memoryview(f.read())
|
|
|
|
for token in tokenize(mem):
|
|
print(token.lexeme, token.type)
|
|
|
|
if __name__ == "__main__":
|
|
main()
|