dreamcast/x/lex.py
2025-02-17 23:09:47 -06:00

119 lines
3.2 KiB
Python

import string
def parse_magic(mem, offset):
magic = b"xof 0302txt 0064"
window = bytes(mem[offset:offset+len(magic)])
assert window == magic, window
return offset + len(magic)
string_digits = set(ord(i) for i in string.digits)
def parse_number(mem, offset):
mem = memoryview(mem)
whole = []
fraction = []
sign = 1
if mem[offset] == ord('-'):
sign = -1
offset += 1
# whole
while True:
c = mem[offset]
if c in string_digits:
whole.append(c)
offset += 1
elif c == ord('.'):
assert whole != [], chr(c)
offset += 1
break
else:
assert whole != [], chr(c)
number = sign * int(bytes(whole))
return offset, number
# fraction
while True:
c = mem[offset]
if c in string_digits:
fraction.append(c)
offset += 1
else:
assert fraction != [], chr(c)
w = int(bytes(whole))
f = int(bytes(fraction)) / (10 ** len(fraction))
number = sign * (w + f)
return offset, number
assert parse_number(b"1234;", 0)[1] == 1234
assert abs(parse_number(b"1234.5678;", 0)[1] - 1234.5678) < 0.0001
assert parse_number(b"-1234;", 0)[1] == -1234
assert abs(parse_number(b"-1234.5678;", 0)[1] - -1234.5678) < 0.0001
whitespace = set([ord(' '), ord('\n')])
TOKEN_SEMICOLON = type("TOKEN_SEMICOLON", (), {})
TOKEN_COMMA = type("TOKEN_COMMA", (), {})
TOKEN_LBRACKET = type("TOKEN_LBRACKET", (), {})
TOKEN_RBRACKET = type("TOKEN_RBRACKET", (), {})
identifier_start = set(map(ord, string.ascii_letters + "_"))
identifier = identifier_start | string_digits
def parse_identifier(mem, offset):
l = []
while True:
c = mem[offset]
if c in identifier:
l.append(c)
offset += 1
else:
assert l != []
return offset, bytes(l)
def parse_string(mem, offset):
assert mem[offset] == ord('"')
offset += 1
start = offset
while mem[offset] != ord('"'):
assert mem[offset] != ord("\n")
offset += 1
assert mem[offset] == ord('"')
s = bytes(mem[start:offset]).decode("utf-8")
offset += 1
return offset, s
def next_token(mem, offset):
while True:
if offset >= len(mem):
return offset, None
c = mem[offset]
if c in whitespace:
offset += 1
else:
break
if c in string_digits or c == ord('-'):
return parse_number(mem, offset)
elif c == ord(';'):
return offset + 1, TOKEN_SEMICOLON
elif c == ord(','):
return offset + 1, TOKEN_COMMA
elif c == ord('{'):
return offset + 1, TOKEN_LBRACKET
elif c == ord('}'):
return offset + 1, TOKEN_RBRACKET
elif c == ord('"'):
return parse_string(mem, offset)
elif c in identifier_start:
return parse_identifier(mem, offset)
else:
assert False, chr(c)
def lex_all(mem, offset):
offset = parse_magic(mem, offset)
while True:
offset, token = next_token(mem, offset)
if token is None:
return
else:
yield token