119 lines
3.2 KiB
Python
119 lines
3.2 KiB
Python
import string
|
|
|
|
def parse_magic(mem, offset):
|
|
magic = b"xof 0302txt 0064"
|
|
window = bytes(mem[offset:offset+len(magic)])
|
|
assert window == magic, window
|
|
return offset + len(magic)
|
|
|
|
string_digits = set(ord(i) for i in string.digits)
|
|
|
|
def parse_number(mem, offset):
|
|
mem = memoryview(mem)
|
|
whole = []
|
|
fraction = []
|
|
sign = 1
|
|
if mem[offset] == ord('-'):
|
|
sign = -1
|
|
offset += 1
|
|
# whole
|
|
while True:
|
|
c = mem[offset]
|
|
if c in string_digits:
|
|
whole.append(c)
|
|
offset += 1
|
|
elif c == ord('.'):
|
|
assert whole != [], chr(c)
|
|
offset += 1
|
|
break
|
|
else:
|
|
assert whole != [], chr(c)
|
|
number = sign * int(bytes(whole))
|
|
return offset, number
|
|
# fraction
|
|
while True:
|
|
c = mem[offset]
|
|
if c in string_digits:
|
|
fraction.append(c)
|
|
offset += 1
|
|
else:
|
|
assert fraction != [], chr(c)
|
|
w = int(bytes(whole))
|
|
f = int(bytes(fraction)) / (10 ** len(fraction))
|
|
number = sign * (w + f)
|
|
return offset, number
|
|
|
|
assert parse_number(b"1234;", 0)[1] == 1234
|
|
assert abs(parse_number(b"1234.5678;", 0)[1] - 1234.5678) < 0.0001
|
|
assert parse_number(b"-1234;", 0)[1] == -1234
|
|
assert abs(parse_number(b"-1234.5678;", 0)[1] - -1234.5678) < 0.0001
|
|
|
|
whitespace = set([ord(' '), ord('\n')])
|
|
|
|
TOKEN_SEMICOLON = type("TOKEN_SEMICOLON", (), {})
|
|
TOKEN_COMMA = type("TOKEN_COMMA", (), {})
|
|
TOKEN_LBRACKET = type("TOKEN_LBRACKET", (), {})
|
|
TOKEN_RBRACKET = type("TOKEN_RBRACKET", (), {})
|
|
|
|
identifier_start = set(map(ord, string.ascii_letters + "_"))
|
|
identifier = identifier_start | string_digits
|
|
|
|
def parse_identifier(mem, offset):
|
|
l = []
|
|
while True:
|
|
c = mem[offset]
|
|
if c in identifier:
|
|
l.append(c)
|
|
offset += 1
|
|
else:
|
|
assert l != []
|
|
return offset, bytes(l)
|
|
|
|
def parse_string(mem, offset):
|
|
assert mem[offset] == ord('"')
|
|
offset += 1
|
|
start = offset
|
|
while mem[offset] != ord('"'):
|
|
assert mem[offset] != ord("\n")
|
|
offset += 1
|
|
assert mem[offset] == ord('"')
|
|
s = bytes(mem[start:offset]).decode("utf-8")
|
|
offset += 1
|
|
return offset, s
|
|
|
|
def next_token(mem, offset):
|
|
while True:
|
|
if offset >= len(mem):
|
|
return offset, None
|
|
c = mem[offset]
|
|
if c in whitespace:
|
|
offset += 1
|
|
else:
|
|
break
|
|
|
|
if c in string_digits or c == ord('-'):
|
|
return parse_number(mem, offset)
|
|
elif c == ord(';'):
|
|
return offset + 1, TOKEN_SEMICOLON
|
|
elif c == ord(','):
|
|
return offset + 1, TOKEN_COMMA
|
|
elif c == ord('{'):
|
|
return offset + 1, TOKEN_LBRACKET
|
|
elif c == ord('}'):
|
|
return offset + 1, TOKEN_RBRACKET
|
|
elif c == ord('"'):
|
|
return parse_string(mem, offset)
|
|
elif c in identifier_start:
|
|
return parse_identifier(mem, offset)
|
|
else:
|
|
assert False, chr(c)
|
|
|
|
def lex_all(mem, offset):
|
|
offset = parse_magic(mem, offset)
|
|
while True:
|
|
offset, token = next_token(mem, offset)
|
|
if token is None:
|
|
return
|
|
else:
|
|
yield token
|