r500/regs/assembler/lexer.py

157 lines
4.9 KiB
Python

from dataclasses import dataclass
from enum import Enum, auto
from itertools import chain
from typing import Union, Any
DEBUG = True
class TT(Enum):
eof = auto()
eol = auto()
left_square = auto()
right_square = auto()
left_paren = auto()
right_paren = auto()
equal = auto()
dot = auto()
identifier = auto()
keyword = auto()
colon = auto()
semicolon = auto()
bar = auto()
comma = auto()
minus = auto()
@dataclass
class Token:
start_ix: int
line: int
col: int
type: TT
lexeme: memoryview
keyword: Any = None
identifier_characters = set(chain(
b'abcdefghijklmnopqrstuvwxyz'
b'ABCDEFGHIJKLMNOPQRSTUVWXYZ',
b'0123456789',
b'_-'
))
class LexerError(Exception):
pass
class Lexer:
def __init__(self, buf: memoryview, find_keyword,
emit_newlines=False,
minus_is_token=False):
self.start_ix = 0
self.current_ix = 0
self.buf = memoryview(buf)
self.line = 1
self.col = 0
self.find_keyword = find_keyword
self.emit_newlines = emit_newlines
self.minus_is_token = minus_is_token
def at_end_p(self):
return self.current_ix >= len(self.buf)
def lexeme(self):
if DEBUG:
return bytes(self.buf[self.start_ix:self.current_ix])
else:
return memoryview(self.buf[self.start_ix:self.current_ix])
def advance(self):
c = self.buf[self.current_ix]
self.col += 1
self.current_ix += 1
return c
def peek(self):
return self.buf[self.current_ix]
def pos(self):
return self.start_ix, self.line, self.col - (self.current_ix - self.start_ix)
def identifier(self):
while not self.at_end_p() and self.peek() in identifier_characters:
self.advance()
keyword = self.find_keyword(self.lexeme())
if keyword is not None:
return Token(*self.pos(), TT.keyword, self.lexeme(), keyword)
else:
return Token(*self.pos(), TT.identifier, self.lexeme(), None)
def lex_token(self):
while True:
self.start_ix = self.current_ix
if self.at_end_p():
return Token(*self.pos(), TT.eof, self.lexeme())
c = self.advance()
if c == ord('('):
return Token(*self.pos(), TT.left_paren, self.lexeme())
elif c == ord(')'):
return Token(*self.pos(), TT.right_paren, self.lexeme())
elif c == ord('['):
return Token(*self.pos(), TT.left_square, self.lexeme())
elif c == ord(']'):
return Token(*self.pos(), TT.right_square, self.lexeme())
elif c == ord('='):
return Token(*self.pos(), TT.equal, self.lexeme())
elif c == ord('.'):
return Token(*self.pos(), TT.dot, self.lexeme())
elif c == ord('|'):
return Token(*self.pos(), TT.bar, self.lexeme())
elif c == ord(':'):
return Token(*self.pos(), TT.colon, self.lexeme())
elif c == ord(';'):
return Token(*self.pos(), TT.semicolon, self.lexeme())
elif c == ord(','):
return Token(*self.pos(), TT.comma, self.lexeme())
elif self.minus_is_token and c == ord('-'):
return Token(*self.pos(), TT.minus, self.lexeme())
elif c == ord('#'):
while not self.at_end_p() and self.peek() != ord('\n'):
self.advance()
elif c == ord(' ') or c == ord('\r') or c == ord('\t'):
pass
elif c == ord('\n'):
pos = self.pos()
self.line += 1
self.col = 0
if self.emit_newlines:
return Token(*pos, TT.eol, self.lexeme())
else:
continue
elif c in identifier_characters:
return self.identifier()
else:
token = Token(*self.pos(), None, self.lexeme())
raise LexerError(f"unexpected character at line:{self.line} col:{self.col}", token)
def lex_tokens(self):
while True:
token = self.lex_token()
yield token
if token.type is TT.eof:
break
if __name__ == "__main__":
def vs_test():
from assembler.vskeywords import find_keyword
test = b"out[0].xz = VE_MAD input[0].-y-_-0-_ temp[0].x_0_ temp[0].y_0_"
lexer = Lexer(test, find_keyword)
for token in lexer.lex_tokens():
print(token)
def fs_test():
from assembler.fs.keywords import find_keyword
test = b"src0.rgb = temp[0] : temp[0].a = OP_RSQ |src0.r| ;"
lexer = Lexer(test, find_keyword)
for token in lexer.lex_tokens():
print(token)
fs_test()