diff --git a/renpy-parser/language/statement.h b/renpy-parser/language/statement.h new file mode 100644 index 0000000..fccac9c --- /dev/null +++ b/renpy-parser/language/statement.h @@ -0,0 +1,57 @@ +#include + +namespace language::statement { + enum type { + show, + voice, + music, + text, + menu, + jump, + }; + + struct show { + uint32_t imageIndex; + uint32_t transformIndex; + }; + + struct voice { + uint32_t audioIndex; + }; + + struct music { + uint32_t channelIndex; + uint32_t audioIndex; + }; + + struct say { + uint32_t characterIndex; + uint32_t stringIndex; + }; + + struct option { + uint32_t stringIndex; + uint32_t statementIndex; + }; + + struct menu { + uint32_t count; + uint32_t optionIndex; + }; + + struct jump { + uint32_t statementIndex; + }; + + struct statement { + enum statement_type type; + union { + show show; + voice voice; + music music; + say say; + menu menu; + jump jump; + }; + }; +} diff --git a/renpy-parser/lex.py b/renpy-parser/lex.py new file mode 100644 index 0000000..eaa1e7e --- /dev/null +++ b/renpy-parser/lex.py @@ -0,0 +1,231 @@ +import string +from dataclasses import dataclass +from enum import Enum, auto +import sys + +whitespace = set([ord(' '), ord('\r')]) +string_digits = set(ord(i) for i in string.digits) +identifier_start = set(map(ord, string.ascii_letters + "_")) +identifier = identifier_start | string_digits + +@dataclass +class Position: + offset: int + line: int + column: int + + def from_position(p): + return Position(p.offset, p.line, p.column) + +class TT(Enum): + KEYWORD = auto() + NEWLINE = auto() + COLON = auto() + STRING = auto() + DOT = auto() + EQUAL = auto() + COMMA = auto() + LPAREN = auto() + RPAREN = auto() + NUMBER = auto() + IDENTIFIER = auto() + WITH = auto() + + # keywords + PLAY = auto() + VOICE = auto() + SCENE = auto() + SHOW = auto() + AT = auto() + DEFINE = auto() + IMAGE = auto() + LABEL = auto() + MENU = auto() + JUMP = auto() + RETURN = auto() + INIT = auto() + FADEOUT = auto() + +keywords = { + b"play": TT.PLAY, + b"voice": TT.VOICE, + b"scene": TT.SCENE, + b"show": TT.SHOW, + b"at": TT.AT, + b"define": TT.DEFINE, + b"image": TT.IMAGE, + b"label": TT.LABEL, + b"with": TT.WITH, + b"menu": TT.MENU, + b"jump": TT.JUMP, + b"return": TT.RETURN, + b"init": TT.INIT, + b"fadeout": TT.FADEOUT, +} + +@dataclass +class Token: + position: Position + lexeme: bytes + type: TT + +def parse_string(mem, position): + offset = position.offset + assert mem[offset] == ord('"'), (position, chr(mem[offset])) + offset += 1 + start = offset + while mem[offset] != ord('"'): + assert mem[offset] != ord("\n") + offset += 1 + assert mem[offset] == ord('"'), position + s = bytes(mem[start:offset]) + token = Token( + position = position, + lexeme = s, + type = TT.STRING + ) + next_position = Position( + offset = offset + 1, + line = position.line, + column = position.column + 2 + len(s) + ) + return next_position, token + +def parse_number(mem, position): + offset = position.offset + whole = [] + fraction = [] + sign = 1 + if mem[offset] == ord('-'): + sign = -1 + offset += 1 + # whole + while True: + c = mem[offset] + if c in string_digits: + whole.append(c) + offset += 1 + elif c == ord('.'): + assert whole != [], chr(c) + offset += 1 + break + else: + assert whole != [], chr(c) + number = sign * int(bytes(whole)) + next_position = Position( + offset = offset, + line = position.line, + column = position.column + 1 + (offset - position.offset) + ) + return next_position, Token(position, number, TT.NUMBER) + # fraction + while True: + c = mem[offset] + if c in string_digits: + fraction.append(c) + offset += 1 + else: + assert fraction != [], chr(c) + w = int(bytes(whole)) + f = int(bytes(fraction)) / (10 ** len(fraction)) + number = sign * (w + f) + next_position = Position( + offset = offset, + line = position.line, + column = position.column + 1 + (offset - position.offset) + ) + return next_position, Token(position, number, TT.NUMBER) + +def disambiguate_keyword(lexeme): + if lexeme in keywords: + return keywords[lexeme] + else: + return TT.IDENTIFIER + +def parse_identifier(mem, position): + offset = position.offset + l = [] + while True: + c = mem[offset] + if c in identifier: + l.append(c) + offset += 1 + else: + assert l != [] + lexeme = bytes(l) + next_position = Position( + offset = offset, + line = position.line, + column = position.column + 1 + len(lexeme) + ) + token_type = disambiguate_keyword(lexeme) + return next_position, Token(position, lexeme, token_type) + +def next_token(mem, position): + position = Position.from_position(position) + while True: + if position.offset >= len(mem): + return position, None + c = mem[position.offset] + + if c in whitespace: + position.offset += 1 + position.column += 1 + else: + break + + next_position = Position.from_position(position) + next_position.offset += 1 + next_position.column += 1 + + if c >= 128: + print(f"warning: invalid garbage byte {hex(c)} at {position}") + next_position.column = position.column + return next_token(mem, next_position) + if c == ord('\n'): + next_position.line += 1 + next_position.column = 0 + return next_position, Token(position, b'\n', TT.NEWLINE) + if c == ord(':'): + return next_position, Token(position, b':', TT.COLON) + if c == ord('.'): + return next_position, Token(position, b':', TT.DOT) + if c == ord('='): + return next_position, Token(position, b'=', TT.EQUAL) + if c == ord(','): + return next_position, Token(position, b',', TT.COMMA) + if c == ord('('): + return next_position, Token(position, b'(', TT.LPAREN) + if c == ord(')'): + return next_position, Token(position, b')', TT.RPAREN) + if c == ord('"'): + return parse_string(mem, position) + if c == ord('#'): + offset = position.offset + 1 + while mem[offset] != ord('\n'): + offset += 1 + return next_token(mem, Position(offset + 1, position.line + 1, 0)) + if c in string_digits or c == ord('-'): + return parse_number(mem, position) + if c in identifier_start: + return parse_identifier(mem, position) + assert not "unexpected character", (position, chr(c), hex(c)) + +def tokenize(mem): + position = Position(offset = 0, line = 1, column = 0) + while True: + position, token = next_token(mem, position) + if token is None: + return + else: + yield token + +def main(): + with open(sys.argv[1], 'rb') as f: + mem = memoryview(f.read()) + + for token in tokenize(mem): + print(token.lexeme, token.type) + +if __name__ == "__main__": + main() diff --git a/renpy-parser/parse.py b/renpy-parser/parse.py new file mode 100644 index 0000000..92b4e22 --- /dev/null +++ b/renpy-parser/parse.py @@ -0,0 +1,475 @@ +import sys +from pprint import pprint +import lex +from lex import TT +from dataclasses import dataclass + +class ParseException(Exception): + def __init__(self, message, token): + super().__init__(message) + self.token = token + +def get_lexeme(v): + if type(v) is list: + return list(get_lexeme(i) for i in v) + elif type(v) is tuple: + return tuple(get_lexeme(i) for i in v) + elif type(v) is lex.Token: + return v.lexeme + else: + return v + +def lexeme_repr(self): + kws = [f"{key}={get_lexeme(value)!r}" for key, value in self.__dict__.items()] + return "{}({})".format(type(self).__name__, ", ".join(kws)) + +@dataclass +class FunctionCall: + name: lex.Token + args: list[lex.Token] + kwargs: list[tuple[lex.Token, lex.Token]] + + __repr__ = lexeme_repr + +@dataclass +class Image: + name: list[lex.Token] + path: lex.Token + + __repr__ = lexeme_repr + +@dataclass +class Define: + name: list[lex.Token] + value: 'Expression' + + __repr__ = lexeme_repr + +@dataclass +class Label: + name: lex.Token + + __repr__ = lexeme_repr + +@dataclass +class Play: + channel: lex.Token + path: lex.Token + fadeout: lex.Token + + __repr__ = lexeme_repr + +@dataclass +class Scene: + name: lex.Token + + __repr__ = lexeme_repr + +@dataclass +class With: + function_call: FunctionCall + + __repr__ = lexeme_repr + +@dataclass +class Say: + speaker: lex.Token + text: lex.Token + + __repr__ = lexeme_repr + +@dataclass +class Voice: + path: lex.Token + + __repr__ = lexeme_repr + +@dataclass +class Show: + what: lex.Token + transform: lex.Token + + __repr__ = lexeme_repr + +@dataclass +class Menu: + entries: tuple[lex.Token, list['Statement']] + __repr__ = lexeme_repr + +@dataclass +class Jump: + target: lex.Token + + __repr__ = lexeme_repr + +@dataclass +class Return: + pass + +def parse_lhs(tokens, index): + identifier = tokens[index] + if identifier.type != TT.IDENTIFIER: + raise ParseException("expected identifier", identifier) + lhs = [identifier] + index += 1 + while tokens[index].type == TT.DOT: + identifier = tokens[index + 1] + if identifier.type != TT.IDENTIFIER: + raise ParseException("expected identifier", identifier) + lhs.append(identifier) + index += 2 + return index, lhs + +def parse_image(tokens, index): + index, lhs = parse_lhs(tokens, index) + equal = tokens[index + 0] + if equal.type != TT.EQUAL: + raise ParseException("expected equal", equal) + string = tokens[index + 1] + if string.type != TT.STRING: + raise ParseException("expected string", string) + + image = Image( + name = lhs, + path = string, + ) + return index + 2, image + +def parse_function_call(tokens, index): + name = tokens[index + 0] + if name.type != TT.IDENTIFIER: + raise ParseException("expected identifier", name) + lparen = tokens[index + 1] + if lparen.type != TT.LPAREN: + raise ParseException("expected lparen", lparen) + index += 2 + + # args + args = [] + while tokens[index].type != TT.RPAREN: + token = tokens[index] + if token.type == TT.STRING: + args.append(token) + elif token.type == TT.NUMBER: + args.append(token) + elif token.type == TT.NEWLINE: + pass + else: + break + index += 1 + if tokens[index].type != TT.COMMA: + break + index += 1 + + # kwargs + kwargs = [] + while tokens[index].type != TT.RPAREN: + identifier = tokens[index + 0] + if identifier.type != TT.IDENTIFIER: + raise ParseException("expected function call kwargs identifier", identifier) + equal = tokens[index + 1] + if equal.type != TT.EQUAL: + raise ParseException("expected function call kwargs equal", equal) + string = tokens[index + 2] + if string.type != TT.STRING: + raise ParseException("expected function call kwargs string", string) + kwargs.append((identifier, string)) + index += 3 + if tokens[index].type != TT.COMMA: + break + + rparen = tokens[index] + if rparen.type != TT.RPAREN: + raise ParseException("expected rparen", rparen) + index += 1 + + function_call = FunctionCall( + name = name, + args = args, + kwargs = kwargs + ) + return index, function_call + +def parse_rhs(tokens, index): + token = tokens[index] + peek = tokens[index+1] + if token.type == TT.NUMBER: + return index + 1, token + elif token.type == TT.IDENTIFIER and peek.type == TT.LPAREN: + return parse_function_call(tokens, index) + else: + raise ParseException("expected rhs expression", token) + +def parse_define(tokens, index): + index, lhs = parse_lhs(tokens, index) + equal = tokens[index + 0] + if equal.type != TT.EQUAL: + raise ParseException("expected equal", equal) + index, rhs = parse_rhs(tokens, index + 1) + define = Define( + name = lhs, + value = rhs + ) + return index, define + +def parse_label(tokens, index): + name = tokens[index + 0] + if name.type != TT.IDENTIFIER: + raise ParseException("expected identifier", name) + + colon = tokens[index + 1] + if colon.type != TT.COLON: + raise ParseException("expected colon", colon) + + label = Label( + name = name + ) + + return index + 2, label + +def parse_play(tokens, index): + channel = tokens[index + 0] + if channel.type != TT.IDENTIFIER: + raise ParseException("expected identifier", channel) + + path = tokens[index + 1] + if path.type != TT.STRING: + raise ParseException("expected string", path) + + index += 2 + token = tokens[index] + fadeout = None + if token.type == TT.FADEOUT: + fadeout = tokens[index + 1] + if fadeout.type != TT.NUMBER: + raise ParseException("expected number", fadeout) + index += 2 + + play = Play( + channel = channel, + path = path, + fadeout = fadeout + ) + return index, play + +def parse_scene(tokens, index): + name = tokens[index + 0] + if name.type != TT.IDENTIFIER: + raise ParseException("expected identifier", name) + + scene = Scene( + name = name, + ) + return index + 1, scene + +def parse_with(tokens, index): + index, function_call = parse_function_call(tokens, index) + + _with = With( + function_call = function_call + ) + + return index, _with + +def parse_say(tokens, index): + speaker = tokens[index + 0] + if speaker.type != TT.IDENTIFIER: + raise ParseException("expected identifier", name) + + text = tokens[index + 1] + if text.type != TT.STRING: + raise ParseException("expected string", text) + + say = Say( + speaker = speaker, + text = text + ) + + return index + 2, say + +def parse_voice(tokens, index): + path = tokens[index] + if path.type != TT.STRING: + raise ParseException("expected string", path) + + voice = Voice( + path = path, + ) + return index + 1, voice + +def parse_show(tokens, index): + what = tokens[index + 0] + if what.type != TT.IDENTIFIER: + raise ParseException("expected identifier", path) + + at = tokens[index + 1] + if at.type != TT.AT: + raise ParseException("expected at", at) + + transform = tokens[index + 2] + if transform.type != TT.IDENTIFIER: + raise ParseException("expected identifier", transform) + + show = Show( + what = what, + transform = transform + ) + return index + 3, show + +def parse_menu(tokens, index): + menu = tokens[index + 0] + if menu.type != TT.MENU: + raise ParseException("expected menu", menu) + + colon = tokens[index + 1] + if colon.type != TT.COLON: + raise ParseException("expected colon", colon) + + index = index + 2 + menu_entries = [] + menu_entry_string = None + menu_entry_statements = None + while index < len(tokens): + token = tokens[index+0] + if token.type == TT.NEWLINE: + index += 1 + continue + peek = tokens[index+1] + + if token.position.column < menu.position.column: + raise ParseException("invalid block dedent", token) + if token.position.column == menu.position.column: + break + + if token.type == TT.STRING: + if peek.type != TT.COLON: + raise ParseException("expected colon", peek) + if menu_entry_string is not None: + menu_entries.append((menu_entry_string, menu_entry_statements)) + menu_entry_string = token + menu_entry_statements = [] + index += 2 + else: + if menu_entry_statements is None: + raise ParseException("expected menu option", token) + + index, ast = parse_one(tokens, index) + if ast is not None: + menu_entry_statements.append(ast) + + if menu_entry_string is not None: + menu_entries.append((menu_entry_string, menu_entry_statements)) + + menu = Menu( + entries = menu_entries, + ) + + return index, menu + +def parse_jump(tokens, index): + target = tokens[index + 0] + if target.type != TT.IDENTIFIER: + raise ParseException("expected identifier", target) + + jump = Jump( + target = target, + ) + return index + 1, jump + +def parse_init(tokens, index): + init = tokens[index + 0] + if init.type != TT.INIT: + raise ParseException("expected init", init) + + colon = tokens[index + 1] + if colon.type != TT.COLON: + raise ParseException("expected identifier", colon) + + index += 2 + + # skip all tokens inside block + while index < len(tokens): + token = tokens[index] + if token.type == TT.NEWLINE: + index += 1 + continue + + if token.position.column < init.position.column: + raise ParseException("invalid block dedent", token) + if token.position.column == init.position.column: + break + index += 1 + + return index, None + +def parse_one(tokens, index): + token = tokens[index] + if token.type == TT.NEWLINE: + index = index + 1 + if index < len(tokens): + return parse_one(tokens, index) + else: + return index, None + elif token.type == TT.IMAGE: + index, ast = parse_image(tokens, index + 1) + return index, ast + elif token.type == TT.DEFINE: + index, ast = parse_define(tokens, index + 1) + return index, ast + elif token.type == TT.LABEL: + index, ast = parse_label(tokens, index + 1) + return index, ast + elif token.type == TT.PLAY: + index, ast = parse_play(tokens, index + 1) + return index, ast + elif token.type == TT.SCENE: + index, ast = parse_scene(tokens, index + 1) + return index, ast + elif token.type == TT.WITH: + index, ast = parse_with(tokens, index + 1) + return index, ast + elif token.type == TT.IDENTIFIER: + index, ast = parse_say(tokens, index) + return index, ast + elif token.type == TT.VOICE: + index, ast = parse_voice(tokens, index + 1) + return index, ast + elif token.type == TT.SHOW: + index, ast = parse_show(tokens, index + 1) + return index, ast + elif token.type == TT.MENU: + index, ast = parse_menu(tokens, index) + return index, ast + elif token.type == TT.JUMP: + index, ast = parse_jump(tokens, index + 1) + return index, ast + elif token.type == TT.RETURN: + return index + 1, Return() + elif token.type == TT.INIT: + index, ast = parse_init(tokens, index) + return index, ast + else: + raise ParseException("unexpected token", token) + +def parse_all(tokens): + index = 0 + while index < len(tokens): + index, ast = parse_one(tokens, index) + if ast is not None: + yield ast + +def main(): + with open(sys.argv[1], 'rb') as f: + mem = memoryview(f.read()) + + tokens = list(lex.tokenize(mem)) + try: + ast = parse_all(tokens) + for t in ast: + pprint(t) + except ParseException as e: + print(e, e.token) + raise + +if __name__ == "__main__": + main()