initial renpy parser

2026-05-21 21:21:33 -05:00 · 2026-05-21 21:21:33 -05:00 · 610aff4af6
commit 610aff4af6
parent bb6f76cf72
3 changed files with 763 additions and 0 deletions
--- a/renpy-parser/language/statement.h
+++ b/renpy-parser/language/statement.h
@ -0,0 +1,57 @@
 #include <stdint.h>
 namespace language::statement {
  enum type {
    show,
    voice,
    music,
    text,
    menu,
    jump,
  };
  struct show {
    uint32_t imageIndex;
    uint32_t transformIndex;
  };
  struct voice {
    uint32_t audioIndex;
  };
  struct music {
    uint32_t channelIndex;
    uint32_t audioIndex;
  };
  struct say {
    uint32_t characterIndex;
    uint32_t stringIndex;
  };
  struct option {
    uint32_t stringIndex;
    uint32_t statementIndex;
  };
  struct menu {
    uint32_t count;
    uint32_t optionIndex;
  };
  struct jump {
    uint32_t statementIndex;
  };
  struct statement {
    enum statement_type type;
    union {
      show show;
      voice voice;
      music music;
      say say;
      menu menu;
      jump jump;
    };
  };
 }
--- a/renpy-parser/lex.py
+++ b/renpy-parser/lex.py
@ -0,0 +1,231 @@
 import string
 from dataclasses import dataclass
 from enum import Enum, auto
 import sys
 whitespace = set([ord(' '), ord('\r')])
 string_digits = set(ord(i) for i in string.digits)
 identifier_start = set(map(ord, string.ascii_letters + "_"))
 identifier = identifier_start | string_digits
@dataclass
 class Position:
    offset: int
    line: int
    column: int
    def from_position(p):
        return Position(p.offset, p.line, p.column)
 class TT(Enum):
    KEYWORD = auto()
    NEWLINE = auto()
    COLON = auto()
    STRING = auto()
    DOT = auto()
    EQUAL = auto()
    COMMA = auto()
    LPAREN = auto()
    RPAREN = auto()
    NUMBER = auto()
    IDENTIFIER = auto()
    WITH = auto()
    # keywords
    PLAY = auto()
    VOICE = auto()
    SCENE = auto()
    SHOW = auto()
    AT = auto()
    DEFINE = auto()
    IMAGE = auto()
    LABEL = auto()
    MENU = auto()
    JUMP = auto()
    RETURN = auto()
    INIT = auto()
    FADEOUT = auto()
 keywords = {
    b"play": TT.PLAY,
    b"voice": TT.VOICE,
    b"scene": TT.SCENE,
    b"show": TT.SHOW,
    b"at": TT.AT,
    b"define": TT.DEFINE,
    b"image": TT.IMAGE,
    b"label": TT.LABEL,
    b"with": TT.WITH,
    b"menu": TT.MENU,
    b"jump": TT.JUMP,
    b"return": TT.RETURN,
    b"init": TT.INIT,
    b"fadeout": TT.FADEOUT,
 }
@dataclass
 class Token:
    position: Position
    lexeme: bytes
    type: TT
 def parse_string(mem, position):
    offset = position.offset
    assert mem[offset] == ord('"'), (position, chr(mem[offset]))
    offset += 1
    start = offset
    while mem[offset] != ord('"'):
        assert mem[offset] != ord("\n")
        offset += 1
    assert mem[offset] == ord('"'), position
    s = bytes(mem[start:offset])
    token = Token(
        position = position,
        lexeme = s,
        type = TT.STRING
    )
    next_position = Position(
        offset = offset + 1,
        line = position.line,
        column = position.column + 2 + len(s)
    )
    return next_position, token
 def parse_number(mem, position):
    offset = position.offset
    whole = []
    fraction = []
    sign = 1
    if mem[offset] == ord('-'):
        sign = -1
        offset += 1
    # whole
    while True:
        c = mem[offset]
        if c in string_digits:
            whole.append(c)
            offset += 1
        elif c == ord('.'):
            assert whole != [], chr(c)
            offset += 1
            break
        else:
            assert whole != [], chr(c)
            number = sign * int(bytes(whole))
            next_position = Position(
                offset = offset,
                line = position.line,
                column = position.column + 1 + (offset - position.offset)
            )
            return next_position, Token(position, number, TT.NUMBER)
    # fraction
    while True:
        c = mem[offset]
        if c in string_digits:
            fraction.append(c)
            offset += 1
        else:
            assert fraction != [], chr(c)
            w = int(bytes(whole))
            f = int(bytes(fraction)) / (10 ** len(fraction))
            number = sign * (w + f)
            next_position = Position(
                offset = offset,
                line = position.line,
                column = position.column + 1 + (offset - position.offset)
            )
            return next_position, Token(position, number, TT.NUMBER)
 def disambiguate_keyword(lexeme):
    if lexeme in keywords:
        return keywords[lexeme]
    else:
        return TT.IDENTIFIER
 def parse_identifier(mem, position):
    offset = position.offset
    l = []
    while True:
        c = mem[offset]
        if c in identifier:
            l.append(c)
            offset += 1
        else:
            assert l != []
            lexeme = bytes(l)
            next_position = Position(
                offset = offset,
                line = position.line,
                column = position.column + 1 + len(lexeme)
            )
            token_type = disambiguate_keyword(lexeme)
            return next_position, Token(position, lexeme, token_type)
 def next_token(mem, position):
    position = Position.from_position(position)
    while True:
        if position.offset >= len(mem):
            return position, None
        c = mem[position.offset]
        if c in whitespace:
            position.offset += 1
            position.column += 1
        else:
            break
    next_position = Position.from_position(position)
    next_position.offset += 1
    next_position.column += 1
    if c >= 128:
        print(f"warning: invalid garbage byte {hex(c)} at {position}")
        next_position.column = position.column
        return next_token(mem, next_position)
    if c == ord('\n'):
        next_position.line += 1
        next_position.column = 0
        return next_position, Token(position, b'\n', TT.NEWLINE)
    if c == ord(':'):
        return next_position, Token(position, b':', TT.COLON)
    if c == ord('.'):
        return next_position, Token(position, b':', TT.DOT)
    if c == ord('='):
        return next_position, Token(position, b'=', TT.EQUAL)
    if c == ord(','):
        return next_position, Token(position, b',', TT.COMMA)
    if c == ord('('):
        return next_position, Token(position, b'(', TT.LPAREN)
    if c == ord(')'):
        return next_position, Token(position, b')', TT.RPAREN)
    if c == ord('"'):
        return parse_string(mem, position)
    if c == ord('#'):
        offset = position.offset + 1
        while mem[offset] != ord('\n'):
            offset += 1
        return next_token(mem, Position(offset + 1, position.line + 1, 0))
    if c in string_digits or c == ord('-'):
        return parse_number(mem, position)
    if c in identifier_start:
        return parse_identifier(mem, position)
    assert not "unexpected character", (position, chr(c), hex(c))
 def tokenize(mem):
    position = Position(offset = 0, line = 1, column = 0)
    while True:
        position, token = next_token(mem, position)
        if token is None:
            return
        else:
            yield token
 def main():
    with open(sys.argv[1], 'rb') as f:
        mem = memoryview(f.read())
    for token in tokenize(mem):
        print(token.lexeme, token.type)
 if __name__ == "__main__":
    main()
--- a/renpy-parser/parse.py
+++ b/renpy-parser/parse.py
@ -0,0 +1,475 @@
 import sys
 from pprint import pprint
 import lex
 from lex import TT
 from dataclasses import dataclass
 class ParseException(Exception):
    def __init__(self, message, token):
        super().__init__(message)
        self.token = token
 def get_lexeme(v):
    if type(v) is list:
        return list(get_lexeme(i) for i in v)
    elif type(v) is tuple:
        return tuple(get_lexeme(i) for i in v)
    elif type(v) is lex.Token:
        return v.lexeme
    else:
        return v
 def lexeme_repr(self):
    kws = [f"{key}={get_lexeme(value)!r}" for key, value in self.__dict__.items()]
    return "{}({})".format(type(self).__name__, ", ".join(kws))
@dataclass
 class FunctionCall:
    name: lex.Token
    args: list[lex.Token]
    kwargs: list[tuple[lex.Token, lex.Token]]
    __repr__ = lexeme_repr
@dataclass
 class Image:
    name: list[lex.Token]
    path: lex.Token
    __repr__ = lexeme_repr
@dataclass
 class Define:
    name: list[lex.Token]
    value: 'Expression'
    __repr__ = lexeme_repr
@dataclass
 class Label:
    name: lex.Token
    __repr__ = lexeme_repr
@dataclass
 class Play:
    channel: lex.Token
    path: lex.Token
    fadeout: lex.Token
    __repr__ = lexeme_repr
@dataclass
 class Scene:
    name: lex.Token
    __repr__ = lexeme_repr
@dataclass
 class With:
    function_call: FunctionCall
    __repr__ = lexeme_repr
@dataclass
 class Say:
    speaker: lex.Token
    text: lex.Token
    __repr__ = lexeme_repr
@dataclass
 class Voice:
    path: lex.Token
    __repr__ = lexeme_repr
@dataclass
 class Show:
    what: lex.Token
    transform: lex.Token
    __repr__ = lexeme_repr
@dataclass
 class Menu:
    entries: tuple[lex.Token, list['Statement']]
    __repr__ = lexeme_repr
@dataclass
 class Jump:
    target: lex.Token
    __repr__ = lexeme_repr
@dataclass
 class Return:
    pass
 def parse_lhs(tokens, index):
    identifier = tokens[index]
    if identifier.type != TT.IDENTIFIER:
        raise ParseException("expected identifier", identifier)
    lhs = [identifier]
    index += 1
    while tokens[index].type == TT.DOT:
        identifier = tokens[index + 1]
        if identifier.type != TT.IDENTIFIER:
            raise ParseException("expected identifier", identifier)
        lhs.append(identifier)
        index += 2
    return index, lhs
 def parse_image(tokens, index):
    index, lhs = parse_lhs(tokens, index)
    equal = tokens[index + 0]
    if equal.type != TT.EQUAL:
        raise ParseException("expected equal", equal)
    string = tokens[index + 1]
    if string.type != TT.STRING:
        raise ParseException("expected string", string)
    image = Image(
        name = lhs,
        path = string,
    )
    return index + 2, image
 def parse_function_call(tokens, index):
    name = tokens[index + 0]
    if name.type != TT.IDENTIFIER:
        raise ParseException("expected identifier", name)
    lparen = tokens[index + 1]
    if lparen.type != TT.LPAREN:
        raise ParseException("expected lparen", lparen)
    index += 2
    # args
    args = []
    while tokens[index].type != TT.RPAREN:
        token = tokens[index]
        if token.type == TT.STRING:
            args.append(token)
        elif token.type == TT.NUMBER:
            args.append(token)
        elif token.type == TT.NEWLINE:
            pass
        else:
            break
        index += 1
        if tokens[index].type != TT.COMMA:
            break
        index += 1
    # kwargs
    kwargs = []
    while tokens[index].type != TT.RPAREN:
        identifier = tokens[index + 0]
        if identifier.type != TT.IDENTIFIER:
            raise ParseException("expected function call kwargs identifier", identifier)
        equal = tokens[index + 1]
        if equal.type != TT.EQUAL:
            raise ParseException("expected function call kwargs equal", equal)
        string = tokens[index + 2]
        if string.type != TT.STRING:
            raise ParseException("expected function call kwargs string", string)
        kwargs.append((identifier, string))
        index += 3
        if tokens[index].type != TT.COMMA:
            break
    rparen = tokens[index]
    if rparen.type != TT.RPAREN:
        raise ParseException("expected rparen", rparen)
    index += 1
    function_call = FunctionCall(
        name = name,
        args = args,
        kwargs = kwargs
    )
    return index, function_call
 def parse_rhs(tokens, index):
    token = tokens[index]
    peek = tokens[index+1]
    if token.type == TT.NUMBER:
        return index + 1, token
    elif token.type == TT.IDENTIFIER and peek.type == TT.LPAREN:
        return parse_function_call(tokens, index)
    else:
        raise ParseException("expected rhs expression", token)
 def parse_define(tokens, index):
    index, lhs = parse_lhs(tokens, index)
    equal = tokens[index + 0]
    if equal.type != TT.EQUAL:
        raise ParseException("expected equal", equal)
    index, rhs = parse_rhs(tokens, index + 1)
    define = Define(
        name = lhs,
        value = rhs
    )
    return index, define
 def parse_label(tokens, index):
    name = tokens[index + 0]
    if name.type != TT.IDENTIFIER:
        raise ParseException("expected identifier", name)
    colon = tokens[index + 1]
    if colon.type != TT.COLON:
        raise ParseException("expected colon", colon)
    label = Label(
        name = name
    )
    return index + 2, label
 def parse_play(tokens, index):
    channel = tokens[index + 0]
    if channel.type != TT.IDENTIFIER:
        raise ParseException("expected identifier", channel)
    path = tokens[index + 1]
    if path.type != TT.STRING:
        raise ParseException("expected string", path)
    index += 2
    token = tokens[index]
    fadeout = None
    if token.type == TT.FADEOUT:
        fadeout = tokens[index + 1]
        if fadeout.type != TT.NUMBER:
            raise ParseException("expected number", fadeout)
        index += 2
    play = Play(
        channel = channel,
        path = path,
        fadeout = fadeout
    )
    return index, play
 def parse_scene(tokens, index):
    name = tokens[index + 0]
    if name.type != TT.IDENTIFIER:
        raise ParseException("expected identifier", name)
    scene = Scene(
        name = name,
    )
    return index + 1, scene
 def parse_with(tokens, index):
    index, function_call = parse_function_call(tokens, index)
    _with = With(
        function_call = function_call
    )
    return index, _with
 def parse_say(tokens, index):
    speaker = tokens[index + 0]
    if speaker.type != TT.IDENTIFIER:
        raise ParseException("expected identifier", name)
    text = tokens[index + 1]
    if text.type != TT.STRING:
        raise ParseException("expected string", text)
    say = Say(
        speaker = speaker,
        text = text
    )
    return index + 2, say
 def parse_voice(tokens, index):
    path = tokens[index]
    if path.type != TT.STRING:
        raise ParseException("expected string", path)
    voice = Voice(
        path = path,
    )
    return index + 1, voice
 def parse_show(tokens, index):
    what = tokens[index + 0]
    if what.type != TT.IDENTIFIER:
        raise ParseException("expected identifier", path)
    at = tokens[index + 1]
    if at.type != TT.AT:
        raise ParseException("expected at", at)
    transform = tokens[index + 2]
    if transform.type != TT.IDENTIFIER:
        raise ParseException("expected identifier", transform)
    show = Show(
        what = what,
        transform = transform
    )
    return index + 3, show
 def parse_menu(tokens, index):
    menu = tokens[index + 0]
    if menu.type != TT.MENU:
        raise ParseException("expected menu", menu)
    colon = tokens[index + 1]
    if colon.type != TT.COLON:
        raise ParseException("expected colon", colon)
    index = index + 2
    menu_entries = []
    menu_entry_string = None
    menu_entry_statements = None
    while index < len(tokens):
        token = tokens[index+0]
        if token.type == TT.NEWLINE:
            index += 1
            continue
        peek = tokens[index+1]
        if token.position.column < menu.position.column:
            raise ParseException("invalid block dedent", token)
        if token.position.column == menu.position.column:
            break
        if token.type == TT.STRING:
            if peek.type != TT.COLON:
                raise ParseException("expected colon", peek)
            if menu_entry_string is not None:
                menu_entries.append((menu_entry_string, menu_entry_statements))
            menu_entry_string = token
            menu_entry_statements = []
            index += 2
        else:
            if menu_entry_statements is None:
                raise ParseException("expected menu option", token)
            index, ast = parse_one(tokens, index)
            if ast is not None:
                menu_entry_statements.append(ast)
    if menu_entry_string is not None:
        menu_entries.append((menu_entry_string, menu_entry_statements))
    menu = Menu(
        entries = menu_entries,
    )
    return index, menu
 def parse_jump(tokens, index):
    target = tokens[index + 0]
    if target.type != TT.IDENTIFIER:
        raise ParseException("expected identifier", target)
    jump = Jump(
        target = target,
    )
    return index + 1, jump
 def parse_init(tokens, index):
    init = tokens[index + 0]
    if init.type != TT.INIT:
        raise ParseException("expected init", init)
    colon = tokens[index + 1]
    if colon.type != TT.COLON:
        raise ParseException("expected identifier", colon)
    index += 2
    # skip all tokens inside block
    while index < len(tokens):
        token = tokens[index]
        if token.type == TT.NEWLINE:
            index += 1
            continue
        if token.position.column < init.position.column:
            raise ParseException("invalid block dedent", token)
        if token.position.column == init.position.column:
            break
        index += 1
    return index, None
 def parse_one(tokens, index):
    token = tokens[index]
    if token.type == TT.NEWLINE:
        index = index + 1
        if index < len(tokens):
            return parse_one(tokens, index)
        else:
            return index, None
    elif token.type == TT.IMAGE:
        index, ast = parse_image(tokens, index + 1)
        return index, ast
    elif token.type == TT.DEFINE:
        index, ast = parse_define(tokens, index + 1)
        return index, ast
    elif token.type == TT.LABEL:
        index, ast = parse_label(tokens, index + 1)
        return index, ast
    elif token.type == TT.PLAY:
        index, ast = parse_play(tokens, index + 1)
        return index, ast
    elif token.type == TT.SCENE:
        index, ast = parse_scene(tokens, index + 1)
        return index, ast
    elif token.type == TT.WITH:
        index, ast = parse_with(tokens, index + 1)
        return index, ast
    elif token.type == TT.IDENTIFIER:
        index, ast = parse_say(tokens, index)
        return index, ast
    elif token.type == TT.VOICE:
        index, ast = parse_voice(tokens, index + 1)
        return index, ast
    elif token.type == TT.SHOW:
        index, ast = parse_show(tokens, index + 1)
        return index, ast
    elif token.type == TT.MENU:
        index, ast = parse_menu(tokens, index)
        return index, ast
    elif token.type == TT.JUMP:
        index, ast = parse_jump(tokens, index + 1)
        return index, ast
    elif token.type == TT.RETURN:
        return index + 1, Return()
    elif token.type == TT.INIT:
        index, ast = parse_init(tokens, index)
        return index, ast
    else:
        raise ParseException("unexpected token", token)
 def parse_all(tokens):
    index = 0
    while index < len(tokens):
        index, ast = parse_one(tokens, index)
        if ast is not None:
            yield ast
 def main():
    with open(sys.argv[1], 'rb') as f:
        mem = memoryview(f.read())
    tokens = list(lex.tokenize(mem))
    try:
        ast = parse_all(tokens)
        for t in ast:
            pprint(t)
    except ParseException as e:
        print(e, e.token)
        raise
 if __name__ == "__main__":
    main()