initial renpy parser

This commit is contained in:
Zack Buhman 2026-05-21 21:21:33 -05:00
parent bb6f76cf72
commit 610aff4af6
3 changed files with 763 additions and 0 deletions

View File

@ -0,0 +1,57 @@
#include <stdint.h>
namespace language::statement {
enum type {
show,
voice,
music,
text,
menu,
jump,
};
struct show {
uint32_t imageIndex;
uint32_t transformIndex;
};
struct voice {
uint32_t audioIndex;
};
struct music {
uint32_t channelIndex;
uint32_t audioIndex;
};
struct say {
uint32_t characterIndex;
uint32_t stringIndex;
};
struct option {
uint32_t stringIndex;
uint32_t statementIndex;
};
struct menu {
uint32_t count;
uint32_t optionIndex;
};
struct jump {
uint32_t statementIndex;
};
struct statement {
enum statement_type type;
union {
show show;
voice voice;
music music;
say say;
menu menu;
jump jump;
};
};
}

231
renpy-parser/lex.py Normal file
View File

@ -0,0 +1,231 @@
import string
from dataclasses import dataclass
from enum import Enum, auto
import sys
whitespace = set([ord(' '), ord('\r')])
string_digits = set(ord(i) for i in string.digits)
identifier_start = set(map(ord, string.ascii_letters + "_"))
identifier = identifier_start | string_digits
@dataclass
class Position:
offset: int
line: int
column: int
def from_position(p):
return Position(p.offset, p.line, p.column)
class TT(Enum):
KEYWORD = auto()
NEWLINE = auto()
COLON = auto()
STRING = auto()
DOT = auto()
EQUAL = auto()
COMMA = auto()
LPAREN = auto()
RPAREN = auto()
NUMBER = auto()
IDENTIFIER = auto()
WITH = auto()
# keywords
PLAY = auto()
VOICE = auto()
SCENE = auto()
SHOW = auto()
AT = auto()
DEFINE = auto()
IMAGE = auto()
LABEL = auto()
MENU = auto()
JUMP = auto()
RETURN = auto()
INIT = auto()
FADEOUT = auto()
keywords = {
b"play": TT.PLAY,
b"voice": TT.VOICE,
b"scene": TT.SCENE,
b"show": TT.SHOW,
b"at": TT.AT,
b"define": TT.DEFINE,
b"image": TT.IMAGE,
b"label": TT.LABEL,
b"with": TT.WITH,
b"menu": TT.MENU,
b"jump": TT.JUMP,
b"return": TT.RETURN,
b"init": TT.INIT,
b"fadeout": TT.FADEOUT,
}
@dataclass
class Token:
position: Position
lexeme: bytes
type: TT
def parse_string(mem, position):
offset = position.offset
assert mem[offset] == ord('"'), (position, chr(mem[offset]))
offset += 1
start = offset
while mem[offset] != ord('"'):
assert mem[offset] != ord("\n")
offset += 1
assert mem[offset] == ord('"'), position
s = bytes(mem[start:offset])
token = Token(
position = position,
lexeme = s,
type = TT.STRING
)
next_position = Position(
offset = offset + 1,
line = position.line,
column = position.column + 2 + len(s)
)
return next_position, token
def parse_number(mem, position):
offset = position.offset
whole = []
fraction = []
sign = 1
if mem[offset] == ord('-'):
sign = -1
offset += 1
# whole
while True:
c = mem[offset]
if c in string_digits:
whole.append(c)
offset += 1
elif c == ord('.'):
assert whole != [], chr(c)
offset += 1
break
else:
assert whole != [], chr(c)
number = sign * int(bytes(whole))
next_position = Position(
offset = offset,
line = position.line,
column = position.column + 1 + (offset - position.offset)
)
return next_position, Token(position, number, TT.NUMBER)
# fraction
while True:
c = mem[offset]
if c in string_digits:
fraction.append(c)
offset += 1
else:
assert fraction != [], chr(c)
w = int(bytes(whole))
f = int(bytes(fraction)) / (10 ** len(fraction))
number = sign * (w + f)
next_position = Position(
offset = offset,
line = position.line,
column = position.column + 1 + (offset - position.offset)
)
return next_position, Token(position, number, TT.NUMBER)
def disambiguate_keyword(lexeme):
if lexeme in keywords:
return keywords[lexeme]
else:
return TT.IDENTIFIER
def parse_identifier(mem, position):
offset = position.offset
l = []
while True:
c = mem[offset]
if c in identifier:
l.append(c)
offset += 1
else:
assert l != []
lexeme = bytes(l)
next_position = Position(
offset = offset,
line = position.line,
column = position.column + 1 + len(lexeme)
)
token_type = disambiguate_keyword(lexeme)
return next_position, Token(position, lexeme, token_type)
def next_token(mem, position):
position = Position.from_position(position)
while True:
if position.offset >= len(mem):
return position, None
c = mem[position.offset]
if c in whitespace:
position.offset += 1
position.column += 1
else:
break
next_position = Position.from_position(position)
next_position.offset += 1
next_position.column += 1
if c >= 128:
print(f"warning: invalid garbage byte {hex(c)} at {position}")
next_position.column = position.column
return next_token(mem, next_position)
if c == ord('\n'):
next_position.line += 1
next_position.column = 0
return next_position, Token(position, b'\n', TT.NEWLINE)
if c == ord(':'):
return next_position, Token(position, b':', TT.COLON)
if c == ord('.'):
return next_position, Token(position, b':', TT.DOT)
if c == ord('='):
return next_position, Token(position, b'=', TT.EQUAL)
if c == ord(','):
return next_position, Token(position, b',', TT.COMMA)
if c == ord('('):
return next_position, Token(position, b'(', TT.LPAREN)
if c == ord(')'):
return next_position, Token(position, b')', TT.RPAREN)
if c == ord('"'):
return parse_string(mem, position)
if c == ord('#'):
offset = position.offset + 1
while mem[offset] != ord('\n'):
offset += 1
return next_token(mem, Position(offset + 1, position.line + 1, 0))
if c in string_digits or c == ord('-'):
return parse_number(mem, position)
if c in identifier_start:
return parse_identifier(mem, position)
assert not "unexpected character", (position, chr(c), hex(c))
def tokenize(mem):
position = Position(offset = 0, line = 1, column = 0)
while True:
position, token = next_token(mem, position)
if token is None:
return
else:
yield token
def main():
with open(sys.argv[1], 'rb') as f:
mem = memoryview(f.read())
for token in tokenize(mem):
print(token.lexeme, token.type)
if __name__ == "__main__":
main()

475
renpy-parser/parse.py Normal file
View File

@ -0,0 +1,475 @@
import sys
from pprint import pprint
import lex
from lex import TT
from dataclasses import dataclass
class ParseException(Exception):
def __init__(self, message, token):
super().__init__(message)
self.token = token
def get_lexeme(v):
if type(v) is list:
return list(get_lexeme(i) for i in v)
elif type(v) is tuple:
return tuple(get_lexeme(i) for i in v)
elif type(v) is lex.Token:
return v.lexeme
else:
return v
def lexeme_repr(self):
kws = [f"{key}={get_lexeme(value)!r}" for key, value in self.__dict__.items()]
return "{}({})".format(type(self).__name__, ", ".join(kws))
@dataclass
class FunctionCall:
name: lex.Token
args: list[lex.Token]
kwargs: list[tuple[lex.Token, lex.Token]]
__repr__ = lexeme_repr
@dataclass
class Image:
name: list[lex.Token]
path: lex.Token
__repr__ = lexeme_repr
@dataclass
class Define:
name: list[lex.Token]
value: 'Expression'
__repr__ = lexeme_repr
@dataclass
class Label:
name: lex.Token
__repr__ = lexeme_repr
@dataclass
class Play:
channel: lex.Token
path: lex.Token
fadeout: lex.Token
__repr__ = lexeme_repr
@dataclass
class Scene:
name: lex.Token
__repr__ = lexeme_repr
@dataclass
class With:
function_call: FunctionCall
__repr__ = lexeme_repr
@dataclass
class Say:
speaker: lex.Token
text: lex.Token
__repr__ = lexeme_repr
@dataclass
class Voice:
path: lex.Token
__repr__ = lexeme_repr
@dataclass
class Show:
what: lex.Token
transform: lex.Token
__repr__ = lexeme_repr
@dataclass
class Menu:
entries: tuple[lex.Token, list['Statement']]
__repr__ = lexeme_repr
@dataclass
class Jump:
target: lex.Token
__repr__ = lexeme_repr
@dataclass
class Return:
pass
def parse_lhs(tokens, index):
identifier = tokens[index]
if identifier.type != TT.IDENTIFIER:
raise ParseException("expected identifier", identifier)
lhs = [identifier]
index += 1
while tokens[index].type == TT.DOT:
identifier = tokens[index + 1]
if identifier.type != TT.IDENTIFIER:
raise ParseException("expected identifier", identifier)
lhs.append(identifier)
index += 2
return index, lhs
def parse_image(tokens, index):
index, lhs = parse_lhs(tokens, index)
equal = tokens[index + 0]
if equal.type != TT.EQUAL:
raise ParseException("expected equal", equal)
string = tokens[index + 1]
if string.type != TT.STRING:
raise ParseException("expected string", string)
image = Image(
name = lhs,
path = string,
)
return index + 2, image
def parse_function_call(tokens, index):
name = tokens[index + 0]
if name.type != TT.IDENTIFIER:
raise ParseException("expected identifier", name)
lparen = tokens[index + 1]
if lparen.type != TT.LPAREN:
raise ParseException("expected lparen", lparen)
index += 2
# args
args = []
while tokens[index].type != TT.RPAREN:
token = tokens[index]
if token.type == TT.STRING:
args.append(token)
elif token.type == TT.NUMBER:
args.append(token)
elif token.type == TT.NEWLINE:
pass
else:
break
index += 1
if tokens[index].type != TT.COMMA:
break
index += 1
# kwargs
kwargs = []
while tokens[index].type != TT.RPAREN:
identifier = tokens[index + 0]
if identifier.type != TT.IDENTIFIER:
raise ParseException("expected function call kwargs identifier", identifier)
equal = tokens[index + 1]
if equal.type != TT.EQUAL:
raise ParseException("expected function call kwargs equal", equal)
string = tokens[index + 2]
if string.type != TT.STRING:
raise ParseException("expected function call kwargs string", string)
kwargs.append((identifier, string))
index += 3
if tokens[index].type != TT.COMMA:
break
rparen = tokens[index]
if rparen.type != TT.RPAREN:
raise ParseException("expected rparen", rparen)
index += 1
function_call = FunctionCall(
name = name,
args = args,
kwargs = kwargs
)
return index, function_call
def parse_rhs(tokens, index):
token = tokens[index]
peek = tokens[index+1]
if token.type == TT.NUMBER:
return index + 1, token
elif token.type == TT.IDENTIFIER and peek.type == TT.LPAREN:
return parse_function_call(tokens, index)
else:
raise ParseException("expected rhs expression", token)
def parse_define(tokens, index):
index, lhs = parse_lhs(tokens, index)
equal = tokens[index + 0]
if equal.type != TT.EQUAL:
raise ParseException("expected equal", equal)
index, rhs = parse_rhs(tokens, index + 1)
define = Define(
name = lhs,
value = rhs
)
return index, define
def parse_label(tokens, index):
name = tokens[index + 0]
if name.type != TT.IDENTIFIER:
raise ParseException("expected identifier", name)
colon = tokens[index + 1]
if colon.type != TT.COLON:
raise ParseException("expected colon", colon)
label = Label(
name = name
)
return index + 2, label
def parse_play(tokens, index):
channel = tokens[index + 0]
if channel.type != TT.IDENTIFIER:
raise ParseException("expected identifier", channel)
path = tokens[index + 1]
if path.type != TT.STRING:
raise ParseException("expected string", path)
index += 2
token = tokens[index]
fadeout = None
if token.type == TT.FADEOUT:
fadeout = tokens[index + 1]
if fadeout.type != TT.NUMBER:
raise ParseException("expected number", fadeout)
index += 2
play = Play(
channel = channel,
path = path,
fadeout = fadeout
)
return index, play
def parse_scene(tokens, index):
name = tokens[index + 0]
if name.type != TT.IDENTIFIER:
raise ParseException("expected identifier", name)
scene = Scene(
name = name,
)
return index + 1, scene
def parse_with(tokens, index):
index, function_call = parse_function_call(tokens, index)
_with = With(
function_call = function_call
)
return index, _with
def parse_say(tokens, index):
speaker = tokens[index + 0]
if speaker.type != TT.IDENTIFIER:
raise ParseException("expected identifier", name)
text = tokens[index + 1]
if text.type != TT.STRING:
raise ParseException("expected string", text)
say = Say(
speaker = speaker,
text = text
)
return index + 2, say
def parse_voice(tokens, index):
path = tokens[index]
if path.type != TT.STRING:
raise ParseException("expected string", path)
voice = Voice(
path = path,
)
return index + 1, voice
def parse_show(tokens, index):
what = tokens[index + 0]
if what.type != TT.IDENTIFIER:
raise ParseException("expected identifier", path)
at = tokens[index + 1]
if at.type != TT.AT:
raise ParseException("expected at", at)
transform = tokens[index + 2]
if transform.type != TT.IDENTIFIER:
raise ParseException("expected identifier", transform)
show = Show(
what = what,
transform = transform
)
return index + 3, show
def parse_menu(tokens, index):
menu = tokens[index + 0]
if menu.type != TT.MENU:
raise ParseException("expected menu", menu)
colon = tokens[index + 1]
if colon.type != TT.COLON:
raise ParseException("expected colon", colon)
index = index + 2
menu_entries = []
menu_entry_string = None
menu_entry_statements = None
while index < len(tokens):
token = tokens[index+0]
if token.type == TT.NEWLINE:
index += 1
continue
peek = tokens[index+1]
if token.position.column < menu.position.column:
raise ParseException("invalid block dedent", token)
if token.position.column == menu.position.column:
break
if token.type == TT.STRING:
if peek.type != TT.COLON:
raise ParseException("expected colon", peek)
if menu_entry_string is not None:
menu_entries.append((menu_entry_string, menu_entry_statements))
menu_entry_string = token
menu_entry_statements = []
index += 2
else:
if menu_entry_statements is None:
raise ParseException("expected menu option", token)
index, ast = parse_one(tokens, index)
if ast is not None:
menu_entry_statements.append(ast)
if menu_entry_string is not None:
menu_entries.append((menu_entry_string, menu_entry_statements))
menu = Menu(
entries = menu_entries,
)
return index, menu
def parse_jump(tokens, index):
target = tokens[index + 0]
if target.type != TT.IDENTIFIER:
raise ParseException("expected identifier", target)
jump = Jump(
target = target,
)
return index + 1, jump
def parse_init(tokens, index):
init = tokens[index + 0]
if init.type != TT.INIT:
raise ParseException("expected init", init)
colon = tokens[index + 1]
if colon.type != TT.COLON:
raise ParseException("expected identifier", colon)
index += 2
# skip all tokens inside block
while index < len(tokens):
token = tokens[index]
if token.type == TT.NEWLINE:
index += 1
continue
if token.position.column < init.position.column:
raise ParseException("invalid block dedent", token)
if token.position.column == init.position.column:
break
index += 1
return index, None
def parse_one(tokens, index):
token = tokens[index]
if token.type == TT.NEWLINE:
index = index + 1
if index < len(tokens):
return parse_one(tokens, index)
else:
return index, None
elif token.type == TT.IMAGE:
index, ast = parse_image(tokens, index + 1)
return index, ast
elif token.type == TT.DEFINE:
index, ast = parse_define(tokens, index + 1)
return index, ast
elif token.type == TT.LABEL:
index, ast = parse_label(tokens, index + 1)
return index, ast
elif token.type == TT.PLAY:
index, ast = parse_play(tokens, index + 1)
return index, ast
elif token.type == TT.SCENE:
index, ast = parse_scene(tokens, index + 1)
return index, ast
elif token.type == TT.WITH:
index, ast = parse_with(tokens, index + 1)
return index, ast
elif token.type == TT.IDENTIFIER:
index, ast = parse_say(tokens, index)
return index, ast
elif token.type == TT.VOICE:
index, ast = parse_voice(tokens, index + 1)
return index, ast
elif token.type == TT.SHOW:
index, ast = parse_show(tokens, index + 1)
return index, ast
elif token.type == TT.MENU:
index, ast = parse_menu(tokens, index)
return index, ast
elif token.type == TT.JUMP:
index, ast = parse_jump(tokens, index + 1)
return index, ast
elif token.type == TT.RETURN:
return index + 1, Return()
elif token.type == TT.INIT:
index, ast = parse_init(tokens, index)
return index, ast
else:
raise ParseException("unexpected token", token)
def parse_all(tokens):
index = 0
while index < len(tokens):
index, ast = parse_one(tokens, index)
if ast is not None:
yield ast
def main():
with open(sys.argv[1], 'rb') as f:
mem = memoryview(f.read())
tokens = list(lex.tokenize(mem))
try:
ast = parse_all(tokens)
for t in ast:
pprint(t)
except ParseException as e:
print(e, e.token)
raise
if __name__ == "__main__":
main()