from parse.line import next_line, skip_whitespace def parse_label(lines): lines, line = next_line(lines) assert line[-2:] == '::', line name = line.removesuffix('::') return lines, name string_tokens = {"text", "cont", "para", "line"} def parse_string(line): line = line.strip() assert line[0] == '"' line = line[1:] for i, c in enumerate(line): if c != '\\' and line[i] == '"': return line[:i], line[i+1:] def parse_args(line): return [i.strip() for i in line.split(',')] def parse_body(lines): body = [] while lines: lines, line = next_line(lines) if line in {"text_end", "done", "prompt"}: body.append((line,)) break type, *rest = line.split(maxsplit=1) if type in string_tokens: value, = rest string_value, rest = parse_string(value) assert rest == "", rest body.append((type, string_value)) elif type == 'text_ram': value, = rest assert value[0] == "w", value body.append((type, value)) elif type == 'text_start': body.append((type,)) elif type in {'text_decimal', 'text_bcd'}: value, = rest body.append((type, parse_args(value))) else: assert False, line return lines, body def tokenize_text(lines): lines, name = parse_label(lines) lines, body = parse_body(lines) return lines, (name, body) def tokenize(lines): while lines: lines, tokens = tokenize_text(lines) lines = skip_whitespace(lines) yield tokens def parse(path): with open(path) as f: tokens = list(tokenize(f.read().split('\n'))) d = dict(tokens) assert len(tokens) == len(d) return d def parse_all(prefix): base_path = prefix / 'text' paths = [p for p in base_path.iterdir() if p.is_file()] return [parse(path) for path in paths] import sys from pprint import pprint from pathlib import Path pprint(parse_all(Path(sys.argv[1])))