From 29428c7a92356c33c02392b8e60139c43b9dfb6b Mon Sep 17 00:00:00 2001 From: Zack Buhman Date: Mon, 14 Aug 2023 16:52:56 +0000 Subject: [PATCH] initial --- .gitignore | 4 + Makefile | 33 ++++ build_radix_tree.py | 99 ++++++++++++ keyword.hpp | 386 ++++++++++++++++++++++++++++++++++++++++++++ lexer.cpp | 218 +++++++++++++++++++++++++ lexer.hpp | 1 + main.cpp | 64 ++++++++ num.hpp | 5 + token.hpp | 194 ++++++++++++++++++++++ 9 files changed, 1004 insertions(+) create mode 100644 .gitignore create mode 100644 Makefile create mode 100644 build_radix_tree.py create mode 100644 keyword.hpp create mode 100644 lexer.cpp create mode 100644 lexer.hpp create mode 100644 main.cpp create mode 100644 num.hpp create mode 100644 token.hpp diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b5e0755 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +main +*.o +*.gch +*.d \ No newline at end of file diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..c4011ba --- /dev/null +++ b/Makefile @@ -0,0 +1,33 @@ +CXXFLAGS = -Og -g -Wall -Wextra -Werror -Wfatal-errors -Wpedantic -std=c++20 +LDFLAGS = + +TARGET = +CXX = $(TARGET)g++ + +SRC = main.cpp +OBJ = $(patsubst %.cpp,%.o,$(SRC)) +DEP = $(patsubst %.cpp,%.d,$(SRC)) + +all: main + +-include $(DEP) + +%.o: %.cpp + $(CXX) $(CXXFLAGS) -MMD -MF $(basename $<).d -c $< -o $@ + +main: $(OBJ) + $(CXX) $(LDFLAGS) $^ -o $@ + +clean: + rm -f *.o *.d *.gch + +.SUFFIXES: +.INTERMEDIATE: +.SECONDARY: +.PHONY: all clean + +%: RCS/%,v +%: RCS/% +%: %,v +%: s.% +%: SCCS/s.% diff --git a/build_radix_tree.py b/build_radix_tree.py new file mode 100644 index 0000000..3efc81e --- /dev/null +++ b/build_radix_tree.py @@ -0,0 +1,99 @@ +def build_radix_tree(ops: list[str]) -> dict: + root = dict() + for op in ops: + d = root + for i in range(len(op)): + if op[i] not in d: + d[op[i]] = (None,{}) + if i == (len(op) - 1): + d[op[i]] = (op,d[op[i]][1]) + else: + d = d[op[i]][1] + return root + +def indent(i): + return " " * (2 * i) + +def print_switch(d, level=0): + p = print + inden0 = indent(level+0) + inden1 = indent(level+1) + inden2 = indent(level+2) + p(inden0 + "switch (s[ix++]) {") + for key, (terminal, children) in d.items(): + if key.upper() != key.lower(): + p(inden0 + f"case '{key.upper()}': [[fallthrough]];") + p(inden0 + f"case '{key.lower()}':") + if terminal is not None: + p(inden1 + f"if (ix == s.length()) return {{ token::type_t::_{terminal} }};") + if children: + p(inden1 + "else {") + else: + if children: + p(inden1 + "if (ix < s.length()) {") + if children: + print_switch(children, level+2) + p(inden1 + "}") + p(inden1 + "break;") + p(inden0 + "}") + +def print_keyword_func(root): + p = print + inden1 = indent(1) + p("#include ") + p('#include "token.hpp"') + p() + p("namespace dsp {") + p() + p("struct keyword {") + p() + p("inline static constexpr std::optional") + p("find(const std::string_view s)") + p("{") + p(inden1 + "if (s.length() == 0) { return {}; }") + p() + p(inden1 + "std::string_view::size_type ix = 0;") + p() + print_switch(root, level=1) + p(inden1 + "return {};") + p("}") + p() + p("};") + p() + p("}") + +from pprint import pprint +d = build_radix_tree([ + "alh", + "all", + "alu", + "m0", "m1", "m2", "m3", + "mc0", "mc1", "mc2", "mc3", + "mul", + "nop", + "and", + "or", + "xor", + "add", + "sub", + "ad2", + "sr", + "rr", + "sl", + "rl", + "rl8", + "clr", + "mov", + "mvi", + "dma", + "dmah", + "jmp", + "btm", + "lps", + "end", + "endi", + "equ", + "org", + "ends", +]) +print_keyword_func(d) diff --git a/keyword.hpp b/keyword.hpp new file mode 100644 index 0000000..90c16ee --- /dev/null +++ b/keyword.hpp @@ -0,0 +1,386 @@ +#include +#include "token.hpp" + +namespace dsp { + +struct keyword { + +inline static constexpr std::optional +find(const std::string_view s) +{ + if (s.length() == 0) { return {}; } + + std::string_view::size_type ix = 0; + + switch (s[ix++]) { + case 'A': [[fallthrough]]; + case 'a': + if (ix < s.length()) { + switch (s[ix++]) { + case 'L': [[fallthrough]]; + case 'l': + if (ix < s.length()) { + switch (s[ix++]) { + case 'H': [[fallthrough]]; + case 'h': + if (ix == s.length()) return { token::type_t::_alh }; + break; + case 'L': [[fallthrough]]; + case 'l': + if (ix == s.length()) return { token::type_t::_all }; + break; + case 'U': [[fallthrough]]; + case 'u': + if (ix == s.length()) return { token::type_t::_alu }; + break; + } + } + break; + case 'N': [[fallthrough]]; + case 'n': + if (ix < s.length()) { + switch (s[ix++]) { + case 'D': [[fallthrough]]; + case 'd': + if (ix == s.length()) return { token::type_t::_and }; + break; + } + } + break; + case 'D': [[fallthrough]]; + case 'd': + if (ix < s.length()) { + switch (s[ix++]) { + case 'D': [[fallthrough]]; + case 'd': + if (ix == s.length()) return { token::type_t::_add }; + break; + case '2': + if (ix == s.length()) return { token::type_t::_ad2 }; + break; + } + } + break; + } + } + break; + case 'M': [[fallthrough]]; + case 'm': + if (ix < s.length()) { + switch (s[ix++]) { + case '0': + if (ix == s.length()) return { token::type_t::_m0 }; + break; + case '1': + if (ix == s.length()) return { token::type_t::_m1 }; + break; + case '2': + if (ix == s.length()) return { token::type_t::_m2 }; + break; + case '3': + if (ix == s.length()) return { token::type_t::_m3 }; + break; + case 'C': [[fallthrough]]; + case 'c': + if (ix < s.length()) { + switch (s[ix++]) { + case '0': + if (ix == s.length()) return { token::type_t::_mc0 }; + break; + case '1': + if (ix == s.length()) return { token::type_t::_mc1 }; + break; + case '2': + if (ix == s.length()) return { token::type_t::_mc2 }; + break; + case '3': + if (ix == s.length()) return { token::type_t::_mc3 }; + break; + } + } + break; + case 'U': [[fallthrough]]; + case 'u': + if (ix < s.length()) { + switch (s[ix++]) { + case 'L': [[fallthrough]]; + case 'l': + if (ix == s.length()) return { token::type_t::_mul }; + break; + } + } + break; + case 'O': [[fallthrough]]; + case 'o': + if (ix < s.length()) { + switch (s[ix++]) { + case 'V': [[fallthrough]]; + case 'v': + if (ix == s.length()) return { token::type_t::_mov }; + break; + } + } + break; + case 'V': [[fallthrough]]; + case 'v': + if (ix < s.length()) { + switch (s[ix++]) { + case 'I': [[fallthrough]]; + case 'i': + if (ix == s.length()) return { token::type_t::_mvi }; + break; + } + } + break; + } + } + break; + case 'N': [[fallthrough]]; + case 'n': + if (ix < s.length()) { + switch (s[ix++]) { + case 'O': [[fallthrough]]; + case 'o': + if (ix < s.length()) { + switch (s[ix++]) { + case 'P': [[fallthrough]]; + case 'p': + if (ix == s.length()) return { token::type_t::_nop }; + break; + } + } + break; + } + } + break; + case 'O': [[fallthrough]]; + case 'o': + if (ix < s.length()) { + switch (s[ix++]) { + case 'R': [[fallthrough]]; + case 'r': + if (ix == s.length()) return { token::type_t::_or }; + else { + switch (s[ix++]) { + case 'G': [[fallthrough]]; + case 'g': + if (ix == s.length()) return { token::type_t::_org }; + break; + } + } + break; + } + } + break; + case 'X': [[fallthrough]]; + case 'x': + if (ix < s.length()) { + switch (s[ix++]) { + case 'O': [[fallthrough]]; + case 'o': + if (ix < s.length()) { + switch (s[ix++]) { + case 'R': [[fallthrough]]; + case 'r': + if (ix == s.length()) return { token::type_t::_xor }; + break; + } + } + break; + } + } + break; + case 'S': [[fallthrough]]; + case 's': + if (ix < s.length()) { + switch (s[ix++]) { + case 'U': [[fallthrough]]; + case 'u': + if (ix < s.length()) { + switch (s[ix++]) { + case 'B': [[fallthrough]]; + case 'b': + if (ix == s.length()) return { token::type_t::_sub }; + break; + } + } + break; + case 'R': [[fallthrough]]; + case 'r': + if (ix == s.length()) return { token::type_t::_sr }; + break; + case 'L': [[fallthrough]]; + case 'l': + if (ix == s.length()) return { token::type_t::_sl }; + break; + } + } + break; + case 'R': [[fallthrough]]; + case 'r': + if (ix < s.length()) { + switch (s[ix++]) { + case 'R': [[fallthrough]]; + case 'r': + if (ix == s.length()) return { token::type_t::_rr }; + break; + case 'L': [[fallthrough]]; + case 'l': + if (ix == s.length()) return { token::type_t::_rl }; + else { + switch (s[ix++]) { + case '8': + if (ix == s.length()) return { token::type_t::_rl8 }; + break; + } + } + break; + } + } + break; + case 'C': [[fallthrough]]; + case 'c': + if (ix < s.length()) { + switch (s[ix++]) { + case 'L': [[fallthrough]]; + case 'l': + if (ix < s.length()) { + switch (s[ix++]) { + case 'R': [[fallthrough]]; + case 'r': + if (ix == s.length()) return { token::type_t::_clr }; + break; + } + } + break; + } + } + break; + case 'D': [[fallthrough]]; + case 'd': + if (ix < s.length()) { + switch (s[ix++]) { + case 'M': [[fallthrough]]; + case 'm': + if (ix < s.length()) { + switch (s[ix++]) { + case 'A': [[fallthrough]]; + case 'a': + if (ix == s.length()) return { token::type_t::_dma }; + else { + switch (s[ix++]) { + case 'H': [[fallthrough]]; + case 'h': + if (ix == s.length()) return { token::type_t::_dmah }; + break; + } + } + break; + } + } + break; + } + } + break; + case 'J': [[fallthrough]]; + case 'j': + if (ix < s.length()) { + switch (s[ix++]) { + case 'M': [[fallthrough]]; + case 'm': + if (ix < s.length()) { + switch (s[ix++]) { + case 'P': [[fallthrough]]; + case 'p': + if (ix == s.length()) return { token::type_t::_jmp }; + break; + } + } + break; + } + } + break; + case 'B': [[fallthrough]]; + case 'b': + if (ix < s.length()) { + switch (s[ix++]) { + case 'T': [[fallthrough]]; + case 't': + if (ix < s.length()) { + switch (s[ix++]) { + case 'M': [[fallthrough]]; + case 'm': + if (ix == s.length()) return { token::type_t::_btm }; + break; + } + } + break; + } + } + break; + case 'L': [[fallthrough]]; + case 'l': + if (ix < s.length()) { + switch (s[ix++]) { + case 'P': [[fallthrough]]; + case 'p': + if (ix < s.length()) { + switch (s[ix++]) { + case 'S': [[fallthrough]]; + case 's': + if (ix == s.length()) return { token::type_t::_lps }; + break; + } + } + break; + } + } + break; + case 'E': [[fallthrough]]; + case 'e': + if (ix < s.length()) { + switch (s[ix++]) { + case 'N': [[fallthrough]]; + case 'n': + if (ix < s.length()) { + switch (s[ix++]) { + case 'D': [[fallthrough]]; + case 'd': + if (ix == s.length()) return { token::type_t::_end }; + else { + switch (s[ix++]) { + case 'I': [[fallthrough]]; + case 'i': + if (ix == s.length()) return { token::type_t::_endi }; + break; + case 'S': [[fallthrough]]; + case 's': + if (ix == s.length()) return { token::type_t::_ends }; + break; + } + } + break; + } + } + break; + case 'Q': [[fallthrough]]; + case 'q': + if (ix < s.length()) { + switch (s[ix++]) { + case 'U': [[fallthrough]]; + case 'u': + if (ix == s.length()) return { token::type_t::_equ }; + break; + } + } + break; + } + } + break; + } + return {}; +} + +}; + +} diff --git a/lexer.cpp b/lexer.cpp new file mode 100644 index 0000000..a9fe88c --- /dev/null +++ b/lexer.cpp @@ -0,0 +1,218 @@ +#include +#include +#include +#include + +#include "token.hpp" +#include "num.hpp" +#include "lexer.hpp" +#include "keyword.hpp" + +namespace dsp { + +template +constexpr static N parse_digit(const char c) +{ + switch (c) { + default: [[fallthrough]]; + case '0': return 0; + case '1': return 1; + case '2': return 2; + case '3': return 3; + case '4': return 4; + case '5': return 5; + case '6': return 6; + case '7': return 7; + case '8': return 8; + case '9': return 9; + case 'a': return 10; + case 'b': return 11; + case 'c': return 12; + case 'd': return 13; + case 'e': return 14; + case 'f': return 15; + case 'A': return 10; + case 'B': return 11; + case 'C': return 12; + case 'D': return 13; + case 'E': return 14; + case 'F': return 15; + } +} + +template +constexpr static N parse_number(const std::string_view s) +{ + N n = 0; + for (std::string_view::size_type ix = 0; ix < s.length(); ix++) { + n *= base; + n += parse_digit(s[ix]); + } + + return n; +} + +struct dec_t { + constexpr static bool pred(const char c) + { + return c >= '0' && c <= '9'; + } + + template + constexpr static token_t parse(const std::string_view s) + { + return parse_number(s); + } +}; + +struct hex_t { + constexpr static bool pred(const char c) + { + return dec_t::pred(c) + || (c >= 'a' && c <= 'f') + || (c >= 'A' && c <= 'F'); + } + + template + constexpr static token_t parse(const std::string_view s) + { + return parse_number(s); + } +}; + +constexpr bool alpha_p(const char c) +{ + return (c >= 'a' && c <= 'z') + || (c >= 'A' && c <= 'Z'); +} + +constexpr bool alpha_numeric_p(const char c) +{ + return alpha_p(c) || dec_t::pred(c) || (c == '_'); +} + +struct lexer_t { + const std::string_view source; + std::string_view::size_type start_ix; + std::string_view::size_type current_ix; + token_pos_t pos; + + lexer_t() = delete; + + constexpr lexer_t(const std::string_view source) + : source(source), start_ix(0), pos{ .line = 1, .col = 0} + { } + + bool at_end_p() + { + return current_ix >= source.length(); + } + + char peek() + { + if (at_end_p()) return '\0'; + return source[current_ix]; + } + + bool match(const char expected) + { + if (at_end_p()) return false; + else if (source[current_ix] != expected) return false; + pos.col++; + current_ix++; + return true; + } + + char advance() + { + pos.col++; + return source[current_ix++]; + } + + const std::string_view lexeme() + { + return source.substr(start_ix, current_ix); + } + + template + token _number() + { + while (T::pred(peek())) advance(); + + return {pos, token::number, lexeme(), T::parse(lexeme())}; + } + + token _identifier() + { + while (alpha_numeric_p(peek())) advance(); + std::optional keyword = keyword::find(lexeme()); + if (keyword) return {pos, *keyword, lexeme()}; + else return {pos, token::identifier, lexeme()}; + } + + token scan_token() + { + using enum token::type_t; + + start_ix = current_ix; + + const char c = advance(); + switch (c) { + case '(': return {pos, left_paren, lexeme()}; + case ')': return {pos, right_paren, lexeme()}; + case ',': return {pos, comma, lexeme()}; + case '.': return {pos, dot, lexeme()}; + case '+': return {pos, plus, lexeme()}; + case '-': return {pos, minus, lexeme()}; + case '*': return {pos, star, lexeme()}; + case '/': return {pos, slash, lexeme()}; + case '%': return {pos, percent, lexeme()}; + case '~': return {pos, tilde, lexeme()}; + case '&': return {pos, ampersand, lexeme()}; + case '|': return {pos, bar, lexeme()}; + case '^': return {pos, carot, lexeme()}; + case '<': + if (match('<')) return {pos, left_shift, lexeme()}; + break; + case '>': + if (match('>')) return {pos, right_shift, lexeme()}; + break; + case ';': + while (!at_end_p() && peek() != '\n') advance(); + break; + case ' ': + case '\r': + case '\t': + break; + case '\n': + pos.line++; + pos.col = 0; + break; + case '$': + if (hex_t::pred(peek())) { + start_ix += 1; + return _number(); + } + [[fallthrough]]; + case '0': + if (match('x')) { + if (hex_t::pred(peek())) { + start_ix += 2; + return _number(); + } + } + [[fallthrough]]; + default: + if (dec_t::pred(c)) { + return _number(); + } else if (alpha_p(c)) { + return _identifier(); + } else { + //error(pos.line, "Unexpected character."); + } + break; + } + } +}; + +} diff --git a/lexer.hpp b/lexer.hpp new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/lexer.hpp @@ -0,0 +1 @@ + diff --git a/main.cpp b/main.cpp new file mode 100644 index 0000000..475a1f3 --- /dev/null +++ b/main.cpp @@ -0,0 +1,64 @@ +#include +#include +#include + +#include "token.hpp" + +static bool had_error = false; + +static void report(int line, std::string where, std::string message) +{ + std::cerr << "[line " << line << "] Error" << where << ": " << message; + had_error = true; +} + +void error(int line, std::string message) +{ + report(line, "", message); +} + +static void run(std::string source) +{ + std::string_view buf {source}; + (void)buf; +} + +static void run_prompt() +{ + constexpr auto prompt = "> "; + std::string line; + std::cout << prompt << std::flush; + while (std::getline(std::cin, line)) { + run(line); + std::cout << prompt << std::flush; + } +} + +static int run_file(char const * const filename) +{ + std::ifstream is {filename, std::ios::binary | std::ios::ate}; + if (!is.is_open()) { + std::cerr << "failed to open " << filename << std::endl; + return -1; + } + const std::streampos size = is.tellg(); + std::string buf(size, '\0'); + is.seekg(0); + if (!is.read(&buf[0], size)) { + std::cerr << "read failed" << std::endl; + return -1; + } + run(buf); + return had_error; +} + +int main(const int argc, char const * const argv[]) +{ + switch (argc) { + case 1: run_prompt(); return had_error; + case 2: return run_file(argv[1]); + default: + std::cerr << "Usage: " << argv[0] << " [filename]" << std::endl; + return -1; + } +} diff --git a/num.hpp b/num.hpp new file mode 100644 index 0000000..e52fb35 --- /dev/null +++ b/num.hpp @@ -0,0 +1,5 @@ +#pragma once + +#include + +using num_t = int64_t; diff --git a/token.hpp b/token.hpp new file mode 100644 index 0000000..91edfd3 --- /dev/null +++ b/token.hpp @@ -0,0 +1,194 @@ +#pragma once + +#include +#include +#include + +#include "num.hpp" + +namespace dsp { + +struct object_t { +}; + +struct token_pos_t { + int line; + int col; +}; + +template +struct token_t { + enum type_t { + left_paren, + right_paren, + + comma, + dot, + + // operators + plus, + minus, + star, + slash, + percent, + tilde, + ampersand, + bar, + carot, + left_shift, + right_shift, + equal, + + // literals + identifier, + string, + number, + + // keywords + _alh, + _all, + _alu, + _m0, + _m1, + _m2, + _m3, + _mc0, + _mc1, + _mc2, + _mc3, + _mul, + _nop, + _and, + _or, + _xor, + _add, + _sub, + _ad2, + _sr, + _rr, + _sl, + _rl, + _rl8, + _clr, + _mov, + _mvi, + _dma, + _dmah, + _jmp, + _btm, + _lps, + _end, + _endi, + _equ, + _org, + _ends, + + eof, + }; + + using literal_t = std::variant; + + const token_pos_t pos; + const type_t type; + const std::string_view lexeme; + const literal_t literal; + + token_t() = delete; + + constexpr token_t(token_pos_t pos, type_t type, const std::string_view lexeme, N number) + : pos(pos), type(type), lexeme(lexeme), literal(number) + { } + + constexpr token_t(token_pos_t pos, type_t type, const std::string_view lexeme) + : pos(pos), type(type), lexeme(lexeme), literal() + { } + + friend std::ostream& operator<<(std::ostream& os, const enum token_t::type_t type) + { + switch (type) { + case left_paren : return os << "LEFT_PAREN"; + case right_paren : return os << "RIGHT_PAREN"; + + case comma : return os << "COMMA"; + case dot : return os << "DOT"; + + // operators + case plus : return os << "PLUS"; + case minus : return os << "MINUS"; + case star : return os << "STAR"; + case slash : return os << "SLASH"; + case percent : return os << "PERCENT"; + case tilde : return os << "TILDE"; + case ampersand : return os << "AMPERSAND"; + case bar : return os << "BAR"; + case carot : return os << "CAROT"; + case left_shift : return os << "LEFT_SHIFT"; + case right_shift : return os << "RIGHT_SHIFT"; + case equal : return os << "EQUAL"; + + // literals + case identifier : return os << "IDENTIFIER"; + case string : return os << "STRING"; + case number : return os << "NUMBER"; + + // keywords + case _alh : return os << "ALH"; + case _all : return os << "ALL"; + case _alu : return os << "ALU"; + case _m0 : return os << "M0"; + case _m1 : return os << "M1"; + case _m2 : return os << "M2"; + case _m3 : return os << "M3"; + case _mc0 : return os << "MC0"; + case _mc1 : return os << "MC1"; + case _mc2 : return os << "MC2"; + case _mc3 : return os << "MC3"; + case _mul : return os << "MUL"; + case _nop : return os << "NOP"; + case _and : return os << "AND"; + case _or : return os << "OR"; + case _xor : return os << "XOR"; + case _add : return os << "ADD"; + case _sub : return os << "SUB"; + case _ad2 : return os << "AD2"; + case _sr : return os << "SR"; + case _rr : return os << "RR"; + case _sl : return os << "SL"; + case _rl : return os << "RL"; + case _rl8 : return os << "RL8"; + case _clr : return os << "CLR"; + case _mov : return os << "MOV"; + case _mvi : return os << "MVI"; + case _dma : return os << "DMA"; + case _dmah : return os << "DMAH"; + case _jmp : return os << "JMP"; + case _btm : return os << "BTM"; + case _lps : return os << "LPS"; + case _end : return os << "END"; + case _endi : return os << "ENDI"; + case _equ : return os << "EQU"; + case _org : return os << "ORG"; + case _ends : return os << "ENDS"; + + case eof : return os << "EOF"; + } + __builtin_unreachable(); + } + + friend std::ostream& operator<<(std::ostream& os, const token_t& token) + { + os << token.type << ' ' << token.lexeme; + + if (auto* v = std::get_if(&token.literal)) { + os << '/' << *v; + } else { // std::monostate + } + + return os; + } + +}; + +} + +using token = dsp::token_t;