From 9ee311b464264a62699b01fa5145d9bc5c23bdc7 Mon Sep 17 00:00:00 2001 From: Zack Buhman Date: Sun, 23 Feb 2025 15:55:42 -0600 Subject: [PATCH] initial --- lexer.c | 127 ++++++++++++++++++++++++++++++++++++++++++++++++++ lexer.h | 33 +++++++++++++ main_hosted.c | 99 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 259 insertions(+) create mode 100644 lexer.c create mode 100644 lexer.h create mode 100644 main_hosted.c diff --git a/lexer.c b/lexer.c new file mode 100644 index 0000000..2fe7774 --- /dev/null +++ b/lexer.c @@ -0,0 +1,127 @@ +#include "lexer.h" + +static inline bool is_whitespace(uint8_t c) +{ + return c == ' ' || c == '\t' || c == '\r' || c == '\n'; +} + +static inline bool is_identifier_start(uint8_t c) +{ + return + (c >= 'a' && c <= 'z') + || (c >= 'A' && c <= 'Z') + || (c == '_'); +} + +static inline bool is_decimal_digit(uint8_t c) +{ + return (c >= '0' && c <= '9'); +} + +struct keyword_desc { + const uint8_t * buf; + int length; + enum token_type token_type; +}; + +static const struct keyword_desc keywords[] = { + { + .buf = (const uint8_t *)"int", + .length = 3, + .token_type = TOKEN_INT, + }, + { + .buf = (const uint8_t *)"void", + .length = 4, + .token_type = TOKEN_VOID, + }, + { + .buf = (const uint8_t *)"return", + .length = 6, + .token_type = TOKEN_RETURN, + }, +}; + +static inline bool keyword_equal(const uint8_t * buf, int start, int end, const struct keyword_desc * keyword) +{ + int length = end - start; + if (length != keyword->length) + return false; + + int i = start; + int j = 0; + while (i < end) { + if (buf[i++] != keyword->buf[j++]) + return false; + } + return true; +} + +static inline enum token_type find_keyword(const uint8_t * buf, int start, int end) +{ + for (unsigned int i = 0; i < (sizeof (keywords)) / (sizeof (keywords[0])); i++) { + if (keyword_equal(buf, start, end, &keywords[i])) { + return keywords[i].token_type; + } + } + return TOKEN_IDENTIFIER; +} + +struct token lexer_next_token(struct lexer_state * state) +{ + struct token token; + + while (state->offset < state->size && is_whitespace(state->buf[state->offset])) { + state->offset += 1; + } + + token.start = state->offset; + + uint8_t c = state->buf[state->offset++]; + + switch (c) { + case '(': + token.type = TOKEN_LPAREN; break; + case ')': + token.type = TOKEN_RPAREN; break; + case '{': + token.type = TOKEN_LBRACE; break; + case '}': + token.type = TOKEN_RBRACE; break; + case ';': + token.type = TOKEN_SEMICOLON; break; + default: + if (is_identifier_start(c)) { + while (state->offset < state->size) { + uint8_t c = state->buf[state->offset]; + if (!(is_identifier_start(c) || is_decimal_digit(c))) + break; + state->offset += 1; + } + token.type = find_keyword(state->buf, token.start, state->offset); + } else if (is_decimal_digit(c)) { + while (state->offset < state->size) { + uint8_t c = state->buf[state->offset]; + if (!(is_decimal_digit(c))) { + if (is_identifier_start(c)) + token.type = TOKEN_INVALID; + else { + token.type = TOKEN_CONSTANT; + } + break; + } + state->offset += 1; + } + } else { + token.type = TOKEN_INVALID; + } + break; + } + + token.end = state->offset; + + if (token.start >= state->size) + token.type = TOKEN_EOF; + + return token; +} diff --git a/lexer.h b/lexer.h new file mode 100644 index 0000000..d7cd6b3 --- /dev/null +++ b/lexer.h @@ -0,0 +1,33 @@ +#pragma once + +#include + +enum token_type { + TOKEN_INVALID, + TOKEN_EOF, + TOKEN_IDENTIFIER, + TOKEN_CONSTANT, + TOKEN_INT, + TOKEN_VOID, + TOKEN_RETURN, + TOKEN_LPAREN, + TOKEN_RPAREN, + TOKEN_LBRACE, + TOKEN_RBRACE, + TOKEN_SEMICOLON, +}; + +struct token { + enum token_type type; + int start; + int end; + int value; +}; + +struct lexer_state { + const uint8_t * buf; + int offset; + int size; +}; + +struct token lexer_next_token(struct lexer_state * state); diff --git a/main_hosted.c b/main_hosted.c new file mode 100644 index 0000000..47c9b95 --- /dev/null +++ b/main_hosted.c @@ -0,0 +1,99 @@ +#include +#include +#include +#include + +#include "lexer.h" + +int read_file(const char * filename, uint8_t ** buf) +{ + FILE * file = fopen(filename, "rb"); + if (file == NULL) { + fprintf(stderr, "fopen(\"%s\", \"rb\"): %s\n", filename, strerror(errno)); + return -1; + } + + int ret; + ret = fseek(file, 0L, SEEK_END); + if (ret < 0) { + fprintf(stderr, "fseek(SEEK_END)"); + return -1; + } + + int offset = ftell(file); + if (offset < 0) { + fprintf(stderr, "ftell"); + return -1; + } + int size = offset; + + ret = fseek(file, 0L, SEEK_SET); + if (ret < 0) { + fprintf(stderr, "fseek(SEEK_SET)"); + return -1; + } + + fprintf(stderr, "read_file: %s size %d\n", filename, size); + *buf = (uint8_t *)malloc(size); + int fread_size = fread(*buf, 1, size, file); + if (fread_size != size) { + fprintf(stderr, "fread `%s` short read: %d ; expected: %d\n", filename, fread_size, size); + return -1; + } + + ret = fclose(file); + if (ret < 0) { + fprintf(stderr, "fclose"); + return -1; + } + + return size; +} + +const char * token_str[] = { + [TOKEN_INVALID] = "TOKEN_INVALID", + [TOKEN_EOF] = "TOKEN_EOF", + [TOKEN_IDENTIFIER] = "TOKEN_IDENTIFIER", + [TOKEN_CONSTANT] = "TOKEN_CONSTANT", + [TOKEN_INT] = "TOKEN_INT", + [TOKEN_VOID] = "TOKEN_VOID", + [TOKEN_RETURN] = "TOKEN_RETURN", + [TOKEN_LPAREN] = "TOKEN_LPAREN", + [TOKEN_RPAREN] = "TOKEN_RPAREN", + [TOKEN_LBRACE] = "TOKEN_LBRACE", + [TOKEN_RBRACE] = "TOKEN_RBRACE", + [TOKEN_SEMICOLON] = "TOKEN_SEMICOLON", +}; + +int main(int argc, char * argv[]) +{ + if (argc != 3) { + fprintf(stderr, "argc != 3 %d %s\n", argc, argv[1]); + return EXIT_FAILURE; + } + + // --lex + // --parse + // --codegen + + uint8_t * buf; + int size = read_file(argv[2], &buf); + if (size < 0) { + return EXIT_FAILURE; + } + + struct lexer_state lexer_state; + lexer_state.buf = buf; + lexer_state.offset = 0; + lexer_state.size = size; + + while (true) { + struct token token = lexer_next_token(&lexer_state); + printf("%s\n", token_str[token.type]); + if (token.type == TOKEN_INVALID) + return EXIT_FAILURE; + if (token.type == TOKEN_EOF) + break; + } + return EXIT_SUCCESS; +}