From 9ee311b464264a62699b01fa5145d9bc5c23bdc7 Mon Sep 17 00:00:00 2001
From: Zack Buhman <zack@buhman.org>
Date: Sun, 23 Feb 2025 15:55:42 -0600
Subject: [PATCH] initial

---
 lexer.c       | 127 ++++++++++++++++++++++++++++++++++++++++++++++++++
 lexer.h       |  33 +++++++++++++
 main_hosted.c |  99 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 259 insertions(+)
 create mode 100644 lexer.c
 create mode 100644 lexer.h
 create mode 100644 main_hosted.c

diff --git a/lexer.c b/lexer.c
new file mode 100644
index 0000000..2fe7774
--- /dev/null
+++ b/lexer.c
@@ -0,0 +1,127 @@
+#include "lexer.h"
+
+static inline bool is_whitespace(uint8_t c)
+{
+  return c == ' ' || c == '\t' || c == '\r' || c == '\n';
+}
+
+static inline bool is_identifier_start(uint8_t c)
+{
+  return
+       (c >= 'a' && c <= 'z')
+    || (c >= 'A' && c <= 'Z')
+    || (c == '_');
+}
+
+static inline bool is_decimal_digit(uint8_t c)
+{
+  return (c >= '0' && c <= '9');
+}
+
+struct keyword_desc {
+  const uint8_t * buf;
+  int length;
+  enum token_type token_type;
+};
+
+static const struct keyword_desc keywords[] = {
+  {
+    .buf = (const uint8_t *)"int",
+    .length = 3,
+    .token_type = TOKEN_INT,
+  },
+  {
+    .buf = (const uint8_t *)"void",
+    .length = 4,
+    .token_type = TOKEN_VOID,
+  },
+  {
+    .buf = (const uint8_t *)"return",
+    .length = 6,
+    .token_type = TOKEN_RETURN,
+  },
+};
+
+static inline bool keyword_equal(const uint8_t * buf, int start, int end, const struct keyword_desc * keyword)
+{
+  int length = end - start;
+  if (length != keyword->length)
+    return false;
+
+  int i = start;
+  int j = 0;
+  while (i < end) {
+    if (buf[i++] != keyword->buf[j++])
+      return false;
+  }
+  return true;
+}
+
+static inline enum token_type find_keyword(const uint8_t * buf, int start, int end)
+{
+  for (unsigned int i = 0; i < (sizeof (keywords)) / (sizeof (keywords[0])); i++) {
+    if (keyword_equal(buf, start, end, &keywords[i])) {
+      return keywords[i].token_type;
+    }
+  }
+  return TOKEN_IDENTIFIER;
+}
+
+struct token lexer_next_token(struct lexer_state * state)
+{
+  struct token token;
+
+  while (state->offset < state->size && is_whitespace(state->buf[state->offset])) {
+    state->offset += 1;
+  }
+
+  token.start = state->offset;
+
+  uint8_t c = state->buf[state->offset++];
+
+  switch (c) {
+  case '(':
+    token.type = TOKEN_LPAREN; break;
+  case ')':
+    token.type = TOKEN_RPAREN; break;
+  case '{':
+    token.type = TOKEN_LBRACE; break;
+  case '}':
+    token.type = TOKEN_RBRACE; break;
+  case ';':
+    token.type = TOKEN_SEMICOLON; break;
+  default:
+    if (is_identifier_start(c)) {
+      while (state->offset < state->size) {
+        uint8_t c = state->buf[state->offset];
+        if (!(is_identifier_start(c) || is_decimal_digit(c)))
+          break;
+        state->offset += 1;
+      }
+      token.type = find_keyword(state->buf, token.start, state->offset);
+    } else if (is_decimal_digit(c)) {
+      while (state->offset < state->size) {
+        uint8_t c = state->buf[state->offset];
+        if (!(is_decimal_digit(c))) {
+          if (is_identifier_start(c))
+            token.type = TOKEN_INVALID;
+          else {
+            token.type = TOKEN_CONSTANT;
+          }
+          break;
+        }
+        state->offset += 1;
+      }
+    } else {
+      token.type = TOKEN_INVALID;
+    }
+    break;
+  }
+
+  token.end = state->offset;
+
+  if (token.start >= state->size)
+    token.type = TOKEN_EOF;
+
+  return token;
+}
diff --git a/lexer.h b/lexer.h
new file mode 100644
index 0000000..d7cd6b3
--- /dev/null
+++ b/lexer.h
@@ -0,0 +1,33 @@
+#pragma once
+
+#include <stdint.h>
+
+enum token_type {
+  TOKEN_INVALID,
+  TOKEN_EOF,
+  TOKEN_IDENTIFIER,
+  TOKEN_CONSTANT,
+  TOKEN_INT,
+  TOKEN_VOID,
+  TOKEN_RETURN,
+  TOKEN_LPAREN,
+  TOKEN_RPAREN,
+  TOKEN_LBRACE,
+  TOKEN_RBRACE,
+  TOKEN_SEMICOLON,
+};
+
+struct token {
+  enum token_type type;
+  int start;
+  int end;
+  int value;
+};
+
+struct lexer_state {
+  const uint8_t * buf;
+  int offset;
+  int size;
+};
+
+struct token lexer_next_token(struct lexer_state * state);
diff --git a/main_hosted.c b/main_hosted.c
new file mode 100644
index 0000000..47c9b95
--- /dev/null
+++ b/main_hosted.c
@@ -0,0 +1,99 @@
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <stdlib.h>
+
+#include "lexer.h"
+
+int read_file(const char * filename, uint8_t ** buf)
+{
+  FILE * file = fopen(filename, "rb");
+  if (file == NULL) {
+    fprintf(stderr, "fopen(\"%s\", \"rb\"): %s\n", filename, strerror(errno));
+    return -1;
+  }
+
+  int ret;
+  ret = fseek(file, 0L, SEEK_END);
+  if (ret < 0) {
+    fprintf(stderr, "fseek(SEEK_END)");
+    return -1;
+  }
+
+  int offset = ftell(file);
+  if (offset < 0) {
+    fprintf(stderr, "ftell");
+    return -1;
+  }
+  int size = offset;
+
+  ret = fseek(file, 0L, SEEK_SET);
+  if (ret < 0) {
+    fprintf(stderr, "fseek(SEEK_SET)");
+    return -1;
+  }
+
+  fprintf(stderr, "read_file: %s size %d\n", filename, size);
+  *buf = (uint8_t *)malloc(size);
+  int fread_size = fread(*buf, 1, size, file);
+  if (fread_size != size) {
+    fprintf(stderr, "fread `%s` short read: %d ; expected: %d\n", filename, fread_size, size);
+    return -1;
+  }
+
+  ret = fclose(file);
+  if (ret < 0) {
+    fprintf(stderr, "fclose");
+    return -1;
+  }
+
+  return size;
+}
+
+const char * token_str[] = {
+  [TOKEN_INVALID] = "TOKEN_INVALID",
+  [TOKEN_EOF] = "TOKEN_EOF",
+  [TOKEN_IDENTIFIER] = "TOKEN_IDENTIFIER",
+  [TOKEN_CONSTANT] = "TOKEN_CONSTANT",
+  [TOKEN_INT] = "TOKEN_INT",
+  [TOKEN_VOID] = "TOKEN_VOID",
+  [TOKEN_RETURN] = "TOKEN_RETURN",
+  [TOKEN_LPAREN] = "TOKEN_LPAREN",
+  [TOKEN_RPAREN] = "TOKEN_RPAREN",
+  [TOKEN_LBRACE] = "TOKEN_LBRACE",
+  [TOKEN_RBRACE] = "TOKEN_RBRACE",
+  [TOKEN_SEMICOLON] = "TOKEN_SEMICOLON",
+};
+
+int main(int argc, char * argv[])
+{
+  if (argc != 3) {
+    fprintf(stderr, "argc != 3 %d %s\n", argc, argv[1]);
+    return EXIT_FAILURE;
+  }
+
+  // --lex
+  // --parse
+  // --codegen
+
+  uint8_t * buf;
+  int size = read_file(argv[2], &buf);
+  if (size < 0) {
+    return EXIT_FAILURE;
+  }
+
+  struct lexer_state lexer_state;
+  lexer_state.buf = buf;
+  lexer_state.offset = 0;
+  lexer_state.size = size;
+
+  while (true) {
+    struct token token = lexer_next_token(&lexer_state);
+    printf("%s\n", token_str[token.type]);
+    if (token.type == TOKEN_INVALID)
+      return EXIT_FAILURE;
+    if (token.type == TOKEN_EOF)
+      break;
+  }
+  return EXIT_SUCCESS;
+}