initial

2025-02-23 15:55:42 -06:00 · 2025-02-23 15:55:42 -06:00 · 9ee311b464
commit 9ee311b464
3 changed files with 259 additions and 0 deletions
--- a/lexer.c
+++ b/lexer.c
@ -0,0 +1,127 @@
 #include "lexer.h"
 static inline bool is_whitespace(uint8_t c)
 {
  return c == ' ' || c == '\t' || c == '\r' || c == '\n';
 }
 static inline bool is_identifier_start(uint8_t c)
 {
  return
       (c >= 'a' && c <= 'z')
    || (c >= 'A' && c <= 'Z')
    || (c == '_');
 }
 static inline bool is_decimal_digit(uint8_t c)
 {
  return (c >= '0' && c <= '9');
 }
 struct keyword_desc {
  const uint8_t * buf;
  int length;
  enum token_type token_type;
 };
 static const struct keyword_desc keywords[] = {
  {
    .buf = (const uint8_t *)"int",
    .length = 3,
    .token_type = TOKEN_INT,
  },
  {
    .buf = (const uint8_t *)"void",
    .length = 4,
    .token_type = TOKEN_VOID,
  },
  {
    .buf = (const uint8_t *)"return",
    .length = 6,
    .token_type = TOKEN_RETURN,
  },
 };
 static inline bool keyword_equal(const uint8_t * buf, int start, int end, const struct keyword_desc * keyword)
 {
  int length = end - start;
  if (length != keyword->length)
    return false;
  int i = start;
  int j = 0;
  while (i < end) {
    if (buf[i++] != keyword->buf[j++])
      return false;
  }
  return true;
 }
 static inline enum token_type find_keyword(const uint8_t * buf, int start, int end)
 {
  for (unsigned int i = 0; i < (sizeof (keywords)) / (sizeof (keywords[0])); i++) {
    if (keyword_equal(buf, start, end, &keywords[i])) {
      return keywords[i].token_type;
    }
  }
  return TOKEN_IDENTIFIER;
 }
 struct token lexer_next_token(struct lexer_state * state)
 {
  struct token token;
  while (state->offset < state->size && is_whitespace(state->buf[state->offset])) {
    state->offset += 1;
  }
  token.start = state->offset;
  uint8_t c = state->buf[state->offset++];
  switch (c) {
  case '(':
    token.type = TOKEN_LPAREN; break;
  case ')':
    token.type = TOKEN_RPAREN; break;
  case '{':
    token.type = TOKEN_LBRACE; break;
  case '}':
    token.type = TOKEN_RBRACE; break;
  case ';':
    token.type = TOKEN_SEMICOLON; break;
  default:
    if (is_identifier_start(c)) {
      while (state->offset < state->size) {
        uint8_t c = state->buf[state->offset];
        if (!(is_identifier_start(c) || is_decimal_digit(c)))
          break;
        state->offset += 1;
      }
      token.type = find_keyword(state->buf, token.start, state->offset);
    } else if (is_decimal_digit(c)) {
      while (state->offset < state->size) {
        uint8_t c = state->buf[state->offset];
        if (!(is_decimal_digit(c))) {
          if (is_identifier_start(c))
            token.type = TOKEN_INVALID;
          else {
            token.type = TOKEN_CONSTANT;
          }
          break;
        }
        state->offset += 1;
      }
    } else {
      token.type = TOKEN_INVALID;
    }
    break;
  }
  token.end = state->offset;
  if (token.start >= state->size)
    token.type = TOKEN_EOF;
  return token;
 }
--- a/lexer.h
+++ b/lexer.h
@ -0,0 +1,33 @@
 #pragma once
 #include <stdint.h>
 enum token_type {
  TOKEN_INVALID,
  TOKEN_EOF,
  TOKEN_IDENTIFIER,
  TOKEN_CONSTANT,
  TOKEN_INT,
  TOKEN_VOID,
  TOKEN_RETURN,
  TOKEN_LPAREN,
  TOKEN_RPAREN,
  TOKEN_LBRACE,
  TOKEN_RBRACE,
  TOKEN_SEMICOLON,
 };
 struct token {
  enum token_type type;
  int start;
  int end;
  int value;
 };
 struct lexer_state {
  const uint8_t * buf;
  int offset;
  int size;
 };
 struct token lexer_next_token(struct lexer_state * state);
--- a/main_hosted.c
+++ b/main_hosted.c
@ -0,0 +1,99 @@
 #include <stdio.h>
 #include <string.h>
 #include <errno.h>
 #include <stdlib.h>
 #include "lexer.h"
 int read_file(const char * filename, uint8_t ** buf)
 {
  FILE * file = fopen(filename, "rb");
  if (file == NULL) {
    fprintf(stderr, "fopen(\"%s\", \"rb\"): %s\n", filename, strerror(errno));
    return -1;
  }
  int ret;
  ret = fseek(file, 0L, SEEK_END);
  if (ret < 0) {
    fprintf(stderr, "fseek(SEEK_END)");
    return -1;
  }
  int offset = ftell(file);
  if (offset < 0) {
    fprintf(stderr, "ftell");
    return -1;
  }
  int size = offset;
  ret = fseek(file, 0L, SEEK_SET);
  if (ret < 0) {
    fprintf(stderr, "fseek(SEEK_SET)");
    return -1;
  }
  fprintf(stderr, "read_file: %s size %d\n", filename, size);
  *buf = (uint8_t *)malloc(size);
  int fread_size = fread(*buf, 1, size, file);
  if (fread_size != size) {
    fprintf(stderr, "fread `%s` short read: %d ; expected: %d\n", filename, fread_size, size);
    return -1;
  }
  ret = fclose(file);
  if (ret < 0) {
    fprintf(stderr, "fclose");
    return -1;
  }
  return size;
 }
 const char * token_str[] = {
  [TOKEN_INVALID] = "TOKEN_INVALID",
  [TOKEN_EOF] = "TOKEN_EOF",
  [TOKEN_IDENTIFIER] = "TOKEN_IDENTIFIER",
  [TOKEN_CONSTANT] = "TOKEN_CONSTANT",
  [TOKEN_INT] = "TOKEN_INT",
  [TOKEN_VOID] = "TOKEN_VOID",
  [TOKEN_RETURN] = "TOKEN_RETURN",
  [TOKEN_LPAREN] = "TOKEN_LPAREN",
  [TOKEN_RPAREN] = "TOKEN_RPAREN",
  [TOKEN_LBRACE] = "TOKEN_LBRACE",
  [TOKEN_RBRACE] = "TOKEN_RBRACE",
  [TOKEN_SEMICOLON] = "TOKEN_SEMICOLON",
 };
 int main(int argc, char * argv[])
 {
  if (argc != 3) {
    fprintf(stderr, "argc != 3 %d %s\n", argc, argv[1]);
    return EXIT_FAILURE;
  }
  // --lex
  // --parse
  // --codegen
  uint8_t * buf;
  int size = read_file(argv[2], &buf);
  if (size < 0) {
    return EXIT_FAILURE;
  }
  struct lexer_state lexer_state;
  lexer_state.buf = buf;
  lexer_state.offset = 0;
  lexer_state.size = size;
  while (true) {
    struct token token = lexer_next_token(&lexer_state);
    printf("%s\n", token_str[token.type]);
    if (token.type == TOKEN_INVALID)
      return EXIT_FAILURE;
    if (token.type == TOKEN_EOF)
      break;
  }
  return EXIT_SUCCESS;
 }