compiler/lexer.c

#include "lexer.h"

static inline bool is_whitespace(uint8_t c)
{
  return c == ' ' || c == '\t' || c == '\r' || c == '\n';
}

static inline bool is_identifier_start(uint8_t c)
{
  return
       (c >= 'a' && c <= 'z')
    || (c >= 'A' && c <= 'Z')
    || (c == '_');
}

static inline bool is_decimal_digit(uint8_t c)
{
  return (c >= '0' && c <= '9');
}

struct keyword_desc {
  const uint8_t * buf;
  int length;
  enum token_type token_type;
};

static const struct keyword_desc keywords[] = {
  {
    .buf = (const uint8_t *)"int",
    .length = 3,
    .token_type = TOKEN_INT,
  },
  {
    .buf = (const uint8_t *)"void",
    .length = 4,
    .token_type = TOKEN_VOID,
  },
  {
    .buf = (const uint8_t *)"return",
    .length = 6,
    .token_type = TOKEN_RETURN,
  },
  {
    .buf = (const uint8_t *)"if",
    .length = 2,
    .token_type = TOKEN_RETURN,
  },
  {
    .buf = (const uint8_t *)"else",
    .length = 4,
    .token_type = TOKEN_RETURN,
  },
};

static inline bool keyword_equal(const uint8_t * buf, int start, int end, const struct keyword_desc * keyword)
{
  int length = end - start;
  if (length != keyword->length)
    return false;

  int i = start;
  int j = 0;
  while (i < end) {
    if (buf[i++] != keyword->buf[j++])
      return false;
  }
  return true;
}

static inline enum token_type find_keyword(const uint8_t * buf, int start, int end)
{
  for (unsigned int i = 0; i < (sizeof (keywords)) / (sizeof (keywords[0])); i++) {
    if (keyword_equal(buf, start, end, &keywords[i])) {
      return keywords[i].token_type;
    }
  }
  return TOKEN_IDENTIFIER;
}

struct token lexer_next_token(struct lexer_state * state)
{
  struct token token;
  int token_start;
  int token_end;

  while (state->offset < state->size && is_whitespace(state->buf[state->offset])) {
    state->offset += 1;
  }

  token_start = state->offset;

  uint8_t c = state->buf[state->offset++];

  switch (c) {
  case '(':
    token.type = TOKEN_LPAREN; break;
  case ')':
    token.type = TOKEN_RPAREN; break;
  case '{':
    token.type = TOKEN_LBRACE; break;
  case '}':
    token.type = TOKEN_RBRACE; break;
  case ';':
    token.type = TOKEN_SEMICOLON; break;
  default:
    if (is_identifier_start(c)) {
      while (state->offset < state->size) {
        uint8_t c = state->buf[state->offset];
        if (!(is_identifier_start(c) || is_decimal_digit(c)))
          break;
        state->offset += 1;
      }
      token.type = find_keyword(state->buf, token_start, state->offset);
    } else if (is_decimal_digit(c)) {
      while (state->offset < state->size) {
        uint8_t c = state->buf[state->offset];
        if (!(is_decimal_digit(c))) {
          if (is_identifier_start(c))
            token.type = TOKEN_INVALID;
          else {
            token.type = TOKEN_CONSTANT;
          }
          break;
        }
        state->offset += 1;
      }
    } else {
      token.type = TOKEN_INVALID;
    }
    break;
  }

  token_end = state->offset;

  if (token_start >= state->size)
    token.type = TOKEN_EOF;

  token.start = &state->buf[token_start];
  token.end = &state->buf[token_end];

  return token;
}