compiler/lexer.c

138 lines
2.9 KiB
C

#include "lexer.h"
static inline bool is_whitespace(uint8_t c)
{
return c == ' ' || c == '\t' || c == '\r' || c == '\n';
}
static inline bool is_identifier_start(uint8_t c)
{
return
(c >= 'a' && c <= 'z')
|| (c >= 'A' && c <= 'Z')
|| (c == '_');
}
static inline bool is_decimal_digit(uint8_t c)
{
return (c >= '0' && c <= '9');
}
struct keyword_desc {
const uint8_t * buf;
int length;
enum token_type token_type;
};
static const struct keyword_desc keywords[] = {
{
.buf = (const uint8_t *)"int",
.length = 3,
.token_type = TOKEN_INT,
},
{
.buf = (const uint8_t *)"void",
.length = 4,
.token_type = TOKEN_VOID,
},
{
.buf = (const uint8_t *)"return",
.length = 6,
.token_type = TOKEN_RETURN,
},
{
.buf = (const uint8_t *)"if",
.length = 2,
.token_type = TOKEN_RETURN,
},
{
.buf = (const uint8_t *)"else",
.length = 4,
.token_type = TOKEN_RETURN,
},
};
static inline bool keyword_equal(const uint8_t * buf, int start, int end, const struct keyword_desc * keyword)
{
int length = end - start;
if (length != keyword->length)
return false;
int i = start;
int j = 0;
while (i < end) {
if (buf[i++] != keyword->buf[j++])
return false;
}
return true;
}
static inline enum token_type find_keyword(const uint8_t * buf, int start, int end)
{
for (unsigned int i = 0; i < (sizeof (keywords)) / (sizeof (keywords[0])); i++) {
if (keyword_equal(buf, start, end, &keywords[i])) {
return keywords[i].token_type;
}
}
return TOKEN_IDENTIFIER;
}
struct token lexer_next_token(struct lexer_state * state)
{
struct token token;
while (state->offset < state->size && is_whitespace(state->buf[state->offset])) {
state->offset += 1;
}
token.start = state->offset;
uint8_t c = state->buf[state->offset++];
switch (c) {
case '(':
token.type = TOKEN_LPAREN; break;
case ')':
token.type = TOKEN_RPAREN; break;
case '{':
token.type = TOKEN_LBRACE; break;
case '}':
token.type = TOKEN_RBRACE; break;
case ';':
token.type = TOKEN_SEMICOLON; break;
default:
if (is_identifier_start(c)) {
while (state->offset < state->size) {
uint8_t c = state->buf[state->offset];
if (!(is_identifier_start(c) || is_decimal_digit(c)))
break;
state->offset += 1;
}
token.type = find_keyword(state->buf, token.start, state->offset);
} else if (is_decimal_digit(c)) {
while (state->offset < state->size) {
uint8_t c = state->buf[state->offset];
if (!(is_decimal_digit(c))) {
if (is_identifier_start(c))
token.type = TOKEN_INVALID;
else {
token.type = TOKEN_CONSTANT;
}
break;
}
state->offset += 1;
}
} else {
token.type = TOKEN_INVALID;
}
break;
}
token.end = state->offset;
if (token.start >= state->size)
token.type = TOKEN_EOF;
return token;
}