143 lines
3.1 KiB
C
143 lines
3.1 KiB
C
#include "lexer.h"
|
|
|
|
static inline bool is_whitespace(uint8_t c)
|
|
{
|
|
return c == ' ' || c == '\t' || c == '\r' || c == '\n';
|
|
}
|
|
|
|
static inline bool is_identifier_start(uint8_t c)
|
|
{
|
|
return
|
|
(c >= 'a' && c <= 'z')
|
|
|| (c >= 'A' && c <= 'Z')
|
|
|| (c == '_');
|
|
}
|
|
|
|
static inline bool is_decimal_digit(uint8_t c)
|
|
{
|
|
return (c >= '0' && c <= '9');
|
|
}
|
|
|
|
struct keyword_desc {
|
|
const uint8_t * buf;
|
|
int length;
|
|
enum token_type token_type;
|
|
};
|
|
|
|
static const struct keyword_desc keywords[] = {
|
|
{
|
|
.buf = (const uint8_t *)"int",
|
|
.length = 3,
|
|
.token_type = TOKEN_INT,
|
|
},
|
|
{
|
|
.buf = (const uint8_t *)"void",
|
|
.length = 4,
|
|
.token_type = TOKEN_VOID,
|
|
},
|
|
{
|
|
.buf = (const uint8_t *)"return",
|
|
.length = 6,
|
|
.token_type = TOKEN_RETURN,
|
|
},
|
|
{
|
|
.buf = (const uint8_t *)"if",
|
|
.length = 2,
|
|
.token_type = TOKEN_RETURN,
|
|
},
|
|
{
|
|
.buf = (const uint8_t *)"else",
|
|
.length = 4,
|
|
.token_type = TOKEN_RETURN,
|
|
},
|
|
};
|
|
|
|
static inline bool keyword_equal(const uint8_t * buf, int start, int end, const struct keyword_desc * keyword)
|
|
{
|
|
int length = end - start;
|
|
if (length != keyword->length)
|
|
return false;
|
|
|
|
int i = start;
|
|
int j = 0;
|
|
while (i < end) {
|
|
if (buf[i++] != keyword->buf[j++])
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
static inline enum token_type find_keyword(const uint8_t * buf, int start, int end)
|
|
{
|
|
for (unsigned int i = 0; i < (sizeof (keywords)) / (sizeof (keywords[0])); i++) {
|
|
if (keyword_equal(buf, start, end, &keywords[i])) {
|
|
return keywords[i].token_type;
|
|
}
|
|
}
|
|
return TOKEN_IDENTIFIER;
|
|
}
|
|
|
|
struct token lexer_next_token(struct lexer_state * state)
|
|
{
|
|
struct token token;
|
|
int token_start;
|
|
int token_end;
|
|
|
|
while (state->offset < state->size && is_whitespace(state->buf[state->offset])) {
|
|
state->offset += 1;
|
|
}
|
|
|
|
token_start = state->offset;
|
|
|
|
uint8_t c = state->buf[state->offset++];
|
|
|
|
switch (c) {
|
|
case '(':
|
|
token.type = TOKEN_LPAREN; break;
|
|
case ')':
|
|
token.type = TOKEN_RPAREN; break;
|
|
case '{':
|
|
token.type = TOKEN_LBRACE; break;
|
|
case '}':
|
|
token.type = TOKEN_RBRACE; break;
|
|
case ';':
|
|
token.type = TOKEN_SEMICOLON; break;
|
|
default:
|
|
if (is_identifier_start(c)) {
|
|
while (state->offset < state->size) {
|
|
uint8_t c = state->buf[state->offset];
|
|
if (!(is_identifier_start(c) || is_decimal_digit(c)))
|
|
break;
|
|
state->offset += 1;
|
|
}
|
|
token.type = find_keyword(state->buf, token_start, state->offset);
|
|
} else if (is_decimal_digit(c)) {
|
|
while (state->offset < state->size) {
|
|
uint8_t c = state->buf[state->offset];
|
|
if (!(is_decimal_digit(c))) {
|
|
if (is_identifier_start(c))
|
|
token.type = TOKEN_INVALID;
|
|
else {
|
|
token.type = TOKEN_CONSTANT;
|
|
}
|
|
break;
|
|
}
|
|
state->offset += 1;
|
|
}
|
|
} else {
|
|
token.type = TOKEN_INVALID;
|
|
}
|
|
break;
|
|
}
|
|
|
|
token_end = state->offset;
|
|
|
|
if (token_start >= state->size)
|
|
token.type = TOKEN_EOF;
|
|
|
|
token.start = &state->buf[token_start];
|
|
token.end = &state->buf[token_end];
|
|
|
|
return token;
|
|
}
|