This commit is contained in:
Zack Buhman 2025-02-23 15:55:42 -06:00
commit 9ee311b464
3 changed files with 259 additions and 0 deletions

127
lexer.c Normal file
View File

@ -0,0 +1,127 @@
#include "lexer.h"
static inline bool is_whitespace(uint8_t c)
{
return c == ' ' || c == '\t' || c == '\r' || c == '\n';
}
static inline bool is_identifier_start(uint8_t c)
{
return
(c >= 'a' && c <= 'z')
|| (c >= 'A' && c <= 'Z')
|| (c == '_');
}
static inline bool is_decimal_digit(uint8_t c)
{
return (c >= '0' && c <= '9');
}
struct keyword_desc {
const uint8_t * buf;
int length;
enum token_type token_type;
};
static const struct keyword_desc keywords[] = {
{
.buf = (const uint8_t *)"int",
.length = 3,
.token_type = TOKEN_INT,
},
{
.buf = (const uint8_t *)"void",
.length = 4,
.token_type = TOKEN_VOID,
},
{
.buf = (const uint8_t *)"return",
.length = 6,
.token_type = TOKEN_RETURN,
},
};
static inline bool keyword_equal(const uint8_t * buf, int start, int end, const struct keyword_desc * keyword)
{
int length = end - start;
if (length != keyword->length)
return false;
int i = start;
int j = 0;
while (i < end) {
if (buf[i++] != keyword->buf[j++])
return false;
}
return true;
}
static inline enum token_type find_keyword(const uint8_t * buf, int start, int end)
{
for (unsigned int i = 0; i < (sizeof (keywords)) / (sizeof (keywords[0])); i++) {
if (keyword_equal(buf, start, end, &keywords[i])) {
return keywords[i].token_type;
}
}
return TOKEN_IDENTIFIER;
}
struct token lexer_next_token(struct lexer_state * state)
{
struct token token;
while (state->offset < state->size && is_whitespace(state->buf[state->offset])) {
state->offset += 1;
}
token.start = state->offset;
uint8_t c = state->buf[state->offset++];
switch (c) {
case '(':
token.type = TOKEN_LPAREN; break;
case ')':
token.type = TOKEN_RPAREN; break;
case '{':
token.type = TOKEN_LBRACE; break;
case '}':
token.type = TOKEN_RBRACE; break;
case ';':
token.type = TOKEN_SEMICOLON; break;
default:
if (is_identifier_start(c)) {
while (state->offset < state->size) {
uint8_t c = state->buf[state->offset];
if (!(is_identifier_start(c) || is_decimal_digit(c)))
break;
state->offset += 1;
}
token.type = find_keyword(state->buf, token.start, state->offset);
} else if (is_decimal_digit(c)) {
while (state->offset < state->size) {
uint8_t c = state->buf[state->offset];
if (!(is_decimal_digit(c))) {
if (is_identifier_start(c))
token.type = TOKEN_INVALID;
else {
token.type = TOKEN_CONSTANT;
}
break;
}
state->offset += 1;
}
} else {
token.type = TOKEN_INVALID;
}
break;
}
token.end = state->offset;
if (token.start >= state->size)
token.type = TOKEN_EOF;
return token;
}

33
lexer.h Normal file
View File

@ -0,0 +1,33 @@
#pragma once
#include <stdint.h>
enum token_type {
TOKEN_INVALID,
TOKEN_EOF,
TOKEN_IDENTIFIER,
TOKEN_CONSTANT,
TOKEN_INT,
TOKEN_VOID,
TOKEN_RETURN,
TOKEN_LPAREN,
TOKEN_RPAREN,
TOKEN_LBRACE,
TOKEN_RBRACE,
TOKEN_SEMICOLON,
};
struct token {
enum token_type type;
int start;
int end;
int value;
};
struct lexer_state {
const uint8_t * buf;
int offset;
int size;
};
struct token lexer_next_token(struct lexer_state * state);

99
main_hosted.c Normal file
View File

@ -0,0 +1,99 @@
#include <stdio.h>
#include <string.h>
#include <errno.h>
#include <stdlib.h>
#include "lexer.h"
int read_file(const char * filename, uint8_t ** buf)
{
FILE * file = fopen(filename, "rb");
if (file == NULL) {
fprintf(stderr, "fopen(\"%s\", \"rb\"): %s\n", filename, strerror(errno));
return -1;
}
int ret;
ret = fseek(file, 0L, SEEK_END);
if (ret < 0) {
fprintf(stderr, "fseek(SEEK_END)");
return -1;
}
int offset = ftell(file);
if (offset < 0) {
fprintf(stderr, "ftell");
return -1;
}
int size = offset;
ret = fseek(file, 0L, SEEK_SET);
if (ret < 0) {
fprintf(stderr, "fseek(SEEK_SET)");
return -1;
}
fprintf(stderr, "read_file: %s size %d\n", filename, size);
*buf = (uint8_t *)malloc(size);
int fread_size = fread(*buf, 1, size, file);
if (fread_size != size) {
fprintf(stderr, "fread `%s` short read: %d ; expected: %d\n", filename, fread_size, size);
return -1;
}
ret = fclose(file);
if (ret < 0) {
fprintf(stderr, "fclose");
return -1;
}
return size;
}
const char * token_str[] = {
[TOKEN_INVALID] = "TOKEN_INVALID",
[TOKEN_EOF] = "TOKEN_EOF",
[TOKEN_IDENTIFIER] = "TOKEN_IDENTIFIER",
[TOKEN_CONSTANT] = "TOKEN_CONSTANT",
[TOKEN_INT] = "TOKEN_INT",
[TOKEN_VOID] = "TOKEN_VOID",
[TOKEN_RETURN] = "TOKEN_RETURN",
[TOKEN_LPAREN] = "TOKEN_LPAREN",
[TOKEN_RPAREN] = "TOKEN_RPAREN",
[TOKEN_LBRACE] = "TOKEN_LBRACE",
[TOKEN_RBRACE] = "TOKEN_RBRACE",
[TOKEN_SEMICOLON] = "TOKEN_SEMICOLON",
};
int main(int argc, char * argv[])
{
if (argc != 3) {
fprintf(stderr, "argc != 3 %d %s\n", argc, argv[1]);
return EXIT_FAILURE;
}
// --lex
// --parse
// --codegen
uint8_t * buf;
int size = read_file(argv[2], &buf);
if (size < 0) {
return EXIT_FAILURE;
}
struct lexer_state lexer_state;
lexer_state.buf = buf;
lexer_state.offset = 0;
lexer_state.size = size;
while (true) {
struct token token = lexer_next_token(&lexer_state);
printf("%s\n", token_str[token.type]);
if (token.type == TOKEN_INVALID)
return EXIT_FAILURE;
if (token.type == TOKEN_EOF)
break;
}
return EXIT_SUCCESS;
}