lexer: loop if the current character does not produce a token

2023-08-16 19:52:02 +00:00 · 2023-08-16 19:52:02 +00:00 · cc7345ec33
commit cc7345ec33
parent b6d4ae5e8e
3 changed files with 77 additions and 58 deletions
--- a/lexer.cpp
+++ b/lexer.cpp
@ -143,68 +143,74 @@ std::optional<token_t> lexer_t::scan_token()
 {
  using enum token_t::type_t;
-  if (at_end_p())
+  while (true) {
-    return {{pos, eof, ""}};
+    if (at_end_p())
      return {{pos, eof, ""}};
-  start_ix = current_ix;
+    start_ix = current_ix;
-  const char c = advance();
+    const char c = advance();
-  switch (c) {
+    switch (c) {
-  case '(': return {{pos, left_paren, lexeme()}};
+    case '(': return {{pos, left_paren, lexeme()}};
-  case ')': return {{pos, right_paren, lexeme()}};
+    case ')': return {{pos, right_paren, lexeme()}};
-  case ',': return {{pos, comma, lexeme()}};
+    case ',': return {{pos, comma, lexeme()}};
-  case '.': return {{pos, dot, lexeme()}};
+    case '.': return {{pos, dot, lexeme()}};
-  case '+': return {{pos, plus, lexeme()}};
+    case '+': return {{pos, plus, lexeme()}};
-  case '-': return {{pos, minus, lexeme()}};
+    case '-': return {{pos, minus, lexeme()}};
-  case '*': return {{pos, star, lexeme()}};
+    case '*': return {{pos, star, lexeme()}};
-  case '/': return {{pos, slash, lexeme()}};
+    case '/': return {{pos, slash, lexeme()}};
-  case '%': return {{pos, percent, lexeme()}};
+    case '%': return {{pos, percent, lexeme()}};
-  case '~': return {{pos, tilde, lexeme()}};
+    case '~': return {{pos, tilde, lexeme()}};
-  case '&': return {{pos, ampersand, lexeme()}};
+    case '&': return {{pos, ampersand, lexeme()}};
-  case '|': return {{pos, bar, lexeme()}};
+    case '|': return {{pos, bar, lexeme()}};
-  case '^': return {{pos, carot, lexeme()}};
+    case '^': return {{pos, carot, lexeme()}};
-  case '=': return {{pos, equal, lexeme()}};
+    case '=': return {{pos, equal, lexeme()}};
-  case '<':
+    case '<':
-    if (match('<')) return {{pos, left_shift, lexeme()}};
+      if (match('<')) return {{pos, left_shift, lexeme()}};
-    break;
+      break;
-  case '>':
+    case '>':
-    if (match('>')) return {{pos, right_shift, lexeme()}};
+      if (match('>')) return {{pos, right_shift, lexeme()}};
-    break;
+      break;
-  case ';':
+    case ';':
-    while (!at_end_p() && peek() != '\n') advance();
+      while (!at_end_p() && peek() != '\n') advance();
-    break;
+      break;
-  case ' ':
+    case ' ':
-  case '\r':
+    case '\r':
-  case '\t':
+    case '\t':
-    break;
+      break;
-  case '\n':
+    case '\n':
-    pos.line++;
+      {
-    pos.col = 0;
+	token_pos_t tmp = pos;
-    break;
+	pos.line++;
-  case '$':
+	pos.col = 0;
-    if (hex_t::pred(peek())) {
+	return {{tmp, eol, lexeme()}};
      start_ix += 1;
      return {_number<hex_t>()};
    }
    [[fallthrough]];
  case '0':
    if (match('x')) {
      if (hex_t::pred(peek())) {
        start_ix += 2;
        return {_number<hex_t>()};
      }
      break;
    case '$':
      if (hex_t::pred(peek())) {
 	start_ix += 1;
 	return {_number<hex_t>()};
      }
      [[fallthrough]];
    case '0':
      if (match('x')) {
 	if (hex_t::pred(peek())) {
 	  start_ix += 2;
 	  return {_number<hex_t>()};
 	}
      }
      [[fallthrough]];
    default:
      if (dec_t::pred(c)) {
 	return {_number<dec_t>()};
      } else if (alpha_p(c)) {
 	return {_identifier()};
      } else {
 	error(pos.line, pos.col - 1, "Unexpected character.");
 	return {};
      }
      break;
    }
    [[fallthrough]];
  default:
    if (dec_t::pred(c)) {
      return {_number<dec_t>()};
    } else if (alpha_p(c)) {
      return {_identifier()};
    } else {
      error(pos.line, pos.col - 1, "Unexpected character.");
      return {};
    }
    break;
  }
  __builtin_unreachable();
 }
--- a/parser.cpp
+++ b/parser.cpp
@ -159,4 +159,15 @@ expr_t * parser_t::primary()
  throw error(peek(), "expected expression");
 }
 /*
 void parser_t::synchronize()
 {
  advance();
  while (!at_end_p()) {
    if (previous().type == eol) return;
    advance();
  }
 }
 */
 }
--- a/token.hpp
+++ b/token.hpp
@ -85,6 +85,7 @@ struct token_t {
    _ends,
    eof,
    eol,
  };
  using literal_t = std::variant<std::monostate, num_type>;
@ -172,6 +173,7 @@ struct token_t {
    case _ends        : return os << "ENDS";
    case eof          : return os << "EOF";
    case eol          : return os << "EOL";
    }
    __builtin_unreachable();
  }