From f6049fc64467acd6102594fda5358a2b016b5e1a Mon Sep 17 00:00:00 2001 From: Kyler Date: Thu, 12 Jun 2025 19:19:29 -0600 Subject: [PATCH] Refactor lexer to include filename and line/column tracking; update related functions and tests --- include/sync/lexer.h | 18 ++++++++++++++--- include/sync/types.h | 39 +++++++++++++++++++++++++++++++++++ src/lexer.c | 48 ++++++++++++++++++++++++++++++++------------ src/main.c | 15 +++++++++----- test/test_lexer.c | 31 ++++++++++++++-------------- 5 files changed, 115 insertions(+), 36 deletions(-) create mode 100644 include/sync/types.h diff --git a/include/sync/lexer.h b/include/sync/lexer.h index c16e115..2477499 100644 --- a/include/sync/lexer.h +++ b/include/sync/lexer.h @@ -2,6 +2,7 @@ #define SYNC_LEXER_H #include +#include "types.h" typedef enum { TOKEN_EOF, @@ -11,21 +12,32 @@ typedef enum { TOKEN_LPAREN, TOKEN_RPAREN, TOKEN_SEMICOLON, - TOKEN_UNKNOWN, } TokenType; typedef struct { TokenType type; const char *start; size_t length; + FileInfo file_info; } Token; typedef struct { + const char *filename; const char *source; size_t pos; + size_t column; + size_t line; } Lexer; -void lexer_init(Lexer *lexer, const char *source); -Token lexer_next(Lexer *lexer); +typedef struct { + SyncResultType type; + union { + Token result; + SyncError error; + }; +} TokenResult; + +void lexer_init(Lexer *lexer, const char *filename, const char *source); +TokenResult lexer_next(Lexer *lexer); #endif // SYNC_LEXER_H diff --git a/include/sync/types.h b/include/sync/types.h new file mode 100644 index 0000000..74547e2 --- /dev/null +++ b/include/sync/types.h @@ -0,0 +1,39 @@ +#ifndef SYNC_TYPES_H +#define SYNC_TYPES_H + +#include + +typedef struct { + const char *filename; + size_t line; + size_t column; + size_t length; + size_t lines; +} FileInfo; + +typedef enum { + SYNC_LEXER_ERROR, + SYNC_PARSER_ERROR, + SYNC_RUNTIME_ERROR +} SyncErrorType; + +typedef struct { + SyncErrorType type; + const char *message; + FileInfo file_info; +} SyncError; + +typedef enum { + SYNC_RESULT, + SYNC_ERROR, +} SyncResultType; + +// typedef struct { +// SyncResultType type; +// union { +// void *result; +// SyncError error; +// }; +// } SyncResult; + +#endif // SYNC_TYPES_H diff --git a/src/lexer.c b/src/lexer.c index d9c95d0..25c9dee 100644 --- a/src/lexer.c +++ b/src/lexer.c @@ -1,10 +1,24 @@ #include #include +#include "sync/types.h" #include "sync/lexer.h" -void lexer_init(Lexer *lexer, const char *source) { +void lexer_init(Lexer *lexer, const char *filename, const char *source) { + lexer->filename = filename; lexer->source = source; lexer->pos = 0; + lexer->column = 1; + lexer->line = 1; +} + +static FileInfo get_file_info(Lexer *lexer, size_t start, size_t start_line) { + return (FileInfo){ + .filename = lexer->filename, + .line = lexer->line, + .column = lexer->column, + .length = lexer->pos - start, + .lines = lexer->line - start_line + }; } static char peek(Lexer *lexer) { @@ -16,18 +30,24 @@ static char double_peek(Lexer *lexer) { } static char advance(Lexer *lexer) { + if (lexer->source[lexer->pos] == '\n') { + lexer->line++; + lexer->column = 1; + } else { + lexer->column++; + } return lexer->source[lexer->pos++]; } -static int is_identifier_start(char c) { +static char is_identifier_start(char c) { return isalpha(c) || c == '_'; } -static int is_identifier_char(char c) { +static char is_identifier_char(char c) { return isalnum(c) || c == '_'; } -Token lexer_next(Lexer *lexer) { +TokenResult lexer_next(Lexer *lexer) { // Gets the next token from the source while (isspace(peek(lexer)) || peek(lexer) == '/') { @@ -40,36 +60,38 @@ Token lexer_next(Lexer *lexer) { char c = peek(lexer); size_t start = lexer->pos; + size_t start_line = lexer->line; // End of file tokens if (c == '\0') { - return (Token){TOKEN_EOF, &lexer->source[start], 0}; + return (TokenResult){SYNC_RESULT, .result = (Token){TOKEN_EOF, &lexer->source[start], 0, get_file_info(lexer, start, start_line)}}; } // Identifiers if (is_identifier_start(c)) { while (is_identifier_char(peek(lexer))) advance(lexer); - return (Token){TOKEN_IDENTIFIER, &lexer->source[start], lexer->pos - start}; + return (TokenResult){SYNC_RESULT, .result = (Token){TOKEN_IDENTIFIER, &lexer->source[start], lexer->pos - start, get_file_info(lexer, start, start_line)}}; } // Digits if (isdigit(c)) { while (isdigit(peek(lexer))) advance(lexer); - return (Token){TOKEN_NUMBER, &lexer->source[start], lexer->pos - start}; + return (TokenResult){SYNC_RESULT, .result = (Token){TOKEN_NUMBER, &lexer->source[start], lexer->pos - start, get_file_info(lexer, start, start_line)}}; } advance(lexer); switch (c) { case '=': - if (peek(lexer) == '=') return (Token){TOKEN_OPERATOR, &lexer->source[start], 2}; + if (peek(lexer) == '=') return (TokenResult){SYNC_RESULT, .result = (Token){TOKEN_OPERATOR, &lexer->source[start], 2, get_file_info(lexer, start, start_line)}}; case '+': case '-': case '*': case '/': - return (Token){TOKEN_OPERATOR, &lexer->source[start], 1}; - case '(': return (Token){TOKEN_LPAREN, &lexer->source[start], 1}; - case ')': return (Token){TOKEN_RPAREN, &lexer->source[start], 1}; - case ';': return (Token){TOKEN_SEMICOLON, &lexer->source[start], 1}; - default: return (Token){TOKEN_UNKNOWN, &lexer->source[start], 1}; + return (TokenResult){SYNC_RESULT, .result = (Token){TOKEN_OPERATOR, &lexer->source[start], 1, get_file_info(lexer, start, start_line)}}; + case '(': return (TokenResult){SYNC_RESULT, .result = (Token){TOKEN_LPAREN, &lexer->source[start], 1, get_file_info(lexer, start, start_line)}}; + case ')': return (TokenResult){SYNC_RESULT, .result = (Token){TOKEN_RPAREN, &lexer->source[start], 1, get_file_info(lexer, start, start_line)}}; + case ';': return (TokenResult){SYNC_RESULT, .result = (Token){TOKEN_SEMICOLON, &lexer->source[start], 1, get_file_info(lexer, start, start_line)}}; + default: + return (TokenResult){SYNC_ERROR, .error = (SyncError){SYNC_LEXER_ERROR, "Unknown token", get_file_info(lexer, start, start_line)}}; } } diff --git a/src/main.c b/src/main.c index cad33ca..9c4a6eb 100644 --- a/src/main.c +++ b/src/main.c @@ -1,4 +1,5 @@ #include +#include "sync/types.h" #include "sync/lexer.h" static void print_token(Token token) { @@ -15,13 +16,17 @@ int main(void) { const char *source = "sum = a + b123;\nprint(sum);"; Lexer lexer; - lexer_init(&lexer, source); + lexer_init(&lexer, "", source); - Token token; + TokenResult result; do { - token = lexer_next(&lexer); - print_token(token); - } while (token.type != TOKEN_EOF); + result = lexer_next(&lexer); + if (result.type == SYNC_RESULT) { + print_token(result.result); + } else { + fprintf(stderr, "Error: %s\n", result.error.message); + } + } while (result.type != SYNC_ERROR && result.result.type != TOKEN_EOF); return 0; } diff --git a/test/test_lexer.c b/test/test_lexer.c index fbc64e3..3a6e4c2 100644 --- a/test/test_lexer.c +++ b/test/test_lexer.c @@ -1,51 +1,52 @@ #include #include #include +#include "../include/sync/types.h" #include "../include/sync/lexer.h" void test_tokenize_simple_assignment(void) { const char *src = "x = 42;"; Lexer lexer; - lexer_init(&lexer, src); + lexer_init(&lexer, "", src); - Token t = lexer_next(&lexer); - assert(t.type == TOKEN_IDENTIFIER && strncmp(t.start, "x", t.length) == 0); + TokenResult t = lexer_next(&lexer); + assert(t.type == SYNC_RESULT && t.result.type == TOKEN_IDENTIFIER && strncmp(t.result.start, "x", t.result.length) == 0); t = lexer_next(&lexer); - assert(t.type == TOKEN_OPERATOR && strncmp(t.start, "=", t.length) == 0); + assert(t.type == SYNC_RESULT && t.result.type == TOKEN_OPERATOR && strncmp(t.result.start, "=", t.result.length) == 0); t = lexer_next(&lexer); - assert(t.type == TOKEN_NUMBER && strncmp(t.start, "42", t.length) == 0); + assert(t.type == SYNC_RESULT && t.result.type == TOKEN_NUMBER && strncmp(t.result.start, "42", t.result.length) == 0); t = lexer_next(&lexer); - assert(t.type == TOKEN_SEMICOLON); + assert(t.type == SYNC_RESULT && t.result.type == TOKEN_SEMICOLON); t = lexer_next(&lexer); - assert(t.type == TOKEN_EOF); + assert(t.type == SYNC_RESULT && t.result.type == TOKEN_EOF); } void test_tokenize_function_call(void) { const char *src = "print(x);"; Lexer lexer; - lexer_init(&lexer, src); + lexer_init(&lexer, "", src); - Token t = lexer_next(&lexer); - assert(t.type == TOKEN_IDENTIFIER && strncmp(t.start, "print", t.length) == 0); + TokenResult t = lexer_next(&lexer); + assert(t.type == SYNC_RESULT && t.result.type == TOKEN_IDENTIFIER && strncmp(t.result.start, "print", t.result.length) == 0); t = lexer_next(&lexer); - assert(t.type == TOKEN_LPAREN); + assert(t.type == SYNC_RESULT && t.result.type == TOKEN_LPAREN); t = lexer_next(&lexer); - assert(t.type == TOKEN_IDENTIFIER && strncmp(t.start, "x", t.length) == 0); + assert(t.type == SYNC_RESULT && t.result.type == TOKEN_IDENTIFIER && strncmp(t.result.start, "x", t.result.length) == 0); t = lexer_next(&lexer); - assert(t.type == TOKEN_RPAREN); + assert(t.type == SYNC_RESULT && t.result.type == TOKEN_RPAREN); t = lexer_next(&lexer); - assert(t.type == TOKEN_SEMICOLON); + assert(t.type == SYNC_RESULT && t.result.type == TOKEN_SEMICOLON); t = lexer_next(&lexer); - assert(t.type == TOKEN_EOF); + assert(t.type == SYNC_RESULT && t.result.type == TOKEN_EOF); } int main(void) {