Refactor lexer to include filename and line/column tracking; update related functions and tests

This commit is contained in:
Kyler Olsen 2025-06-12 19:19:29 -06:00
parent e67041cd0e
commit f6049fc644
5 changed files with 115 additions and 36 deletions

View File

@ -2,6 +2,7 @@
#define SYNC_LEXER_H
#include <stddef.h>
#include "types.h"
typedef enum {
TOKEN_EOF,
@ -11,21 +12,32 @@ typedef enum {
TOKEN_LPAREN,
TOKEN_RPAREN,
TOKEN_SEMICOLON,
TOKEN_UNKNOWN,
} TokenType;
typedef struct {
TokenType type;
const char *start;
size_t length;
FileInfo file_info;
} Token;
typedef struct {
const char *filename;
const char *source;
size_t pos;
size_t column;
size_t line;
} Lexer;
void lexer_init(Lexer *lexer, const char *source);
Token lexer_next(Lexer *lexer);
typedef struct {
SyncResultType type;
union {
Token result;
SyncError error;
};
} TokenResult;
void lexer_init(Lexer *lexer, const char *filename, const char *source);
TokenResult lexer_next(Lexer *lexer);
#endif // SYNC_LEXER_H

39
include/sync/types.h Normal file
View File

@ -0,0 +1,39 @@
#ifndef SYNC_TYPES_H
#define SYNC_TYPES_H
#include <stddef.h>
typedef struct {
const char *filename;
size_t line;
size_t column;
size_t length;
size_t lines;
} FileInfo;
typedef enum {
SYNC_LEXER_ERROR,
SYNC_PARSER_ERROR,
SYNC_RUNTIME_ERROR
} SyncErrorType;
typedef struct {
SyncErrorType type;
const char *message;
FileInfo file_info;
} SyncError;
typedef enum {
SYNC_RESULT,
SYNC_ERROR,
} SyncResultType;
// typedef struct {
// SyncResultType type;
// union {
// void *result;
// SyncError error;
// };
// } SyncResult;
#endif // SYNC_TYPES_H

View File

@ -1,10 +1,24 @@
#include <ctype.h>
#include <string.h>
#include "sync/types.h"
#include "sync/lexer.h"
void lexer_init(Lexer *lexer, const char *source) {
void lexer_init(Lexer *lexer, const char *filename, const char *source) {
lexer->filename = filename;
lexer->source = source;
lexer->pos = 0;
lexer->column = 1;
lexer->line = 1;
}
static FileInfo get_file_info(Lexer *lexer, size_t start, size_t start_line) {
return (FileInfo){
.filename = lexer->filename,
.line = lexer->line,
.column = lexer->column,
.length = lexer->pos - start,
.lines = lexer->line - start_line
};
}
static char peek(Lexer *lexer) {
@ -16,18 +30,24 @@ static char double_peek(Lexer *lexer) {
}
static char advance(Lexer *lexer) {
if (lexer->source[lexer->pos] == '\n') {
lexer->line++;
lexer->column = 1;
} else {
lexer->column++;
}
return lexer->source[lexer->pos++];
}
static int is_identifier_start(char c) {
static char is_identifier_start(char c) {
return isalpha(c) || c == '_';
}
static int is_identifier_char(char c) {
static char is_identifier_char(char c) {
return isalnum(c) || c == '_';
}
Token lexer_next(Lexer *lexer) {
TokenResult lexer_next(Lexer *lexer) {
// Gets the next token from the source
while (isspace(peek(lexer)) || peek(lexer) == '/') {
@ -40,36 +60,38 @@ Token lexer_next(Lexer *lexer) {
char c = peek(lexer);
size_t start = lexer->pos;
size_t start_line = lexer->line;
// End of file tokens
if (c == '\0') {
return (Token){TOKEN_EOF, &lexer->source[start], 0};
return (TokenResult){SYNC_RESULT, .result = (Token){TOKEN_EOF, &lexer->source[start], 0, get_file_info(lexer, start, start_line)}};
}
// Identifiers
if (is_identifier_start(c)) {
while (is_identifier_char(peek(lexer))) advance(lexer);
return (Token){TOKEN_IDENTIFIER, &lexer->source[start], lexer->pos - start};
return (TokenResult){SYNC_RESULT, .result = (Token){TOKEN_IDENTIFIER, &lexer->source[start], lexer->pos - start, get_file_info(lexer, start, start_line)}};
}
// Digits
if (isdigit(c)) {
while (isdigit(peek(lexer))) advance(lexer);
return (Token){TOKEN_NUMBER, &lexer->source[start], lexer->pos - start};
return (TokenResult){SYNC_RESULT, .result = (Token){TOKEN_NUMBER, &lexer->source[start], lexer->pos - start, get_file_info(lexer, start, start_line)}};
}
advance(lexer);
switch (c) {
case '=':
if (peek(lexer) == '=') return (Token){TOKEN_OPERATOR, &lexer->source[start], 2};
if (peek(lexer) == '=') return (TokenResult){SYNC_RESULT, .result = (Token){TOKEN_OPERATOR, &lexer->source[start], 2, get_file_info(lexer, start, start_line)}};
case '+':
case '-':
case '*':
case '/':
return (Token){TOKEN_OPERATOR, &lexer->source[start], 1};
case '(': return (Token){TOKEN_LPAREN, &lexer->source[start], 1};
case ')': return (Token){TOKEN_RPAREN, &lexer->source[start], 1};
case ';': return (Token){TOKEN_SEMICOLON, &lexer->source[start], 1};
default: return (Token){TOKEN_UNKNOWN, &lexer->source[start], 1};
return (TokenResult){SYNC_RESULT, .result = (Token){TOKEN_OPERATOR, &lexer->source[start], 1, get_file_info(lexer, start, start_line)}};
case '(': return (TokenResult){SYNC_RESULT, .result = (Token){TOKEN_LPAREN, &lexer->source[start], 1, get_file_info(lexer, start, start_line)}};
case ')': return (TokenResult){SYNC_RESULT, .result = (Token){TOKEN_RPAREN, &lexer->source[start], 1, get_file_info(lexer, start, start_line)}};
case ';': return (TokenResult){SYNC_RESULT, .result = (Token){TOKEN_SEMICOLON, &lexer->source[start], 1, get_file_info(lexer, start, start_line)}};
default:
return (TokenResult){SYNC_ERROR, .error = (SyncError){SYNC_LEXER_ERROR, "Unknown token", get_file_info(lexer, start, start_line)}};
}
}

View File

@ -1,4 +1,5 @@
#include <stdio.h>
#include "sync/types.h"
#include "sync/lexer.h"
static void print_token(Token token) {
@ -15,13 +16,17 @@ int main(void) {
const char *source = "sum = a + b123;\nprint(sum);";
Lexer lexer;
lexer_init(&lexer, source);
lexer_init(&lexer, "<stdin>", source);
Token token;
TokenResult result;
do {
token = lexer_next(&lexer);
print_token(token);
} while (token.type != TOKEN_EOF);
result = lexer_next(&lexer);
if (result.type == SYNC_RESULT) {
print_token(result.result);
} else {
fprintf(stderr, "Error: %s\n", result.error.message);
}
} while (result.type != SYNC_ERROR && result.result.type != TOKEN_EOF);
return 0;
}

View File

@ -1,51 +1,52 @@
#include <assert.h>
#include <string.h>
#include <stdio.h>
#include "../include/sync/types.h"
#include "../include/sync/lexer.h"
void test_tokenize_simple_assignment(void) {
const char *src = "x = 42;";
Lexer lexer;
lexer_init(&lexer, src);
lexer_init(&lexer, "<stdin>", src);
Token t = lexer_next(&lexer);
assert(t.type == TOKEN_IDENTIFIER && strncmp(t.start, "x", t.length) == 0);
TokenResult t = lexer_next(&lexer);
assert(t.type == SYNC_RESULT && t.result.type == TOKEN_IDENTIFIER && strncmp(t.result.start, "x", t.result.length) == 0);
t = lexer_next(&lexer);
assert(t.type == TOKEN_OPERATOR && strncmp(t.start, "=", t.length) == 0);
assert(t.type == SYNC_RESULT && t.result.type == TOKEN_OPERATOR && strncmp(t.result.start, "=", t.result.length) == 0);
t = lexer_next(&lexer);
assert(t.type == TOKEN_NUMBER && strncmp(t.start, "42", t.length) == 0);
assert(t.type == SYNC_RESULT && t.result.type == TOKEN_NUMBER && strncmp(t.result.start, "42", t.result.length) == 0);
t = lexer_next(&lexer);
assert(t.type == TOKEN_SEMICOLON);
assert(t.type == SYNC_RESULT && t.result.type == TOKEN_SEMICOLON);
t = lexer_next(&lexer);
assert(t.type == TOKEN_EOF);
assert(t.type == SYNC_RESULT && t.result.type == TOKEN_EOF);
}
void test_tokenize_function_call(void) {
const char *src = "print(x);";
Lexer lexer;
lexer_init(&lexer, src);
lexer_init(&lexer, "<stdin>", src);
Token t = lexer_next(&lexer);
assert(t.type == TOKEN_IDENTIFIER && strncmp(t.start, "print", t.length) == 0);
TokenResult t = lexer_next(&lexer);
assert(t.type == SYNC_RESULT && t.result.type == TOKEN_IDENTIFIER && strncmp(t.result.start, "print", t.result.length) == 0);
t = lexer_next(&lexer);
assert(t.type == TOKEN_LPAREN);
assert(t.type == SYNC_RESULT && t.result.type == TOKEN_LPAREN);
t = lexer_next(&lexer);
assert(t.type == TOKEN_IDENTIFIER && strncmp(t.start, "x", t.length) == 0);
assert(t.type == SYNC_RESULT && t.result.type == TOKEN_IDENTIFIER && strncmp(t.result.start, "x", t.result.length) == 0);
t = lexer_next(&lexer);
assert(t.type == TOKEN_RPAREN);
assert(t.type == SYNC_RESULT && t.result.type == TOKEN_RPAREN);
t = lexer_next(&lexer);
assert(t.type == TOKEN_SEMICOLON);
assert(t.type == SYNC_RESULT && t.result.type == TOKEN_SEMICOLON);
t = lexer_next(&lexer);
assert(t.type == TOKEN_EOF);
assert(t.type == SYNC_RESULT && t.result.type == TOKEN_EOF);
}
int main(void) {