Refactor lexer to include filename and line/column tracking; update related functions and tests
This commit is contained in:
parent
e67041cd0e
commit
f6049fc644
|
@ -2,6 +2,7 @@
|
|||
#define SYNC_LEXER_H
|
||||
|
||||
#include <stddef.h>
|
||||
#include "types.h"
|
||||
|
||||
typedef enum {
|
||||
TOKEN_EOF,
|
||||
|
@ -11,21 +12,32 @@ typedef enum {
|
|||
TOKEN_LPAREN,
|
||||
TOKEN_RPAREN,
|
||||
TOKEN_SEMICOLON,
|
||||
TOKEN_UNKNOWN,
|
||||
} TokenType;
|
||||
|
||||
typedef struct {
|
||||
TokenType type;
|
||||
const char *start;
|
||||
size_t length;
|
||||
FileInfo file_info;
|
||||
} Token;
|
||||
|
||||
typedef struct {
|
||||
const char *filename;
|
||||
const char *source;
|
||||
size_t pos;
|
||||
size_t column;
|
||||
size_t line;
|
||||
} Lexer;
|
||||
|
||||
void lexer_init(Lexer *lexer, const char *source);
|
||||
Token lexer_next(Lexer *lexer);
|
||||
typedef struct {
|
||||
SyncResultType type;
|
||||
union {
|
||||
Token result;
|
||||
SyncError error;
|
||||
};
|
||||
} TokenResult;
|
||||
|
||||
void lexer_init(Lexer *lexer, const char *filename, const char *source);
|
||||
TokenResult lexer_next(Lexer *lexer);
|
||||
|
||||
#endif // SYNC_LEXER_H
|
||||
|
|
|
@ -0,0 +1,39 @@
|
|||
#ifndef SYNC_TYPES_H
|
||||
#define SYNC_TYPES_H
|
||||
|
||||
#include <stddef.h>
|
||||
|
||||
typedef struct {
|
||||
const char *filename;
|
||||
size_t line;
|
||||
size_t column;
|
||||
size_t length;
|
||||
size_t lines;
|
||||
} FileInfo;
|
||||
|
||||
typedef enum {
|
||||
SYNC_LEXER_ERROR,
|
||||
SYNC_PARSER_ERROR,
|
||||
SYNC_RUNTIME_ERROR
|
||||
} SyncErrorType;
|
||||
|
||||
typedef struct {
|
||||
SyncErrorType type;
|
||||
const char *message;
|
||||
FileInfo file_info;
|
||||
} SyncError;
|
||||
|
||||
typedef enum {
|
||||
SYNC_RESULT,
|
||||
SYNC_ERROR,
|
||||
} SyncResultType;
|
||||
|
||||
// typedef struct {
|
||||
// SyncResultType type;
|
||||
// union {
|
||||
// void *result;
|
||||
// SyncError error;
|
||||
// };
|
||||
// } SyncResult;
|
||||
|
||||
#endif // SYNC_TYPES_H
|
48
src/lexer.c
48
src/lexer.c
|
@ -1,10 +1,24 @@
|
|||
#include <ctype.h>
|
||||
#include <string.h>
|
||||
#include "sync/types.h"
|
||||
#include "sync/lexer.h"
|
||||
|
||||
void lexer_init(Lexer *lexer, const char *source) {
|
||||
void lexer_init(Lexer *lexer, const char *filename, const char *source) {
|
||||
lexer->filename = filename;
|
||||
lexer->source = source;
|
||||
lexer->pos = 0;
|
||||
lexer->column = 1;
|
||||
lexer->line = 1;
|
||||
}
|
||||
|
||||
static FileInfo get_file_info(Lexer *lexer, size_t start, size_t start_line) {
|
||||
return (FileInfo){
|
||||
.filename = lexer->filename,
|
||||
.line = lexer->line,
|
||||
.column = lexer->column,
|
||||
.length = lexer->pos - start,
|
||||
.lines = lexer->line - start_line
|
||||
};
|
||||
}
|
||||
|
||||
static char peek(Lexer *lexer) {
|
||||
|
@ -16,18 +30,24 @@ static char double_peek(Lexer *lexer) {
|
|||
}
|
||||
|
||||
static char advance(Lexer *lexer) {
|
||||
if (lexer->source[lexer->pos] == '\n') {
|
||||
lexer->line++;
|
||||
lexer->column = 1;
|
||||
} else {
|
||||
lexer->column++;
|
||||
}
|
||||
return lexer->source[lexer->pos++];
|
||||
}
|
||||
|
||||
static int is_identifier_start(char c) {
|
||||
static char is_identifier_start(char c) {
|
||||
return isalpha(c) || c == '_';
|
||||
}
|
||||
|
||||
static int is_identifier_char(char c) {
|
||||
static char is_identifier_char(char c) {
|
||||
return isalnum(c) || c == '_';
|
||||
}
|
||||
|
||||
Token lexer_next(Lexer *lexer) {
|
||||
TokenResult lexer_next(Lexer *lexer) {
|
||||
// Gets the next token from the source
|
||||
|
||||
while (isspace(peek(lexer)) || peek(lexer) == '/') {
|
||||
|
@ -40,36 +60,38 @@ Token lexer_next(Lexer *lexer) {
|
|||
|
||||
char c = peek(lexer);
|
||||
size_t start = lexer->pos;
|
||||
size_t start_line = lexer->line;
|
||||
|
||||
// End of file tokens
|
||||
if (c == '\0') {
|
||||
return (Token){TOKEN_EOF, &lexer->source[start], 0};
|
||||
return (TokenResult){SYNC_RESULT, .result = (Token){TOKEN_EOF, &lexer->source[start], 0, get_file_info(lexer, start, start_line)}};
|
||||
}
|
||||
|
||||
// Identifiers
|
||||
if (is_identifier_start(c)) {
|
||||
while (is_identifier_char(peek(lexer))) advance(lexer);
|
||||
return (Token){TOKEN_IDENTIFIER, &lexer->source[start], lexer->pos - start};
|
||||
return (TokenResult){SYNC_RESULT, .result = (Token){TOKEN_IDENTIFIER, &lexer->source[start], lexer->pos - start, get_file_info(lexer, start, start_line)}};
|
||||
}
|
||||
|
||||
// Digits
|
||||
if (isdigit(c)) {
|
||||
while (isdigit(peek(lexer))) advance(lexer);
|
||||
return (Token){TOKEN_NUMBER, &lexer->source[start], lexer->pos - start};
|
||||
return (TokenResult){SYNC_RESULT, .result = (Token){TOKEN_NUMBER, &lexer->source[start], lexer->pos - start, get_file_info(lexer, start, start_line)}};
|
||||
}
|
||||
|
||||
advance(lexer);
|
||||
switch (c) {
|
||||
case '=':
|
||||
if (peek(lexer) == '=') return (Token){TOKEN_OPERATOR, &lexer->source[start], 2};
|
||||
if (peek(lexer) == '=') return (TokenResult){SYNC_RESULT, .result = (Token){TOKEN_OPERATOR, &lexer->source[start], 2, get_file_info(lexer, start, start_line)}};
|
||||
case '+':
|
||||
case '-':
|
||||
case '*':
|
||||
case '/':
|
||||
return (Token){TOKEN_OPERATOR, &lexer->source[start], 1};
|
||||
case '(': return (Token){TOKEN_LPAREN, &lexer->source[start], 1};
|
||||
case ')': return (Token){TOKEN_RPAREN, &lexer->source[start], 1};
|
||||
case ';': return (Token){TOKEN_SEMICOLON, &lexer->source[start], 1};
|
||||
default: return (Token){TOKEN_UNKNOWN, &lexer->source[start], 1};
|
||||
return (TokenResult){SYNC_RESULT, .result = (Token){TOKEN_OPERATOR, &lexer->source[start], 1, get_file_info(lexer, start, start_line)}};
|
||||
case '(': return (TokenResult){SYNC_RESULT, .result = (Token){TOKEN_LPAREN, &lexer->source[start], 1, get_file_info(lexer, start, start_line)}};
|
||||
case ')': return (TokenResult){SYNC_RESULT, .result = (Token){TOKEN_RPAREN, &lexer->source[start], 1, get_file_info(lexer, start, start_line)}};
|
||||
case ';': return (TokenResult){SYNC_RESULT, .result = (Token){TOKEN_SEMICOLON, &lexer->source[start], 1, get_file_info(lexer, start, start_line)}};
|
||||
default:
|
||||
return (TokenResult){SYNC_ERROR, .error = (SyncError){SYNC_LEXER_ERROR, "Unknown token", get_file_info(lexer, start, start_line)}};
|
||||
}
|
||||
}
|
||||
|
|
15
src/main.c
15
src/main.c
|
@ -1,4 +1,5 @@
|
|||
#include <stdio.h>
|
||||
#include "sync/types.h"
|
||||
#include "sync/lexer.h"
|
||||
|
||||
static void print_token(Token token) {
|
||||
|
@ -15,13 +16,17 @@ int main(void) {
|
|||
const char *source = "sum = a + b123;\nprint(sum);";
|
||||
|
||||
Lexer lexer;
|
||||
lexer_init(&lexer, source);
|
||||
lexer_init(&lexer, "<stdin>", source);
|
||||
|
||||
Token token;
|
||||
TokenResult result;
|
||||
do {
|
||||
token = lexer_next(&lexer);
|
||||
print_token(token);
|
||||
} while (token.type != TOKEN_EOF);
|
||||
result = lexer_next(&lexer);
|
||||
if (result.type == SYNC_RESULT) {
|
||||
print_token(result.result);
|
||||
} else {
|
||||
fprintf(stderr, "Error: %s\n", result.error.message);
|
||||
}
|
||||
} while (result.type != SYNC_ERROR && result.result.type != TOKEN_EOF);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -1,51 +1,52 @@
|
|||
#include <assert.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "../include/sync/types.h"
|
||||
#include "../include/sync/lexer.h"
|
||||
|
||||
void test_tokenize_simple_assignment(void) {
|
||||
const char *src = "x = 42;";
|
||||
Lexer lexer;
|
||||
lexer_init(&lexer, src);
|
||||
lexer_init(&lexer, "<stdin>", src);
|
||||
|
||||
Token t = lexer_next(&lexer);
|
||||
assert(t.type == TOKEN_IDENTIFIER && strncmp(t.start, "x", t.length) == 0);
|
||||
TokenResult t = lexer_next(&lexer);
|
||||
assert(t.type == SYNC_RESULT && t.result.type == TOKEN_IDENTIFIER && strncmp(t.result.start, "x", t.result.length) == 0);
|
||||
|
||||
t = lexer_next(&lexer);
|
||||
assert(t.type == TOKEN_OPERATOR && strncmp(t.start, "=", t.length) == 0);
|
||||
assert(t.type == SYNC_RESULT && t.result.type == TOKEN_OPERATOR && strncmp(t.result.start, "=", t.result.length) == 0);
|
||||
|
||||
t = lexer_next(&lexer);
|
||||
assert(t.type == TOKEN_NUMBER && strncmp(t.start, "42", t.length) == 0);
|
||||
assert(t.type == SYNC_RESULT && t.result.type == TOKEN_NUMBER && strncmp(t.result.start, "42", t.result.length) == 0);
|
||||
|
||||
t = lexer_next(&lexer);
|
||||
assert(t.type == TOKEN_SEMICOLON);
|
||||
assert(t.type == SYNC_RESULT && t.result.type == TOKEN_SEMICOLON);
|
||||
|
||||
t = lexer_next(&lexer);
|
||||
assert(t.type == TOKEN_EOF);
|
||||
assert(t.type == SYNC_RESULT && t.result.type == TOKEN_EOF);
|
||||
}
|
||||
|
||||
void test_tokenize_function_call(void) {
|
||||
const char *src = "print(x);";
|
||||
Lexer lexer;
|
||||
lexer_init(&lexer, src);
|
||||
lexer_init(&lexer, "<stdin>", src);
|
||||
|
||||
Token t = lexer_next(&lexer);
|
||||
assert(t.type == TOKEN_IDENTIFIER && strncmp(t.start, "print", t.length) == 0);
|
||||
TokenResult t = lexer_next(&lexer);
|
||||
assert(t.type == SYNC_RESULT && t.result.type == TOKEN_IDENTIFIER && strncmp(t.result.start, "print", t.result.length) == 0);
|
||||
|
||||
t = lexer_next(&lexer);
|
||||
assert(t.type == TOKEN_LPAREN);
|
||||
assert(t.type == SYNC_RESULT && t.result.type == TOKEN_LPAREN);
|
||||
|
||||
t = lexer_next(&lexer);
|
||||
assert(t.type == TOKEN_IDENTIFIER && strncmp(t.start, "x", t.length) == 0);
|
||||
assert(t.type == SYNC_RESULT && t.result.type == TOKEN_IDENTIFIER && strncmp(t.result.start, "x", t.result.length) == 0);
|
||||
|
||||
t = lexer_next(&lexer);
|
||||
assert(t.type == TOKEN_RPAREN);
|
||||
assert(t.type == SYNC_RESULT && t.result.type == TOKEN_RPAREN);
|
||||
|
||||
t = lexer_next(&lexer);
|
||||
assert(t.type == TOKEN_SEMICOLON);
|
||||
assert(t.type == SYNC_RESULT && t.result.type == TOKEN_SEMICOLON);
|
||||
|
||||
t = lexer_next(&lexer);
|
||||
assert(t.type == TOKEN_EOF);
|
||||
assert(t.type == SYNC_RESULT && t.result.type == TOKEN_EOF);
|
||||
}
|
||||
|
||||
int main(void) {
|
||||
|
|
Loading…
Reference in New Issue