Refactor lexer to include filename and line/column tracking; update related functions and tests
This commit is contained in:
parent
e67041cd0e
commit
f6049fc644
|
@ -2,6 +2,7 @@
|
||||||
#define SYNC_LEXER_H
|
#define SYNC_LEXER_H
|
||||||
|
|
||||||
#include <stddef.h>
|
#include <stddef.h>
|
||||||
|
#include "types.h"
|
||||||
|
|
||||||
typedef enum {
|
typedef enum {
|
||||||
TOKEN_EOF,
|
TOKEN_EOF,
|
||||||
|
@ -11,21 +12,32 @@ typedef enum {
|
||||||
TOKEN_LPAREN,
|
TOKEN_LPAREN,
|
||||||
TOKEN_RPAREN,
|
TOKEN_RPAREN,
|
||||||
TOKEN_SEMICOLON,
|
TOKEN_SEMICOLON,
|
||||||
TOKEN_UNKNOWN,
|
|
||||||
} TokenType;
|
} TokenType;
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
TokenType type;
|
TokenType type;
|
||||||
const char *start;
|
const char *start;
|
||||||
size_t length;
|
size_t length;
|
||||||
|
FileInfo file_info;
|
||||||
} Token;
|
} Token;
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
|
const char *filename;
|
||||||
const char *source;
|
const char *source;
|
||||||
size_t pos;
|
size_t pos;
|
||||||
|
size_t column;
|
||||||
|
size_t line;
|
||||||
} Lexer;
|
} Lexer;
|
||||||
|
|
||||||
void lexer_init(Lexer *lexer, const char *source);
|
typedef struct {
|
||||||
Token lexer_next(Lexer *lexer);
|
SyncResultType type;
|
||||||
|
union {
|
||||||
|
Token result;
|
||||||
|
SyncError error;
|
||||||
|
};
|
||||||
|
} TokenResult;
|
||||||
|
|
||||||
|
void lexer_init(Lexer *lexer, const char *filename, const char *source);
|
||||||
|
TokenResult lexer_next(Lexer *lexer);
|
||||||
|
|
||||||
#endif // SYNC_LEXER_H
|
#endif // SYNC_LEXER_H
|
||||||
|
|
|
@ -0,0 +1,39 @@
|
||||||
|
#ifndef SYNC_TYPES_H
|
||||||
|
#define SYNC_TYPES_H
|
||||||
|
|
||||||
|
#include <stddef.h>
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
const char *filename;
|
||||||
|
size_t line;
|
||||||
|
size_t column;
|
||||||
|
size_t length;
|
||||||
|
size_t lines;
|
||||||
|
} FileInfo;
|
||||||
|
|
||||||
|
typedef enum {
|
||||||
|
SYNC_LEXER_ERROR,
|
||||||
|
SYNC_PARSER_ERROR,
|
||||||
|
SYNC_RUNTIME_ERROR
|
||||||
|
} SyncErrorType;
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
SyncErrorType type;
|
||||||
|
const char *message;
|
||||||
|
FileInfo file_info;
|
||||||
|
} SyncError;
|
||||||
|
|
||||||
|
typedef enum {
|
||||||
|
SYNC_RESULT,
|
||||||
|
SYNC_ERROR,
|
||||||
|
} SyncResultType;
|
||||||
|
|
||||||
|
// typedef struct {
|
||||||
|
// SyncResultType type;
|
||||||
|
// union {
|
||||||
|
// void *result;
|
||||||
|
// SyncError error;
|
||||||
|
// };
|
||||||
|
// } SyncResult;
|
||||||
|
|
||||||
|
#endif // SYNC_TYPES_H
|
48
src/lexer.c
48
src/lexer.c
|
@ -1,10 +1,24 @@
|
||||||
#include <ctype.h>
|
#include <ctype.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
#include "sync/types.h"
|
||||||
#include "sync/lexer.h"
|
#include "sync/lexer.h"
|
||||||
|
|
||||||
void lexer_init(Lexer *lexer, const char *source) {
|
void lexer_init(Lexer *lexer, const char *filename, const char *source) {
|
||||||
|
lexer->filename = filename;
|
||||||
lexer->source = source;
|
lexer->source = source;
|
||||||
lexer->pos = 0;
|
lexer->pos = 0;
|
||||||
|
lexer->column = 1;
|
||||||
|
lexer->line = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
static FileInfo get_file_info(Lexer *lexer, size_t start, size_t start_line) {
|
||||||
|
return (FileInfo){
|
||||||
|
.filename = lexer->filename,
|
||||||
|
.line = lexer->line,
|
||||||
|
.column = lexer->column,
|
||||||
|
.length = lexer->pos - start,
|
||||||
|
.lines = lexer->line - start_line
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
static char peek(Lexer *lexer) {
|
static char peek(Lexer *lexer) {
|
||||||
|
@ -16,18 +30,24 @@ static char double_peek(Lexer *lexer) {
|
||||||
}
|
}
|
||||||
|
|
||||||
static char advance(Lexer *lexer) {
|
static char advance(Lexer *lexer) {
|
||||||
|
if (lexer->source[lexer->pos] == '\n') {
|
||||||
|
lexer->line++;
|
||||||
|
lexer->column = 1;
|
||||||
|
} else {
|
||||||
|
lexer->column++;
|
||||||
|
}
|
||||||
return lexer->source[lexer->pos++];
|
return lexer->source[lexer->pos++];
|
||||||
}
|
}
|
||||||
|
|
||||||
static int is_identifier_start(char c) {
|
static char is_identifier_start(char c) {
|
||||||
return isalpha(c) || c == '_';
|
return isalpha(c) || c == '_';
|
||||||
}
|
}
|
||||||
|
|
||||||
static int is_identifier_char(char c) {
|
static char is_identifier_char(char c) {
|
||||||
return isalnum(c) || c == '_';
|
return isalnum(c) || c == '_';
|
||||||
}
|
}
|
||||||
|
|
||||||
Token lexer_next(Lexer *lexer) {
|
TokenResult lexer_next(Lexer *lexer) {
|
||||||
// Gets the next token from the source
|
// Gets the next token from the source
|
||||||
|
|
||||||
while (isspace(peek(lexer)) || peek(lexer) == '/') {
|
while (isspace(peek(lexer)) || peek(lexer) == '/') {
|
||||||
|
@ -40,36 +60,38 @@ Token lexer_next(Lexer *lexer) {
|
||||||
|
|
||||||
char c = peek(lexer);
|
char c = peek(lexer);
|
||||||
size_t start = lexer->pos;
|
size_t start = lexer->pos;
|
||||||
|
size_t start_line = lexer->line;
|
||||||
|
|
||||||
// End of file tokens
|
// End of file tokens
|
||||||
if (c == '\0') {
|
if (c == '\0') {
|
||||||
return (Token){TOKEN_EOF, &lexer->source[start], 0};
|
return (TokenResult){SYNC_RESULT, .result = (Token){TOKEN_EOF, &lexer->source[start], 0, get_file_info(lexer, start, start_line)}};
|
||||||
}
|
}
|
||||||
|
|
||||||
// Identifiers
|
// Identifiers
|
||||||
if (is_identifier_start(c)) {
|
if (is_identifier_start(c)) {
|
||||||
while (is_identifier_char(peek(lexer))) advance(lexer);
|
while (is_identifier_char(peek(lexer))) advance(lexer);
|
||||||
return (Token){TOKEN_IDENTIFIER, &lexer->source[start], lexer->pos - start};
|
return (TokenResult){SYNC_RESULT, .result = (Token){TOKEN_IDENTIFIER, &lexer->source[start], lexer->pos - start, get_file_info(lexer, start, start_line)}};
|
||||||
}
|
}
|
||||||
|
|
||||||
// Digits
|
// Digits
|
||||||
if (isdigit(c)) {
|
if (isdigit(c)) {
|
||||||
while (isdigit(peek(lexer))) advance(lexer);
|
while (isdigit(peek(lexer))) advance(lexer);
|
||||||
return (Token){TOKEN_NUMBER, &lexer->source[start], lexer->pos - start};
|
return (TokenResult){SYNC_RESULT, .result = (Token){TOKEN_NUMBER, &lexer->source[start], lexer->pos - start, get_file_info(lexer, start, start_line)}};
|
||||||
}
|
}
|
||||||
|
|
||||||
advance(lexer);
|
advance(lexer);
|
||||||
switch (c) {
|
switch (c) {
|
||||||
case '=':
|
case '=':
|
||||||
if (peek(lexer) == '=') return (Token){TOKEN_OPERATOR, &lexer->source[start], 2};
|
if (peek(lexer) == '=') return (TokenResult){SYNC_RESULT, .result = (Token){TOKEN_OPERATOR, &lexer->source[start], 2, get_file_info(lexer, start, start_line)}};
|
||||||
case '+':
|
case '+':
|
||||||
case '-':
|
case '-':
|
||||||
case '*':
|
case '*':
|
||||||
case '/':
|
case '/':
|
||||||
return (Token){TOKEN_OPERATOR, &lexer->source[start], 1};
|
return (TokenResult){SYNC_RESULT, .result = (Token){TOKEN_OPERATOR, &lexer->source[start], 1, get_file_info(lexer, start, start_line)}};
|
||||||
case '(': return (Token){TOKEN_LPAREN, &lexer->source[start], 1};
|
case '(': return (TokenResult){SYNC_RESULT, .result = (Token){TOKEN_LPAREN, &lexer->source[start], 1, get_file_info(lexer, start, start_line)}};
|
||||||
case ')': return (Token){TOKEN_RPAREN, &lexer->source[start], 1};
|
case ')': return (TokenResult){SYNC_RESULT, .result = (Token){TOKEN_RPAREN, &lexer->source[start], 1, get_file_info(lexer, start, start_line)}};
|
||||||
case ';': return (Token){TOKEN_SEMICOLON, &lexer->source[start], 1};
|
case ';': return (TokenResult){SYNC_RESULT, .result = (Token){TOKEN_SEMICOLON, &lexer->source[start], 1, get_file_info(lexer, start, start_line)}};
|
||||||
default: return (Token){TOKEN_UNKNOWN, &lexer->source[start], 1};
|
default:
|
||||||
|
return (TokenResult){SYNC_ERROR, .error = (SyncError){SYNC_LEXER_ERROR, "Unknown token", get_file_info(lexer, start, start_line)}};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
15
src/main.c
15
src/main.c
|
@ -1,4 +1,5 @@
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
#include "sync/types.h"
|
||||||
#include "sync/lexer.h"
|
#include "sync/lexer.h"
|
||||||
|
|
||||||
static void print_token(Token token) {
|
static void print_token(Token token) {
|
||||||
|
@ -15,13 +16,17 @@ int main(void) {
|
||||||
const char *source = "sum = a + b123;\nprint(sum);";
|
const char *source = "sum = a + b123;\nprint(sum);";
|
||||||
|
|
||||||
Lexer lexer;
|
Lexer lexer;
|
||||||
lexer_init(&lexer, source);
|
lexer_init(&lexer, "<stdin>", source);
|
||||||
|
|
||||||
Token token;
|
TokenResult result;
|
||||||
do {
|
do {
|
||||||
token = lexer_next(&lexer);
|
result = lexer_next(&lexer);
|
||||||
print_token(token);
|
if (result.type == SYNC_RESULT) {
|
||||||
} while (token.type != TOKEN_EOF);
|
print_token(result.result);
|
||||||
|
} else {
|
||||||
|
fprintf(stderr, "Error: %s\n", result.error.message);
|
||||||
|
}
|
||||||
|
} while (result.type != SYNC_ERROR && result.result.type != TOKEN_EOF);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,51 +1,52 @@
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
#include "../include/sync/types.h"
|
||||||
#include "../include/sync/lexer.h"
|
#include "../include/sync/lexer.h"
|
||||||
|
|
||||||
void test_tokenize_simple_assignment(void) {
|
void test_tokenize_simple_assignment(void) {
|
||||||
const char *src = "x = 42;";
|
const char *src = "x = 42;";
|
||||||
Lexer lexer;
|
Lexer lexer;
|
||||||
lexer_init(&lexer, src);
|
lexer_init(&lexer, "<stdin>", src);
|
||||||
|
|
||||||
Token t = lexer_next(&lexer);
|
TokenResult t = lexer_next(&lexer);
|
||||||
assert(t.type == TOKEN_IDENTIFIER && strncmp(t.start, "x", t.length) == 0);
|
assert(t.type == SYNC_RESULT && t.result.type == TOKEN_IDENTIFIER && strncmp(t.result.start, "x", t.result.length) == 0);
|
||||||
|
|
||||||
t = lexer_next(&lexer);
|
t = lexer_next(&lexer);
|
||||||
assert(t.type == TOKEN_OPERATOR && strncmp(t.start, "=", t.length) == 0);
|
assert(t.type == SYNC_RESULT && t.result.type == TOKEN_OPERATOR && strncmp(t.result.start, "=", t.result.length) == 0);
|
||||||
|
|
||||||
t = lexer_next(&lexer);
|
t = lexer_next(&lexer);
|
||||||
assert(t.type == TOKEN_NUMBER && strncmp(t.start, "42", t.length) == 0);
|
assert(t.type == SYNC_RESULT && t.result.type == TOKEN_NUMBER && strncmp(t.result.start, "42", t.result.length) == 0);
|
||||||
|
|
||||||
t = lexer_next(&lexer);
|
t = lexer_next(&lexer);
|
||||||
assert(t.type == TOKEN_SEMICOLON);
|
assert(t.type == SYNC_RESULT && t.result.type == TOKEN_SEMICOLON);
|
||||||
|
|
||||||
t = lexer_next(&lexer);
|
t = lexer_next(&lexer);
|
||||||
assert(t.type == TOKEN_EOF);
|
assert(t.type == SYNC_RESULT && t.result.type == TOKEN_EOF);
|
||||||
}
|
}
|
||||||
|
|
||||||
void test_tokenize_function_call(void) {
|
void test_tokenize_function_call(void) {
|
||||||
const char *src = "print(x);";
|
const char *src = "print(x);";
|
||||||
Lexer lexer;
|
Lexer lexer;
|
||||||
lexer_init(&lexer, src);
|
lexer_init(&lexer, "<stdin>", src);
|
||||||
|
|
||||||
Token t = lexer_next(&lexer);
|
TokenResult t = lexer_next(&lexer);
|
||||||
assert(t.type == TOKEN_IDENTIFIER && strncmp(t.start, "print", t.length) == 0);
|
assert(t.type == SYNC_RESULT && t.result.type == TOKEN_IDENTIFIER && strncmp(t.result.start, "print", t.result.length) == 0);
|
||||||
|
|
||||||
t = lexer_next(&lexer);
|
t = lexer_next(&lexer);
|
||||||
assert(t.type == TOKEN_LPAREN);
|
assert(t.type == SYNC_RESULT && t.result.type == TOKEN_LPAREN);
|
||||||
|
|
||||||
t = lexer_next(&lexer);
|
t = lexer_next(&lexer);
|
||||||
assert(t.type == TOKEN_IDENTIFIER && strncmp(t.start, "x", t.length) == 0);
|
assert(t.type == SYNC_RESULT && t.result.type == TOKEN_IDENTIFIER && strncmp(t.result.start, "x", t.result.length) == 0);
|
||||||
|
|
||||||
t = lexer_next(&lexer);
|
t = lexer_next(&lexer);
|
||||||
assert(t.type == TOKEN_RPAREN);
|
assert(t.type == SYNC_RESULT && t.result.type == TOKEN_RPAREN);
|
||||||
|
|
||||||
t = lexer_next(&lexer);
|
t = lexer_next(&lexer);
|
||||||
assert(t.type == TOKEN_SEMICOLON);
|
assert(t.type == SYNC_RESULT && t.result.type == TOKEN_SEMICOLON);
|
||||||
|
|
||||||
t = lexer_next(&lexer);
|
t = lexer_next(&lexer);
|
||||||
assert(t.type == TOKEN_EOF);
|
assert(t.type == SYNC_RESULT && t.result.type == TOKEN_EOF);
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(void) {
|
int main(void) {
|
||||||
|
|
Loading…
Reference in New Issue