213 lines
7.2 KiB
C
213 lines
7.2 KiB
C
#include <ctype.h>
|
|
#include <string.h>
|
|
#include "sync/types.h"
|
|
#include "sync/lexer.h"
|
|
|
|
void lexer_init(Lexer* lexer, const char* filename, const char* source) {
|
|
lexer->filename = filename;
|
|
lexer->source = source;
|
|
lexer->pos = 0;
|
|
lexer->column = 1;
|
|
lexer->line = 1;
|
|
}
|
|
|
|
static FileInfo get_file_info(Lexer* lexer, size_t start, size_t start_line) {
|
|
return (FileInfo){
|
|
.filename = lexer->filename,
|
|
.line = lexer->line,
|
|
.column = lexer->column,
|
|
.length = lexer->pos - start,
|
|
.lines = lexer->line - start_line
|
|
};
|
|
}
|
|
|
|
static char peek(Lexer* lexer) {
|
|
return lexer->source[lexer->pos];
|
|
}
|
|
|
|
static char far_peek(Lexer* lexer, size_t index) {
|
|
return lexer->source[lexer->pos + index];
|
|
}
|
|
|
|
static void advance(Lexer* lexer) {
|
|
if (lexer->source[lexer->pos] == '\n') {
|
|
lexer->line++;
|
|
lexer->column = 1;
|
|
} else {
|
|
lexer->column++;
|
|
}
|
|
lexer->source[lexer->pos++];
|
|
}
|
|
|
|
typedef enum {
|
|
DIGIT_IS_BINARY = 1 << 0,
|
|
DIGIT_IS_OCTAL = 1 << 1,
|
|
DIGIT_IS_HEXADECIMAL = 1 << 2,
|
|
} DigitFlags;
|
|
|
|
static char is_digit_start(char c) {
|
|
return isdigit(c);
|
|
}
|
|
|
|
static char is_digit_char(char c, DigitFlags flags) {
|
|
if (flags & DIGIT_IS_BINARY) return c == '0' || c == '1' || c == '_';
|
|
if (flags & DIGIT_IS_OCTAL) return isdigit(c) && c < '8' || c == '_';
|
|
if (flags & DIGIT_IS_HEXADECIMAL) return isxdigit(c) || c == '_';
|
|
return isdigit(c) || c == '_';
|
|
}
|
|
|
|
static char is_digit_after(char c) {
|
|
return
|
|
isspace(c) ||
|
|
c == '\0' ||
|
|
c == ',' ||
|
|
c == ')' ||
|
|
c == '(' ||
|
|
c == '}' ||
|
|
c == '{' ||
|
|
c == ']' ||
|
|
c == '[' ||
|
|
c == ';';
|
|
}
|
|
|
|
static char is_identifier_start(char c) {
|
|
return isalpha(c) || c == '_';
|
|
}
|
|
|
|
static char is_identifier_char(char c) {
|
|
return isalnum(c) || c == '_';
|
|
}
|
|
|
|
static TokenResult lexer_result(Lexer* lexer, TokenType type, size_t start, size_t start_line) {
|
|
return (TokenResult){SYNC_RESULT, .result = (Token){
|
|
type,
|
|
&lexer->source[start],
|
|
lexer->pos - start,
|
|
get_file_info(lexer, start, start_line)
|
|
}};
|
|
}
|
|
|
|
static TokenResult lexer_error(Lexer* lexer, const char* message, size_t start, size_t start_line) {
|
|
return (TokenResult){SYNC_ERROR, .error = (SyncError){SYNC_LEXER_ERROR, message, get_file_info(lexer, start, start_line)}};
|
|
}
|
|
|
|
TokenResult lexer_next(Lexer* lexer) {
|
|
// Gets the next token from the source
|
|
|
|
while (isspace(peek(lexer)) || peek(lexer) == '/') {
|
|
// Skip Comments
|
|
if (peek(lexer) == '/' && far_peek(lexer, 1) == '/')
|
|
while (peek(lexer) != '\n') advance(lexer);
|
|
// Skip whitespace
|
|
while (isspace(peek(lexer))) advance(lexer);
|
|
}
|
|
|
|
char c = peek(lexer);
|
|
size_t start = lexer->pos;
|
|
size_t start_line = lexer->line;
|
|
|
|
// End of file tokens
|
|
if (c == '\0') {
|
|
return lexer_result(lexer, TOKEN_EOF, start, start_line);
|
|
}
|
|
|
|
// Digits
|
|
if (isdigit(c)) {
|
|
DigitFlags flags = 0;
|
|
if (c == '0') {
|
|
advance(lexer);
|
|
if (peek(lexer) == 'b' || peek(lexer) == 'B') {
|
|
flags |= DIGIT_IS_BINARY; advance(lexer);
|
|
} else if (peek(lexer) == 'o' || peek(lexer) == 'O') {
|
|
flags |= DIGIT_IS_OCTAL; advance(lexer);
|
|
} else if (peek(lexer) == 'x' || peek(lexer) == 'X') {
|
|
flags |= DIGIT_IS_HEXADECIMAL; advance(lexer);
|
|
} else if (is_digit_char(peek(lexer), flags)) {
|
|
return lexer_error(lexer, "Invalid number format", start, start_line);
|
|
}
|
|
}
|
|
while (is_digit_char(peek(lexer), flags)) advance(lexer);
|
|
if (is_digit_after(peek(lexer))) {
|
|
return lexer_result(lexer, TOKEN_NUMBER, start, start_line);
|
|
} else {
|
|
return lexer_error(lexer, "Invalid number format", start, start_line);
|
|
}
|
|
}
|
|
|
|
// Characters
|
|
if (c == '\'') {
|
|
advance(lexer); // Opening `'`
|
|
if (peek(lexer) == '\\') advance(lexer); // Slash
|
|
advance(lexer); // Char
|
|
if (peek(lexer) == '\'') {
|
|
advance(lexer); // Closing `'`
|
|
return lexer_result(lexer, TOKEN_CHARACTER, start, start_line);
|
|
} else return lexer_error(lexer, "Invalid character format", start, start_line);
|
|
}
|
|
|
|
// Strings
|
|
if (c == '"') {
|
|
advance(lexer);
|
|
while (peek(lexer) != '"') {
|
|
if (peek(lexer) == '\\') {
|
|
advance(lexer);
|
|
} else if (peek(lexer) == '\n') {
|
|
return lexer_error(lexer, "Invalid string format", start, start_line);
|
|
}
|
|
advance(lexer);
|
|
}
|
|
advance(lexer);
|
|
return lexer_result(lexer, TOKEN_STRING, start, start_line);
|
|
}
|
|
|
|
// Identifiers
|
|
if (is_identifier_start(c)) {
|
|
while (is_identifier_char(peek(lexer))) advance(lexer);
|
|
return lexer_result(lexer, TOKEN_IDENTIFIER, start, start_line);
|
|
}
|
|
|
|
advance(lexer);
|
|
switch (c) {
|
|
case '=':
|
|
if (peek(lexer) == '=') advance(lexer);
|
|
return lexer_result(lexer, TOKEN_OPERATOR, start, start_line);
|
|
case '>':
|
|
if (peek(lexer) == '=' || peek(lexer) == '>') advance(lexer);
|
|
return lexer_result(lexer, TOKEN_OPERATOR, start, start_line);
|
|
case '<':
|
|
if (peek(lexer) == '=' || peek(lexer) == '<') advance(lexer);
|
|
return lexer_result(lexer, TOKEN_OPERATOR, start, start_line);
|
|
case '!':
|
|
if (peek(lexer) == '=') advance(lexer);
|
|
return lexer_result(lexer, TOKEN_OPERATOR, start, start_line);
|
|
case '&':
|
|
if (peek(lexer) == '=' || peek(lexer) == '&') advance(lexer);
|
|
return lexer_result(lexer, TOKEN_OPERATOR, start, start_line);
|
|
case '|':
|
|
if (peek(lexer) == '=' || peek(lexer) == '|') advance(lexer);
|
|
return lexer_result(lexer, TOKEN_OPERATOR, start, start_line);
|
|
case '+':
|
|
if (peek(lexer) == '=') advance(lexer);
|
|
return lexer_result(lexer, TOKEN_OPERATOR, start, start_line);
|
|
case '-':
|
|
if (peek(lexer) == '=') advance(lexer);
|
|
return lexer_result(lexer, TOKEN_OPERATOR, start, start_line);
|
|
case '*':
|
|
if (peek(lexer) == '=') advance(lexer);
|
|
return lexer_result(lexer, TOKEN_OPERATOR, start, start_line);
|
|
case '/':
|
|
if (peek(lexer) == '=') advance(lexer);
|
|
return lexer_result(lexer, TOKEN_OPERATOR, start, start_line);
|
|
case '.': return lexer_result(lexer, TOKEN_OPERATOR, start, start_line);
|
|
case ',': return lexer_result(lexer, TOKEN_OPERATOR, start, start_line);
|
|
case '(': return lexer_result(lexer, TOKEN_LPAREN, start, start_line);
|
|
case ')': return lexer_result(lexer, TOKEN_RPAREN, start, start_line);
|
|
case ';': return lexer_result(lexer, TOKEN_SEMICOLON, start, start_line);
|
|
case '}': return lexer_result(lexer, TOKEN_RBRACE, start, start_line);
|
|
case '{': return lexer_result(lexer, TOKEN_LBRACE, start, start_line);
|
|
case ']': return lexer_result(lexer, TOKEN_RBRACKET, start, start_line);
|
|
case '[': return lexer_result(lexer, TOKEN_LBRACKET, start, start_line);
|
|
default: return lexer_error(lexer, "Unknown token", start, start_line);
|
|
}
|
|
}
|