sync/src/lexer.c

213 lines
7.2 KiB
C

#include <ctype.h>
#include <string.h>
#include "sync/types.h"
#include "sync/lexer.h"
void lexer_init(Lexer* lexer, const char* filename, const char* source) {
lexer->filename = filename;
lexer->source = source;
lexer->pos = 0;
lexer->column = 1;
lexer->line = 1;
}
static FileInfo get_file_info(Lexer* lexer, size_t start, size_t start_line) {
return (FileInfo){
.filename = lexer->filename,
.line = lexer->line,
.column = lexer->column,
.length = lexer->pos - start,
.lines = lexer->line - start_line
};
}
static char peek(Lexer* lexer) {
return lexer->source[lexer->pos];
}
static char far_peek(Lexer* lexer, size_t index) {
return lexer->source[lexer->pos + index];
}
static void advance(Lexer* lexer) {
if (lexer->source[lexer->pos] == '\n') {
lexer->line++;
lexer->column = 1;
} else {
lexer->column++;
}
lexer->source[lexer->pos++];
}
typedef enum {
DIGIT_IS_BINARY = 1 << 0,
DIGIT_IS_OCTAL = 1 << 1,
DIGIT_IS_HEXADECIMAL = 1 << 2,
} DigitFlags;
static char is_digit_start(char c) {
return isdigit(c);
}
static char is_digit_char(char c, DigitFlags flags) {
if (flags & DIGIT_IS_BINARY) return c == '0' || c == '1' || c == '_';
if (flags & DIGIT_IS_OCTAL) return isdigit(c) && c < '8' || c == '_';
if (flags & DIGIT_IS_HEXADECIMAL) return isxdigit(c) || c == '_';
return isdigit(c) || c == '_';
}
static char is_digit_after(char c) {
return
isspace(c) ||
c == '\0' ||
c == ',' ||
c == ')' ||
c == '(' ||
c == '}' ||
c == '{' ||
c == ']' ||
c == '[' ||
c == ';';
}
static char is_identifier_start(char c) {
return isalpha(c) || c == '_';
}
static char is_identifier_char(char c) {
return isalnum(c) || c == '_';
}
static TokenResult lexer_result(Lexer* lexer, TokenType type, size_t start, size_t start_line) {
return (TokenResult){SYNC_RESULT, .result = (Token){
type,
&lexer->source[start],
lexer->pos - start,
get_file_info(lexer, start, start_line)
}};
}
static TokenResult lexer_error(Lexer* lexer, const char* message, size_t start, size_t start_line) {
return (TokenResult){SYNC_ERROR, .error = (SyncError){SYNC_LEXER_ERROR, message, get_file_info(lexer, start, start_line)}};
}
TokenResult lexer_next(Lexer* lexer) {
// Gets the next token from the source
while (isspace(peek(lexer)) || peek(lexer) == '/') {
// Skip Comments
if (peek(lexer) == '/' && far_peek(lexer, 1) == '/')
while (peek(lexer) != '\n') advance(lexer);
// Skip whitespace
while (isspace(peek(lexer))) advance(lexer);
}
char c = peek(lexer);
size_t start = lexer->pos;
size_t start_line = lexer->line;
// End of file tokens
if (c == '\0') {
return lexer_result(lexer, TOKEN_EOF, start, start_line);
}
// Digits
if (isdigit(c)) {
DigitFlags flags = 0;
if (c == '0') {
advance(lexer);
if (peek(lexer) == 'b' || peek(lexer) == 'B') {
flags |= DIGIT_IS_BINARY; advance(lexer);
} else if (peek(lexer) == 'o' || peek(lexer) == 'O') {
flags |= DIGIT_IS_OCTAL; advance(lexer);
} else if (peek(lexer) == 'x' || peek(lexer) == 'X') {
flags |= DIGIT_IS_HEXADECIMAL; advance(lexer);
} else if (is_digit_char(peek(lexer), flags)) {
return lexer_error(lexer, "Invalid number format", start, start_line);
}
}
while (is_digit_char(peek(lexer), flags)) advance(lexer);
if (is_digit_after(peek(lexer))) {
return lexer_result(lexer, TOKEN_NUMBER, start, start_line);
} else {
return lexer_error(lexer, "Invalid number format", start, start_line);
}
}
// Characters
if (c == '\'') {
advance(lexer); // Opening `'`
if (peek(lexer) == '\\') advance(lexer); // Slash
advance(lexer); // Char
if (peek(lexer) == '\'') {
advance(lexer); // Closing `'`
return lexer_result(lexer, TOKEN_CHARACTER, start, start_line);
} else return lexer_error(lexer, "Invalid character format", start, start_line);
}
// Strings
if (c == '"') {
advance(lexer);
while (peek(lexer) != '"') {
if (peek(lexer) == '\\') {
advance(lexer);
} else if (peek(lexer) == '\n') {
return lexer_error(lexer, "Invalid string format", start, start_line);
}
advance(lexer);
}
advance(lexer);
return lexer_result(lexer, TOKEN_STRING, start, start_line);
}
// Identifiers
if (is_identifier_start(c)) {
while (is_identifier_char(peek(lexer))) advance(lexer);
return lexer_result(lexer, TOKEN_IDENTIFIER, start, start_line);
}
advance(lexer);
switch (c) {
case '=':
if (peek(lexer) == '=') advance(lexer);
return lexer_result(lexer, TOKEN_OPERATOR, start, start_line);
case '>':
if (peek(lexer) == '=' || peek(lexer) == '>') advance(lexer);
return lexer_result(lexer, TOKEN_OPERATOR, start, start_line);
case '<':
if (peek(lexer) == '=' || peek(lexer) == '<') advance(lexer);
return lexer_result(lexer, TOKEN_OPERATOR, start, start_line);
case '!':
if (peek(lexer) == '=') advance(lexer);
return lexer_result(lexer, TOKEN_OPERATOR, start, start_line);
case '&':
if (peek(lexer) == '=' || peek(lexer) == '&') advance(lexer);
return lexer_result(lexer, TOKEN_OPERATOR, start, start_line);
case '|':
if (peek(lexer) == '=' || peek(lexer) == '|') advance(lexer);
return lexer_result(lexer, TOKEN_OPERATOR, start, start_line);
case '+':
if (peek(lexer) == '=') advance(lexer);
return lexer_result(lexer, TOKEN_OPERATOR, start, start_line);
case '-':
if (peek(lexer) == '=') advance(lexer);
return lexer_result(lexer, TOKEN_OPERATOR, start, start_line);
case '*':
if (peek(lexer) == '=') advance(lexer);
return lexer_result(lexer, TOKEN_OPERATOR, start, start_line);
case '/':
if (peek(lexer) == '=') advance(lexer);
return lexer_result(lexer, TOKEN_OPERATOR, start, start_line);
case '.': return lexer_result(lexer, TOKEN_OPERATOR, start, start_line);
case ',': return lexer_result(lexer, TOKEN_OPERATOR, start, start_line);
case '(': return lexer_result(lexer, TOKEN_LPAREN, start, start_line);
case ')': return lexer_result(lexer, TOKEN_RPAREN, start, start_line);
case ';': return lexer_result(lexer, TOKEN_SEMICOLON, start, start_line);
case '}': return lexer_result(lexer, TOKEN_RBRACE, start, start_line);
case '{': return lexer_result(lexer, TOKEN_LBRACE, start, start_line);
case ']': return lexer_result(lexer, TOKEN_RBRACKET, start, start_line);
case '[': return lexer_result(lexer, TOKEN_LBRACKET, start, start_line);
default: return lexer_error(lexer, "Unknown token", start, start_line);
}
}