#include #include #include "sync/types.h" #include "sync/lexer.h" void lexer_init(Lexer* lexer, const char* filename, const char* source) { lexer->filename = filename; lexer->source = source; lexer->pos = 0; lexer->column = 1; lexer->line = 1; } static FileInfo get_file_info(Lexer* lexer, size_t start, size_t start_line) { return (FileInfo){ .filename = lexer->filename, .line = lexer->line, .column = lexer->column, .length = lexer->pos - start, .lines = lexer->line - start_line }; } static char peek(Lexer* lexer) { return lexer->source[lexer->pos]; } static char far_peek(Lexer* lexer, size_t index) { return lexer->source[lexer->pos + index]; } static void advance(Lexer* lexer) { if (lexer->source[lexer->pos] == '\n') { lexer->line++; lexer->column = 1; } else { lexer->column++; } lexer->source[lexer->pos++]; } typedef enum { DIGIT_IS_BINARY = 1 << 0, DIGIT_IS_OCTAL = 1 << 1, DIGIT_IS_HEXADECIMAL = 1 << 2, } DigitFlags; static char is_digit_start(char c) { return isdigit(c); } static char is_digit_char(char c, DigitFlags flags) { if (flags & DIGIT_IS_BINARY) return c == '0' || c == '1' || c == '_'; if (flags & DIGIT_IS_OCTAL) return isdigit(c) && c < '8' || c == '_'; if (flags & DIGIT_IS_HEXADECIMAL) return isxdigit(c) || c == '_'; return isdigit(c) || c == '_'; } static char is_digit_after(char c) { return isspace(c) || c == '\0' || c == ',' || c == ')' || c == '(' || c == '}' || c == '{' || c == ']' || c == '[' || c == ';'; } static char is_identifier_start(char c) { return isalpha(c) || c == '_'; } static char is_identifier_char(char c) { return isalnum(c) || c == '_'; } static TokenResult lexer_result(Lexer* lexer, TokenType type, size_t start, size_t start_line) { return (TokenResult){SYNC_RESULT, .result = (Token){ type, &lexer->source[start], lexer->pos - start, get_file_info(lexer, start, start_line) }}; } static TokenResult lexer_error(Lexer* lexer, const char* message, size_t start, size_t start_line) { return (TokenResult){SYNC_ERROR, .error = (SyncError){SYNC_LEXER_ERROR, message, get_file_info(lexer, start, start_line)}}; } TokenResult lexer_next(Lexer* lexer) { // Gets the next token from the source while (isspace(peek(lexer)) || peek(lexer) == '/') { // Skip Comments if (peek(lexer) == '/' && far_peek(lexer, 1) == '/') while (peek(lexer) != '\n') advance(lexer); // Skip whitespace while (isspace(peek(lexer))) advance(lexer); } char c = peek(lexer); size_t start = lexer->pos; size_t start_line = lexer->line; // End of file tokens if (c == '\0') { return lexer_result(lexer, TOKEN_EOF, start, start_line); } // Digits if (isdigit(c)) { DigitFlags flags = 0; if (c == '0') { advance(lexer); if (peek(lexer) == 'b' || peek(lexer) == 'B') { flags |= DIGIT_IS_BINARY; advance(lexer); } else if (peek(lexer) == 'o' || peek(lexer) == 'O') { flags |= DIGIT_IS_OCTAL; advance(lexer); } else if (peek(lexer) == 'x' || peek(lexer) == 'X') { flags |= DIGIT_IS_HEXADECIMAL; advance(lexer); } else if (is_digit_char(peek(lexer), flags)) { return lexer_error(lexer, "Invalid number format", start, start_line); } } while (is_digit_char(peek(lexer), flags)) advance(lexer); if (is_digit_after(peek(lexer))) { return lexer_result(lexer, TOKEN_NUMBER, start, start_line); } else { return lexer_error(lexer, "Invalid number format", start, start_line); } } // Characters if (c == '\'') { advance(lexer); // Opening `'` if (peek(lexer) == '\\') advance(lexer); // Slash advance(lexer); // Char if (peek(lexer) == '\'') { advance(lexer); // Closing `'` return lexer_result(lexer, TOKEN_CHARACTER, start, start_line); } else return lexer_error(lexer, "Invalid character format", start, start_line); } // Strings if (c == '"') { advance(lexer); while (peek(lexer) != '"') { if (peek(lexer) == '\\') { advance(lexer); } else if (peek(lexer) == '\n') { return lexer_error(lexer, "Invalid string format", start, start_line); } advance(lexer); } advance(lexer); return lexer_result(lexer, TOKEN_STRING, start, start_line); } // Identifiers if (is_identifier_start(c)) { while (is_identifier_char(peek(lexer))) advance(lexer); return lexer_result(lexer, TOKEN_IDENTIFIER, start, start_line); } advance(lexer); switch (c) { case '=': if (peek(lexer) == '=') advance(lexer); return lexer_result(lexer, TOKEN_OPERATOR, start, start_line); case '>': if (peek(lexer) == '=' || peek(lexer) == '>') advance(lexer); return lexer_result(lexer, TOKEN_OPERATOR, start, start_line); case '<': if (peek(lexer) == '=' || peek(lexer) == '<') advance(lexer); return lexer_result(lexer, TOKEN_OPERATOR, start, start_line); case '!': if (peek(lexer) == '=') advance(lexer); return lexer_result(lexer, TOKEN_OPERATOR, start, start_line); case '&': if (peek(lexer) == '=' || peek(lexer) == '&') advance(lexer); return lexer_result(lexer, TOKEN_OPERATOR, start, start_line); case '|': if (peek(lexer) == '=' || peek(lexer) == '|') advance(lexer); return lexer_result(lexer, TOKEN_OPERATOR, start, start_line); case '+': if (peek(lexer) == '=') advance(lexer); return lexer_result(lexer, TOKEN_OPERATOR, start, start_line); case '-': if (peek(lexer) == '=') advance(lexer); return lexer_result(lexer, TOKEN_OPERATOR, start, start_line); case '*': if (peek(lexer) == '=') advance(lexer); return lexer_result(lexer, TOKEN_OPERATOR, start, start_line); case '/': if (peek(lexer) == '=') advance(lexer); return lexer_result(lexer, TOKEN_OPERATOR, start, start_line); case '.': return lexer_result(lexer, TOKEN_OPERATOR, start, start_line); case ',': return lexer_result(lexer, TOKEN_OPERATOR, start, start_line); case '(': return lexer_result(lexer, TOKEN_LPAREN, start, start_line); case ')': return lexer_result(lexer, TOKEN_RPAREN, start, start_line); case ';': return lexer_result(lexer, TOKEN_SEMICOLON, start, start_line); case '}': return lexer_result(lexer, TOKEN_RBRACE, start, start_line); case '{': return lexer_result(lexer, TOKEN_LBRACE, start, start_line); case ']': return lexer_result(lexer, TOKEN_RBRACKET, start, start_line); case '[': return lexer_result(lexer, TOKEN_LBRACKET, start, start_line); default: return lexer_error(lexer, "Unknown token", start, start_line); } }