From e174f95d5c61817de840a302e5ad50154bed490d Mon Sep 17 00:00:00 2001 From: Kyler Date: Thu, 12 Jun 2025 21:39:57 -0600 Subject: [PATCH] Expanded number token reading and added more operators and punctuation --- include/sync/lexer.h | 4 ++ src/lexer.c | 100 ++++++++++++++++++++++++++++++++++++------- 2 files changed, 89 insertions(+), 15 deletions(-) diff --git a/include/sync/lexer.h b/include/sync/lexer.h index 2477499..67c6662 100644 --- a/include/sync/lexer.h +++ b/include/sync/lexer.h @@ -12,6 +12,10 @@ typedef enum { TOKEN_LPAREN, TOKEN_RPAREN, TOKEN_SEMICOLON, + TOKEN_LBRACE, + TOKEN_RBRACE, + TOKEN_LBRACKET, + TOKEN_RBRACKET, } TokenType; typedef struct { diff --git a/src/lexer.c b/src/lexer.c index 25c9dee..223c32d 100644 --- a/src/lexer.c +++ b/src/lexer.c @@ -39,6 +39,23 @@ static char advance(Lexer *lexer) { return lexer->source[lexer->pos++]; } +typedef enum { + DIGIT_IS_BINARY = 1 << 0, + DIGIT_IS_OCTAL = 1 << 1, + DIGIT_IS_HEXADECIMAL = 1 << 2, +} DigitFlags; + +static char is_digit_start(char c) { + return isdigit(c); +} + +static char is_digit_char(char c, DigitFlags flags) { + if (flags & DIGIT_IS_BINARY) return c == '0' || c == '1' || c == '_'; + if (flags & DIGIT_IS_OCTAL) return isdigit(c) && c < '8' || c == '_'; + if (flags & DIGIT_IS_HEXADECIMAL) return isxdigit(c) || c == '_'; + return isdigit(c) || c == '_'; +} + static char is_identifier_start(char c) { return isalpha(c) || c == '_'; } @@ -47,6 +64,14 @@ static char is_identifier_char(char c) { return isalnum(c) || c == '_'; } +static TokenResult lexer_result(Lexer *lexer, TokenType type, size_t start, size_t start_line) { + return (TokenResult){SYNC_RESULT, .result = (Token){type, &lexer->source[start], lexer->pos - start, get_file_info(lexer, start, start_line)}}; +} + +static TokenResult lexer_error(Lexer *lexer, const char *message, size_t start, size_t start_line) { + return (TokenResult){SYNC_ERROR, .error = (SyncError){SYNC_LEXER_ERROR, message, get_file_info(lexer, start, start_line)}}; +} + TokenResult lexer_next(Lexer *lexer) { // Gets the next token from the source @@ -64,34 +89,79 @@ TokenResult lexer_next(Lexer *lexer) { // End of file tokens if (c == '\0') { - return (TokenResult){SYNC_RESULT, .result = (Token){TOKEN_EOF, &lexer->source[start], 0, get_file_info(lexer, start, start_line)}}; + return lexer_result(lexer, TOKEN_EOF, start, start_line); + } + + // Digits + if (isdigit(c)) { + DigitFlags flags = 0; + if (c == '0') { + advance(lexer); + if (peek(lexer) == 'b' || peek(lexer) == 'B') { + flags |= DIGIT_IS_BINARY; advance(lexer); + } else if (peek(lexer) == 'o' || peek(lexer) == 'O') { + flags |= DIGIT_IS_OCTAL; advance(lexer); + } else if (peek(lexer) == 'x' || peek(lexer) == 'X') { + flags |= DIGIT_IS_HEXADECIMAL; advance(lexer); + } else if (is_digit_char(peek(lexer), flags)) { + return lexer_error(lexer, "Invalid number format", start, start_line); + } + } + while (is_digit_char(peek(lexer), flags)) advance(lexer); + if (isspace(peek(lexer)) || peek(lexer) == ';' || peek(lexer) == '\0' || peek(lexer) == ')' || peek(lexer) == '(' || peek(lexer) == ',') { + return lexer_result(lexer, TOKEN_NUMBER, start, start_line); + } else { + return lexer_error(lexer, "Invalid number format", start, start_line); + } } // Identifiers if (is_identifier_start(c)) { while (is_identifier_char(peek(lexer))) advance(lexer); - return (TokenResult){SYNC_RESULT, .result = (Token){TOKEN_IDENTIFIER, &lexer->source[start], lexer->pos - start, get_file_info(lexer, start, start_line)}}; - } - - // Digits - if (isdigit(c)) { - while (isdigit(peek(lexer))) advance(lexer); - return (TokenResult){SYNC_RESULT, .result = (Token){TOKEN_NUMBER, &lexer->source[start], lexer->pos - start, get_file_info(lexer, start, start_line)}}; + return lexer_result(lexer, TOKEN_IDENTIFIER, start, start_line); } advance(lexer); switch (c) { case '=': - if (peek(lexer) == '=') return (TokenResult){SYNC_RESULT, .result = (Token){TOKEN_OPERATOR, &lexer->source[start], 2, get_file_info(lexer, start, start_line)}}; + if (peek(lexer) == '=') advance(lexer); + return lexer_result(lexer, TOKEN_OPERATOR, start, start_line); + case '>': + if (peek(lexer) == '=' || peek(lexer) == '>') advance(lexer); + return lexer_result(lexer, TOKEN_OPERATOR, start, start_line); + case '<': + if (peek(lexer) == '=' || peek(lexer) == '<') advance(lexer); + return lexer_result(lexer, TOKEN_OPERATOR, start, start_line); + case '!': + if (peek(lexer) == '=') advance(lexer); + return lexer_result(lexer, TOKEN_OPERATOR, start, start_line); + case '&': + if (peek(lexer) == '=' || peek(lexer) == '&') advance(lexer); + return lexer_result(lexer, TOKEN_OPERATOR, start, start_line); + case '|': + if (peek(lexer) == '=' || peek(lexer) == '|') advance(lexer); + return lexer_result(lexer, TOKEN_OPERATOR, start, start_line); case '+': + if (peek(lexer) == '=') advance(lexer); + return lexer_result(lexer, TOKEN_OPERATOR, start, start_line); case '-': + if (peek(lexer) == '=') advance(lexer); + return lexer_result(lexer, TOKEN_OPERATOR, start, start_line); case '*': + if (peek(lexer) == '=') advance(lexer); + return lexer_result(lexer, TOKEN_OPERATOR, start, start_line); case '/': - return (TokenResult){SYNC_RESULT, .result = (Token){TOKEN_OPERATOR, &lexer->source[start], 1, get_file_info(lexer, start, start_line)}}; - case '(': return (TokenResult){SYNC_RESULT, .result = (Token){TOKEN_LPAREN, &lexer->source[start], 1, get_file_info(lexer, start, start_line)}}; - case ')': return (TokenResult){SYNC_RESULT, .result = (Token){TOKEN_RPAREN, &lexer->source[start], 1, get_file_info(lexer, start, start_line)}}; - case ';': return (TokenResult){SYNC_RESULT, .result = (Token){TOKEN_SEMICOLON, &lexer->source[start], 1, get_file_info(lexer, start, start_line)}}; - default: - return (TokenResult){SYNC_ERROR, .error = (SyncError){SYNC_LEXER_ERROR, "Unknown token", get_file_info(lexer, start, start_line)}}; + if (peek(lexer) == '=') advance(lexer); + return lexer_result(lexer, TOKEN_OPERATOR, start, start_line); + case '.': return lexer_result(lexer, TOKEN_OPERATOR, start, start_line); + case ',': return lexer_result(lexer, TOKEN_OPERATOR, start, start_line); + case '(': return lexer_result(lexer, TOKEN_LPAREN, start, start_line); + case ')': return lexer_result(lexer, TOKEN_RPAREN, start, start_line); + case ';': return lexer_result(lexer, TOKEN_SEMICOLON, start, start_line); + case '}': return lexer_result(lexer, TOKEN_RBRACE, start, start_line); + case '{': return lexer_result(lexer, TOKEN_LBRACE, start, start_line); + case ']': return lexer_result(lexer, TOKEN_RBRACKET, start, start_line); + case '[': return lexer_result(lexer, TOKEN_LBRACKET, start, start_line); + default: return lexer_error(lexer, "Unknown token", start, start_line); } }