Basic Lexer

2025-06-12 00:06:17 -06:00 · 2025-06-12 00:06:17 -06:00 · e67041cd0e
parent b1c1eb9e5b
commit e67041cd0e
6 changed files with 217 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,2 @@
+build/
+.vscode/
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -0,0 +1,25 @@
+cmake_minimum_required(VERSION 3.13)
+project(sync C)
+
+set(CMAKE_C_STANDARD 99)
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
+
+include_directories(include)
+
+# ---- Source Library (exclude main.c) ----
+file(GLOB SYNC_LIB_SRC src/*.c)
+list(REMOVE_ITEM SYNC_LIB_SRC "${CMAKE_SOURCE_DIR}/src/main.c")
+add_library(sync_lib STATIC ${SYNC_LIB_SRC})
+
+# ---- sync executable ----
+add_executable(sync src/main.c)
+target_link_libraries(sync PRIVATE sync_lib)
+
+# ---- Tests ----
+file(GLOB TEST_FILES test/test_*.c)
+foreach(TEST_FILE ${TEST_FILES})
+    get_filename_component(TEST_NAME ${TEST_FILE} NAME_WE)
+    add_executable(${TEST_NAME} ${TEST_FILE})
+    target_link_libraries(${TEST_NAME} PRIVATE sync_lib)
+    target_include_directories(${TEST_NAME} PRIVATE include)
+endforeach()
--- a/include/sync/lexer.h
+++ b/include/sync/lexer.h
@ -0,0 +1,31 @@
+#ifndef SYNC_LEXER_H
+#define SYNC_LEXER_H
+
+#include <stddef.h>
+
+typedef enum {
+    TOKEN_EOF,
+    TOKEN_IDENTIFIER,
+    TOKEN_NUMBER,
+    TOKEN_OPERATOR,
+    TOKEN_LPAREN,
+    TOKEN_RPAREN,
+    TOKEN_SEMICOLON,
+    TOKEN_UNKNOWN,
+} TokenType;
+
+typedef struct {
+    TokenType type;
+    const char *start;
+    size_t length;
+} Token;
+
+typedef struct {
+    const char *source;
+    size_t pos;
+} Lexer;
+
+void lexer_init(Lexer *lexer, const char *source);
+Token lexer_next(Lexer *lexer);
+
+#endif // SYNC_LEXER_H
--- a/src/lexer.c
+++ b/src/lexer.c
@ -0,0 +1,75 @@
+#include <ctype.h>
+#include <string.h>
+#include "sync/lexer.h"
+
+void lexer_init(Lexer *lexer, const char *source) {
+    lexer->source = source;
+    lexer->pos = 0;
+}
+
+static char peek(Lexer *lexer) {
+    return lexer->source[lexer->pos];
+}
+
+static char double_peek(Lexer *lexer) {
+    return lexer->source[lexer->pos+1];
+}
+
+static char advance(Lexer *lexer) {
+    return lexer->source[lexer->pos++];
+}
+
+static int is_identifier_start(char c) {
+    return isalpha(c) || c == '_';
+}
+
+static int is_identifier_char(char c) {
+    return isalnum(c) || c == '_';
+}
+
+Token lexer_next(Lexer *lexer) {
+    // Gets the next token from the source
+
+    while (isspace(peek(lexer)) || peek(lexer) == '/') {
+        // Skip Comments
+        if (peek(lexer) == '/' && double_peek(lexer) == '/')
+            while (peek(lexer) != '\n') advance(lexer);
+        // Skip whitespace
+        while (isspace(peek(lexer))) advance(lexer);
+    }
+
+    char c = peek(lexer);
+    size_t start = lexer->pos;
+
+    // End of file tokens
+    if (c == '\0') {
+        return (Token){TOKEN_EOF, &lexer->source[start], 0};
+    }
+
+    // Identifiers
+    if (is_identifier_start(c)) {
+        while (is_identifier_char(peek(lexer))) advance(lexer);
+        return (Token){TOKEN_IDENTIFIER, &lexer->source[start], lexer->pos - start};
+    }
+
+    // Digits
+    if (isdigit(c)) {
+        while (isdigit(peek(lexer))) advance(lexer);
+        return (Token){TOKEN_NUMBER, &lexer->source[start], lexer->pos - start};
+    }
+
+    advance(lexer);
+    switch (c) {
+        case '=':
+            if (peek(lexer) == '=') return (Token){TOKEN_OPERATOR, &lexer->source[start], 2};
+        case '+':
+        case '-':
+        case '*':
+        case '/':
+            return (Token){TOKEN_OPERATOR, &lexer->source[start], 1};
+        case '(': return (Token){TOKEN_LPAREN, &lexer->source[start], 1};
+        case ')': return (Token){TOKEN_RPAREN, &lexer->source[start], 1};
+        case ';': return (Token){TOKEN_SEMICOLON, &lexer->source[start], 1};
+        default: return (Token){TOKEN_UNKNOWN, &lexer->source[start], 1};
+    }
+}
--- a/src/main.c
+++ b/src/main.c
@ -0,0 +1,27 @@
+#include <stdio.h>
+#include "sync/lexer.h"
+
+static void print_token(Token token) {
+    printf("Token: %-15s | Text: '%.*s'\n",
+        (const char *[]){
+            "EOF", "IDENTIFIER", "NUMBER", "OPERATOR",
+            "LPAREN", "RPAREN", "SEMICOLON", "UNKNOWN"
+        }[token.type],
+        (int)token.length, token.start
+    );
+}
+
+int main(void) {
+    const char *source = "sum = a + b123;\nprint(sum);";
+
+    Lexer lexer;
+    lexer_init(&lexer, source);
+
+    Token token;
+    do {
+        token = lexer_next(&lexer);
+        print_token(token);
+    } while (token.type != TOKEN_EOF);
+
+    return 0;
+}
--- a/test/test_lexer.c
+++ b/test/test_lexer.c
@ -0,0 +1,57 @@
+#include <assert.h>
+#include <string.h>
+#include <stdio.h>
+#include "../include/sync/lexer.h"
+
+void test_tokenize_simple_assignment(void) {
+    const char *src = "x = 42;";
+    Lexer lexer;
+    lexer_init(&lexer, src);
+
+    Token t = lexer_next(&lexer);
+    assert(t.type == TOKEN_IDENTIFIER && strncmp(t.start, "x", t.length) == 0);
+
+    t = lexer_next(&lexer);
+    assert(t.type == TOKEN_OPERATOR && strncmp(t.start, "=", t.length) == 0);
+
+    t = lexer_next(&lexer);
+    assert(t.type == TOKEN_NUMBER && strncmp(t.start, "42", t.length) == 0);
+
+    t = lexer_next(&lexer);
+    assert(t.type == TOKEN_SEMICOLON);
+
+    t = lexer_next(&lexer);
+    assert(t.type == TOKEN_EOF);
+}
+
+void test_tokenize_function_call(void) {
+    const char *src = "print(x);";
+    Lexer lexer;
+    lexer_init(&lexer, src);
+
+    Token t = lexer_next(&lexer);
+    assert(t.type == TOKEN_IDENTIFIER && strncmp(t.start, "print", t.length) == 0);
+
+    t = lexer_next(&lexer);
+    assert(t.type == TOKEN_LPAREN);
+
+    t = lexer_next(&lexer);
+    assert(t.type == TOKEN_IDENTIFIER && strncmp(t.start, "x", t.length) == 0);
+
+    t = lexer_next(&lexer);
+    assert(t.type == TOKEN_RPAREN);
+
+    t = lexer_next(&lexer);
+    assert(t.type == TOKEN_SEMICOLON);
+
+    t = lexer_next(&lexer);
+    assert(t.type == TOKEN_EOF);
+}
+
+int main(void) {
+    test_tokenize_simple_assignment();
+    test_tokenize_function_call();
+
+    printf("All lexer tests passed.\n");
+    return 0;
+}