Basic Lexer

This commit is contained in:
Kyler Olsen 2025-06-12 00:06:17 -06:00
parent b1c1eb9e5b
commit e67041cd0e
6 changed files with 217 additions and 0 deletions

2
.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
build/
.vscode/

25
CMakeLists.txt Normal file
View File

@ -0,0 +1,25 @@
cmake_minimum_required(VERSION 3.13)
project(sync C)
set(CMAKE_C_STANDARD 99)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
include_directories(include)
# ---- Source Library (exclude main.c) ----
file(GLOB SYNC_LIB_SRC src/*.c)
list(REMOVE_ITEM SYNC_LIB_SRC "${CMAKE_SOURCE_DIR}/src/main.c")
add_library(sync_lib STATIC ${SYNC_LIB_SRC})
# ---- sync executable ----
add_executable(sync src/main.c)
target_link_libraries(sync PRIVATE sync_lib)
# ---- Tests ----
file(GLOB TEST_FILES test/test_*.c)
foreach(TEST_FILE ${TEST_FILES})
get_filename_component(TEST_NAME ${TEST_FILE} NAME_WE)
add_executable(${TEST_NAME} ${TEST_FILE})
target_link_libraries(${TEST_NAME} PRIVATE sync_lib)
target_include_directories(${TEST_NAME} PRIVATE include)
endforeach()

31
include/sync/lexer.h Normal file
View File

@ -0,0 +1,31 @@
#ifndef SYNC_LEXER_H
#define SYNC_LEXER_H
#include <stddef.h>
typedef enum {
TOKEN_EOF,
TOKEN_IDENTIFIER,
TOKEN_NUMBER,
TOKEN_OPERATOR,
TOKEN_LPAREN,
TOKEN_RPAREN,
TOKEN_SEMICOLON,
TOKEN_UNKNOWN,
} TokenType;
typedef struct {
TokenType type;
const char *start;
size_t length;
} Token;
typedef struct {
const char *source;
size_t pos;
} Lexer;
void lexer_init(Lexer *lexer, const char *source);
Token lexer_next(Lexer *lexer);
#endif // SYNC_LEXER_H

75
src/lexer.c Normal file
View File

@ -0,0 +1,75 @@
#include <ctype.h>
#include <string.h>
#include "sync/lexer.h"
void lexer_init(Lexer *lexer, const char *source) {
lexer->source = source;
lexer->pos = 0;
}
static char peek(Lexer *lexer) {
return lexer->source[lexer->pos];
}
static char double_peek(Lexer *lexer) {
return lexer->source[lexer->pos+1];
}
static char advance(Lexer *lexer) {
return lexer->source[lexer->pos++];
}
static int is_identifier_start(char c) {
return isalpha(c) || c == '_';
}
static int is_identifier_char(char c) {
return isalnum(c) || c == '_';
}
Token lexer_next(Lexer *lexer) {
// Gets the next token from the source
while (isspace(peek(lexer)) || peek(lexer) == '/') {
// Skip Comments
if (peek(lexer) == '/' && double_peek(lexer) == '/')
while (peek(lexer) != '\n') advance(lexer);
// Skip whitespace
while (isspace(peek(lexer))) advance(lexer);
}
char c = peek(lexer);
size_t start = lexer->pos;
// End of file tokens
if (c == '\0') {
return (Token){TOKEN_EOF, &lexer->source[start], 0};
}
// Identifiers
if (is_identifier_start(c)) {
while (is_identifier_char(peek(lexer))) advance(lexer);
return (Token){TOKEN_IDENTIFIER, &lexer->source[start], lexer->pos - start};
}
// Digits
if (isdigit(c)) {
while (isdigit(peek(lexer))) advance(lexer);
return (Token){TOKEN_NUMBER, &lexer->source[start], lexer->pos - start};
}
advance(lexer);
switch (c) {
case '=':
if (peek(lexer) == '=') return (Token){TOKEN_OPERATOR, &lexer->source[start], 2};
case '+':
case '-':
case '*':
case '/':
return (Token){TOKEN_OPERATOR, &lexer->source[start], 1};
case '(': return (Token){TOKEN_LPAREN, &lexer->source[start], 1};
case ')': return (Token){TOKEN_RPAREN, &lexer->source[start], 1};
case ';': return (Token){TOKEN_SEMICOLON, &lexer->source[start], 1};
default: return (Token){TOKEN_UNKNOWN, &lexer->source[start], 1};
}
}

27
src/main.c Normal file
View File

@ -0,0 +1,27 @@
#include <stdio.h>
#include "sync/lexer.h"
static void print_token(Token token) {
printf("Token: %-15s | Text: '%.*s'\n",
(const char *[]){
"EOF", "IDENTIFIER", "NUMBER", "OPERATOR",
"LPAREN", "RPAREN", "SEMICOLON", "UNKNOWN"
}[token.type],
(int)token.length, token.start
);
}
int main(void) {
const char *source = "sum = a + b123;\nprint(sum);";
Lexer lexer;
lexer_init(&lexer, source);
Token token;
do {
token = lexer_next(&lexer);
print_token(token);
} while (token.type != TOKEN_EOF);
return 0;
}

57
test/test_lexer.c Normal file
View File

@ -0,0 +1,57 @@
#include <assert.h>
#include <string.h>
#include <stdio.h>
#include "../include/sync/lexer.h"
void test_tokenize_simple_assignment(void) {
const char *src = "x = 42;";
Lexer lexer;
lexer_init(&lexer, src);
Token t = lexer_next(&lexer);
assert(t.type == TOKEN_IDENTIFIER && strncmp(t.start, "x", t.length) == 0);
t = lexer_next(&lexer);
assert(t.type == TOKEN_OPERATOR && strncmp(t.start, "=", t.length) == 0);
t = lexer_next(&lexer);
assert(t.type == TOKEN_NUMBER && strncmp(t.start, "42", t.length) == 0);
t = lexer_next(&lexer);
assert(t.type == TOKEN_SEMICOLON);
t = lexer_next(&lexer);
assert(t.type == TOKEN_EOF);
}
void test_tokenize_function_call(void) {
const char *src = "print(x);";
Lexer lexer;
lexer_init(&lexer, src);
Token t = lexer_next(&lexer);
assert(t.type == TOKEN_IDENTIFIER && strncmp(t.start, "print", t.length) == 0);
t = lexer_next(&lexer);
assert(t.type == TOKEN_LPAREN);
t = lexer_next(&lexer);
assert(t.type == TOKEN_IDENTIFIER && strncmp(t.start, "x", t.length) == 0);
t = lexer_next(&lexer);
assert(t.type == TOKEN_RPAREN);
t = lexer_next(&lexer);
assert(t.type == TOKEN_SEMICOLON);
t = lexer_next(&lexer);
assert(t.type == TOKEN_EOF);
}
int main(void) {
test_tokenize_simple_assignment();
test_tokenize_function_call();
printf("All lexer tests passed.\n");
return 0;
}