Basic Lexer
This commit is contained in:
parent
b1c1eb9e5b
commit
e67041cd0e
|
@ -0,0 +1,2 @@
|
|||
build/
|
||||
.vscode/
|
|
@ -0,0 +1,25 @@
|
|||
cmake_minimum_required(VERSION 3.13)
|
||||
project(sync C)
|
||||
|
||||
set(CMAKE_C_STANDARD 99)
|
||||
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
|
||||
|
||||
include_directories(include)
|
||||
|
||||
# ---- Source Library (exclude main.c) ----
|
||||
file(GLOB SYNC_LIB_SRC src/*.c)
|
||||
list(REMOVE_ITEM SYNC_LIB_SRC "${CMAKE_SOURCE_DIR}/src/main.c")
|
||||
add_library(sync_lib STATIC ${SYNC_LIB_SRC})
|
||||
|
||||
# ---- sync executable ----
|
||||
add_executable(sync src/main.c)
|
||||
target_link_libraries(sync PRIVATE sync_lib)
|
||||
|
||||
# ---- Tests ----
|
||||
file(GLOB TEST_FILES test/test_*.c)
|
||||
foreach(TEST_FILE ${TEST_FILES})
|
||||
get_filename_component(TEST_NAME ${TEST_FILE} NAME_WE)
|
||||
add_executable(${TEST_NAME} ${TEST_FILE})
|
||||
target_link_libraries(${TEST_NAME} PRIVATE sync_lib)
|
||||
target_include_directories(${TEST_NAME} PRIVATE include)
|
||||
endforeach()
|
|
@ -0,0 +1,31 @@
|
|||
#ifndef SYNC_LEXER_H
|
||||
#define SYNC_LEXER_H
|
||||
|
||||
#include <stddef.h>
|
||||
|
||||
typedef enum {
|
||||
TOKEN_EOF,
|
||||
TOKEN_IDENTIFIER,
|
||||
TOKEN_NUMBER,
|
||||
TOKEN_OPERATOR,
|
||||
TOKEN_LPAREN,
|
||||
TOKEN_RPAREN,
|
||||
TOKEN_SEMICOLON,
|
||||
TOKEN_UNKNOWN,
|
||||
} TokenType;
|
||||
|
||||
typedef struct {
|
||||
TokenType type;
|
||||
const char *start;
|
||||
size_t length;
|
||||
} Token;
|
||||
|
||||
typedef struct {
|
||||
const char *source;
|
||||
size_t pos;
|
||||
} Lexer;
|
||||
|
||||
void lexer_init(Lexer *lexer, const char *source);
|
||||
Token lexer_next(Lexer *lexer);
|
||||
|
||||
#endif // SYNC_LEXER_H
|
|
@ -0,0 +1,75 @@
|
|||
#include <ctype.h>
|
||||
#include <string.h>
|
||||
#include "sync/lexer.h"
|
||||
|
||||
void lexer_init(Lexer *lexer, const char *source) {
|
||||
lexer->source = source;
|
||||
lexer->pos = 0;
|
||||
}
|
||||
|
||||
static char peek(Lexer *lexer) {
|
||||
return lexer->source[lexer->pos];
|
||||
}
|
||||
|
||||
static char double_peek(Lexer *lexer) {
|
||||
return lexer->source[lexer->pos+1];
|
||||
}
|
||||
|
||||
static char advance(Lexer *lexer) {
|
||||
return lexer->source[lexer->pos++];
|
||||
}
|
||||
|
||||
static int is_identifier_start(char c) {
|
||||
return isalpha(c) || c == '_';
|
||||
}
|
||||
|
||||
static int is_identifier_char(char c) {
|
||||
return isalnum(c) || c == '_';
|
||||
}
|
||||
|
||||
Token lexer_next(Lexer *lexer) {
|
||||
// Gets the next token from the source
|
||||
|
||||
while (isspace(peek(lexer)) || peek(lexer) == '/') {
|
||||
// Skip Comments
|
||||
if (peek(lexer) == '/' && double_peek(lexer) == '/')
|
||||
while (peek(lexer) != '\n') advance(lexer);
|
||||
// Skip whitespace
|
||||
while (isspace(peek(lexer))) advance(lexer);
|
||||
}
|
||||
|
||||
char c = peek(lexer);
|
||||
size_t start = lexer->pos;
|
||||
|
||||
// End of file tokens
|
||||
if (c == '\0') {
|
||||
return (Token){TOKEN_EOF, &lexer->source[start], 0};
|
||||
}
|
||||
|
||||
// Identifiers
|
||||
if (is_identifier_start(c)) {
|
||||
while (is_identifier_char(peek(lexer))) advance(lexer);
|
||||
return (Token){TOKEN_IDENTIFIER, &lexer->source[start], lexer->pos - start};
|
||||
}
|
||||
|
||||
// Digits
|
||||
if (isdigit(c)) {
|
||||
while (isdigit(peek(lexer))) advance(lexer);
|
||||
return (Token){TOKEN_NUMBER, &lexer->source[start], lexer->pos - start};
|
||||
}
|
||||
|
||||
advance(lexer);
|
||||
switch (c) {
|
||||
case '=':
|
||||
if (peek(lexer) == '=') return (Token){TOKEN_OPERATOR, &lexer->source[start], 2};
|
||||
case '+':
|
||||
case '-':
|
||||
case '*':
|
||||
case '/':
|
||||
return (Token){TOKEN_OPERATOR, &lexer->source[start], 1};
|
||||
case '(': return (Token){TOKEN_LPAREN, &lexer->source[start], 1};
|
||||
case ')': return (Token){TOKEN_RPAREN, &lexer->source[start], 1};
|
||||
case ';': return (Token){TOKEN_SEMICOLON, &lexer->source[start], 1};
|
||||
default: return (Token){TOKEN_UNKNOWN, &lexer->source[start], 1};
|
||||
}
|
||||
}
|
|
@ -0,0 +1,27 @@
|
|||
#include <stdio.h>
|
||||
#include "sync/lexer.h"
|
||||
|
||||
static void print_token(Token token) {
|
||||
printf("Token: %-15s | Text: '%.*s'\n",
|
||||
(const char *[]){
|
||||
"EOF", "IDENTIFIER", "NUMBER", "OPERATOR",
|
||||
"LPAREN", "RPAREN", "SEMICOLON", "UNKNOWN"
|
||||
}[token.type],
|
||||
(int)token.length, token.start
|
||||
);
|
||||
}
|
||||
|
||||
int main(void) {
|
||||
const char *source = "sum = a + b123;\nprint(sum);";
|
||||
|
||||
Lexer lexer;
|
||||
lexer_init(&lexer, source);
|
||||
|
||||
Token token;
|
||||
do {
|
||||
token = lexer_next(&lexer);
|
||||
print_token(token);
|
||||
} while (token.type != TOKEN_EOF);
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,57 @@
|
|||
#include <assert.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "../include/sync/lexer.h"
|
||||
|
||||
void test_tokenize_simple_assignment(void) {
|
||||
const char *src = "x = 42;";
|
||||
Lexer lexer;
|
||||
lexer_init(&lexer, src);
|
||||
|
||||
Token t = lexer_next(&lexer);
|
||||
assert(t.type == TOKEN_IDENTIFIER && strncmp(t.start, "x", t.length) == 0);
|
||||
|
||||
t = lexer_next(&lexer);
|
||||
assert(t.type == TOKEN_OPERATOR && strncmp(t.start, "=", t.length) == 0);
|
||||
|
||||
t = lexer_next(&lexer);
|
||||
assert(t.type == TOKEN_NUMBER && strncmp(t.start, "42", t.length) == 0);
|
||||
|
||||
t = lexer_next(&lexer);
|
||||
assert(t.type == TOKEN_SEMICOLON);
|
||||
|
||||
t = lexer_next(&lexer);
|
||||
assert(t.type == TOKEN_EOF);
|
||||
}
|
||||
|
||||
void test_tokenize_function_call(void) {
|
||||
const char *src = "print(x);";
|
||||
Lexer lexer;
|
||||
lexer_init(&lexer, src);
|
||||
|
||||
Token t = lexer_next(&lexer);
|
||||
assert(t.type == TOKEN_IDENTIFIER && strncmp(t.start, "print", t.length) == 0);
|
||||
|
||||
t = lexer_next(&lexer);
|
||||
assert(t.type == TOKEN_LPAREN);
|
||||
|
||||
t = lexer_next(&lexer);
|
||||
assert(t.type == TOKEN_IDENTIFIER && strncmp(t.start, "x", t.length) == 0);
|
||||
|
||||
t = lexer_next(&lexer);
|
||||
assert(t.type == TOKEN_RPAREN);
|
||||
|
||||
t = lexer_next(&lexer);
|
||||
assert(t.type == TOKEN_SEMICOLON);
|
||||
|
||||
t = lexer_next(&lexer);
|
||||
assert(t.type == TOKEN_EOF);
|
||||
}
|
||||
|
||||
int main(void) {
|
||||
test_tokenize_simple_assignment();
|
||||
test_tokenize_function_call();
|
||||
|
||||
printf("All lexer tests passed.\n");
|
||||
return 0;
|
||||
}
|
Loading…
Reference in New Issue