980 lines
39 KiB
C
980 lines
39 KiB
C
// Kyler Olsen
|
|
// YREA SLS
|
|
// Lexer
|
|
// November 2025
|
|
|
|
#include <ctype.h>
|
|
#include <string.h>
|
|
#include <stdint.h>
|
|
#include <stddef.h>
|
|
#include <stdlib.h>
|
|
#include <stdio.h>
|
|
#include <math.h>
|
|
|
|
#include "sls/errors.h"
|
|
#include "sls/bool.h"
|
|
#include "sls/lexer.h"
|
|
#include "sls/string.h"
|
|
|
|
const size_t TYPE_NAMES_SAFE_LENGTH = 20;
|
|
|
|
const char *TOKEN_TYPES_NAMES[] = {
|
|
"End of File",
|
|
"Identifier",
|
|
"Integer",
|
|
"Float",
|
|
"Double",
|
|
"Character",
|
|
"String",
|
|
"Boolean",
|
|
"Array",
|
|
"Token String",
|
|
"Type Tuple",
|
|
};
|
|
|
|
const size_t TOKEN_TYPE_COUNT = sizeof(TOKEN_TYPES_NAMES) / sizeof(*TOKEN_TYPES_NAMES);
|
|
|
|
const char *ARRAY_TYPES_NAMES[] = {
|
|
"Identifier",
|
|
"i64",
|
|
"i32",
|
|
"i16",
|
|
"i8",
|
|
"u64",
|
|
"u32",
|
|
"u16",
|
|
"u8",
|
|
"Float",
|
|
"Double",
|
|
"Character",
|
|
"String",
|
|
"Boolean",
|
|
"Inline Struct",
|
|
};
|
|
|
|
const size_t ARRAY_TYPE_COUNT = sizeof(ARRAY_TYPES_NAMES) / sizeof(*ARRAY_TYPES_NAMES);
|
|
|
|
const char *INTEGER_TYPES_NAMES[] = {
|
|
"i64",
|
|
"i32",
|
|
"i16",
|
|
"i8",
|
|
"u64",
|
|
"u32",
|
|
"u16",
|
|
"u8",
|
|
};
|
|
|
|
const size_t INTEGER_TYPE_COUNT = sizeof(INTEGER_TYPES_NAMES) / sizeof(*INTEGER_TYPES_NAMES);
|
|
|
|
void init_lexer(LexerInfo *lexer_info, SlsStr filename, SlsStr source_code) {
|
|
// Initializes a LexerInfo struct with file info and source code
|
|
lexer_info->filename = filename;
|
|
lexer_info->source_code = source_code;
|
|
lexer_info->pos = 0;
|
|
lexer_info->column = 1;
|
|
lexer_info->line = 1;
|
|
}
|
|
|
|
static FileInfo get_file_info(LexerInfo *lexer_info, size_t start, size_t start_line) {
|
|
// Creates a FileInfo struct based on starting and current lexer states
|
|
return (FileInfo){
|
|
.filename = lexer_info->filename,
|
|
.line = lexer_info->line,
|
|
.column = lexer_info->column,
|
|
.length = lexer_info->pos - start,
|
|
.lines = lexer_info->line - start_line
|
|
};
|
|
}
|
|
|
|
static const char *get_token_text(LexerInfo *lexer_info, size_t start) {
|
|
// Returns the current character from the source code
|
|
return lexer_info->source_code.str + start;
|
|
}
|
|
|
|
static char peek(LexerInfo *lexer_info) {
|
|
// Returns the current character from the source code
|
|
return lexer_info->source_code.str[lexer_info->pos];
|
|
}
|
|
|
|
static char far_peek(LexerInfo *lexer_info, size_t index) {
|
|
// Returns the character index away from the current char in the source code
|
|
return lexer_info->source_code.str[lexer_info->pos + index];
|
|
}
|
|
|
|
static char seek(LexerInfo *lexer_info, size_t index) {
|
|
// Returns the character from the given index from the source code
|
|
return lexer_info->source_code.str[index];
|
|
}
|
|
|
|
static char advance(LexerInfo *lexer_info) {
|
|
// Advances lexer_info to the next character
|
|
if (lexer_info->source_code.str[lexer_info->pos] == '\n') {
|
|
// If a new line is encountered, advance line and reset column
|
|
lexer_info->line++;
|
|
lexer_info->column = 1;
|
|
} else {
|
|
// Elsewhere in a line, advance column
|
|
lexer_info->column++;
|
|
}
|
|
// Advance to and return the next character
|
|
return lexer_info->source_code.str[++lexer_info->pos];
|
|
}
|
|
|
|
static LexerResult lexer_result(LexerInfo *lexer_info, Token token, size_t start, size_t start_line) {
|
|
// Create a LexerTokenResult to store the results of lexing the current token
|
|
LexerTokenResult *result = (LexerTokenResult *)malloc(sizeof(LexerTokenResult));
|
|
if (result == NULL)
|
|
return (LexerResult){SLS_ERROR, .error = (SlsError){SLS_STR("Failed to allocate memory."), 1}};
|
|
result->type = SLS_RESULT;
|
|
result->result = token;
|
|
result->file_info = get_file_info(lexer_info, start, start_line);
|
|
result->next = NULL;
|
|
return (LexerResult){SLS_RESULT, .result = result};
|
|
}
|
|
|
|
static LexerResult lexer_error(LexerInfo *lexer_info, SlsStr message, size_t start, size_t start_line) {
|
|
// Create a LexerTokenResult to store an error from lexing the current token
|
|
LexerTokenResult *result = (LexerTokenResult *)malloc(sizeof(LexerTokenResult));
|
|
if (result == NULL)
|
|
return (LexerResult){SLS_ERROR, .error = (SlsError){SLS_STR("Failed to allocate memory."), 1}};
|
|
result->type = SLS_ERROR;
|
|
result->error.message = message;
|
|
result->error.code = 1;
|
|
result->file_info = get_file_info(lexer_info, start, start_line);
|
|
result->next = NULL;
|
|
// SLS_RESULT is used here because we successfully created a LexerTokenResult reporting an error
|
|
return (LexerResult){SLS_RESULT, .result = result};
|
|
}
|
|
|
|
typedef enum {
|
|
NUMERIC_FLOAT_BASE = 1 << 0,
|
|
NUMERIC_UNSIGNED = 1 << 1,
|
|
NUMERIC_64 = 1 << 2,
|
|
NUMERIC_32 = 1 << 3,
|
|
NUMERIC_16 = 1 << 4,
|
|
NUMERIC_8 = 1 << 5,
|
|
} NumericTypesBase;
|
|
|
|
typedef enum {
|
|
NUMERIC_F64 = NUMERIC_64 | NUMERIC_FLOAT_BASE,
|
|
NUMERIC_F32 = NUMERIC_32 | NUMERIC_FLOAT_BASE,
|
|
NUMERIC_I64 = NUMERIC_64,
|
|
NUMERIC_I32 = NUMERIC_32,
|
|
NUMERIC_I16 = NUMERIC_16,
|
|
NUMERIC_I8 = NUMERIC_8,
|
|
NUMERIC_U64 = NUMERIC_64 | NUMERIC_UNSIGNED,
|
|
NUMERIC_U32 = NUMERIC_32 | NUMERIC_UNSIGNED,
|
|
NUMERIC_U16 = NUMERIC_16 | NUMERIC_UNSIGNED,
|
|
NUMERIC_U8 = NUMERIC_8 | NUMERIC_UNSIGNED,
|
|
} NumericTypes;
|
|
|
|
typedef struct {
|
|
SlsResultType type;
|
|
union {
|
|
IntegerBuiltInType integer_type; // type == SLS_RESULT
|
|
SlsError error; // type == SLS_ERROR
|
|
};
|
|
} IntegerTypeResult;
|
|
|
|
static IntegerTypeResult get_integer_type(NumericTypes numeric_type) {
|
|
IntegerBuiltInType integer_type;
|
|
switch (numeric_type) {
|
|
case NUMERIC_I64:
|
|
integer_type = INTEGER_I64;
|
|
break;
|
|
case NUMERIC_I32:
|
|
integer_type = INTEGER_I32;
|
|
break;
|
|
case NUMERIC_I16:
|
|
integer_type = INTEGER_I16;
|
|
break;
|
|
case NUMERIC_I8:
|
|
integer_type = INTEGER_I8;
|
|
break;
|
|
case NUMERIC_U64:
|
|
integer_type = INTEGER_U64;
|
|
break;
|
|
case NUMERIC_U32:
|
|
integer_type = INTEGER_U32;
|
|
break;
|
|
case NUMERIC_U16:
|
|
integer_type = INTEGER_U16;
|
|
break;
|
|
case NUMERIC_U8:
|
|
integer_type = INTEGER_U8;
|
|
break;
|
|
default:
|
|
return (IntegerTypeResult){SLS_ERROR, .error = (SlsError){SLS_STR("Lexer Error: Encountered a Float where there should not be one."), 1}};
|
|
}
|
|
return (IntegerTypeResult){SLS_RESULT, .integer_type = integer_type};
|
|
}
|
|
|
|
static uint64_t create_binary_integer(LexerInfo *lexer_info, size_t start) {
|
|
uint64_t value = 0;
|
|
Boolean negative = FALSE;
|
|
const char *token = get_token_text(lexer_info, start);
|
|
size_t i = 2;
|
|
if (token[0] == '-') {
|
|
negative = TRUE;
|
|
i += 1;
|
|
}
|
|
for (; i < lexer_info->pos - start; i++) {
|
|
if (isspace(token[i]) || token[i] == '/' || token[i] == '\0' || token[i] == ':') break;
|
|
if (token[i] == '.' || token[i] == '_') continue;
|
|
value *= 2;
|
|
switch (token[i]) {
|
|
case '1': value += 1; break;
|
|
}
|
|
}
|
|
if (negative) value = (~value) + 1;
|
|
return value;
|
|
}
|
|
|
|
static uint64_t create_octal_integer(LexerInfo *lexer_info, size_t start) {
|
|
uint64_t value = 0;
|
|
Boolean negative = FALSE;
|
|
const char *token = get_token_text(lexer_info, start);
|
|
size_t i = 2;
|
|
if (token[0] == '-') {
|
|
negative = TRUE;
|
|
i += 1;
|
|
}
|
|
for (; i < lexer_info->pos - start; i++) {
|
|
if (isspace(token[i]) || token[i] == '/' || token[i] == '\0' || token[i] == ':') break;
|
|
if (token[i] == '.' || token[i] == '_') continue;
|
|
value *= 8;
|
|
switch (token[i]) {
|
|
case '1': value += 1; break;
|
|
case '2': value += 2; break;
|
|
case '3': value += 3; break;
|
|
case '4': value += 4; break;
|
|
case '5': value += 5; break;
|
|
case '6': value += 6; break;
|
|
case '7': value += 7; break;
|
|
}
|
|
}
|
|
if (negative) value = (~value) + 1;
|
|
return value;
|
|
}
|
|
|
|
static uint64_t create_decimal_integer(LexerInfo *lexer_info, size_t start) {
|
|
uint64_t value = 0;
|
|
Boolean negative = FALSE;
|
|
const char *token = get_token_text(lexer_info, start);
|
|
size_t i = 0;
|
|
if (token[0] == '-') {
|
|
negative = TRUE;
|
|
i += 1;
|
|
}
|
|
for (; i < lexer_info->pos - start; i++) {
|
|
if (isspace(token[i]) || token[i] == '/' || token[i] == '\0' || token[i] == ':') break;
|
|
if (token[i] == '_') continue;
|
|
value *= 10;
|
|
switch (token[i]) {
|
|
case '1': value += 1; break;
|
|
case '2': value += 2; break;
|
|
case '3': value += 3; break;
|
|
case '4': value += 4; break;
|
|
case '5': value += 5; break;
|
|
case '6': value += 6; break;
|
|
case '7': value += 7; break;
|
|
case '8': value += 8; break;
|
|
case '9': value += 9; break;
|
|
}
|
|
}
|
|
if (negative) value = (~value) + 1;
|
|
return value;
|
|
}
|
|
|
|
static uint64_t create_hexadecimal_integer(LexerInfo *lexer_info, size_t start) {
|
|
uint64_t value = 0;
|
|
Boolean negative = FALSE;
|
|
const char *token = get_token_text(lexer_info, start);
|
|
size_t i = 2;
|
|
if (token[0] == '-') {
|
|
negative = TRUE;
|
|
i += 1;
|
|
}
|
|
for (; i < lexer_info->pos - start; i++) {
|
|
if (isspace(token[i]) || token[i] == '/' || token[i] == '\0' || token[i] == ':') break;
|
|
if (token[i] == '.' || token[i] == '_') continue;
|
|
value *= 16;
|
|
switch (token[i]) {
|
|
case '1': value += 1; break;
|
|
case '2': value += 2; break;
|
|
case '3': value += 3; break;
|
|
case '4': value += 4; break;
|
|
case '5': value += 5; break;
|
|
case '6': value += 6; break;
|
|
case '7': value += 7; break;
|
|
case '8': value += 8; break;
|
|
case '9': value += 9; break;
|
|
case 'A':
|
|
case 'a': value += 10; break;
|
|
case 'B':
|
|
case 'b': value += 11; break;
|
|
case 'C':
|
|
case 'c': value += 12; break;
|
|
case 'D':
|
|
case 'd': value += 13; break;
|
|
case 'E':
|
|
case 'e': value += 14; break;
|
|
case 'F':
|
|
case 'f': value += 15; break;
|
|
}
|
|
}
|
|
if (negative) value = (~value) + 1;
|
|
return value;
|
|
}
|
|
|
|
static LexerResult create_integer_token(LexerInfo *lexer_info, IntegerBuiltInType type, uint64_t value, size_t start, size_t start_line) {
|
|
switch (type) {
|
|
case INTEGER_I64: break;
|
|
case INTEGER_I32:
|
|
if ((int64_t)value < INT32_MIN || (int64_t)value > INT32_MAX) {
|
|
return lexer_error(lexer_info, SLS_STR("Integer overflow: value exceeds range for i32."), start, start_line);
|
|
}
|
|
break;
|
|
case INTEGER_I16:
|
|
if ((int64_t)value < INT16_MIN || (int64_t)value > INT16_MAX) {
|
|
return lexer_error(lexer_info, SLS_STR("Integer overflow: value exceeds range for i16."), start, start_line);
|
|
}
|
|
break;
|
|
case INTEGER_I8:
|
|
if ((int64_t)value < INT8_MIN || (int64_t)value > INT8_MAX) {
|
|
return lexer_error(lexer_info, SLS_STR("Integer overflow: value exceeds range for i8."), start, start_line);
|
|
}
|
|
break;
|
|
case INTEGER_U64:
|
|
if (seek(lexer_info, start) == '-') {
|
|
return lexer_error(lexer_info, SLS_STR("Integer overflow: value exceeds range for u64."), start, start_line);
|
|
}
|
|
break;
|
|
case INTEGER_U32:
|
|
if (seek(lexer_info, start) == '-') {
|
|
return lexer_error(lexer_info, SLS_STR("Integer overflow: value exceeds range for u32."), start, start_line);
|
|
} else if (value > (uint64_t)UINT32_MAX) {
|
|
return lexer_error(lexer_info, SLS_STR("Integer overflow: value exceeds range for u32."), start, start_line);
|
|
}
|
|
break;
|
|
case INTEGER_U16:
|
|
if (seek(lexer_info, start) == '-') {
|
|
return lexer_error(lexer_info, SLS_STR("Integer overflow: value exceeds range for u16."), start, start_line);
|
|
} else if (value > (uint64_t)UINT16_MAX) {
|
|
return lexer_error(lexer_info, SLS_STR("Integer overflow: value exceeds range for u16."), start, start_line);
|
|
}
|
|
break;
|
|
case INTEGER_U8:
|
|
if (seek(lexer_info, start) == '-') {
|
|
return lexer_error(lexer_info, SLS_STR("Integer overflow: value exceeds range for u8."), start, start_line);
|
|
} else if (value > (uint64_t)UINT8_MAX) {
|
|
return lexer_error(lexer_info, SLS_STR("Integer overflow: value exceeds range for u8."), start, start_line);
|
|
}
|
|
break;
|
|
}
|
|
return lexer_result(lexer_info, (Token){TOKEN_INTEGER, .integer_literal = (IntegerLiteral){.type = type, .value = value}}, start, start_line);
|
|
}
|
|
|
|
static double create_float(LexerInfo *lexer_info, size_t start) {
|
|
double value = 0;
|
|
Boolean negative = FALSE;
|
|
uint64_t fractional = 0;
|
|
const char *token = get_token_text(lexer_info, start);
|
|
size_t i = 0;
|
|
if (token[0] == '-') {
|
|
negative = TRUE;
|
|
i += 1;
|
|
}
|
|
for (; i < lexer_info->pos - start; i++) {
|
|
if (isspace(token[i]) || token[i] == '/' || token[i] == '\0' || token[i] == ':') break;
|
|
if (token[i] == '_') continue;
|
|
if (token[i] == '.') {
|
|
fractional = 1;
|
|
continue;
|
|
}
|
|
if (fractional == 0) value *= 10;
|
|
else fractional *= 10;
|
|
switch (token[i]) {
|
|
case '1': value += 1.0 / (fractional == 0 ? 1 : fractional); break;
|
|
case '2': value += 2.0 / (fractional == 0 ? 1 : fractional); break;
|
|
case '3': value += 3.0 / (fractional == 0 ? 1 : fractional); break;
|
|
case '4': value += 4.0 / (fractional == 0 ? 1 : fractional); break;
|
|
case '5': value += 5.0 / (fractional == 0 ? 1 : fractional); break;
|
|
case '6': value += 6.0 / (fractional == 0 ? 1 : fractional); break;
|
|
case '7': value += 7.0 / (fractional == 0 ? 1 : fractional); break;
|
|
case '8': value += 8.0 / (fractional == 0 ? 1 : fractional); break;
|
|
case '9': value += 9.0 / (fractional == 0 ? 1 : fractional); break;
|
|
}
|
|
}
|
|
if (negative) value = -value;
|
|
return value;
|
|
}
|
|
|
|
static LexerResult create_float_token(LexerInfo *lexer_info, NumericTypes type, size_t start, size_t start_line) {
|
|
double value = create_float(lexer_info, start);
|
|
if (type == NUMERIC_F64)
|
|
return lexer_result(lexer_info, (Token){TOKEN_DOUBLE, .double_literal = value}, start, start_line);
|
|
else
|
|
return lexer_result(lexer_info, (Token){TOKEN_FLOAT, .float_literal = (float){value}}, start, start_line);
|
|
}
|
|
|
|
typedef enum {
|
|
NUMERIC_BINARY,
|
|
NUMERIC_OCTAL,
|
|
NUMERIC_DECIMAL,
|
|
NUMERIC_HEXADECIMAL,
|
|
NUMERIC_FLOAT,
|
|
NUMERIC_EXPONENTIAL,
|
|
} NumericLiteralTypes;
|
|
|
|
static LexerResult parse_numeric_type(LexerInfo *lexer_info, char c, size_t start, size_t start_line, NumericLiteralTypes numeric_literal_type) {
|
|
NumericTypes numeric_type = 0;
|
|
c = advance(lexer_info);
|
|
if (c == 'f') {
|
|
numeric_type |= NUMERIC_FLOAT_BASE;
|
|
if (numeric_literal_type == NUMERIC_DECIMAL || numeric_literal_type == NUMERIC_FLOAT || numeric_literal_type == NUMERIC_EXPONENTIAL) {
|
|
c = advance(lexer_info);
|
|
if (c == '6' && far_peek(lexer_info, 1) == '4') {
|
|
numeric_type |= NUMERIC_64;
|
|
c = advance(lexer_info);
|
|
c = advance(lexer_info);
|
|
} else if (c == '3' && far_peek(lexer_info, 1) == '2') {
|
|
numeric_type |= NUMERIC_32;
|
|
c = advance(lexer_info);
|
|
c = advance(lexer_info);
|
|
} else {
|
|
return lexer_error(lexer_info, SLS_STR("Invalid float type: must be of type 'f64' or 'f32'."), start, start_line);
|
|
}
|
|
} else {
|
|
return lexer_error(lexer_info, SLS_STR("Invalid numeric literal: float type not allowed."), start, start_line);
|
|
}
|
|
} else if (c == 'i' || c == 'u') {
|
|
if (numeric_literal_type == NUMERIC_FLOAT || numeric_literal_type == NUMERIC_EXPONENTIAL)
|
|
return lexer_error(lexer_info, SLS_STR("Invalid float type: must be of type 'f64' or 'f32'."), start, start_line);
|
|
if (c == 'u') numeric_type |= NUMERIC_UNSIGNED;
|
|
c = advance(lexer_info);
|
|
if (c == '6' && far_peek(lexer_info, 1) == '4') {
|
|
numeric_type |= NUMERIC_64;
|
|
c = advance(lexer_info);
|
|
c = advance(lexer_info);
|
|
} else if (c == '3' && far_peek(lexer_info, 1) == '2') {
|
|
numeric_type |= NUMERIC_32;
|
|
c = advance(lexer_info);
|
|
c = advance(lexer_info);
|
|
} else if (c == '1' && far_peek(lexer_info, 1) == '6') {
|
|
numeric_type |= NUMERIC_16;
|
|
c = advance(lexer_info);
|
|
c = advance(lexer_info);
|
|
} else if (c == '8') {
|
|
numeric_type |= NUMERIC_8;
|
|
c = advance(lexer_info);
|
|
} else {
|
|
if (numeric_type & NUMERIC_UNSIGNED) {
|
|
return lexer_error(lexer_info, SLS_STR("Invalid unsigned integer type: must be of type 'u64', 'u32', 'u16', and 'u8'."), start, start_line);
|
|
} else {
|
|
return lexer_error(lexer_info, SLS_STR("Invalid signed integer type: must be of type 'i64', 'i32', 'i16', and 'i8'."), start, start_line);
|
|
}
|
|
}
|
|
} else {
|
|
if (numeric_literal_type == NUMERIC_DECIMAL || numeric_literal_type == NUMERIC_FLOAT || numeric_literal_type == NUMERIC_EXPONENTIAL) {
|
|
return lexer_error(lexer_info, SLS_STR("Invalid numeric type: type must start with 'f', 'i', or 'u'."), start, start_line);
|
|
} else {
|
|
return lexer_error(lexer_info, SLS_STR("Invalid integer type: type must start with 'i' or 'u'."), start, start_line);
|
|
}
|
|
} if (isspace(c) || c == '/' || c == '\0') {
|
|
IntegerTypeResult integer_type = get_integer_type(numeric_type);
|
|
if (numeric_literal_type == NUMERIC_DECIMAL && numeric_type & NUMERIC_FLOAT_BASE)
|
|
numeric_literal_type = NUMERIC_FLOAT;
|
|
uint64_t value;
|
|
switch (numeric_literal_type) {
|
|
case NUMERIC_BINARY:
|
|
if (integer_type.type == SLS_ERROR) return (LexerResult){SLS_ERROR, .error = integer_type.error};
|
|
value = create_binary_integer(lexer_info, start);
|
|
return create_integer_token(lexer_info, integer_type.integer_type, value, start, start_line);
|
|
case NUMERIC_OCTAL:
|
|
if (integer_type.type == SLS_ERROR) return (LexerResult){SLS_ERROR, .error = integer_type.error};
|
|
value = create_octal_integer(lexer_info, start);
|
|
return create_integer_token(lexer_info, integer_type.integer_type, value, start, start_line);
|
|
case NUMERIC_DECIMAL:
|
|
if (integer_type.type == SLS_ERROR) return (LexerResult){SLS_ERROR, .error = integer_type.error};
|
|
value = create_decimal_integer(lexer_info, start);
|
|
return create_integer_token(lexer_info, integer_type.integer_type, value, start, start_line);
|
|
case NUMERIC_HEXADECIMAL:
|
|
if (integer_type.type == SLS_ERROR) return (LexerResult){SLS_ERROR, .error = integer_type.error};
|
|
value = create_hexadecimal_integer(lexer_info, start);
|
|
return create_integer_token(lexer_info, integer_type.integer_type, value, start, start_line);
|
|
case NUMERIC_FLOAT:
|
|
return create_float_token(lexer_info, numeric_type, start, start_line);
|
|
case NUMERIC_EXPONENTIAL:
|
|
break;
|
|
}
|
|
return (LexerResult){SLS_ERROR, .error = (SlsError){SLS_STR("Lexer: Numeric Literal Not Implemented Error."), 1}};
|
|
}
|
|
SlsStr error_msg = sls_format(SLS_STR("Invalid numeric literal: unexpected '%c' in numeric type."), c);
|
|
if (error_msg.str == NULL) return (LexerResult){SLS_ERROR, .error = (SlsError){SLS_STR("Out Of Memory Error."), 1}};
|
|
return lexer_error(lexer_info, error_msg, start, start_line);
|
|
}
|
|
|
|
static LexerResult parse_binary_integer(LexerInfo *lexer_info, char c, size_t start, size_t start_line) {
|
|
while (c == '0' || c == '1' || c == '_') c = advance(lexer_info);
|
|
if (c == ':') return parse_numeric_type(lexer_info, c, start, start_line, NUMERIC_BINARY);
|
|
if (isspace(c) || c == '/' || c == '\0') {
|
|
uint64_t value = create_binary_integer(lexer_info, start);
|
|
return create_integer_token(lexer_info, INTEGER_I64, value, start, start_line);
|
|
}
|
|
SlsStr error_msg = sls_format(SLS_STR("Invalid binary literal: unexpected '%c' in binary integer."), c);
|
|
if (error_msg.str == NULL) return (LexerResult){SLS_ERROR, .error = (SlsError){SLS_STR("Out Of Memory Error."), 1}};
|
|
return lexer_error(lexer_info, error_msg, start, start_line);
|
|
}
|
|
|
|
static LexerResult parse_octal_integer(LexerInfo *lexer_info, char c, size_t start, size_t start_line) {
|
|
while ((isdigit(c) || c == '_') && !(c == '8' || c == '9')) c = advance(lexer_info);
|
|
if (c == ':') return parse_numeric_type(lexer_info, c, start, start_line, NUMERIC_OCTAL);
|
|
if (isspace(c) || c == '/' || c == '\0') {
|
|
uint64_t value = create_octal_integer(lexer_info, start);
|
|
return create_integer_token(lexer_info, INTEGER_I64, value, start, start_line);
|
|
}
|
|
SlsStr error_msg = sls_format(SLS_STR("Invalid octal literal: unexpected '%c' in octal integer."), c);
|
|
if (error_msg.str == NULL) return (LexerResult){SLS_ERROR, .error = (SlsError){SLS_STR("Out Of Memory Error."), 1}};
|
|
return lexer_error(lexer_info, error_msg, start, start_line);
|
|
}
|
|
|
|
static LexerResult parse_exponential(LexerInfo *lexer_info, char c, size_t start, size_t start_line) {
|
|
(void)lexer_info; (void)c; (void)start; (void)start_line;
|
|
return (LexerResult){SLS_ERROR, .error = (SlsError){SLS_STR("Lexer: Float Exponential Not Implemented Error."), 1}};
|
|
}
|
|
|
|
static LexerResult parse_float(LexerInfo *lexer_info, char c, size_t start, size_t start_line) {
|
|
while (isdigit(c) || c == '_') c = advance(lexer_info);
|
|
if (c == 'e' || c == 'E') return parse_exponential(lexer_info, c, start, start_line);
|
|
if (c == ':') return parse_numeric_type(lexer_info, c, start, start_line, NUMERIC_FLOAT);
|
|
if (isspace(c) || c == '/' || c == '\0')
|
|
return create_float_token(lexer_info, NUMERIC_F64, start, start_line);
|
|
SlsStr error_msg = sls_format(SLS_STR("Invalid float literal: unexpected '%c' in float."), c);
|
|
if (error_msg.str == NULL) return (LexerResult){SLS_ERROR, .error = (SlsError){SLS_STR("Out Of Memory Error."), 1}};
|
|
return lexer_error(lexer_info, error_msg, start, start_line);
|
|
}
|
|
|
|
static LexerResult parse_decimal_integer(LexerInfo *lexer_info, char c, size_t start, size_t start_line) {
|
|
while (isdigit(c) || c == '_') c = advance(lexer_info);
|
|
if (c == '.') {
|
|
c = advance(lexer_info);
|
|
return parse_float(lexer_info, c, start, start_line);
|
|
} if (c == 'e' || c == 'E') return parse_exponential(lexer_info, c, start, start_line);
|
|
if (c == ':') return parse_numeric_type(lexer_info, c, start, start_line, NUMERIC_DECIMAL);
|
|
if (isspace(c) || c == '/' || c == '\0') {
|
|
uint64_t value = create_decimal_integer(lexer_info, start);
|
|
return create_integer_token(lexer_info, INTEGER_I64, value, start, start_line);
|
|
}
|
|
SlsStr error_msg = sls_format(SLS_STR("Invalid decimal literal: unexpected '%c' in decimal integer."), c);
|
|
if (error_msg.str == NULL) return (LexerResult){SLS_ERROR, .error = (SlsError){SLS_STR("Out Of Memory Error."), 1}};
|
|
return lexer_error(lexer_info, error_msg, start, start_line);
|
|
}
|
|
|
|
static LexerResult parse_hexadecimal_integer(LexerInfo *lexer_info, char c, size_t start, size_t start_line) {
|
|
while (isxdigit(c) || c == '_') c = advance(lexer_info);
|
|
if (c == ':') return parse_numeric_type(lexer_info, c, start, start_line, NUMERIC_HEXADECIMAL);
|
|
if (isspace(c) || c == '/' || c == '\0') {
|
|
uint64_t value = create_hexadecimal_integer(lexer_info, start);
|
|
return create_integer_token(lexer_info, INTEGER_I64, value, start, start_line);
|
|
}
|
|
SlsStr error_msg = sls_format(SLS_STR("Invalid hexadecimal literal: unexpected '%c' in hexadecimal integer."), c);
|
|
if (error_msg.str == NULL) return (LexerResult){SLS_ERROR, .error = (SlsError){SLS_STR("Out Of Memory Error."), 1}};
|
|
return lexer_error(lexer_info, error_msg, start, start_line);
|
|
}
|
|
|
|
static LexerResult parse_numeric_literal(LexerInfo *lexer_info, char c, size_t start, size_t start_line) {
|
|
if (c == '-') c = advance(lexer_info);
|
|
if (c == '0') {
|
|
c = advance(lexer_info);
|
|
if (c == 'b' || c == 'B') {
|
|
c = advance(lexer_info);
|
|
return parse_binary_integer(lexer_info, c, start, start_line);
|
|
} else if (c == 'o' || c == 'O') {
|
|
c = advance(lexer_info);
|
|
return parse_octal_integer(lexer_info, c, start, start_line);
|
|
} else if (c == 'x' || c == 'X') {
|
|
c = advance(lexer_info);
|
|
return parse_hexadecimal_integer(lexer_info, c, start, start_line);
|
|
}
|
|
}
|
|
return parse_decimal_integer(lexer_info, c, start, start_line);
|
|
}
|
|
|
|
static LexerResult parse_character_literal(LexerInfo *lexer_info, char c, size_t start, size_t start_line) {
|
|
if (c == '\'')
|
|
return lexer_error(lexer_info, SLS_STR("Invalid character literal: empty character literal."), start, start_line);
|
|
char value = '\0';
|
|
if (c == '\\') {
|
|
c = advance(lexer_info);
|
|
switch (c) {
|
|
case 'n':
|
|
value = '\n';
|
|
break;
|
|
case 'r':
|
|
value = '\r';
|
|
break;
|
|
case 't':
|
|
value = '\t';
|
|
break;
|
|
case '\\':
|
|
value = '\\';
|
|
break;
|
|
case '\'':
|
|
value = '\'';
|
|
break;
|
|
case '0':
|
|
value = '\0';
|
|
break;
|
|
default:
|
|
SlsStr error_msg = sls_format(SLS_STR("Invalid character literal: unknown escape sequence '\\%c'."), c);
|
|
if (error_msg.str == NULL) return (LexerResult){SLS_ERROR, .error = (SlsError){SLS_STR("Out Of Memory Error."), 1}};
|
|
return lexer_error(lexer_info, error_msg, start, start_line);
|
|
}
|
|
} else if (c == '\n' || c == '\r')
|
|
return lexer_error(lexer_info, SLS_STR("Invalid character literal: unclosed character literal."), start, start_line);
|
|
else value = c;
|
|
c = advance(lexer_info);
|
|
if (isspace(c) || c == '/' || c == '\0')
|
|
return lexer_error(lexer_info, SLS_STR("Invalid character literal: unclosed character literal."), start, start_line);
|
|
else if (c != '\'') {
|
|
SlsStr error_msg = sls_format(SLS_STR("Invalid character literal: unexpected '%c' in character."), c);
|
|
if (error_msg.str == NULL) return (LexerResult){SLS_ERROR, .error = (SlsError){SLS_STR("Out Of Memory Error."), 1}};
|
|
return lexer_error(lexer_info, error_msg, start, start_line);
|
|
}
|
|
advance(lexer_info);
|
|
return lexer_result(lexer_info, (Token){TOKEN_CHARACTER, .character_literal = (uint8_t){value}}, start, start_line);
|
|
}
|
|
|
|
static LexerResult parse_string_literal(LexerInfo *lexer_info, char c, size_t start, size_t start_line) {
|
|
(void)lexer_info; (void)c; (void)start; (void)start_line;
|
|
return (LexerResult){SLS_ERROR, .error = (SlsError){SLS_STR("Lexer: String Literals Not Implemented Error."), 1}};
|
|
}
|
|
|
|
static void skip_comments_and_whitespace(LexerInfo *lexer_info) {
|
|
while (isspace(peek(lexer_info)) || peek(lexer_info) == '/' || peek(lexer_info) == '#') {
|
|
// Skip Comments
|
|
if ((peek(lexer_info) == '/' && far_peek(lexer_info, 1) == '/') || peek(lexer_info) == '#')
|
|
while (!(peek(lexer_info) == '\n' || peek(lexer_info) == '\0'))
|
|
advance(lexer_info);
|
|
// Skip whitespace
|
|
while (isspace(peek(lexer_info))) advance(lexer_info);
|
|
}
|
|
}
|
|
|
|
static LexerResult lexer_next(LexerInfo *lexer_info);
|
|
|
|
TokenString copy_token_string(TokenString token_string) {
|
|
TokenString new_string = (TokenString){
|
|
.length = token_string.length,
|
|
.tokens = (Token *)malloc(sizeof(Token) * token_string.length)
|
|
};
|
|
|
|
for (size_t i = 0; i < token_string.length; i++) {
|
|
if (token_string.tokens[i].type == TOKEN_STRING) {
|
|
new_string.tokens[i].type = TOKEN_STRING;
|
|
new_string.tokens[i].string_literal = sls_str_cpy(token_string.tokens[i].string_literal);
|
|
} else if (token_string.tokens[i].type == TOKEN_TOKEN_STRING) {
|
|
new_string.tokens[i].type = TOKEN_TOKEN_STRING;
|
|
new_string.tokens[i].token_string = copy_token_string(token_string.tokens[i].token_string);
|
|
} else {
|
|
new_string.tokens[i] = token_string.tokens[i];
|
|
}
|
|
}
|
|
return new_string;
|
|
}
|
|
|
|
static LexerResult convert_to_token_string(LexerInfo *lexer_info, LexerTokenResult *head, size_t start, size_t start_line) {
|
|
TokenString token_string = {
|
|
.length = 0,
|
|
.tokens = NULL
|
|
};
|
|
LexerTokenResult *current = head;
|
|
|
|
while (current != NULL) {
|
|
token_string.length += 1;
|
|
current = current->next;
|
|
}
|
|
|
|
current = head;
|
|
token_string.tokens = (Token *)malloc(sizeof(Token) * token_string.length);
|
|
|
|
for (size_t i = 0; i < token_string.length; i++) {
|
|
if (current->result.type == TOKEN_STRING) {
|
|
token_string.tokens[i].type = TOKEN_STRING;
|
|
token_string.tokens[i].string_literal = sls_str_cpy(current->result.string_literal);
|
|
} else if (current->result.type == TOKEN_TOKEN_STRING) {
|
|
token_string.tokens[i].type = TOKEN_TOKEN_STRING;
|
|
token_string.tokens[i].token_string = copy_token_string(current->result.token_string);
|
|
memcpy(token_string.tokens[i].token_string.tokens, current->result.token_string.tokens, current->result.token_string.length);
|
|
} else token_string.tokens[i] = current->result;
|
|
current = current->next;
|
|
}
|
|
|
|
clean_token_result(head);
|
|
|
|
return lexer_result(lexer_info, (Token){TOKEN_TOKEN_STRING, .token_string = token_string}, start, start_line);
|
|
}
|
|
|
|
static LexerResult parse_token_string(LexerInfo *lexer_info, char c, size_t start, size_t start_line) {
|
|
// Lexes a token string
|
|
LexerResult result; // For lexer_next returns
|
|
LexerTokenResult *head = 0;
|
|
LexerTokenResult *current = 0;
|
|
|
|
size_t watchdog = 0;
|
|
|
|
c = advance(lexer_info);
|
|
while (c != '\0') {
|
|
skip_comments_and_whitespace(lexer_info);
|
|
c = peek(lexer_info);
|
|
|
|
// Stop at the end of the token string
|
|
if (c == '}') {
|
|
advance(lexer_info);
|
|
return convert_to_token_string(lexer_info, head, start, start_line);
|
|
}
|
|
|
|
// Get next token
|
|
result = lexer_next(lexer_info);
|
|
|
|
// Handle Errors
|
|
if (result.type == SLS_ERROR) {
|
|
clean_token_result(head);
|
|
return result;
|
|
}
|
|
|
|
// Save result
|
|
if (head == 0) {
|
|
head = result.result;
|
|
current = head;
|
|
} else {
|
|
current->next = result.result;
|
|
current = current->next;
|
|
}
|
|
|
|
// Current should not be null_ptr
|
|
if (current == 0) {
|
|
clean_token_result(head);
|
|
return (LexerResult){SLS_ERROR, .error = (SlsError){SLS_STR("Unknown Error."), 1}};
|
|
}
|
|
|
|
if (current->type == SLS_ERROR) {
|
|
LexerTokenResult *e = (LexerTokenResult *)malloc(sizeof(LexerTokenResult));
|
|
*e = (LexerTokenResult){
|
|
.type = SLS_ERROR,
|
|
.error = (SlsError){sls_str_cpy(current->error.message), 1},
|
|
.file_info = current->file_info,
|
|
.next = NULL
|
|
};
|
|
clean_token_result(head);
|
|
return (LexerResult){SLS_RESULT, .result = e};
|
|
}
|
|
if (current->result.type == TOKEN_EOF) break;
|
|
|
|
c = peek(lexer_info);
|
|
|
|
if (watchdog++ > 1000000) {
|
|
clean_token_result(head);
|
|
return (LexerResult){SLS_ERROR, .error = (SlsError){SLS_STR("Watchdog Triggered in Token String."), 1}};
|
|
}
|
|
}
|
|
clean_token_result(head);
|
|
return lexer_error(lexer_info, SLS_STR("Unclosed token string: missing closing brace '}'."), start, start_line);
|
|
}
|
|
|
|
static LexerResult parse_array_literal(LexerInfo *lexer_info, char c, size_t start, size_t start_line) {
|
|
(void)lexer_info; (void)c; (void)start; (void)start_line;
|
|
return (LexerResult){SLS_ERROR, .error = (SlsError){SLS_STR("Lexer: Array Literals Not Implemented Error."), 1}};
|
|
}
|
|
|
|
static LexerResult parse_type_tuples(LexerInfo *lexer_info, char c, size_t start, size_t start_line) {
|
|
(void)lexer_info; (void)c; (void)start; (void)start_line;
|
|
return (LexerResult){SLS_ERROR, .error = (SlsError){SLS_STR("Lexer: Type Tuples Not Implemented Error."), 1}};
|
|
}
|
|
|
|
Boolean is_identifier_continue(LexerInfo *lexer_info, char c) {
|
|
// If the current character and its context are a valid identifier character
|
|
if (!isprint(c)) return FALSE;
|
|
if (c == '/' && far_peek(lexer_info, 1) == '/') return FALSE;
|
|
if (c == '{' || c == '}') return FALSE;
|
|
if (c == '[' || c == ']') return FALSE;
|
|
if (c == '(' || c == ')') return FALSE;
|
|
if (c == '\'' || c == '"' || c == '#') return FALSE;
|
|
if (isspace(c) || c == '\0') return FALSE;
|
|
return TRUE;
|
|
}
|
|
|
|
Boolean is_identifier_start(LexerInfo *lexer_info) {
|
|
// If the current character and its context are a valid identifier start
|
|
char c = peek(lexer_info);
|
|
if (c == ':' && far_peek(lexer_info, 1) == ':') c = far_peek(lexer_info, 2);
|
|
if ((!isdigit(c)) && is_identifier_continue(lexer_info, c)) return TRUE;
|
|
else return FALSE;
|
|
}
|
|
|
|
static LexerResult parse_identifiers_and_booleans(LexerInfo *lexer_info, char c, size_t start, size_t start_line) {
|
|
// Parses identifier, identifier literals, and boolean tokens
|
|
|
|
Boolean literal = FALSE;
|
|
// Skip leading `::` for identifier literals
|
|
if (c == ':' && far_peek(lexer_info, 1) == ':') {
|
|
literal = TRUE;
|
|
c = advance(lexer_info);
|
|
c = advance(lexer_info);
|
|
}
|
|
|
|
// Read the name of the identifier
|
|
size_t length = 0;
|
|
while (is_identifier_continue(lexer_info, c)) {
|
|
if (c == ':') // && !literal)
|
|
return lexer_error(lexer_info, SLS_STR("Invalid identifier: ':' is not allowed in identifiers."), start, start_line);
|
|
if (c == '.') // && !literal)
|
|
return lexer_error(lexer_info, SLS_STR("Invalid identifier: '.' is not allowed in identifiers."), start, start_line);
|
|
c = advance(lexer_info);
|
|
length++;
|
|
}
|
|
char *name_value = (char *)calloc(length+1, sizeof(char));
|
|
for (size_t i = 0; i < length; i++)
|
|
name_value[i] = lexer_info->source_code.str[start + i + (2 * literal)];
|
|
SlsStr name = sls_str_malloc(name_value, length);
|
|
free(name_value);
|
|
|
|
// Return as identifier or boolean tokens
|
|
if (sls_str_cmp(name, SLS_STR("false")) == 0)
|
|
return lexer_result(lexer_info, (Token){TOKEN_BOOLEAN, .boolean_literal = FALSE}, start, start_line);
|
|
else if (sls_str_cmp(name, SLS_STR("true")) == 0)
|
|
return lexer_result(lexer_info, (Token){TOKEN_BOOLEAN, .boolean_literal = TRUE}, start, start_line);
|
|
else
|
|
return lexer_result(lexer_info, (Token){TOKEN_IDENTIFIER, .identifier = (Identifier){.is_literal = literal, .name = name}}, start, start_line);
|
|
}
|
|
|
|
static LexerResult lexer_next(LexerInfo *lexer_info) {
|
|
// Gets the next token from the source
|
|
|
|
skip_comments_and_whitespace(lexer_info);
|
|
|
|
// Initialize begining variables
|
|
char c = peek(lexer_info);
|
|
size_t start = lexer_info->pos;
|
|
size_t start_line = lexer_info->line;
|
|
|
|
// End of file tokens
|
|
if (c == '\0') return lexer_result(lexer_info, (Token){.type = TOKEN_EOF}, start, start_line);
|
|
// Integers and Floats
|
|
if (isdigit(c) || (c == '.' && isdigit(far_peek(lexer_info, 1))) || (c == '-' && isdigit(far_peek(lexer_info, 1))))
|
|
return parse_numeric_literal(lexer_info, c, start, start_line);
|
|
// Character Literals
|
|
if (c == '\'') {
|
|
c = advance(lexer_info);
|
|
return parse_character_literal(lexer_info, c, start, start_line);
|
|
}
|
|
// String Literals
|
|
if (c == '\"') return parse_string_literal(lexer_info, c, start, start_line);
|
|
// Token Strings
|
|
if (c == '{') return parse_token_string(lexer_info, c, start, start_line);
|
|
if (c == '}') {
|
|
advance(lexer_info);
|
|
return lexer_error(lexer_info, SLS_STR("Unexpected closing brace '}' without matching opening brace."), start, start_line);
|
|
}
|
|
// Array Literals
|
|
if (c == '[') return parse_array_literal(lexer_info, c, start, start_line);
|
|
if (c == ']') {
|
|
advance(lexer_info);
|
|
return lexer_error(lexer_info, SLS_STR("Unexpected closing bracket ']' without matching opening bracket."), start, start_line);
|
|
}
|
|
// Type Tuples
|
|
if (c == '(') return parse_type_tuples(lexer_info, c, start, start_line);
|
|
if (c == ')') {
|
|
advance(lexer_info);
|
|
return lexer_error(lexer_info, SLS_STR("Unexpected closing parentheses ')' without matching opening parentheses."), start, start_line);
|
|
}
|
|
// Identifiers and Booleans
|
|
if (is_identifier_start(lexer_info))
|
|
return parse_identifiers_and_booleans(lexer_info, c, start, start_line);
|
|
if (c == ':') {
|
|
advance(lexer_info);
|
|
if (far_peek(lexer_info, 1) == ':')
|
|
return lexer_error(lexer_info, SLS_STR("Invalid identifier literal: empty identifier after '::'."), start, start_line);
|
|
else
|
|
return lexer_error(lexer_info, SLS_STR("Unexpected single colon ':'."), start, start_line);
|
|
}
|
|
|
|
// Random Characters
|
|
SlsStr error_msg = sls_format(SLS_STR("Unexpected character: unexpected '%c' during parsing."), c);
|
|
if (error_msg.str == NULL) return (LexerResult){SLS_ERROR, .error = (SlsError){SLS_STR("Out Of Memory Error."), 1}};
|
|
return lexer_error(lexer_info, error_msg, start, start_line);
|
|
}
|
|
|
|
static void clean_token_string(TokenString token_string) {
|
|
if (token_string.tokens == NULL) return;
|
|
for (size_t i = 0; i < token_string.length; i++) {
|
|
if (token_string.tokens[i].type == TOKEN_STRING)
|
|
sls_str_free(&token_string.tokens[i].string_literal);
|
|
if (token_string.tokens[i].type == TOKEN_TOKEN_STRING)
|
|
clean_token_string(token_string.tokens[i].token_string);
|
|
}
|
|
free(token_string.tokens);
|
|
token_string.tokens = NULL;
|
|
}
|
|
|
|
void clean_token_result(LexerTokenResult *head) {
|
|
// Deallocates a LexerTokenResult linked list
|
|
if (head == NULL) return;
|
|
if (head->type == SLS_ERROR) sls_str_free(&head->error.message);
|
|
else {
|
|
if (head->result.type == TOKEN_STRING)
|
|
sls_str_free(&head->result.string_literal);
|
|
if (head->result.type == TOKEN_TOKEN_STRING)
|
|
clean_token_string(head->result.token_string);
|
|
}
|
|
clean_token_result(head->next);
|
|
head->next = NULL;
|
|
if (head) free(head);
|
|
}
|
|
|
|
LexerTokenResult *get_token(LexerTokenResult *head, size_t i) {
|
|
// Returns the token at i in a LexerTokenResult linked list, or null_ptr if i is out of bounds
|
|
for (size_t j = 0; j < i && head; j++) {
|
|
head = head->next;
|
|
}
|
|
return head;
|
|
}
|
|
|
|
LexerResult lexical_analysis(LexerInfo *lexer_info) {
|
|
// Lexes code loaded into lexer_info
|
|
LexerResult result; // For lexer_next returns
|
|
LexerTokenResult *head = 0;
|
|
LexerTokenResult *current = 0;
|
|
|
|
do {
|
|
// Get next token
|
|
result = lexer_next(lexer_info);
|
|
|
|
// Handle Errors
|
|
if (result.type == SLS_ERROR) {
|
|
clean_token_result(head);
|
|
return result;
|
|
}
|
|
|
|
// Save result
|
|
if (head == 0) {
|
|
head = result.result;
|
|
current = head;
|
|
} else {
|
|
current->next = result.result;
|
|
current = current->next;
|
|
}
|
|
|
|
// Current should not be null_ptr
|
|
if (current == 0) {
|
|
clean_token_result(head);
|
|
return (LexerResult){SLS_ERROR, .error = (SlsError){SLS_STR("Unknown Error."), 1}};
|
|
}
|
|
|
|
} while (current->type != SLS_ERROR && current->result.type != TOKEN_EOF);
|
|
|
|
return (LexerResult) {.type = SLS_RESULT, .result = head};
|
|
}
|