Compare commits

...

2 Commits

Author SHA1 Message Date
Kyler Olsen 35a6f4537f Fixes to file and repl 2025-12-03 14:58:37 -07:00
Kyler Olsen 64f620b2ef Claude implementation of lexer.py 2025-12-03 14:57:15 -07:00
3 changed files with 701 additions and 27 deletions

View File

@ -1,6 +1,6 @@
from pathlib import Path from pathlib import Path
from sls.lexer import LexerInfo, lexical_analysis, Token from .lexer import LexerInfo, lexical_analysis, Token
from sls.interpreter import InterpreterState from .interpreter import InterpreterState
def exec_file(interpreter_state: InterpreterState, filename: str) -> bool: def exec_file(interpreter_state: InterpreterState, filename: str) -> bool:

View File

@ -1,7 +1,7 @@
from __future__ import annotations from __future__ import annotations
from dataclasses import dataclass, field from dataclasses import dataclass, field
from enum import Enum, auto from enum import Enum, auto
from typing import List, Optional, Any from typing import List, Optional, Any, Union
# ===================================================================== # =====================================================================
@ -19,7 +19,7 @@ class LexerInfo:
self.filename = filename self.filename = filename
self.source_code = source_code self.source_code = source_code
self.pos = 0 self.pos = 0
self.column = 0 self.column = 1
self.line = 1 self.line = 1
@ -90,7 +90,7 @@ class IntegerBuiltInType(Enum):
@dataclass @dataclass
class IntegerLiteral: class IntegerLiteral:
value: int # Python int is arbitrary precision value: int
type: IntegerBuiltInType type: IntegerBuiltInType
@ -132,12 +132,12 @@ class TypeTuple:
@dataclass @dataclass
class StructInline: class StructInline:
values: List[Any] # Python can store anything values: List[Any]
name: str name: str
# ===================================================================== # =====================================================================
# ArrayLiteral (replaces C unions with Optional lists) # ArrayLiteral
# ===================================================================== # =====================================================================
@dataclass @dataclass
@ -189,7 +189,7 @@ class ArrayLiteral:
# ===================================================================== # =====================================================================
# Token (Python “union” via Optional fields) # Token
# ===================================================================== # =====================================================================
@dataclass @dataclass
@ -209,24 +209,696 @@ class Token:
# ===================================================================== # =====================================================================
# Lexer Token Result / Lexer Result # File Info and Results
# ===================================================================== # =====================================================================
class SlsResultType(Enum):
RESULT = auto()
ERROR = auto()
@dataclass @dataclass
class FileInfo: class FileInfo:
filename: str filename: str
line: int line: int
column: int column: int
length: int = 0
lines: int = 0
@dataclass
class LexerError(Exception):
message: str
file_info: FileInfo
# ===================================================================== # =====================================================================
# Function Stubs (to be implemented in Python version) # Numeric Type Flags
# ===================================================================== # =====================================================================
def lexical_analysis(lexer_info: LexerInfo) -> list[Token]: class NumericType(Enum):
return [] F64 = auto()
F32 = auto()
I64 = auto()
I32 = auto()
I16 = auto()
I8 = auto()
U64 = auto()
U32 = auto()
U16 = auto()
U8 = auto()
class NumericLiteralType(Enum):
BINARY = auto()
OCTAL = auto()
DECIMAL = auto()
HEXADECIMAL = auto()
FLOAT = auto()
EXPONENTIAL = auto()
# =====================================================================
# Lexer Implementation
# =====================================================================
class Lexer:
def __init__(self, info: LexerInfo):
self.info = info
def peek(self) -> str:
if self.info.pos >= len(self.info.source_code):
return '\0'
return self.info.source_code[self.info.pos]
def far_peek(self, offset: int) -> str:
pos = self.info.pos + offset
if pos >= len(self.info.source_code):
return '\0'
return self.info.source_code[pos]
def seek(self, index: int) -> str:
if index >= len(self.info.source_code):
return '\0'
return self.info.source_code[index]
def advance(self) -> str:
if self.info.pos < len(self.info.source_code):
if self.info.source_code[self.info.pos] == '\n':
self.info.line += 1
self.info.column = 1
else:
self.info.column += 1
self.info.pos += 1
return self.peek()
def get_file_info(self, start: int, start_line: int) -> FileInfo:
return FileInfo(
filename=self.info.filename,
line=self.info.line,
column=self.info.column,
length=self.info.pos - start,
lines=self.info.line - start_line
)
def get_token_text(self, start: int) -> str:
return self.info.source_code[start:self.info.pos]
def skip_comments_and_whitespace(self):
while True:
c = self.peek()
# Skip comments
if (c == '/' and self.far_peek(1) == '/') or c == '#':
while self.peek() not in ('\n', '\0'):
self.advance()
# Skip whitespace
if c.isspace():
self.advance()
continue
break
def is_identifier_continue(self, c: str) -> bool:
if not c.isprintable():
return False
if c == '/' and self.far_peek(1) == '/':
return False
if c in '{}[]()\'\"#':
return False
if c.isspace() or c == '\0':
return False
return True
def is_identifier_start(self) -> bool:
c = self.peek()
if c == ':' and self.far_peek(1) == ':':
c = self.far_peek(2)
return not c.isdigit() and self.is_identifier_continue(c)
# =====================================================================
# Integer Parsing Helpers
# =====================================================================
def create_binary_integer(self, start: int) -> int:
token = self.get_token_text(start)
negative = token[0] == '-'
i = 3 if negative else 2
value = 0
while i < len(token):
c = token[i]
if c.isspace() or c in '/:' or c == '\0':
break
if c in '._':
i += 1
continue
value *= 2
if c == '1':
value += 1
i += 1
if negative:
# Python handles negative integers naturally
value = -value
return value
def create_octal_integer(self, start: int) -> int:
token = self.get_token_text(start)
negative = token[0] == '-'
i = 3 if negative else 2
value = 0
while i < len(token):
c = token[i]
if c.isspace() or c in '/:' or c == '\0':
break
if c in '._':
i += 1
continue
value *= 8
if c.isdigit() and c < '8':
value += int(c)
i += 1
if negative:
value = -value
return value
def create_decimal_integer(self, start: int) -> int:
token = self.get_token_text(start)
negative = token[0] == '-'
i = 1 if negative else 0
value = 0
while i < len(token):
c = token[i]
if c.isspace() or c in '/:' or c == '\0':
break
if c == '_':
i += 1
continue
if c.isdigit():
value *= 10
value += int(c)
i += 1
if negative:
value = -value
return value
def create_hexadecimal_integer(self, start: int) -> int:
token = self.get_token_text(start)
negative = token[0] == '-'
i = 3 if negative else 2
value = 0
while i < len(token):
c = token[i]
if c.isspace() or c in '/:' or c == '\0':
break
if c in '._':
i += 1
continue
value *= 16
if c.isdigit():
value += int(c)
elif c.upper() in 'ABCDEF':
value += ord(c.upper()) - ord('A') + 10
i += 1
if negative:
value = -value
return value
def create_float(self, start: int) -> float:
token = self.get_token_text(start)
negative = token[0] == '-'
i = 1 if negative else 0
value = 0.0
fractional = 0
while i < len(token):
c = token[i]
if c.isspace() or c in '/:' or c == '\0':
break
if c == '_':
i += 1
continue
if c == '.':
fractional = 1
i += 1
continue
if fractional == 0:
value *= 10
else:
fractional *= 10
if c.isdigit():
digit = int(c)
if fractional == 0:
value += digit
else:
value += digit / fractional
i += 1
if negative:
value = -value
return value
# =====================================================================
# Integer Type Validation
# =====================================================================
def get_integer_type(self, numeric_type: NumericType) -> IntegerBuiltInType:
type_map = {
NumericType.I64: IntegerBuiltInType.I64,
NumericType.I32: IntegerBuiltInType.I32,
NumericType.I16: IntegerBuiltInType.I16,
NumericType.I8: IntegerBuiltInType.I8,
NumericType.U64: IntegerBuiltInType.U64,
NumericType.U32: IntegerBuiltInType.U32,
NumericType.U16: IntegerBuiltInType.U16,
NumericType.U8: IntegerBuiltInType.U8,
}
if numeric_type not in type_map:
raise ValueError("Encountered a Float where there should not be one.")
return type_map[numeric_type]
def validate_integer_range(self, value: int, int_type: IntegerBuiltInType, start: int, start_line: int):
ranges = {
IntegerBuiltInType.I64: (-2**63, 2**63 - 1),
IntegerBuiltInType.I32: (-2**31, 2**31 - 1),
IntegerBuiltInType.I16: (-2**15, 2**15 - 1),
IntegerBuiltInType.I8: (-2**7, 2**7 - 1),
IntegerBuiltInType.U64: (0, 2**64 - 1),
IntegerBuiltInType.U32: (0, 2**32 - 1),
IntegerBuiltInType.U16: (0, 2**16 - 1),
IntegerBuiltInType.U8: (0, 2**8 - 1),
}
min_val, max_val = ranges[int_type]
if value < min_val or value > max_val:
type_name = int_type.name.lower()
raise LexerError(
f"Integer overflow: value exceeds range for {type_name}.",
self.get_file_info(start, start_line)
)
def create_integer_token(self, int_type: IntegerBuiltInType, value: int, start: int, start_line: int) -> Token:
self.validate_integer_range(value, int_type, start, start_line)
return Token(
type=TokenType.INTEGER,
integer_literal=IntegerLiteral(value=value, type=int_type)
)
def create_float_token(self, numeric_type: NumericType, start: int, start_line: int) -> Token:
value = self.create_float(start)
if numeric_type == NumericType.F64:
return Token(type=TokenType.DOUBLE, double_literal=value)
else:
return Token(type=TokenType.FLOAT, float_literal=value)
# =====================================================================
# Numeric Type Parsing
# =====================================================================
def parse_numeric_type(self, start: int, start_line: int, literal_type: NumericLiteralType) -> NumericType:
c = self.advance()
if c == 'f':
if literal_type not in (NumericLiteralType.DECIMAL, NumericLiteralType.FLOAT, NumericLiteralType.EXPONENTIAL):
raise LexerError("Invalid numeric literal: float type not allowed.", self.get_file_info(start, start_line))
c = self.advance()
if c == '6' and self.far_peek(1) == '4':
self.advance()
self.advance()
return NumericType.F64
elif c == '3' and self.far_peek(1) == '2':
self.advance()
self.advance()
return NumericType.F32
else:
raise LexerError("Invalid float type: must be of type 'f64' or 'f32'.", self.get_file_info(start, start_line))
elif c in 'iu':
if literal_type in (NumericLiteralType.FLOAT, NumericLiteralType.EXPONENTIAL):
raise LexerError("Invalid float type: must be of type 'f64' or 'f32'.", self.get_file_info(start, start_line))
unsigned = c == 'u'
c = self.advance()
if c == '6' and self.far_peek(1) == '4':
self.advance()
self.advance()
return NumericType.U64 if unsigned else NumericType.I64
elif c == '3' and self.far_peek(1) == '2':
self.advance()
self.advance()
return NumericType.U32 if unsigned else NumericType.I32
elif c == '1' and self.far_peek(1) == '6':
self.advance()
self.advance()
return NumericType.U16 if unsigned else NumericType.I16
elif c == '8':
self.advance()
return NumericType.U8 if unsigned else NumericType.I8
else:
prefix = 'unsigned' if unsigned else 'signed'
raise LexerError(f"Invalid {prefix} integer type.", self.get_file_info(start, start_line))
else:
raise LexerError("Invalid numeric type: type must start with 'f', 'i', or 'u'.", self.get_file_info(start, start_line))
# =====================================================================
# Numeric Literal Parsing
# =====================================================================
def parse_binary_integer(self, start: int, start_line: int) -> Token:
c = self.peek()
while c in '01_':
c = self.advance()
if c == ':':
numeric_type = self.parse_numeric_type(start, start_line, NumericLiteralType.BINARY)
int_type = self.get_integer_type(numeric_type)
value = self.create_binary_integer(start)
return self.create_integer_token(int_type, value, start, start_line)
if c.isspace() or c in '/\0':
value = self.create_binary_integer(start)
return self.create_integer_token(IntegerBuiltInType.I64, value, start, start_line)
raise LexerError(f"Invalid binary literal: unexpected '{c}' in binary integer.", self.get_file_info(start, start_line))
def parse_octal_integer(self, start: int, start_line: int) -> Token:
c = self.peek()
while c.isdigit() and c not in '89' or c == '_':
c = self.advance()
if c == ':':
numeric_type = self.parse_numeric_type(start, start_line, NumericLiteralType.OCTAL)
int_type = self.get_integer_type(numeric_type)
value = self.create_octal_integer(start)
return self.create_integer_token(int_type, value, start, start_line)
if c.isspace() or c in '/\0':
value = self.create_octal_integer(start)
return self.create_integer_token(IntegerBuiltInType.I64, value, start, start_line)
raise LexerError(f"Invalid octal literal: unexpected '{c}' in octal integer.", self.get_file_info(start, start_line))
def parse_hexadecimal_integer(self, start: int, start_line: int) -> Token:
c = self.peek()
while c in '0123456789ABCDEFabcdef_':
c = self.advance()
if c == ':':
numeric_type = self.parse_numeric_type(start, start_line, NumericLiteralType.HEXADECIMAL)
int_type = self.get_integer_type(numeric_type)
value = self.create_hexadecimal_integer(start)
return self.create_integer_token(int_type, value, start, start_line)
if c.isspace() or c in '/\0':
value = self.create_hexadecimal_integer(start)
return self.create_integer_token(IntegerBuiltInType.I64, value, start, start_line)
raise LexerError(f"Invalid hexadecimal literal: unexpected '{c}' in hexadecimal integer.", self.get_file_info(start, start_line))
def parse_exponential(self, start: int, start_line: int) -> Token:
raise NotImplementedError("Float exponential not implemented yet.")
def parse_float(self, start: int, start_line: int) -> Token:
c = self.peek()
while c.isdigit() or c == '_':
c = self.advance()
if c in 'eE':
return self.parse_exponential(start, start_line)
if c == ':':
numeric_type = self.parse_numeric_type(start, start_line, NumericLiteralType.FLOAT)
return self.create_float_token(numeric_type, start, start_line)
if c.isspace() or c in '/\0':
return self.create_float_token(NumericType.F64, start, start_line)
raise LexerError(f"Invalid float literal: unexpected '{c}' in float.", self.get_file_info(start, start_line))
def parse_decimal_integer(self, start: int, start_line: int) -> Token:
c = self.peek()
while c.isdigit() or c == '_':
c = self.advance()
if c == '.':
self.advance()
return self.parse_float(start, start_line)
if c in 'eE':
return self.parse_exponential(start, start_line)
if c == ':':
numeric_type = self.parse_numeric_type(start, start_line, NumericLiteralType.DECIMAL)
int_type = self.get_integer_type(numeric_type)
value = self.create_decimal_integer(start)
return self.create_integer_token(int_type, value, start, start_line)
if c.isspace() or c in '/\0':
value = self.create_decimal_integer(start)
return self.create_integer_token(IntegerBuiltInType.I64, value, start, start_line)
raise LexerError(f"Invalid decimal literal: unexpected '{c}' in decimal integer.", self.get_file_info(start, start_line))
def parse_numeric_literal(self, start: int, start_line: int) -> Token:
c = self.peek()
if c == '-':
c = self.advance()
if c == '0':
c = self.advance()
if c in 'bB':
self.advance()
return self.parse_binary_integer(start, start_line)
elif c in 'oO':
self.advance()
return self.parse_octal_integer(start, start_line)
elif c in 'xX':
self.advance()
return self.parse_hexadecimal_integer(start, start_line)
return self.parse_decimal_integer(start, start_line)
# =====================================================================
# Character Literal Parsing
# =====================================================================
def parse_character_literal(self, start: int, start_line: int) -> Token:
c = self.peek()
if c == '\'':
raise LexerError("Invalid character literal: empty character literal.", self.get_file_info(start, start_line))
if c == '\\':
c = self.advance()
escape_map = {
'n': '\n',
'r': '\r',
't': '\t',
'\\': '\\',
'\'': '\'',
'0': '\0'
}
if c in escape_map:
value = ord(escape_map[c])
else:
raise LexerError(f"Invalid character literal: unknown escape sequence '\\{c}'.", self.get_file_info(start, start_line))
elif c in '\n\r':
raise LexerError("Invalid character literal: unclosed character literal.", self.get_file_info(start, start_line))
else:
value = ord(c)
c = self.advance()
if c.isspace() or c in '/\0':
raise LexerError("Invalid character literal: unclosed character literal.", self.get_file_info(start, start_line))
elif c != '\'':
raise LexerError(f"Invalid character literal: unexpected '{c}' in character.", self.get_file_info(start, start_line))
self.advance()
return Token(type=TokenType.CHARACTER, character_literal=value)
# =====================================================================
# String Literal Parsing (stub)
# =====================================================================
def parse_string_literal(self, start: int, start_line: int) -> Token:
raise NotImplementedError("String literals not implemented yet.")
# =====================================================================
# Token String Parsing
# =====================================================================
def parse_token_string(self, start: int, start_line: int) -> Token:
tokens = []
self.advance() # Skip opening '{'
watchdog = 0
while self.peek() != '\0':
self.skip_comments_and_whitespace()
c = self.peek()
if c == '}':
self.advance()
return Token(type=TokenType.TOKEN_STRING, token_string=TokenString(tokens=tokens))
token = self.lexer_next()
tokens.append(token)
if token.type == TokenType.EOF:
break
watchdog += 1
if watchdog > 1000000:
raise LexerError("Watchdog triggered in token string.", self.get_file_info(start, start_line))
raise LexerError("Unclosed token string: missing closing brace '}'.", self.get_file_info(start, start_line))
# =====================================================================
# Array and Type Tuple Parsing (stubs)
# =====================================================================
def parse_array_literal(self, start: int, start_line: int) -> Token:
raise NotImplementedError("Array literals not implemented yet.")
def parse_type_tuple(self, start: int, start_line: int) -> Token:
raise NotImplementedError("Type tuples not implemented yet.")
# =====================================================================
# Identifier and Boolean Parsing
# =====================================================================
def parse_identifiers_and_booleans(self, start: int, start_line: int) -> Token:
c = self.peek()
is_literal = False
# Check for identifier literal (::)
if c == ':' and self.far_peek(1) == ':':
is_literal = True
self.advance()
self.advance()
c = self.peek()
# Read identifier name
name_chars = []
while self.is_identifier_continue(c):
if c == ':':
raise LexerError("Invalid identifier: ':' is not allowed in identifiers.", self.get_file_info(start, start_line))
if c == '.':
raise LexerError("Invalid identifier: '.' is not allowed in identifiers.", self.get_file_info(start, start_line))
name_chars.append(c)
c = self.advance()
name = ''.join(name_chars)
# Check for boolean literals
if name == 'false':
return Token(type=TokenType.BOOLEAN, boolean_literal=False)
elif name == 'true':
return Token(type=TokenType.BOOLEAN, boolean_literal=True)
else:
return Token(type=TokenType.IDENTIFIER, identifier=Identifier(name=name, is_literal=is_literal))
# =====================================================================
# Main Lexer Logic
# =====================================================================
def lexer_next(self) -> Token:
self.skip_comments_and_whitespace()
c = self.peek()
start = self.info.pos
start_line = self.info.line
# End of file
if c == '\0':
return Token(type=TokenType.EOF)
# Numeric literals (integers and floats)
if c.isdigit() or (c == '.' and self.far_peek(1).isdigit()) or (c == '-' and self.far_peek(1).isdigit()):
return self.parse_numeric_literal(start, start_line)
# Character literals
if c == '\'':
self.advance()
return self.parse_character_literal(start, start_line)
# String literals
if c == '"':
return self.parse_string_literal(start, start_line)
# Token strings
if c == '{':
return self.parse_token_string(start, start_line)
if c == '}':
self.advance()
raise LexerError("Unexpected closing brace '}' without matching opening brace.", self.get_file_info(start, start_line))
# Array literals
if c == '[':
return self.parse_array_literal(start, start_line)
if c == ']':
self.advance()
raise LexerError("Unexpected closing bracket ']' without matching opening bracket.", self.get_file_info(start, start_line))
# Type tuples
if c == '(':
return self.parse_type_tuple(start, start_line)
if c == ')':
self.advance()
raise LexerError("Unexpected closing parentheses ')' without matching opening parentheses.", self.get_file_info(start, start_line))
# Identifiers and booleans
if self.is_identifier_start():
return self.parse_identifiers_and_booleans(start, start_line)
# Check for malformed identifier literal
if c == ':':
self.advance()
if self.far_peek(1) == ':':
raise LexerError("Invalid identifier literal: empty identifier after '::'.", self.get_file_info(start, start_line))
else:
raise LexerError("Unexpected single colon ':'.", self.get_file_info(start, start_line))
# Unknown character
raise LexerError(f"Unexpected character: unexpected '{c}' during parsing.", self.get_file_info(start, start_line))
def lexical_analysis(self) -> List[Token]:
"""Main entry point for lexical analysis."""
tokens = []
while True:
try:
token = self.lexer_next()
tokens.append(token)
if token.type == TokenType.EOF:
break
except LexerError as e:
# Re-raise lexer errors
raise
return tokens
# =====================================================================
# Public API
# =====================================================================
def lexical_analysis(lexer_info: LexerInfo) -> List[Token]:
"""Convenience function matching the original C API."""
lexer = Lexer(lexer_info)
return lexer.lexical_analysis()

View File

@ -1,6 +1,6 @@
from .meta import print_version from .meta import print_version
from .lexer import LexerInfo, lexical_analysis from .lexer import LexerInfo, lexical_analysis
from .interpreter import InterpreterState from .interpreter import InterpreterState, StackType
REPL_FILE_NAME = "<STDIN>" REPL_FILE_NAME = "<STDIN>"
@ -16,23 +16,25 @@ def print_top_of_stack(interpreter: InterpreterState) -> None:
t = item.type t = item.type
if t == "Identifier": if t == StackType.IDENTIFIER:
print(f"#0: ::{item.value}") print(f"#0: ::{item.value}")
elif t in {"i64", "i32", "i16", "i8"}: elif t == StackType.I64:
print(f"#0: {item.value}")
elif t in {StackType.I32, StackType.I16, StackType.I8}:
print(f"#0: {item.value}:{t}") print(f"#0: {item.value}:{t}")
elif t in {"u64", "u32", "u16", "u8"}: elif t in {StackType.U64, StackType.U32, StackType.U16, StackType.U8}:
print(f"#0: {item.value}:{t}") print(f"#0: {item.value}:{t}")
elif t == "f32": elif t == StackType.F32:
print(f"#0: {item.value}:f32") print(f"#0: {item.value}:f32")
elif t == "f64": elif t == StackType.F64:
print(f"#0: {item.value}") print(f"#0: {item.value}")
elif t == "char": elif t == StackType.CHARACTER:
print(f"#0: {item.value}") print(f"#0: {item.value}")
elif t == "bool": elif t == StackType.BOOLEAN:
print("#0: TRUE" if item.value else "#0: FALSE") print("#0: TRUE" if item.value else "#0: FALSE")
elif t == "TokenString": elif t == StackType.TOKEN_STRING:
print("#0: <TOKEN STRING>") print("#0: <TOKEN STRING>")
elif t == "Callable": elif t == StackType.CALLABLE:
print("#0: <CALLABLE>") print("#0: <CALLABLE>")
else: else:
print(f"#0: <UNKNOWN {t}>") print(f"#0: <UNKNOWN {t}>")