Compare commits

..

2 Commits

Author SHA1 Message Date
Kyler Olsen 35a6f4537f Fixes to file and repl 2025-12-03 14:58:37 -07:00
Kyler Olsen 64f620b2ef Claude implementation of lexer.py 2025-12-03 14:57:15 -07:00
3 changed files with 701 additions and 27 deletions

View File

@ -1,6 +1,6 @@
from pathlib import Path
from sls.lexer import LexerInfo, lexical_analysis, Token
from sls.interpreter import InterpreterState
from .lexer import LexerInfo, lexical_analysis, Token
from .interpreter import InterpreterState
def exec_file(interpreter_state: InterpreterState, filename: str) -> bool:

View File

@ -1,7 +1,7 @@
from __future__ import annotations
from dataclasses import dataclass, field
from enum import Enum, auto
from typing import List, Optional, Any
from typing import List, Optional, Any, Union
# =====================================================================
@ -19,7 +19,7 @@ class LexerInfo:
self.filename = filename
self.source_code = source_code
self.pos = 0
self.column = 0
self.column = 1
self.line = 1
@ -90,7 +90,7 @@ class IntegerBuiltInType(Enum):
@dataclass
class IntegerLiteral:
value: int # Python int is arbitrary precision
value: int
type: IntegerBuiltInType
@ -132,12 +132,12 @@ class TypeTuple:
@dataclass
class StructInline:
values: List[Any] # Python can store anything
values: List[Any]
name: str
# =====================================================================
# ArrayLiteral (replaces C unions with Optional lists)
# ArrayLiteral
# =====================================================================
@dataclass
@ -189,7 +189,7 @@ class ArrayLiteral:
# =====================================================================
# Token (Python “union” via Optional fields)
# Token
# =====================================================================
@dataclass
@ -209,24 +209,696 @@ class Token:
# =====================================================================
# Lexer Token Result / Lexer Result
# File Info and Results
# =====================================================================
class SlsResultType(Enum):
RESULT = auto()
ERROR = auto()
@dataclass
class FileInfo:
filename: str
line: int
column: int
length: int = 0
lines: int = 0
@dataclass
class LexerError(Exception):
message: str
file_info: FileInfo
# =====================================================================
# Function Stubs (to be implemented in Python version)
# Numeric Type Flags
# =====================================================================
def lexical_analysis(lexer_info: LexerInfo) -> list[Token]:
return []
class NumericType(Enum):
F64 = auto()
F32 = auto()
I64 = auto()
I32 = auto()
I16 = auto()
I8 = auto()
U64 = auto()
U32 = auto()
U16 = auto()
U8 = auto()
class NumericLiteralType(Enum):
BINARY = auto()
OCTAL = auto()
DECIMAL = auto()
HEXADECIMAL = auto()
FLOAT = auto()
EXPONENTIAL = auto()
# =====================================================================
# Lexer Implementation
# =====================================================================
class Lexer:
def __init__(self, info: LexerInfo):
self.info = info
def peek(self) -> str:
if self.info.pos >= len(self.info.source_code):
return '\0'
return self.info.source_code[self.info.pos]
def far_peek(self, offset: int) -> str:
pos = self.info.pos + offset
if pos >= len(self.info.source_code):
return '\0'
return self.info.source_code[pos]
def seek(self, index: int) -> str:
if index >= len(self.info.source_code):
return '\0'
return self.info.source_code[index]
def advance(self) -> str:
if self.info.pos < len(self.info.source_code):
if self.info.source_code[self.info.pos] == '\n':
self.info.line += 1
self.info.column = 1
else:
self.info.column += 1
self.info.pos += 1
return self.peek()
def get_file_info(self, start: int, start_line: int) -> FileInfo:
return FileInfo(
filename=self.info.filename,
line=self.info.line,
column=self.info.column,
length=self.info.pos - start,
lines=self.info.line - start_line
)
def get_token_text(self, start: int) -> str:
return self.info.source_code[start:self.info.pos]
def skip_comments_and_whitespace(self):
while True:
c = self.peek()
# Skip comments
if (c == '/' and self.far_peek(1) == '/') or c == '#':
while self.peek() not in ('\n', '\0'):
self.advance()
# Skip whitespace
if c.isspace():
self.advance()
continue
break
def is_identifier_continue(self, c: str) -> bool:
if not c.isprintable():
return False
if c == '/' and self.far_peek(1) == '/':
return False
if c in '{}[]()\'\"#':
return False
if c.isspace() or c == '\0':
return False
return True
def is_identifier_start(self) -> bool:
c = self.peek()
if c == ':' and self.far_peek(1) == ':':
c = self.far_peek(2)
return not c.isdigit() and self.is_identifier_continue(c)
# =====================================================================
# Integer Parsing Helpers
# =====================================================================
def create_binary_integer(self, start: int) -> int:
token = self.get_token_text(start)
negative = token[0] == '-'
i = 3 if negative else 2
value = 0
while i < len(token):
c = token[i]
if c.isspace() or c in '/:' or c == '\0':
break
if c in '._':
i += 1
continue
value *= 2
if c == '1':
value += 1
i += 1
if negative:
# Python handles negative integers naturally
value = -value
return value
def create_octal_integer(self, start: int) -> int:
token = self.get_token_text(start)
negative = token[0] == '-'
i = 3 if negative else 2
value = 0
while i < len(token):
c = token[i]
if c.isspace() or c in '/:' or c == '\0':
break
if c in '._':
i += 1
continue
value *= 8
if c.isdigit() and c < '8':
value += int(c)
i += 1
if negative:
value = -value
return value
def create_decimal_integer(self, start: int) -> int:
token = self.get_token_text(start)
negative = token[0] == '-'
i = 1 if negative else 0
value = 0
while i < len(token):
c = token[i]
if c.isspace() or c in '/:' or c == '\0':
break
if c == '_':
i += 1
continue
if c.isdigit():
value *= 10
value += int(c)
i += 1
if negative:
value = -value
return value
def create_hexadecimal_integer(self, start: int) -> int:
token = self.get_token_text(start)
negative = token[0] == '-'
i = 3 if negative else 2
value = 0
while i < len(token):
c = token[i]
if c.isspace() or c in '/:' or c == '\0':
break
if c in '._':
i += 1
continue
value *= 16
if c.isdigit():
value += int(c)
elif c.upper() in 'ABCDEF':
value += ord(c.upper()) - ord('A') + 10
i += 1
if negative:
value = -value
return value
def create_float(self, start: int) -> float:
token = self.get_token_text(start)
negative = token[0] == '-'
i = 1 if negative else 0
value = 0.0
fractional = 0
while i < len(token):
c = token[i]
if c.isspace() or c in '/:' or c == '\0':
break
if c == '_':
i += 1
continue
if c == '.':
fractional = 1
i += 1
continue
if fractional == 0:
value *= 10
else:
fractional *= 10
if c.isdigit():
digit = int(c)
if fractional == 0:
value += digit
else:
value += digit / fractional
i += 1
if negative:
value = -value
return value
# =====================================================================
# Integer Type Validation
# =====================================================================
def get_integer_type(self, numeric_type: NumericType) -> IntegerBuiltInType:
type_map = {
NumericType.I64: IntegerBuiltInType.I64,
NumericType.I32: IntegerBuiltInType.I32,
NumericType.I16: IntegerBuiltInType.I16,
NumericType.I8: IntegerBuiltInType.I8,
NumericType.U64: IntegerBuiltInType.U64,
NumericType.U32: IntegerBuiltInType.U32,
NumericType.U16: IntegerBuiltInType.U16,
NumericType.U8: IntegerBuiltInType.U8,
}
if numeric_type not in type_map:
raise ValueError("Encountered a Float where there should not be one.")
return type_map[numeric_type]
def validate_integer_range(self, value: int, int_type: IntegerBuiltInType, start: int, start_line: int):
ranges = {
IntegerBuiltInType.I64: (-2**63, 2**63 - 1),
IntegerBuiltInType.I32: (-2**31, 2**31 - 1),
IntegerBuiltInType.I16: (-2**15, 2**15 - 1),
IntegerBuiltInType.I8: (-2**7, 2**7 - 1),
IntegerBuiltInType.U64: (0, 2**64 - 1),
IntegerBuiltInType.U32: (0, 2**32 - 1),
IntegerBuiltInType.U16: (0, 2**16 - 1),
IntegerBuiltInType.U8: (0, 2**8 - 1),
}
min_val, max_val = ranges[int_type]
if value < min_val or value > max_val:
type_name = int_type.name.lower()
raise LexerError(
f"Integer overflow: value exceeds range for {type_name}.",
self.get_file_info(start, start_line)
)
def create_integer_token(self, int_type: IntegerBuiltInType, value: int, start: int, start_line: int) -> Token:
self.validate_integer_range(value, int_type, start, start_line)
return Token(
type=TokenType.INTEGER,
integer_literal=IntegerLiteral(value=value, type=int_type)
)
def create_float_token(self, numeric_type: NumericType, start: int, start_line: int) -> Token:
value = self.create_float(start)
if numeric_type == NumericType.F64:
return Token(type=TokenType.DOUBLE, double_literal=value)
else:
return Token(type=TokenType.FLOAT, float_literal=value)
# =====================================================================
# Numeric Type Parsing
# =====================================================================
def parse_numeric_type(self, start: int, start_line: int, literal_type: NumericLiteralType) -> NumericType:
c = self.advance()
if c == 'f':
if literal_type not in (NumericLiteralType.DECIMAL, NumericLiteralType.FLOAT, NumericLiteralType.EXPONENTIAL):
raise LexerError("Invalid numeric literal: float type not allowed.", self.get_file_info(start, start_line))
c = self.advance()
if c == '6' and self.far_peek(1) == '4':
self.advance()
self.advance()
return NumericType.F64
elif c == '3' and self.far_peek(1) == '2':
self.advance()
self.advance()
return NumericType.F32
else:
raise LexerError("Invalid float type: must be of type 'f64' or 'f32'.", self.get_file_info(start, start_line))
elif c in 'iu':
if literal_type in (NumericLiteralType.FLOAT, NumericLiteralType.EXPONENTIAL):
raise LexerError("Invalid float type: must be of type 'f64' or 'f32'.", self.get_file_info(start, start_line))
unsigned = c == 'u'
c = self.advance()
if c == '6' and self.far_peek(1) == '4':
self.advance()
self.advance()
return NumericType.U64 if unsigned else NumericType.I64
elif c == '3' and self.far_peek(1) == '2':
self.advance()
self.advance()
return NumericType.U32 if unsigned else NumericType.I32
elif c == '1' and self.far_peek(1) == '6':
self.advance()
self.advance()
return NumericType.U16 if unsigned else NumericType.I16
elif c == '8':
self.advance()
return NumericType.U8 if unsigned else NumericType.I8
else:
prefix = 'unsigned' if unsigned else 'signed'
raise LexerError(f"Invalid {prefix} integer type.", self.get_file_info(start, start_line))
else:
raise LexerError("Invalid numeric type: type must start with 'f', 'i', or 'u'.", self.get_file_info(start, start_line))
# =====================================================================
# Numeric Literal Parsing
# =====================================================================
def parse_binary_integer(self, start: int, start_line: int) -> Token:
c = self.peek()
while c in '01_':
c = self.advance()
if c == ':':
numeric_type = self.parse_numeric_type(start, start_line, NumericLiteralType.BINARY)
int_type = self.get_integer_type(numeric_type)
value = self.create_binary_integer(start)
return self.create_integer_token(int_type, value, start, start_line)
if c.isspace() or c in '/\0':
value = self.create_binary_integer(start)
return self.create_integer_token(IntegerBuiltInType.I64, value, start, start_line)
raise LexerError(f"Invalid binary literal: unexpected '{c}' in binary integer.", self.get_file_info(start, start_line))
def parse_octal_integer(self, start: int, start_line: int) -> Token:
c = self.peek()
while c.isdigit() and c not in '89' or c == '_':
c = self.advance()
if c == ':':
numeric_type = self.parse_numeric_type(start, start_line, NumericLiteralType.OCTAL)
int_type = self.get_integer_type(numeric_type)
value = self.create_octal_integer(start)
return self.create_integer_token(int_type, value, start, start_line)
if c.isspace() or c in '/\0':
value = self.create_octal_integer(start)
return self.create_integer_token(IntegerBuiltInType.I64, value, start, start_line)
raise LexerError(f"Invalid octal literal: unexpected '{c}' in octal integer.", self.get_file_info(start, start_line))
def parse_hexadecimal_integer(self, start: int, start_line: int) -> Token:
c = self.peek()
while c in '0123456789ABCDEFabcdef_':
c = self.advance()
if c == ':':
numeric_type = self.parse_numeric_type(start, start_line, NumericLiteralType.HEXADECIMAL)
int_type = self.get_integer_type(numeric_type)
value = self.create_hexadecimal_integer(start)
return self.create_integer_token(int_type, value, start, start_line)
if c.isspace() or c in '/\0':
value = self.create_hexadecimal_integer(start)
return self.create_integer_token(IntegerBuiltInType.I64, value, start, start_line)
raise LexerError(f"Invalid hexadecimal literal: unexpected '{c}' in hexadecimal integer.", self.get_file_info(start, start_line))
def parse_exponential(self, start: int, start_line: int) -> Token:
raise NotImplementedError("Float exponential not implemented yet.")
def parse_float(self, start: int, start_line: int) -> Token:
c = self.peek()
while c.isdigit() or c == '_':
c = self.advance()
if c in 'eE':
return self.parse_exponential(start, start_line)
if c == ':':
numeric_type = self.parse_numeric_type(start, start_line, NumericLiteralType.FLOAT)
return self.create_float_token(numeric_type, start, start_line)
if c.isspace() or c in '/\0':
return self.create_float_token(NumericType.F64, start, start_line)
raise LexerError(f"Invalid float literal: unexpected '{c}' in float.", self.get_file_info(start, start_line))
def parse_decimal_integer(self, start: int, start_line: int) -> Token:
c = self.peek()
while c.isdigit() or c == '_':
c = self.advance()
if c == '.':
self.advance()
return self.parse_float(start, start_line)
if c in 'eE':
return self.parse_exponential(start, start_line)
if c == ':':
numeric_type = self.parse_numeric_type(start, start_line, NumericLiteralType.DECIMAL)
int_type = self.get_integer_type(numeric_type)
value = self.create_decimal_integer(start)
return self.create_integer_token(int_type, value, start, start_line)
if c.isspace() or c in '/\0':
value = self.create_decimal_integer(start)
return self.create_integer_token(IntegerBuiltInType.I64, value, start, start_line)
raise LexerError(f"Invalid decimal literal: unexpected '{c}' in decimal integer.", self.get_file_info(start, start_line))
def parse_numeric_literal(self, start: int, start_line: int) -> Token:
c = self.peek()
if c == '-':
c = self.advance()
if c == '0':
c = self.advance()
if c in 'bB':
self.advance()
return self.parse_binary_integer(start, start_line)
elif c in 'oO':
self.advance()
return self.parse_octal_integer(start, start_line)
elif c in 'xX':
self.advance()
return self.parse_hexadecimal_integer(start, start_line)
return self.parse_decimal_integer(start, start_line)
# =====================================================================
# Character Literal Parsing
# =====================================================================
def parse_character_literal(self, start: int, start_line: int) -> Token:
c = self.peek()
if c == '\'':
raise LexerError("Invalid character literal: empty character literal.", self.get_file_info(start, start_line))
if c == '\\':
c = self.advance()
escape_map = {
'n': '\n',
'r': '\r',
't': '\t',
'\\': '\\',
'\'': '\'',
'0': '\0'
}
if c in escape_map:
value = ord(escape_map[c])
else:
raise LexerError(f"Invalid character literal: unknown escape sequence '\\{c}'.", self.get_file_info(start, start_line))
elif c in '\n\r':
raise LexerError("Invalid character literal: unclosed character literal.", self.get_file_info(start, start_line))
else:
value = ord(c)
c = self.advance()
if c.isspace() or c in '/\0':
raise LexerError("Invalid character literal: unclosed character literal.", self.get_file_info(start, start_line))
elif c != '\'':
raise LexerError(f"Invalid character literal: unexpected '{c}' in character.", self.get_file_info(start, start_line))
self.advance()
return Token(type=TokenType.CHARACTER, character_literal=value)
# =====================================================================
# String Literal Parsing (stub)
# =====================================================================
def parse_string_literal(self, start: int, start_line: int) -> Token:
raise NotImplementedError("String literals not implemented yet.")
# =====================================================================
# Token String Parsing
# =====================================================================
def parse_token_string(self, start: int, start_line: int) -> Token:
tokens = []
self.advance() # Skip opening '{'
watchdog = 0
while self.peek() != '\0':
self.skip_comments_and_whitespace()
c = self.peek()
if c == '}':
self.advance()
return Token(type=TokenType.TOKEN_STRING, token_string=TokenString(tokens=tokens))
token = self.lexer_next()
tokens.append(token)
if token.type == TokenType.EOF:
break
watchdog += 1
if watchdog > 1000000:
raise LexerError("Watchdog triggered in token string.", self.get_file_info(start, start_line))
raise LexerError("Unclosed token string: missing closing brace '}'.", self.get_file_info(start, start_line))
# =====================================================================
# Array and Type Tuple Parsing (stubs)
# =====================================================================
def parse_array_literal(self, start: int, start_line: int) -> Token:
raise NotImplementedError("Array literals not implemented yet.")
def parse_type_tuple(self, start: int, start_line: int) -> Token:
raise NotImplementedError("Type tuples not implemented yet.")
# =====================================================================
# Identifier and Boolean Parsing
# =====================================================================
def parse_identifiers_and_booleans(self, start: int, start_line: int) -> Token:
c = self.peek()
is_literal = False
# Check for identifier literal (::)
if c == ':' and self.far_peek(1) == ':':
is_literal = True
self.advance()
self.advance()
c = self.peek()
# Read identifier name
name_chars = []
while self.is_identifier_continue(c):
if c == ':':
raise LexerError("Invalid identifier: ':' is not allowed in identifiers.", self.get_file_info(start, start_line))
if c == '.':
raise LexerError("Invalid identifier: '.' is not allowed in identifiers.", self.get_file_info(start, start_line))
name_chars.append(c)
c = self.advance()
name = ''.join(name_chars)
# Check for boolean literals
if name == 'false':
return Token(type=TokenType.BOOLEAN, boolean_literal=False)
elif name == 'true':
return Token(type=TokenType.BOOLEAN, boolean_literal=True)
else:
return Token(type=TokenType.IDENTIFIER, identifier=Identifier(name=name, is_literal=is_literal))
# =====================================================================
# Main Lexer Logic
# =====================================================================
def lexer_next(self) -> Token:
self.skip_comments_and_whitespace()
c = self.peek()
start = self.info.pos
start_line = self.info.line
# End of file
if c == '\0':
return Token(type=TokenType.EOF)
# Numeric literals (integers and floats)
if c.isdigit() or (c == '.' and self.far_peek(1).isdigit()) or (c == '-' and self.far_peek(1).isdigit()):
return self.parse_numeric_literal(start, start_line)
# Character literals
if c == '\'':
self.advance()
return self.parse_character_literal(start, start_line)
# String literals
if c == '"':
return self.parse_string_literal(start, start_line)
# Token strings
if c == '{':
return self.parse_token_string(start, start_line)
if c == '}':
self.advance()
raise LexerError("Unexpected closing brace '}' without matching opening brace.", self.get_file_info(start, start_line))
# Array literals
if c == '[':
return self.parse_array_literal(start, start_line)
if c == ']':
self.advance()
raise LexerError("Unexpected closing bracket ']' without matching opening bracket.", self.get_file_info(start, start_line))
# Type tuples
if c == '(':
return self.parse_type_tuple(start, start_line)
if c == ')':
self.advance()
raise LexerError("Unexpected closing parentheses ')' without matching opening parentheses.", self.get_file_info(start, start_line))
# Identifiers and booleans
if self.is_identifier_start():
return self.parse_identifiers_and_booleans(start, start_line)
# Check for malformed identifier literal
if c == ':':
self.advance()
if self.far_peek(1) == ':':
raise LexerError("Invalid identifier literal: empty identifier after '::'.", self.get_file_info(start, start_line))
else:
raise LexerError("Unexpected single colon ':'.", self.get_file_info(start, start_line))
# Unknown character
raise LexerError(f"Unexpected character: unexpected '{c}' during parsing.", self.get_file_info(start, start_line))
def lexical_analysis(self) -> List[Token]:
"""Main entry point for lexical analysis."""
tokens = []
while True:
try:
token = self.lexer_next()
tokens.append(token)
if token.type == TokenType.EOF:
break
except LexerError as e:
# Re-raise lexer errors
raise
return tokens
# =====================================================================
# Public API
# =====================================================================
def lexical_analysis(lexer_info: LexerInfo) -> List[Token]:
"""Convenience function matching the original C API."""
lexer = Lexer(lexer_info)
return lexer.lexical_analysis()

View File

@ -1,6 +1,6 @@
from .meta import print_version
from .lexer import LexerInfo, lexical_analysis
from .interpreter import InterpreterState
from .interpreter import InterpreterState, StackType
REPL_FILE_NAME = "<STDIN>"
@ -16,23 +16,25 @@ def print_top_of_stack(interpreter: InterpreterState) -> None:
t = item.type
if t == "Identifier":
if t == StackType.IDENTIFIER:
print(f"#0: ::{item.value}")
elif t in {"i64", "i32", "i16", "i8"}:
elif t == StackType.I64:
print(f"#0: {item.value}")
elif t in {StackType.I32, StackType.I16, StackType.I8}:
print(f"#0: {item.value}:{t}")
elif t in {"u64", "u32", "u16", "u8"}:
elif t in {StackType.U64, StackType.U32, StackType.U16, StackType.U8}:
print(f"#0: {item.value}:{t}")
elif t == "f32":
elif t == StackType.F32:
print(f"#0: {item.value}:f32")
elif t == "f64":
elif t == StackType.F64:
print(f"#0: {item.value}")
elif t == "char":
elif t == StackType.CHARACTER:
print(f"#0: {item.value}")
elif t == "bool":
elif t == StackType.BOOLEAN:
print("#0: TRUE" if item.value else "#0: FALSE")
elif t == "TokenString":
elif t == StackType.TOKEN_STRING:
print("#0: <TOKEN STRING>")
elif t == "Callable":
elif t == StackType.CALLABLE:
print("#0: <CALLABLE>")
else:
print(f"#0: <UNKNOWN {t}>")