From 64f620b2ef5e112afb388de9eed42e13c0c3e448 Mon Sep 17 00:00:00 2001 From: Kyler Date: Wed, 3 Dec 2025 14:57:15 -0700 Subject: [PATCH] Claude implementation of lexer.py --- SLS_Python/sls_py/lexer.py | 702 ++++++++++++++++++++++++++++++++++++- 1 file changed, 687 insertions(+), 15 deletions(-) diff --git a/SLS_Python/sls_py/lexer.py b/SLS_Python/sls_py/lexer.py index 0ddaef8..144fc7a 100644 --- a/SLS_Python/sls_py/lexer.py +++ b/SLS_Python/sls_py/lexer.py @@ -1,7 +1,7 @@ from __future__ import annotations from dataclasses import dataclass, field from enum import Enum, auto -from typing import List, Optional, Any +from typing import List, Optional, Any, Union # ===================================================================== @@ -19,7 +19,7 @@ class LexerInfo: self.filename = filename self.source_code = source_code self.pos = 0 - self.column = 0 + self.column = 1 self.line = 1 @@ -90,7 +90,7 @@ class IntegerBuiltInType(Enum): @dataclass class IntegerLiteral: - value: int # Python int is arbitrary precision + value: int type: IntegerBuiltInType @@ -132,12 +132,12 @@ class TypeTuple: @dataclass class StructInline: - values: List[Any] # Python can store anything + values: List[Any] name: str # ===================================================================== -# ArrayLiteral (replaces C unions with Optional lists) +# ArrayLiteral # ===================================================================== @dataclass @@ -189,7 +189,7 @@ class ArrayLiteral: # ===================================================================== -# Token (Python “union” via Optional fields) +# Token # ===================================================================== @dataclass @@ -209,24 +209,696 @@ class Token: # ===================================================================== -# Lexer Token Result / Lexer Result +# File Info and Results # ===================================================================== -class SlsResultType(Enum): - RESULT = auto() - ERROR = auto() - - @dataclass class FileInfo: filename: str line: int column: int + length: int = 0 + lines: int = 0 + + +@dataclass +class LexerError(Exception): + message: str + file_info: FileInfo # ===================================================================== -# Function Stubs (to be implemented in Python version) +# Numeric Type Flags # ===================================================================== -def lexical_analysis(lexer_info: LexerInfo) -> list[Token]: - return [] +class NumericType(Enum): + F64 = auto() + F32 = auto() + I64 = auto() + I32 = auto() + I16 = auto() + I8 = auto() + U64 = auto() + U32 = auto() + U16 = auto() + U8 = auto() + + +class NumericLiteralType(Enum): + BINARY = auto() + OCTAL = auto() + DECIMAL = auto() + HEXADECIMAL = auto() + FLOAT = auto() + EXPONENTIAL = auto() + + +# ===================================================================== +# Lexer Implementation +# ===================================================================== + +class Lexer: + def __init__(self, info: LexerInfo): + self.info = info + + def peek(self) -> str: + if self.info.pos >= len(self.info.source_code): + return '\0' + return self.info.source_code[self.info.pos] + + def far_peek(self, offset: int) -> str: + pos = self.info.pos + offset + if pos >= len(self.info.source_code): + return '\0' + return self.info.source_code[pos] + + def seek(self, index: int) -> str: + if index >= len(self.info.source_code): + return '\0' + return self.info.source_code[index] + + def advance(self) -> str: + if self.info.pos < len(self.info.source_code): + if self.info.source_code[self.info.pos] == '\n': + self.info.line += 1 + self.info.column = 1 + else: + self.info.column += 1 + self.info.pos += 1 + return self.peek() + + def get_file_info(self, start: int, start_line: int) -> FileInfo: + return FileInfo( + filename=self.info.filename, + line=self.info.line, + column=self.info.column, + length=self.info.pos - start, + lines=self.info.line - start_line + ) + + def get_token_text(self, start: int) -> str: + return self.info.source_code[start:self.info.pos] + + def skip_comments_and_whitespace(self): + while True: + c = self.peek() + + # Skip comments + if (c == '/' and self.far_peek(1) == '/') or c == '#': + while self.peek() not in ('\n', '\0'): + self.advance() + + # Skip whitespace + if c.isspace(): + self.advance() + continue + + break + + def is_identifier_continue(self, c: str) -> bool: + if not c.isprintable(): + return False + if c == '/' and self.far_peek(1) == '/': + return False + if c in '{}[]()\'\"#': + return False + if c.isspace() or c == '\0': + return False + return True + + def is_identifier_start(self) -> bool: + c = self.peek() + if c == ':' and self.far_peek(1) == ':': + c = self.far_peek(2) + return not c.isdigit() and self.is_identifier_continue(c) + + # ===================================================================== + # Integer Parsing Helpers + # ===================================================================== + + def create_binary_integer(self, start: int) -> int: + token = self.get_token_text(start) + negative = token[0] == '-' + i = 3 if negative else 2 + + value = 0 + while i < len(token): + c = token[i] + if c.isspace() or c in '/:' or c == '\0': + break + if c in '._': + i += 1 + continue + value *= 2 + if c == '1': + value += 1 + i += 1 + + if negative: + # Python handles negative integers naturally + value = -value + return value + + def create_octal_integer(self, start: int) -> int: + token = self.get_token_text(start) + negative = token[0] == '-' + i = 3 if negative else 2 + + value = 0 + while i < len(token): + c = token[i] + if c.isspace() or c in '/:' or c == '\0': + break + if c in '._': + i += 1 + continue + value *= 8 + if c.isdigit() and c < '8': + value += int(c) + i += 1 + + if negative: + value = -value + return value + + def create_decimal_integer(self, start: int) -> int: + token = self.get_token_text(start) + negative = token[0] == '-' + i = 1 if negative else 0 + + value = 0 + while i < len(token): + c = token[i] + if c.isspace() or c in '/:' or c == '\0': + break + if c == '_': + i += 1 + continue + if c.isdigit(): + value *= 10 + value += int(c) + i += 1 + + if negative: + value = -value + return value + + def create_hexadecimal_integer(self, start: int) -> int: + token = self.get_token_text(start) + negative = token[0] == '-' + i = 3 if negative else 2 + + value = 0 + while i < len(token): + c = token[i] + if c.isspace() or c in '/:' or c == '\0': + break + if c in '._': + i += 1 + continue + value *= 16 + if c.isdigit(): + value += int(c) + elif c.upper() in 'ABCDEF': + value += ord(c.upper()) - ord('A') + 10 + i += 1 + + if negative: + value = -value + return value + + def create_float(self, start: int) -> float: + token = self.get_token_text(start) + negative = token[0] == '-' + i = 1 if negative else 0 + + value = 0.0 + fractional = 0 + + while i < len(token): + c = token[i] + if c.isspace() or c in '/:' or c == '\0': + break + if c == '_': + i += 1 + continue + if c == '.': + fractional = 1 + i += 1 + continue + + if fractional == 0: + value *= 10 + else: + fractional *= 10 + + if c.isdigit(): + digit = int(c) + if fractional == 0: + value += digit + else: + value += digit / fractional + i += 1 + + if negative: + value = -value + return value + + # ===================================================================== + # Integer Type Validation + # ===================================================================== + + def get_integer_type(self, numeric_type: NumericType) -> IntegerBuiltInType: + type_map = { + NumericType.I64: IntegerBuiltInType.I64, + NumericType.I32: IntegerBuiltInType.I32, + NumericType.I16: IntegerBuiltInType.I16, + NumericType.I8: IntegerBuiltInType.I8, + NumericType.U64: IntegerBuiltInType.U64, + NumericType.U32: IntegerBuiltInType.U32, + NumericType.U16: IntegerBuiltInType.U16, + NumericType.U8: IntegerBuiltInType.U8, + } + + if numeric_type not in type_map: + raise ValueError("Encountered a Float where there should not be one.") + + return type_map[numeric_type] + + def validate_integer_range(self, value: int, int_type: IntegerBuiltInType, start: int, start_line: int): + ranges = { + IntegerBuiltInType.I64: (-2**63, 2**63 - 1), + IntegerBuiltInType.I32: (-2**31, 2**31 - 1), + IntegerBuiltInType.I16: (-2**15, 2**15 - 1), + IntegerBuiltInType.I8: (-2**7, 2**7 - 1), + IntegerBuiltInType.U64: (0, 2**64 - 1), + IntegerBuiltInType.U32: (0, 2**32 - 1), + IntegerBuiltInType.U16: (0, 2**16 - 1), + IntegerBuiltInType.U8: (0, 2**8 - 1), + } + + min_val, max_val = ranges[int_type] + if value < min_val or value > max_val: + type_name = int_type.name.lower() + raise LexerError( + f"Integer overflow: value exceeds range for {type_name}.", + self.get_file_info(start, start_line) + ) + + def create_integer_token(self, int_type: IntegerBuiltInType, value: int, start: int, start_line: int) -> Token: + self.validate_integer_range(value, int_type, start, start_line) + return Token( + type=TokenType.INTEGER, + integer_literal=IntegerLiteral(value=value, type=int_type) + ) + + def create_float_token(self, numeric_type: NumericType, start: int, start_line: int) -> Token: + value = self.create_float(start) + if numeric_type == NumericType.F64: + return Token(type=TokenType.DOUBLE, double_literal=value) + else: + return Token(type=TokenType.FLOAT, float_literal=value) + + # ===================================================================== + # Numeric Type Parsing + # ===================================================================== + + def parse_numeric_type(self, start: int, start_line: int, literal_type: NumericLiteralType) -> NumericType: + c = self.advance() + + if c == 'f': + if literal_type not in (NumericLiteralType.DECIMAL, NumericLiteralType.FLOAT, NumericLiteralType.EXPONENTIAL): + raise LexerError("Invalid numeric literal: float type not allowed.", self.get_file_info(start, start_line)) + + c = self.advance() + if c == '6' and self.far_peek(1) == '4': + self.advance() + self.advance() + return NumericType.F64 + elif c == '3' and self.far_peek(1) == '2': + self.advance() + self.advance() + return NumericType.F32 + else: + raise LexerError("Invalid float type: must be of type 'f64' or 'f32'.", self.get_file_info(start, start_line)) + + elif c in 'iu': + if literal_type in (NumericLiteralType.FLOAT, NumericLiteralType.EXPONENTIAL): + raise LexerError("Invalid float type: must be of type 'f64' or 'f32'.", self.get_file_info(start, start_line)) + + unsigned = c == 'u' + c = self.advance() + + if c == '6' and self.far_peek(1) == '4': + self.advance() + self.advance() + return NumericType.U64 if unsigned else NumericType.I64 + elif c == '3' and self.far_peek(1) == '2': + self.advance() + self.advance() + return NumericType.U32 if unsigned else NumericType.I32 + elif c == '1' and self.far_peek(1) == '6': + self.advance() + self.advance() + return NumericType.U16 if unsigned else NumericType.I16 + elif c == '8': + self.advance() + return NumericType.U8 if unsigned else NumericType.I8 + else: + prefix = 'unsigned' if unsigned else 'signed' + raise LexerError(f"Invalid {prefix} integer type.", self.get_file_info(start, start_line)) + + else: + raise LexerError("Invalid numeric type: type must start with 'f', 'i', or 'u'.", self.get_file_info(start, start_line)) + + # ===================================================================== + # Numeric Literal Parsing + # ===================================================================== + + def parse_binary_integer(self, start: int, start_line: int) -> Token: + c = self.peek() + while c in '01_': + c = self.advance() + + if c == ':': + numeric_type = self.parse_numeric_type(start, start_line, NumericLiteralType.BINARY) + int_type = self.get_integer_type(numeric_type) + value = self.create_binary_integer(start) + return self.create_integer_token(int_type, value, start, start_line) + + if c.isspace() or c in '/\0': + value = self.create_binary_integer(start) + return self.create_integer_token(IntegerBuiltInType.I64, value, start, start_line) + + raise LexerError(f"Invalid binary literal: unexpected '{c}' in binary integer.", self.get_file_info(start, start_line)) + + def parse_octal_integer(self, start: int, start_line: int) -> Token: + c = self.peek() + while c.isdigit() and c not in '89' or c == '_': + c = self.advance() + + if c == ':': + numeric_type = self.parse_numeric_type(start, start_line, NumericLiteralType.OCTAL) + int_type = self.get_integer_type(numeric_type) + value = self.create_octal_integer(start) + return self.create_integer_token(int_type, value, start, start_line) + + if c.isspace() or c in '/\0': + value = self.create_octal_integer(start) + return self.create_integer_token(IntegerBuiltInType.I64, value, start, start_line) + + raise LexerError(f"Invalid octal literal: unexpected '{c}' in octal integer.", self.get_file_info(start, start_line)) + + def parse_hexadecimal_integer(self, start: int, start_line: int) -> Token: + c = self.peek() + while c in '0123456789ABCDEFabcdef_': + c = self.advance() + + if c == ':': + numeric_type = self.parse_numeric_type(start, start_line, NumericLiteralType.HEXADECIMAL) + int_type = self.get_integer_type(numeric_type) + value = self.create_hexadecimal_integer(start) + return self.create_integer_token(int_type, value, start, start_line) + + if c.isspace() or c in '/\0': + value = self.create_hexadecimal_integer(start) + return self.create_integer_token(IntegerBuiltInType.I64, value, start, start_line) + + raise LexerError(f"Invalid hexadecimal literal: unexpected '{c}' in hexadecimal integer.", self.get_file_info(start, start_line)) + + def parse_exponential(self, start: int, start_line: int) -> Token: + raise NotImplementedError("Float exponential not implemented yet.") + + def parse_float(self, start: int, start_line: int) -> Token: + c = self.peek() + while c.isdigit() or c == '_': + c = self.advance() + + if c in 'eE': + return self.parse_exponential(start, start_line) + + if c == ':': + numeric_type = self.parse_numeric_type(start, start_line, NumericLiteralType.FLOAT) + return self.create_float_token(numeric_type, start, start_line) + + if c.isspace() or c in '/\0': + return self.create_float_token(NumericType.F64, start, start_line) + + raise LexerError(f"Invalid float literal: unexpected '{c}' in float.", self.get_file_info(start, start_line)) + + def parse_decimal_integer(self, start: int, start_line: int) -> Token: + c = self.peek() + while c.isdigit() or c == '_': + c = self.advance() + + if c == '.': + self.advance() + return self.parse_float(start, start_line) + + if c in 'eE': + return self.parse_exponential(start, start_line) + + if c == ':': + numeric_type = self.parse_numeric_type(start, start_line, NumericLiteralType.DECIMAL) + int_type = self.get_integer_type(numeric_type) + value = self.create_decimal_integer(start) + return self.create_integer_token(int_type, value, start, start_line) + + if c.isspace() or c in '/\0': + value = self.create_decimal_integer(start) + return self.create_integer_token(IntegerBuiltInType.I64, value, start, start_line) + + raise LexerError(f"Invalid decimal literal: unexpected '{c}' in decimal integer.", self.get_file_info(start, start_line)) + + def parse_numeric_literal(self, start: int, start_line: int) -> Token: + c = self.peek() + if c == '-': + c = self.advance() + + if c == '0': + c = self.advance() + if c in 'bB': + self.advance() + return self.parse_binary_integer(start, start_line) + elif c in 'oO': + self.advance() + return self.parse_octal_integer(start, start_line) + elif c in 'xX': + self.advance() + return self.parse_hexadecimal_integer(start, start_line) + + return self.parse_decimal_integer(start, start_line) + + # ===================================================================== + # Character Literal Parsing + # ===================================================================== + + def parse_character_literal(self, start: int, start_line: int) -> Token: + c = self.peek() + + if c == '\'': + raise LexerError("Invalid character literal: empty character literal.", self.get_file_info(start, start_line)) + + if c == '\\': + c = self.advance() + escape_map = { + 'n': '\n', + 'r': '\r', + 't': '\t', + '\\': '\\', + '\'': '\'', + '0': '\0' + } + if c in escape_map: + value = ord(escape_map[c]) + else: + raise LexerError(f"Invalid character literal: unknown escape sequence '\\{c}'.", self.get_file_info(start, start_line)) + elif c in '\n\r': + raise LexerError("Invalid character literal: unclosed character literal.", self.get_file_info(start, start_line)) + else: + value = ord(c) + + c = self.advance() + + if c.isspace() or c in '/\0': + raise LexerError("Invalid character literal: unclosed character literal.", self.get_file_info(start, start_line)) + elif c != '\'': + raise LexerError(f"Invalid character literal: unexpected '{c}' in character.", self.get_file_info(start, start_line)) + + self.advance() + return Token(type=TokenType.CHARACTER, character_literal=value) + + # ===================================================================== + # String Literal Parsing (stub) + # ===================================================================== + + def parse_string_literal(self, start: int, start_line: int) -> Token: + raise NotImplementedError("String literals not implemented yet.") + + # ===================================================================== + # Token String Parsing + # ===================================================================== + + def parse_token_string(self, start: int, start_line: int) -> Token: + tokens = [] + self.advance() # Skip opening '{' + + watchdog = 0 + while self.peek() != '\0': + self.skip_comments_and_whitespace() + c = self.peek() + + if c == '}': + self.advance() + return Token(type=TokenType.TOKEN_STRING, token_string=TokenString(tokens=tokens)) + + token = self.lexer_next() + tokens.append(token) + + if token.type == TokenType.EOF: + break + + watchdog += 1 + if watchdog > 1000000: + raise LexerError("Watchdog triggered in token string.", self.get_file_info(start, start_line)) + + raise LexerError("Unclosed token string: missing closing brace '}'.", self.get_file_info(start, start_line)) + + # ===================================================================== + # Array and Type Tuple Parsing (stubs) + # ===================================================================== + + def parse_array_literal(self, start: int, start_line: int) -> Token: + raise NotImplementedError("Array literals not implemented yet.") + + def parse_type_tuple(self, start: int, start_line: int) -> Token: + raise NotImplementedError("Type tuples not implemented yet.") + + # ===================================================================== + # Identifier and Boolean Parsing + # ===================================================================== + + def parse_identifiers_and_booleans(self, start: int, start_line: int) -> Token: + c = self.peek() + is_literal = False + + # Check for identifier literal (::) + if c == ':' and self.far_peek(1) == ':': + is_literal = True + self.advance() + self.advance() + c = self.peek() + + # Read identifier name + name_chars = [] + while self.is_identifier_continue(c): + if c == ':': + raise LexerError("Invalid identifier: ':' is not allowed in identifiers.", self.get_file_info(start, start_line)) + if c == '.': + raise LexerError("Invalid identifier: '.' is not allowed in identifiers.", self.get_file_info(start, start_line)) + name_chars.append(c) + c = self.advance() + + name = ''.join(name_chars) + + # Check for boolean literals + if name == 'false': + return Token(type=TokenType.BOOLEAN, boolean_literal=False) + elif name == 'true': + return Token(type=TokenType.BOOLEAN, boolean_literal=True) + else: + return Token(type=TokenType.IDENTIFIER, identifier=Identifier(name=name, is_literal=is_literal)) + + # ===================================================================== + # Main Lexer Logic + # ===================================================================== + + def lexer_next(self) -> Token: + self.skip_comments_and_whitespace() + + c = self.peek() + start = self.info.pos + start_line = self.info.line + + # End of file + if c == '\0': + return Token(type=TokenType.EOF) + + # Numeric literals (integers and floats) + if c.isdigit() or (c == '.' and self.far_peek(1).isdigit()) or (c == '-' and self.far_peek(1).isdigit()): + return self.parse_numeric_literal(start, start_line) + + # Character literals + if c == '\'': + self.advance() + return self.parse_character_literal(start, start_line) + + # String literals + if c == '"': + return self.parse_string_literal(start, start_line) + + # Token strings + if c == '{': + return self.parse_token_string(start, start_line) + + if c == '}': + self.advance() + raise LexerError("Unexpected closing brace '}' without matching opening brace.", self.get_file_info(start, start_line)) + + # Array literals + if c == '[': + return self.parse_array_literal(start, start_line) + + if c == ']': + self.advance() + raise LexerError("Unexpected closing bracket ']' without matching opening bracket.", self.get_file_info(start, start_line)) + + # Type tuples + if c == '(': + return self.parse_type_tuple(start, start_line) + + if c == ')': + self.advance() + raise LexerError("Unexpected closing parentheses ')' without matching opening parentheses.", self.get_file_info(start, start_line)) + + # Identifiers and booleans + if self.is_identifier_start(): + return self.parse_identifiers_and_booleans(start, start_line) + + # Check for malformed identifier literal + if c == ':': + self.advance() + if self.far_peek(1) == ':': + raise LexerError("Invalid identifier literal: empty identifier after '::'.", self.get_file_info(start, start_line)) + else: + raise LexerError("Unexpected single colon ':'.", self.get_file_info(start, start_line)) + + # Unknown character + raise LexerError(f"Unexpected character: unexpected '{c}' during parsing.", self.get_file_info(start, start_line)) + + def lexical_analysis(self) -> List[Token]: + """Main entry point for lexical analysis.""" + tokens = [] + + while True: + try: + token = self.lexer_next() + tokens.append(token) + + if token.type == TokenType.EOF: + break + except LexerError as e: + # Re-raise lexer errors + raise + + return tokens + + +# ===================================================================== +# Public API +# ===================================================================== + +def lexical_analysis(lexer_info: LexerInfo) -> List[Token]: + """Convenience function matching the original C API.""" + lexer = Lexer(lexer_info) + return lexer.lexical_analysis()