From 64f620b2ef5e112afb388de9eed42e13c0c3e448 Mon Sep 17 00:00:00 2001
From: Kyler <yrea2001@gmail.com>
Date: Wed, 3 Dec 2025 14:57:15 -0700
Subject: [PATCH] Claude implementation of lexer.py

---
 SLS_Python/sls_py/lexer.py | 702 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 687 insertions(+), 15 deletions(-)

diff --git a/SLS_Python/sls_py/lexer.py b/SLS_Python/sls_py/lexer.py
index 0ddaef8..144fc7a 100644
--- a/SLS_Python/sls_py/lexer.py
+++ b/SLS_Python/sls_py/lexer.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 from dataclasses import dataclass, field
 from enum import Enum, auto
-from typing import List, Optional, Any
+from typing import List, Optional, Any, Union
 
 
 # =====================================================================
@@ -19,7 +19,7 @@ class LexerInfo:
         self.filename = filename
         self.source_code = source_code
         self.pos = 0
-        self.column = 0
+        self.column = 1
         self.line = 1
 
 
@@ -90,7 +90,7 @@ class IntegerBuiltInType(Enum):
 
 @dataclass
 class IntegerLiteral:
-    value: int          # Python int is arbitrary precision
+    value: int
     type: IntegerBuiltInType
 
 
@@ -132,12 +132,12 @@ class TypeTuple:
 
 @dataclass
 class StructInline:
-    values: List[Any]      # Python can store anything
+    values: List[Any]
     name: str
 
 
 # =====================================================================
-#  ArrayLiteral (replaces C unions with Optional lists)
+#  ArrayLiteral
 # =====================================================================
 
 @dataclass
@@ -189,7 +189,7 @@ class ArrayLiteral:
 
 
 # =====================================================================
-#  Token (Python “union” via Optional fields)
+#  Token
 # =====================================================================
 
 @dataclass
@@ -209,24 +209,696 @@ class Token:
 
 
 # =====================================================================
-#  Lexer Token Result / Lexer Result
+#  File Info and Results
 # =====================================================================
 
-class SlsResultType(Enum):
-    RESULT = auto()
-    ERROR = auto()
-
-
 @dataclass
 class FileInfo:
     filename: str
     line: int
     column: int
+    length: int = 0
+    lines: int = 0
+
+
+@dataclass
+class LexerError(Exception):
+    message: str
+    file_info: FileInfo
 
 
 # =====================================================================
-#  Function Stubs (to be implemented in Python version)
+#  Numeric Type Flags
 # =====================================================================
 
-def lexical_analysis(lexer_info: LexerInfo) -> list[Token]:
-    return []
+class NumericType(Enum):
+    F64 = auto()
+    F32 = auto()
+    I64 = auto()
+    I32 = auto()
+    I16 = auto()
+    I8 = auto()
+    U64 = auto()
+    U32 = auto()
+    U16 = auto()
+    U8 = auto()
+
+
+class NumericLiteralType(Enum):
+    BINARY = auto()
+    OCTAL = auto()
+    DECIMAL = auto()
+    HEXADECIMAL = auto()
+    FLOAT = auto()
+    EXPONENTIAL = auto()
+
+
+# =====================================================================
+#  Lexer Implementation
+# =====================================================================
+
+class Lexer:
+    def __init__(self, info: LexerInfo):
+        self.info = info
+
+    def peek(self) -> str:
+        if self.info.pos >= len(self.info.source_code):
+            return '\0'
+        return self.info.source_code[self.info.pos]
+
+    def far_peek(self, offset: int) -> str:
+        pos = self.info.pos + offset
+        if pos >= len(self.info.source_code):
+            return '\0'
+        return self.info.source_code[pos]
+
+    def seek(self, index: int) -> str:
+        if index >= len(self.info.source_code):
+            return '\0'
+        return self.info.source_code[index]
+
+    def advance(self) -> str:
+        if self.info.pos < len(self.info.source_code):
+            if self.info.source_code[self.info.pos] == '\n':
+                self.info.line += 1
+                self.info.column = 1
+            else:
+                self.info.column += 1
+            self.info.pos += 1
+        return self.peek()
+
+    def get_file_info(self, start: int, start_line: int) -> FileInfo:
+        return FileInfo(
+            filename=self.info.filename,
+            line=self.info.line,
+            column=self.info.column,
+            length=self.info.pos - start,
+            lines=self.info.line - start_line
+        )
+
+    def get_token_text(self, start: int) -> str:
+        return self.info.source_code[start:self.info.pos]
+
+    def skip_comments_and_whitespace(self):
+        while True:
+            c = self.peek()
+            
+            # Skip comments
+            if (c == '/' and self.far_peek(1) == '/') or c == '#':
+                while self.peek() not in ('\n', '\0'):
+                    self.advance()
+            
+            # Skip whitespace
+            if c.isspace():
+                self.advance()
+                continue
+            
+            break
+
+    def is_identifier_continue(self, c: str) -> bool:
+        if not c.isprintable():
+            return False
+        if c == '/' and self.far_peek(1) == '/':
+            return False
+        if c in '{}[]()\'\"#':
+            return False
+        if c.isspace() or c == '\0':
+            return False
+        return True
+
+    def is_identifier_start(self) -> bool:
+        c = self.peek()
+        if c == ':' and self.far_peek(1) == ':':
+            c = self.far_peek(2)
+        return not c.isdigit() and self.is_identifier_continue(c)
+
+    # =====================================================================
+    #  Integer Parsing Helpers
+    # =====================================================================
+
+    def create_binary_integer(self, start: int) -> int:
+        token = self.get_token_text(start)
+        negative = token[0] == '-'
+        i = 3 if negative else 2
+        
+        value = 0
+        while i < len(token):
+            c = token[i]
+            if c.isspace() or c in '/:' or c == '\0':
+                break
+            if c in '._':
+                i += 1
+                continue
+            value *= 2
+            if c == '1':
+                value += 1
+            i += 1
+        
+        if negative:
+            # Python handles negative integers naturally
+            value = -value
+        return value
+
+    def create_octal_integer(self, start: int) -> int:
+        token = self.get_token_text(start)
+        negative = token[0] == '-'
+        i = 3 if negative else 2
+        
+        value = 0
+        while i < len(token):
+            c = token[i]
+            if c.isspace() or c in '/:' or c == '\0':
+                break
+            if c in '._':
+                i += 1
+                continue
+            value *= 8
+            if c.isdigit() and c < '8':
+                value += int(c)
+            i += 1
+        
+        if negative:
+            value = -value
+        return value
+
+    def create_decimal_integer(self, start: int) -> int:
+        token = self.get_token_text(start)
+        negative = token[0] == '-'
+        i = 1 if negative else 0
+        
+        value = 0
+        while i < len(token):
+            c = token[i]
+            if c.isspace() or c in '/:' or c == '\0':
+                break
+            if c == '_':
+                i += 1
+                continue
+            if c.isdigit():
+                value *= 10
+                value += int(c)
+            i += 1
+        
+        if negative:
+            value = -value
+        return value
+
+    def create_hexadecimal_integer(self, start: int) -> int:
+        token = self.get_token_text(start)
+        negative = token[0] == '-'
+        i = 3 if negative else 2
+        
+        value = 0
+        while i < len(token):
+            c = token[i]
+            if c.isspace() or c in '/:' or c == '\0':
+                break
+            if c in '._':
+                i += 1
+                continue
+            value *= 16
+            if c.isdigit():
+                value += int(c)
+            elif c.upper() in 'ABCDEF':
+                value += ord(c.upper()) - ord('A') + 10
+            i += 1
+        
+        if negative:
+            value = -value
+        return value
+
+    def create_float(self, start: int) -> float:
+        token = self.get_token_text(start)
+        negative = token[0] == '-'
+        i = 1 if negative else 0
+        
+        value = 0.0
+        fractional = 0
+        
+        while i < len(token):
+            c = token[i]
+            if c.isspace() or c in '/:' or c == '\0':
+                break
+            if c == '_':
+                i += 1
+                continue
+            if c == '.':
+                fractional = 1
+                i += 1
+                continue
+            
+            if fractional == 0:
+                value *= 10
+            else:
+                fractional *= 10
+            
+            if c.isdigit():
+                digit = int(c)
+                if fractional == 0:
+                    value += digit
+                else:
+                    value += digit / fractional
+            i += 1
+        
+        if negative:
+            value = -value
+        return value
+
+    # =====================================================================
+    #  Integer Type Validation
+    # =====================================================================
+
+    def get_integer_type(self, numeric_type: NumericType) -> IntegerBuiltInType:
+        type_map = {
+            NumericType.I64: IntegerBuiltInType.I64,
+            NumericType.I32: IntegerBuiltInType.I32,
+            NumericType.I16: IntegerBuiltInType.I16,
+            NumericType.I8: IntegerBuiltInType.I8,
+            NumericType.U64: IntegerBuiltInType.U64,
+            NumericType.U32: IntegerBuiltInType.U32,
+            NumericType.U16: IntegerBuiltInType.U16,
+            NumericType.U8: IntegerBuiltInType.U8,
+        }
+        
+        if numeric_type not in type_map:
+            raise ValueError("Encountered a Float where there should not be one.")
+        
+        return type_map[numeric_type]
+
+    def validate_integer_range(self, value: int, int_type: IntegerBuiltInType, start: int, start_line: int):
+        ranges = {
+            IntegerBuiltInType.I64: (-2**63, 2**63 - 1),
+            IntegerBuiltInType.I32: (-2**31, 2**31 - 1),
+            IntegerBuiltInType.I16: (-2**15, 2**15 - 1),
+            IntegerBuiltInType.I8: (-2**7, 2**7 - 1),
+            IntegerBuiltInType.U64: (0, 2**64 - 1),
+            IntegerBuiltInType.U32: (0, 2**32 - 1),
+            IntegerBuiltInType.U16: (0, 2**16 - 1),
+            IntegerBuiltInType.U8: (0, 2**8 - 1),
+        }
+        
+        min_val, max_val = ranges[int_type]
+        if value < min_val or value > max_val:
+            type_name = int_type.name.lower()
+            raise LexerError(
+                f"Integer overflow: value exceeds range for {type_name}.",
+                self.get_file_info(start, start_line)
+            )
+
+    def create_integer_token(self, int_type: IntegerBuiltInType, value: int, start: int, start_line: int) -> Token:
+        self.validate_integer_range(value, int_type, start, start_line)
+        return Token(
+            type=TokenType.INTEGER,
+            integer_literal=IntegerLiteral(value=value, type=int_type)
+        )
+
+    def create_float_token(self, numeric_type: NumericType, start: int, start_line: int) -> Token:
+        value = self.create_float(start)
+        if numeric_type == NumericType.F64:
+            return Token(type=TokenType.DOUBLE, double_literal=value)
+        else:
+            return Token(type=TokenType.FLOAT, float_literal=value)
+
+    # =====================================================================
+    #  Numeric Type Parsing
+    # =====================================================================
+
+    def parse_numeric_type(self, start: int, start_line: int, literal_type: NumericLiteralType) -> NumericType:
+        c = self.advance()
+        
+        if c == 'f':
+            if literal_type not in (NumericLiteralType.DECIMAL, NumericLiteralType.FLOAT, NumericLiteralType.EXPONENTIAL):
+                raise LexerError("Invalid numeric literal: float type not allowed.", self.get_file_info(start, start_line))
+            
+            c = self.advance()
+            if c == '6' and self.far_peek(1) == '4':
+                self.advance()
+                self.advance()
+                return NumericType.F64
+            elif c == '3' and self.far_peek(1) == '2':
+                self.advance()
+                self.advance()
+                return NumericType.F32
+            else:
+                raise LexerError("Invalid float type: must be of type 'f64' or 'f32'.", self.get_file_info(start, start_line))
+        
+        elif c in 'iu':
+            if literal_type in (NumericLiteralType.FLOAT, NumericLiteralType.EXPONENTIAL):
+                raise LexerError("Invalid float type: must be of type 'f64' or 'f32'.", self.get_file_info(start, start_line))
+            
+            unsigned = c == 'u'
+            c = self.advance()
+            
+            if c == '6' and self.far_peek(1) == '4':
+                self.advance()
+                self.advance()
+                return NumericType.U64 if unsigned else NumericType.I64
+            elif c == '3' and self.far_peek(1) == '2':
+                self.advance()
+                self.advance()
+                return NumericType.U32 if unsigned else NumericType.I32
+            elif c == '1' and self.far_peek(1) == '6':
+                self.advance()
+                self.advance()
+                return NumericType.U16 if unsigned else NumericType.I16
+            elif c == '8':
+                self.advance()
+                return NumericType.U8 if unsigned else NumericType.I8
+            else:
+                prefix = 'unsigned' if unsigned else 'signed'
+                raise LexerError(f"Invalid {prefix} integer type.", self.get_file_info(start, start_line))
+        
+        else:
+            raise LexerError("Invalid numeric type: type must start with 'f', 'i', or 'u'.", self.get_file_info(start, start_line))
+
+    # =====================================================================
+    #  Numeric Literal Parsing
+    # =====================================================================
+
+    def parse_binary_integer(self, start: int, start_line: int) -> Token:
+        c = self.peek()
+        while c in '01_':
+            c = self.advance()
+        
+        if c == ':':
+            numeric_type = self.parse_numeric_type(start, start_line, NumericLiteralType.BINARY)
+            int_type = self.get_integer_type(numeric_type)
+            value = self.create_binary_integer(start)
+            return self.create_integer_token(int_type, value, start, start_line)
+        
+        if c.isspace() or c in '/\0':
+            value = self.create_binary_integer(start)
+            return self.create_integer_token(IntegerBuiltInType.I64, value, start, start_line)
+        
+        raise LexerError(f"Invalid binary literal: unexpected '{c}' in binary integer.", self.get_file_info(start, start_line))
+
+    def parse_octal_integer(self, start: int, start_line: int) -> Token:
+        c = self.peek()
+        while c.isdigit() and c not in '89' or c == '_':
+            c = self.advance()
+        
+        if c == ':':
+            numeric_type = self.parse_numeric_type(start, start_line, NumericLiteralType.OCTAL)
+            int_type = self.get_integer_type(numeric_type)
+            value = self.create_octal_integer(start)
+            return self.create_integer_token(int_type, value, start, start_line)
+        
+        if c.isspace() or c in '/\0':
+            value = self.create_octal_integer(start)
+            return self.create_integer_token(IntegerBuiltInType.I64, value, start, start_line)
+        
+        raise LexerError(f"Invalid octal literal: unexpected '{c}' in octal integer.", self.get_file_info(start, start_line))
+
+    def parse_hexadecimal_integer(self, start: int, start_line: int) -> Token:
+        c = self.peek()
+        while c in '0123456789ABCDEFabcdef_':
+            c = self.advance()
+        
+        if c == ':':
+            numeric_type = self.parse_numeric_type(start, start_line, NumericLiteralType.HEXADECIMAL)
+            int_type = self.get_integer_type(numeric_type)
+            value = self.create_hexadecimal_integer(start)
+            return self.create_integer_token(int_type, value, start, start_line)
+        
+        if c.isspace() or c in '/\0':
+            value = self.create_hexadecimal_integer(start)
+            return self.create_integer_token(IntegerBuiltInType.I64, value, start, start_line)
+        
+        raise LexerError(f"Invalid hexadecimal literal: unexpected '{c}' in hexadecimal integer.", self.get_file_info(start, start_line))
+
+    def parse_exponential(self, start: int, start_line: int) -> Token:
+        raise NotImplementedError("Float exponential not implemented yet.")
+
+    def parse_float(self, start: int, start_line: int) -> Token:
+        c = self.peek()
+        while c.isdigit() or c == '_':
+            c = self.advance()
+        
+        if c in 'eE':
+            return self.parse_exponential(start, start_line)
+        
+        if c == ':':
+            numeric_type = self.parse_numeric_type(start, start_line, NumericLiteralType.FLOAT)
+            return self.create_float_token(numeric_type, start, start_line)
+        
+        if c.isspace() or c in '/\0':
+            return self.create_float_token(NumericType.F64, start, start_line)
+        
+        raise LexerError(f"Invalid float literal: unexpected '{c}' in float.", self.get_file_info(start, start_line))
+
+    def parse_decimal_integer(self, start: int, start_line: int) -> Token:
+        c = self.peek()
+        while c.isdigit() or c == '_':
+            c = self.advance()
+        
+        if c == '.':
+            self.advance()
+            return self.parse_float(start, start_line)
+        
+        if c in 'eE':
+            return self.parse_exponential(start, start_line)
+        
+        if c == ':':
+            numeric_type = self.parse_numeric_type(start, start_line, NumericLiteralType.DECIMAL)
+            int_type = self.get_integer_type(numeric_type)
+            value = self.create_decimal_integer(start)
+            return self.create_integer_token(int_type, value, start, start_line)
+        
+        if c.isspace() or c in '/\0':
+            value = self.create_decimal_integer(start)
+            return self.create_integer_token(IntegerBuiltInType.I64, value, start, start_line)
+        
+        raise LexerError(f"Invalid decimal literal: unexpected '{c}' in decimal integer.", self.get_file_info(start, start_line))
+
+    def parse_numeric_literal(self, start: int, start_line: int) -> Token:
+        c = self.peek()
+        if c == '-':
+            c = self.advance()
+        
+        if c == '0':
+            c = self.advance()
+            if c in 'bB':
+                self.advance()
+                return self.parse_binary_integer(start, start_line)
+            elif c in 'oO':
+                self.advance()
+                return self.parse_octal_integer(start, start_line)
+            elif c in 'xX':
+                self.advance()
+                return self.parse_hexadecimal_integer(start, start_line)
+        
+        return self.parse_decimal_integer(start, start_line)
+
+    # =====================================================================
+    #  Character Literal Parsing
+    # =====================================================================
+
+    def parse_character_literal(self, start: int, start_line: int) -> Token:
+        c = self.peek()
+        
+        if c == '\'':
+            raise LexerError("Invalid character literal: empty character literal.", self.get_file_info(start, start_line))
+        
+        if c == '\\':
+            c = self.advance()
+            escape_map = {
+                'n': '\n',
+                'r': '\r',
+                't': '\t',
+                '\\': '\\',
+                '\'': '\'',
+                '0': '\0'
+            }
+            if c in escape_map:
+                value = ord(escape_map[c])
+            else:
+                raise LexerError(f"Invalid character literal: unknown escape sequence '\\{c}'.", self.get_file_info(start, start_line))
+        elif c in '\n\r':
+            raise LexerError("Invalid character literal: unclosed character literal.", self.get_file_info(start, start_line))
+        else:
+            value = ord(c)
+        
+        c = self.advance()
+        
+        if c.isspace() or c in '/\0':
+            raise LexerError("Invalid character literal: unclosed character literal.", self.get_file_info(start, start_line))
+        elif c != '\'':
+            raise LexerError(f"Invalid character literal: unexpected '{c}' in character.", self.get_file_info(start, start_line))
+        
+        self.advance()
+        return Token(type=TokenType.CHARACTER, character_literal=value)
+
+    # =====================================================================
+    #  String Literal Parsing (stub)
+    # =====================================================================
+
+    def parse_string_literal(self, start: int, start_line: int) -> Token:
+        raise NotImplementedError("String literals not implemented yet.")
+
+    # =====================================================================
+    #  Token String Parsing
+    # =====================================================================
+
+    def parse_token_string(self, start: int, start_line: int) -> Token:
+        tokens = []
+        self.advance()  # Skip opening '{'
+        
+        watchdog = 0
+        while self.peek() != '\0':
+            self.skip_comments_and_whitespace()
+            c = self.peek()
+            
+            if c == '}':
+                self.advance()
+                return Token(type=TokenType.TOKEN_STRING, token_string=TokenString(tokens=tokens))
+            
+            token = self.lexer_next()
+            tokens.append(token)
+            
+            if token.type == TokenType.EOF:
+                break
+            
+            watchdog += 1
+            if watchdog > 1000000:
+                raise LexerError("Watchdog triggered in token string.", self.get_file_info(start, start_line))
+        
+        raise LexerError("Unclosed token string: missing closing brace '}'.", self.get_file_info(start, start_line))
+
+    # =====================================================================
+    #  Array and Type Tuple Parsing (stubs)
+    # =====================================================================
+
+    def parse_array_literal(self, start: int, start_line: int) -> Token:
+        raise NotImplementedError("Array literals not implemented yet.")
+
+    def parse_type_tuple(self, start: int, start_line: int) -> Token:
+        raise NotImplementedError("Type tuples not implemented yet.")
+
+    # =====================================================================
+    #  Identifier and Boolean Parsing
+    # =====================================================================
+
+    def parse_identifiers_and_booleans(self, start: int, start_line: int) -> Token:
+        c = self.peek()
+        is_literal = False
+        
+        # Check for identifier literal (::)
+        if c == ':' and self.far_peek(1) == ':':
+            is_literal = True
+            self.advance()
+            self.advance()
+            c = self.peek()
+        
+        # Read identifier name
+        name_chars = []
+        while self.is_identifier_continue(c):
+            if c == ':':
+                raise LexerError("Invalid identifier: ':' is not allowed in identifiers.", self.get_file_info(start, start_line))
+            if c == '.':
+                raise LexerError("Invalid identifier: '.' is not allowed in identifiers.", self.get_file_info(start, start_line))
+            name_chars.append(c)
+            c = self.advance()
+        
+        name = ''.join(name_chars)
+        
+        # Check for boolean literals
+        if name == 'false':
+            return Token(type=TokenType.BOOLEAN, boolean_literal=False)
+        elif name == 'true':
+            return Token(type=TokenType.BOOLEAN, boolean_literal=True)
+        else:
+            return Token(type=TokenType.IDENTIFIER, identifier=Identifier(name=name, is_literal=is_literal))
+
+    # =====================================================================
+    #  Main Lexer Logic
+    # =====================================================================
+
+    def lexer_next(self) -> Token:
+        self.skip_comments_and_whitespace()
+        
+        c = self.peek()
+        start = self.info.pos
+        start_line = self.info.line
+        
+        # End of file
+        if c == '\0':
+            return Token(type=TokenType.EOF)
+        
+        # Numeric literals (integers and floats)
+        if c.isdigit() or (c == '.' and self.far_peek(1).isdigit()) or (c == '-' and self.far_peek(1).isdigit()):
+            return self.parse_numeric_literal(start, start_line)
+        
+        # Character literals
+        if c == '\'':
+            self.advance()
+            return self.parse_character_literal(start, start_line)
+        
+        # String literals
+        if c == '"':
+            return self.parse_string_literal(start, start_line)
+        
+        # Token strings
+        if c == '{':
+            return self.parse_token_string(start, start_line)
+        
+        if c == '}':
+            self.advance()
+            raise LexerError("Unexpected closing brace '}' without matching opening brace.", self.get_file_info(start, start_line))
+        
+        # Array literals
+        if c == '[':
+            return self.parse_array_literal(start, start_line)
+        
+        if c == ']':
+            self.advance()
+            raise LexerError("Unexpected closing bracket ']' without matching opening bracket.", self.get_file_info(start, start_line))
+        
+        # Type tuples
+        if c == '(':
+            return self.parse_type_tuple(start, start_line)
+        
+        if c == ')':
+            self.advance()
+            raise LexerError("Unexpected closing parentheses ')' without matching opening parentheses.", self.get_file_info(start, start_line))
+        
+        # Identifiers and booleans
+        if self.is_identifier_start():
+            return self.parse_identifiers_and_booleans(start, start_line)
+        
+        # Check for malformed identifier literal
+        if c == ':':
+            self.advance()
+            if self.far_peek(1) == ':':
+                raise LexerError("Invalid identifier literal: empty identifier after '::'.", self.get_file_info(start, start_line))
+            else:
+                raise LexerError("Unexpected single colon ':'.", self.get_file_info(start, start_line))
+        
+        # Unknown character
+        raise LexerError(f"Unexpected character: unexpected '{c}' during parsing.", self.get_file_info(start, start_line))
+
+    def lexical_analysis(self) -> List[Token]:
+        """Main entry point for lexical analysis."""
+        tokens = []
+        
+        while True:
+            try:
+                token = self.lexer_next()
+                tokens.append(token)
+                
+                if token.type == TokenType.EOF:
+                    break
+            except LexerError as e:
+                # Re-raise lexer errors
+                raise
+        
+        return tokens
+
+
+# =====================================================================
+#  Public API
+# =====================================================================
+
+def lexical_analysis(lexer_info: LexerInfo) -> List[Token]:
+    """Convenience function matching the original C API."""
+    lexer = Lexer(lexer_info)
+    return lexer.lexical_analysis()