from __future__ import annotations from dataclasses import dataclass, field from enum import Enum, auto from typing import List, Optional, Any, Union # ===================================================================== # Basic Types # ===================================================================== class LexerInfo: filename: str source_code: str pos: int column: int line: int def __init__(self, filename: str = "", source_code: str = ""): self.filename = filename self.source_code = source_code self.pos = 0 self.column = 1 self.line = 1 # ===================================================================== # Token Types # ===================================================================== class TokenType(Enum): EOF = auto() IDENTIFIER = auto() INTEGER = auto() FLOAT = auto() DOUBLE = auto() CHARACTER = auto() STRING = auto() BOOLEAN = auto() ARRAY = auto() TOKEN_STRING = auto() TYPE_TUPLE = auto() # ===================================================================== # Array Literal Types # ===================================================================== class ArrayType(Enum): IDENTIFIER = auto() I64 = auto() I32 = auto() I16 = auto() I8 = auto() U64 = auto() U32 = auto() U16 = auto() U8 = auto() FLOAT = auto() DOUBLE = auto() CHARACTER = auto() STRING = auto() BOOLEAN = auto() STRUCT_INLINE = auto() # ===================================================================== # Identifier # ===================================================================== @dataclass class Identifier: name: str is_literal: bool # ===================================================================== # Integer Literal Type # ===================================================================== class IntegerBuiltInType(Enum): I64 = auto() I32 = auto() I16 = auto() I8 = auto() U64 = auto() U32 = auto() U16 = auto() U8 = auto() @dataclass class IntegerLiteral: value: int type: IntegerBuiltInType # ===================================================================== # TokenString, TypeTuple, StructInline # ===================================================================== @dataclass class TokenString: tokens: List["Token"] = field(default_factory=list) def deep_copy(self) -> TokenString: copied_tokens = [Token( type=token.type, identifier=token.identifier, integer_literal=token.integer_literal, float_literal=token.float_literal, double_literal=token.double_literal, character_literal=token.character_literal, string_literal=token.string_literal, boolean_literal=token.boolean_literal, array_literal=token.array_literal.deep_copy() if token.array_literal else None, token_string=token.token_string.deep_copy() if token.token_string else None, type_tuple=token.type_tuple.deep_copy() if token.type_tuple else None ) for token in self.tokens] return TokenString(tokens=copied_tokens) @dataclass class TypeTuple: input_identifiers: List[Identifier] = field(default_factory=list) output_identifiers: List[Identifier] = field(default_factory=list) def deep_copy(self) -> TypeTuple: copied_input_ids = [Identifier(name=id.name, is_literal=id.is_literal) for id in self.input_identifiers] copied_output_ids = [Identifier(name=id.name, is_literal=id.is_literal) for id in self.output_identifiers] return TypeTuple(input_identifiers=copied_input_ids, output_identifiers=copied_output_ids) @dataclass class StructInline: values: List[Any] name: str # ===================================================================== # ArrayLiteral # ===================================================================== @dataclass class ArrayLiteral: type: ArrayType identifiers: Optional[List[Identifier]] = None integer_literals: Optional[List[int]] = None float_literals: Optional[List[float]] = None double_literals: Optional[List[float]] = None character_literals: Optional[List[int]] = None string_literals: Optional[List[str]] = None boolean_literals: Optional[List[bool]] = None token_strings: Optional[List[TokenString]] = None type_tuples: Optional[List[TypeTuple]] = None struct_inline: Optional[StructInline] = None shape: Optional[List[int]] = None dimensions: int = 0 def deep_copy(self) -> ArrayLiteral: copied_array = ArrayLiteral(type=self.type, dimensions=self.dimensions, shape=list(self.shape) if self.shape else None) if self.identifiers is not None: copied_array.identifiers = [Identifier(name=id.name, is_literal=id.is_literal) for id in self.identifiers] if self.integer_literals is not None: copied_array.integer_literals = list(self.integer_literals) if self.float_literals is not None: copied_array.float_literals = list(self.float_literals) if self.double_literals is not None: copied_array.double_literals = list(self.double_literals) if self.character_literals is not None: copied_array.character_literals = list(self.character_literals) if self.string_literals is not None: copied_array.string_literals = list(self.string_literals) if self.boolean_literals is not None: copied_array.boolean_literals = list(self.boolean_literals) if self.token_strings is not None: copied_array.token_strings = [ts.deep_copy() for ts in self.token_strings] if self.type_tuples is not None: copied_array.type_tuples = [tt.deep_copy() for tt in self.type_tuples] if self.struct_inline is not None: copied_array.struct_inline = StructInline( values=list(self.struct_inline.values), name=self.struct_inline.name ) return copied_array # ===================================================================== # Token # ===================================================================== @dataclass class Token: type: TokenType identifier: Optional[Identifier] = None integer_literal: Optional[IntegerLiteral] = None float_literal: Optional[float] = None double_literal: Optional[float] = None character_literal: Optional[int] = None string_literal: Optional[str] = None boolean_literal: Optional[bool] = None array_literal: Optional[ArrayLiteral] = None token_string: Optional[TokenString] = None type_tuple: Optional[TypeTuple] = None # ===================================================================== # File Info and Results # ===================================================================== @dataclass class FileInfo: filename: str line: int column: int length: int = 0 lines: int = 0 @dataclass class LexerError(Exception): message: str file_info: FileInfo # ===================================================================== # Numeric Type Flags # ===================================================================== class NumericType(Enum): F64 = auto() F32 = auto() I64 = auto() I32 = auto() I16 = auto() I8 = auto() U64 = auto() U32 = auto() U16 = auto() U8 = auto() class NumericLiteralType(Enum): BINARY = auto() OCTAL = auto() DECIMAL = auto() HEXADECIMAL = auto() FLOAT = auto() EXPONENTIAL = auto() # ===================================================================== # Lexer Implementation # ===================================================================== class Lexer: def __init__(self, info: LexerInfo): self.info = info def peek(self) -> str: if self.info.pos >= len(self.info.source_code): return '\0' return self.info.source_code[self.info.pos] def far_peek(self, offset: int) -> str: pos = self.info.pos + offset if pos >= len(self.info.source_code): return '\0' return self.info.source_code[pos] def seek(self, index: int) -> str: if index >= len(self.info.source_code): return '\0' return self.info.source_code[index] def advance(self) -> str: if self.info.pos < len(self.info.source_code): if self.info.source_code[self.info.pos] == '\n': self.info.line += 1 self.info.column = 1 else: self.info.column += 1 self.info.pos += 1 return self.peek() def get_file_info(self, start: int, start_line: int) -> FileInfo: return FileInfo( filename=self.info.filename, line=self.info.line, column=self.info.column, length=self.info.pos - start, lines=self.info.line - start_line ) def get_token_text(self, start: int) -> str: return self.info.source_code[start:self.info.pos] def skip_comments_and_whitespace(self): while True: c = self.peek() # Skip comments if (c == '/' and self.far_peek(1) == '/') or c == '#': while self.peek() not in ('\n', '\0'): self.advance() # Skip whitespace if c.isspace(): self.advance() continue break def is_identifier_continue(self, c: str) -> bool: if not c.isprintable(): return False if c == '/' and self.far_peek(1) == '/': return False if c in '{}[]()\'\"#': return False if c.isspace() or c == '\0': return False return True def is_identifier_start(self) -> bool: c = self.peek() if c == ':' and self.far_peek(1) == ':': c = self.far_peek(2) return not c.isdigit() and self.is_identifier_continue(c) # ===================================================================== # Integer Parsing Helpers # ===================================================================== def create_binary_integer(self, start: int) -> int: token = self.get_token_text(start) negative = token[0] == '-' i = 3 if negative else 2 value = 0 while i < len(token): c = token[i] if c.isspace() or c in '/:' or c == '\0': break if c in '._': i += 1 continue value *= 2 if c == '1': value += 1 i += 1 if negative: # Python handles negative integers naturally value = -value return value def create_octal_integer(self, start: int) -> int: token = self.get_token_text(start) negative = token[0] == '-' i = 3 if negative else 2 value = 0 while i < len(token): c = token[i] if c.isspace() or c in '/:' or c == '\0': break if c in '._': i += 1 continue value *= 8 if c.isdigit() and c < '8': value += int(c) i += 1 if negative: value = -value return value def create_decimal_integer(self, start: int) -> int: token = self.get_token_text(start) negative = token[0] == '-' i = 1 if negative else 0 value = 0 while i < len(token): c = token[i] if c.isspace() or c in '/:' or c == '\0': break if c == '_': i += 1 continue if c.isdigit(): value *= 10 value += int(c) i += 1 if negative: value = -value return value def create_hexadecimal_integer(self, start: int) -> int: token = self.get_token_text(start) negative = token[0] == '-' i = 3 if negative else 2 value = 0 while i < len(token): c = token[i] if c.isspace() or c in '/:' or c == '\0': break if c in '._': i += 1 continue value *= 16 if c.isdigit(): value += int(c) elif c.upper() in 'ABCDEF': value += ord(c.upper()) - ord('A') + 10 i += 1 if negative: value = -value return value def create_float(self, start: int) -> float: token = self.get_token_text(start) negative = token[0] == '-' i = 1 if negative else 0 value = 0.0 fractional = 0 while i < len(token): c = token[i] if c.isspace() or c in '/:' or c == '\0': break if c == '_': i += 1 continue if c == '.': fractional = 1 i += 1 continue if fractional == 0: value *= 10 else: fractional *= 10 if c.isdigit(): digit = int(c) if fractional == 0: value += digit else: value += digit / fractional i += 1 if negative: value = -value return value # ===================================================================== # Integer Type Validation # ===================================================================== def get_integer_type(self, numeric_type: NumericType) -> IntegerBuiltInType: type_map = { NumericType.I64: IntegerBuiltInType.I64, NumericType.I32: IntegerBuiltInType.I32, NumericType.I16: IntegerBuiltInType.I16, NumericType.I8: IntegerBuiltInType.I8, NumericType.U64: IntegerBuiltInType.U64, NumericType.U32: IntegerBuiltInType.U32, NumericType.U16: IntegerBuiltInType.U16, NumericType.U8: IntegerBuiltInType.U8, } if numeric_type not in type_map: raise ValueError("Encountered a Float where there should not be one.") return type_map[numeric_type] def validate_integer_range(self, value: int, int_type: IntegerBuiltInType, start: int, start_line: int): ranges = { IntegerBuiltInType.I64: (-2**63, 2**63 - 1), IntegerBuiltInType.I32: (-2**31, 2**31 - 1), IntegerBuiltInType.I16: (-2**15, 2**15 - 1), IntegerBuiltInType.I8: (-2**7, 2**7 - 1), IntegerBuiltInType.U64: (0, 2**64 - 1), IntegerBuiltInType.U32: (0, 2**32 - 1), IntegerBuiltInType.U16: (0, 2**16 - 1), IntegerBuiltInType.U8: (0, 2**8 - 1), } min_val, max_val = ranges[int_type] if value < min_val or value > max_val: type_name = int_type.name.lower() raise LexerError( f"Integer overflow: value exceeds range for {type_name}.", self.get_file_info(start, start_line) ) def create_integer_token(self, int_type: IntegerBuiltInType, value: int, start: int, start_line: int) -> Token: self.validate_integer_range(value, int_type, start, start_line) return Token( type=TokenType.INTEGER, integer_literal=IntegerLiteral(value=value, type=int_type) ) def create_float_token(self, numeric_type: NumericType, start: int, start_line: int) -> Token: value = self.create_float(start) if numeric_type == NumericType.F64: return Token(type=TokenType.DOUBLE, double_literal=value) else: return Token(type=TokenType.FLOAT, float_literal=value) # ===================================================================== # Numeric Type Parsing # ===================================================================== def parse_numeric_type(self, start: int, start_line: int, literal_type: NumericLiteralType) -> NumericType: c = self.advance() if c == 'f': if literal_type not in (NumericLiteralType.DECIMAL, NumericLiteralType.FLOAT, NumericLiteralType.EXPONENTIAL): raise LexerError("Invalid numeric literal: float type not allowed.", self.get_file_info(start, start_line)) c = self.advance() if c == '6' and self.far_peek(1) == '4': self.advance() self.advance() return NumericType.F64 elif c == '3' and self.far_peek(1) == '2': self.advance() self.advance() return NumericType.F32 else: raise LexerError("Invalid float type: must be of type 'f64' or 'f32'.", self.get_file_info(start, start_line)) elif c in 'iu': if literal_type in (NumericLiteralType.FLOAT, NumericLiteralType.EXPONENTIAL): raise LexerError("Invalid float type: must be of type 'f64' or 'f32'.", self.get_file_info(start, start_line)) unsigned = c == 'u' c = self.advance() if c == '6' and self.far_peek(1) == '4': self.advance() self.advance() return NumericType.U64 if unsigned else NumericType.I64 elif c == '3' and self.far_peek(1) == '2': self.advance() self.advance() return NumericType.U32 if unsigned else NumericType.I32 elif c == '1' and self.far_peek(1) == '6': self.advance() self.advance() return NumericType.U16 if unsigned else NumericType.I16 elif c == '8': self.advance() return NumericType.U8 if unsigned else NumericType.I8 else: prefix = 'unsigned' if unsigned else 'signed' raise LexerError(f"Invalid {prefix} integer type.", self.get_file_info(start, start_line)) else: raise LexerError("Invalid numeric type: type must start with 'f', 'i', or 'u'.", self.get_file_info(start, start_line)) # ===================================================================== # Numeric Literal Parsing # ===================================================================== def parse_binary_integer(self, start: int, start_line: int) -> Token: c = self.peek() while c in '01_': c = self.advance() if c == ':': numeric_type = self.parse_numeric_type(start, start_line, NumericLiteralType.BINARY) int_type = self.get_integer_type(numeric_type) value = self.create_binary_integer(start) return self.create_integer_token(int_type, value, start, start_line) if c.isspace() or c in '/\0': value = self.create_binary_integer(start) return self.create_integer_token(IntegerBuiltInType.I64, value, start, start_line) raise LexerError(f"Invalid binary literal: unexpected '{c}' in binary integer.", self.get_file_info(start, start_line)) def parse_octal_integer(self, start: int, start_line: int) -> Token: c = self.peek() while c.isdigit() and c not in '89' or c == '_': c = self.advance() if c == ':': numeric_type = self.parse_numeric_type(start, start_line, NumericLiteralType.OCTAL) int_type = self.get_integer_type(numeric_type) value = self.create_octal_integer(start) return self.create_integer_token(int_type, value, start, start_line) if c.isspace() or c in '/\0': value = self.create_octal_integer(start) return self.create_integer_token(IntegerBuiltInType.I64, value, start, start_line) raise LexerError(f"Invalid octal literal: unexpected '{c}' in octal integer.", self.get_file_info(start, start_line)) def parse_hexadecimal_integer(self, start: int, start_line: int) -> Token: c = self.peek() while c in '0123456789ABCDEFabcdef_': c = self.advance() if c == ':': numeric_type = self.parse_numeric_type(start, start_line, NumericLiteralType.HEXADECIMAL) int_type = self.get_integer_type(numeric_type) value = self.create_hexadecimal_integer(start) return self.create_integer_token(int_type, value, start, start_line) if c.isspace() or c in '/\0': value = self.create_hexadecimal_integer(start) return self.create_integer_token(IntegerBuiltInType.I64, value, start, start_line) raise LexerError(f"Invalid hexadecimal literal: unexpected '{c}' in hexadecimal integer.", self.get_file_info(start, start_line)) def parse_exponential(self, start: int, start_line: int) -> Token: raise NotImplementedError("Float exponential not implemented yet.") def parse_float(self, start: int, start_line: int) -> Token: c = self.peek() while c.isdigit() or c == '_': c = self.advance() if c in 'eE': return self.parse_exponential(start, start_line) if c == ':': numeric_type = self.parse_numeric_type(start, start_line, NumericLiteralType.FLOAT) return self.create_float_token(numeric_type, start, start_line) if c.isspace() or c in '/\0': return self.create_float_token(NumericType.F64, start, start_line) raise LexerError(f"Invalid float literal: unexpected '{c}' in float.", self.get_file_info(start, start_line)) def parse_decimal_integer(self, start: int, start_line: int) -> Token: c = self.peek() while c.isdigit() or c == '_': c = self.advance() if c == '.': self.advance() return self.parse_float(start, start_line) if c in 'eE': return self.parse_exponential(start, start_line) if c == ':': numeric_type = self.parse_numeric_type(start, start_line, NumericLiteralType.DECIMAL) int_type = self.get_integer_type(numeric_type) value = self.create_decimal_integer(start) return self.create_integer_token(int_type, value, start, start_line) if c.isspace() or c in '/\0': value = self.create_decimal_integer(start) return self.create_integer_token(IntegerBuiltInType.I64, value, start, start_line) raise LexerError(f"Invalid decimal literal: unexpected '{c}' in decimal integer.", self.get_file_info(start, start_line)) def parse_numeric_literal(self, start: int, start_line: int) -> Token: c = self.peek() if c == '-': c = self.advance() if c == '0': c = self.advance() if c in 'bB': self.advance() return self.parse_binary_integer(start, start_line) elif c in 'oO': self.advance() return self.parse_octal_integer(start, start_line) elif c in 'xX': self.advance() return self.parse_hexadecimal_integer(start, start_line) return self.parse_decimal_integer(start, start_line) # ===================================================================== # Character Literal Parsing # ===================================================================== def parse_character_literal(self, start: int, start_line: int) -> Token: c = self.peek() if c == '\'': raise LexerError("Invalid character literal: empty character literal.", self.get_file_info(start, start_line)) if c == '\\': c = self.advance() escape_map = { 'n': '\n', 'r': '\r', 't': '\t', '\\': '\\', '\'': '\'', '0': '\0' } if c in escape_map: value = ord(escape_map[c]) else: raise LexerError(f"Invalid character literal: unknown escape sequence '\\{c}'.", self.get_file_info(start, start_line)) elif c in '\n\r': raise LexerError("Invalid character literal: unclosed character literal.", self.get_file_info(start, start_line)) else: value = ord(c) c = self.advance() if c.isspace() or c in '/\0': raise LexerError("Invalid character literal: unclosed character literal.", self.get_file_info(start, start_line)) elif c != '\'': raise LexerError(f"Invalid character literal: unexpected '{c}' in character.", self.get_file_info(start, start_line)) self.advance() return Token(type=TokenType.CHARACTER, character_literal=value) # ===================================================================== # String Literal Parsing (stub) # ===================================================================== def parse_string_literal(self, start: int, start_line: int) -> Token: raise NotImplementedError("String literals not implemented yet.") # ===================================================================== # Token String Parsing # ===================================================================== def parse_token_string(self, start: int, start_line: int) -> Token: tokens = [] self.advance() # Skip opening '{' watchdog = 0 while self.peek() != '\0': self.skip_comments_and_whitespace() c = self.peek() if c == '}': self.advance() return Token(type=TokenType.TOKEN_STRING, token_string=TokenString(tokens=tokens)) token = self.lexer_next() tokens.append(token) if token.type == TokenType.EOF: break watchdog += 1 if watchdog > 1000000: raise LexerError("Watchdog triggered in token string.", self.get_file_info(start, start_line)) raise LexerError("Unclosed token string: missing closing brace '}'.", self.get_file_info(start, start_line)) # ===================================================================== # Array and Type Tuple Parsing (stubs) # ===================================================================== def parse_array_literal(self, start: int, start_line: int) -> Token: raise NotImplementedError("Array literals not implemented yet.") def parse_type_tuple(self, start: int, start_line: int) -> Token: raise NotImplementedError("Type tuples not implemented yet.") # ===================================================================== # Identifier and Boolean Parsing # ===================================================================== def parse_identifiers_and_booleans(self, start: int, start_line: int) -> Token: c = self.peek() is_literal = False # Check for identifier literal (::) if c == ':' and self.far_peek(1) == ':': is_literal = True self.advance() self.advance() c = self.peek() # Read identifier name name_chars = [] while self.is_identifier_continue(c): if c == ':': raise LexerError("Invalid identifier: ':' is not allowed in identifiers.", self.get_file_info(start, start_line)) if c == '.': raise LexerError("Invalid identifier: '.' is not allowed in identifiers.", self.get_file_info(start, start_line)) name_chars.append(c) c = self.advance() name = ''.join(name_chars) # Check for boolean literals if name == 'false': return Token(type=TokenType.BOOLEAN, boolean_literal=False) elif name == 'true': return Token(type=TokenType.BOOLEAN, boolean_literal=True) else: return Token(type=TokenType.IDENTIFIER, identifier=Identifier(name=name, is_literal=is_literal)) # ===================================================================== # Main Lexer Logic # ===================================================================== def lexer_next(self) -> Token: self.skip_comments_and_whitespace() c = self.peek() start = self.info.pos start_line = self.info.line # End of file if c == '\0': return Token(type=TokenType.EOF) # Numeric literals (integers and floats) if c.isdigit() or (c == '.' and self.far_peek(1).isdigit()) or (c == '-' and self.far_peek(1).isdigit()): return self.parse_numeric_literal(start, start_line) # Character literals if c == '\'': self.advance() return self.parse_character_literal(start, start_line) # String literals if c == '"': return self.parse_string_literal(start, start_line) # Token strings if c == '{': return self.parse_token_string(start, start_line) if c == '}': self.advance() raise LexerError("Unexpected closing brace '}' without matching opening brace.", self.get_file_info(start, start_line)) # Array literals if c == '[': return self.parse_array_literal(start, start_line) if c == ']': self.advance() raise LexerError("Unexpected closing bracket ']' without matching opening bracket.", self.get_file_info(start, start_line)) # Type tuples if c == '(': return self.parse_type_tuple(start, start_line) if c == ')': self.advance() raise LexerError("Unexpected closing parentheses ')' without matching opening parentheses.", self.get_file_info(start, start_line)) # Identifiers and booleans if self.is_identifier_start(): return self.parse_identifiers_and_booleans(start, start_line) # Check for malformed identifier literal if c == ':': self.advance() if self.far_peek(1) == ':': raise LexerError("Invalid identifier literal: empty identifier after '::'.", self.get_file_info(start, start_line)) else: raise LexerError("Unexpected single colon ':'.", self.get_file_info(start, start_line)) # Unknown character raise LexerError(f"Unexpected character: unexpected '{c}' during parsing.", self.get_file_info(start, start_line)) def lexical_analysis(self) -> List[Token]: """Main entry point for lexical analysis.""" tokens = [] while True: try: token = self.lexer_next() tokens.append(token) if token.type == TokenType.EOF: break except LexerError as e: # Re-raise lexer errors raise return tokens # ===================================================================== # Public API # ===================================================================== def lexical_analysis(lexer_info: LexerInfo) -> List[Token]: """Convenience function matching the original C API.""" lexer = Lexer(lexer_info) return lexer.lexical_analysis()