905 lines
32 KiB
Python
905 lines
32 KiB
Python
from __future__ import annotations
|
|
from dataclasses import dataclass, field
|
|
from enum import Enum, auto
|
|
from typing import List, Optional, Any, Union
|
|
|
|
|
|
# =====================================================================
|
|
# Basic Types
|
|
# =====================================================================
|
|
|
|
class LexerInfo:
|
|
filename: str
|
|
source_code: str
|
|
pos: int
|
|
column: int
|
|
line: int
|
|
|
|
def __init__(self, filename: str = "", source_code: str = ""):
|
|
self.filename = filename
|
|
self.source_code = source_code
|
|
self.pos = 0
|
|
self.column = 1
|
|
self.line = 1
|
|
|
|
|
|
# =====================================================================
|
|
# Token Types
|
|
# =====================================================================
|
|
|
|
class TokenType(Enum):
|
|
EOF = auto()
|
|
IDENTIFIER = auto()
|
|
INTEGER = auto()
|
|
FLOAT = auto()
|
|
DOUBLE = auto()
|
|
CHARACTER = auto()
|
|
STRING = auto()
|
|
BOOLEAN = auto()
|
|
ARRAY = auto()
|
|
TOKEN_STRING = auto()
|
|
TYPE_TUPLE = auto()
|
|
|
|
|
|
# =====================================================================
|
|
# Array Literal Types
|
|
# =====================================================================
|
|
|
|
class ArrayType(Enum):
|
|
IDENTIFIER = auto()
|
|
I64 = auto()
|
|
I32 = auto()
|
|
I16 = auto()
|
|
I8 = auto()
|
|
U64 = auto()
|
|
U32 = auto()
|
|
U16 = auto()
|
|
U8 = auto()
|
|
FLOAT = auto()
|
|
DOUBLE = auto()
|
|
CHARACTER = auto()
|
|
STRING = auto()
|
|
BOOLEAN = auto()
|
|
STRUCT_INLINE = auto()
|
|
|
|
|
|
# =====================================================================
|
|
# Identifier
|
|
# =====================================================================
|
|
|
|
@dataclass
|
|
class Identifier:
|
|
name: str
|
|
is_literal: bool
|
|
|
|
|
|
# =====================================================================
|
|
# Integer Literal Type
|
|
# =====================================================================
|
|
|
|
class IntegerBuiltInType(Enum):
|
|
I64 = auto()
|
|
I32 = auto()
|
|
I16 = auto()
|
|
I8 = auto()
|
|
U64 = auto()
|
|
U32 = auto()
|
|
U16 = auto()
|
|
U8 = auto()
|
|
|
|
|
|
@dataclass
|
|
class IntegerLiteral:
|
|
value: int
|
|
type: IntegerBuiltInType
|
|
|
|
|
|
# =====================================================================
|
|
# TokenString, TypeTuple, StructInline
|
|
# =====================================================================
|
|
|
|
@dataclass
|
|
class TokenString:
|
|
tokens: List["Token"] = field(default_factory=list)
|
|
|
|
def deep_copy(self) -> TokenString:
|
|
copied_tokens = [Token(
|
|
type=token.type,
|
|
identifier=token.identifier,
|
|
integer_literal=token.integer_literal,
|
|
float_literal=token.float_literal,
|
|
double_literal=token.double_literal,
|
|
character_literal=token.character_literal,
|
|
string_literal=token.string_literal,
|
|
boolean_literal=token.boolean_literal,
|
|
array_literal=token.array_literal.deep_copy() if token.array_literal else None,
|
|
token_string=token.token_string.deep_copy() if token.token_string else None,
|
|
type_tuple=token.type_tuple.deep_copy() if token.type_tuple else None
|
|
) for token in self.tokens]
|
|
return TokenString(tokens=copied_tokens)
|
|
|
|
|
|
@dataclass
|
|
class TypeTuple:
|
|
input_identifiers: List[Identifier] = field(default_factory=list)
|
|
output_identifiers: List[Identifier] = field(default_factory=list)
|
|
|
|
def deep_copy(self) -> TypeTuple:
|
|
copied_input_ids = [Identifier(name=id.name, is_literal=id.is_literal) for id in self.input_identifiers]
|
|
copied_output_ids = [Identifier(name=id.name, is_literal=id.is_literal) for id in self.output_identifiers]
|
|
return TypeTuple(input_identifiers=copied_input_ids, output_identifiers=copied_output_ids)
|
|
|
|
|
|
@dataclass
|
|
class StructInline:
|
|
values: List[Any]
|
|
name: str
|
|
|
|
|
|
# =====================================================================
|
|
# ArrayLiteral
|
|
# =====================================================================
|
|
|
|
@dataclass
|
|
class ArrayLiteral:
|
|
type: ArrayType
|
|
|
|
identifiers: Optional[List[Identifier]] = None
|
|
integer_literals: Optional[List[int]] = None
|
|
float_literals: Optional[List[float]] = None
|
|
double_literals: Optional[List[float]] = None
|
|
character_literals: Optional[List[int]] = None
|
|
string_literals: Optional[List[str]] = None
|
|
boolean_literals: Optional[List[bool]] = None
|
|
token_strings: Optional[List[TokenString]] = None
|
|
type_tuples: Optional[List[TypeTuple]] = None
|
|
struct_inline: Optional[StructInline] = None
|
|
|
|
shape: Optional[List[int]] = None
|
|
dimensions: int = 0
|
|
|
|
def deep_copy(self) -> ArrayLiteral:
|
|
copied_array = ArrayLiteral(type=self.type, dimensions=self.dimensions, shape=list(self.shape) if self.shape else None)
|
|
|
|
if self.identifiers is not None:
|
|
copied_array.identifiers = [Identifier(name=id.name, is_literal=id.is_literal) for id in self.identifiers]
|
|
if self.integer_literals is not None:
|
|
copied_array.integer_literals = list(self.integer_literals)
|
|
if self.float_literals is not None:
|
|
copied_array.float_literals = list(self.float_literals)
|
|
if self.double_literals is not None:
|
|
copied_array.double_literals = list(self.double_literals)
|
|
if self.character_literals is not None:
|
|
copied_array.character_literals = list(self.character_literals)
|
|
if self.string_literals is not None:
|
|
copied_array.string_literals = list(self.string_literals)
|
|
if self.boolean_literals is not None:
|
|
copied_array.boolean_literals = list(self.boolean_literals)
|
|
if self.token_strings is not None:
|
|
copied_array.token_strings = [ts.deep_copy() for ts in self.token_strings]
|
|
if self.type_tuples is not None:
|
|
copied_array.type_tuples = [tt.deep_copy() for tt in self.type_tuples]
|
|
if self.struct_inline is not None:
|
|
copied_array.struct_inline = StructInline(
|
|
values=list(self.struct_inline.values),
|
|
name=self.struct_inline.name
|
|
)
|
|
|
|
return copied_array
|
|
|
|
|
|
# =====================================================================
|
|
# Token
|
|
# =====================================================================
|
|
|
|
@dataclass
|
|
class Token:
|
|
type: TokenType
|
|
|
|
identifier: Optional[Identifier] = None
|
|
integer_literal: Optional[IntegerLiteral] = None
|
|
float_literal: Optional[float] = None
|
|
double_literal: Optional[float] = None
|
|
character_literal: Optional[int] = None
|
|
string_literal: Optional[str] = None
|
|
boolean_literal: Optional[bool] = None
|
|
array_literal: Optional[ArrayLiteral] = None
|
|
token_string: Optional[TokenString] = None
|
|
type_tuple: Optional[TypeTuple] = None
|
|
|
|
|
|
# =====================================================================
|
|
# File Info and Results
|
|
# =====================================================================
|
|
|
|
@dataclass
|
|
class FileInfo:
|
|
filename: str
|
|
line: int
|
|
column: int
|
|
length: int = 0
|
|
lines: int = 0
|
|
|
|
|
|
@dataclass
|
|
class LexerError(Exception):
|
|
message: str
|
|
file_info: FileInfo
|
|
|
|
|
|
# =====================================================================
|
|
# Numeric Type Flags
|
|
# =====================================================================
|
|
|
|
class NumericType(Enum):
|
|
F64 = auto()
|
|
F32 = auto()
|
|
I64 = auto()
|
|
I32 = auto()
|
|
I16 = auto()
|
|
I8 = auto()
|
|
U64 = auto()
|
|
U32 = auto()
|
|
U16 = auto()
|
|
U8 = auto()
|
|
|
|
|
|
class NumericLiteralType(Enum):
|
|
BINARY = auto()
|
|
OCTAL = auto()
|
|
DECIMAL = auto()
|
|
HEXADECIMAL = auto()
|
|
FLOAT = auto()
|
|
EXPONENTIAL = auto()
|
|
|
|
|
|
# =====================================================================
|
|
# Lexer Implementation
|
|
# =====================================================================
|
|
|
|
class Lexer:
|
|
def __init__(self, info: LexerInfo):
|
|
self.info = info
|
|
|
|
def peek(self) -> str:
|
|
if self.info.pos >= len(self.info.source_code):
|
|
return '\0'
|
|
return self.info.source_code[self.info.pos]
|
|
|
|
def far_peek(self, offset: int) -> str:
|
|
pos = self.info.pos + offset
|
|
if pos >= len(self.info.source_code):
|
|
return '\0'
|
|
return self.info.source_code[pos]
|
|
|
|
def seek(self, index: int) -> str:
|
|
if index >= len(self.info.source_code):
|
|
return '\0'
|
|
return self.info.source_code[index]
|
|
|
|
def advance(self) -> str:
|
|
if self.info.pos < len(self.info.source_code):
|
|
if self.info.source_code[self.info.pos] == '\n':
|
|
self.info.line += 1
|
|
self.info.column = 1
|
|
else:
|
|
self.info.column += 1
|
|
self.info.pos += 1
|
|
return self.peek()
|
|
|
|
def get_file_info(self, start: int, start_line: int) -> FileInfo:
|
|
return FileInfo(
|
|
filename=self.info.filename,
|
|
line=self.info.line,
|
|
column=self.info.column,
|
|
length=self.info.pos - start,
|
|
lines=self.info.line - start_line
|
|
)
|
|
|
|
def get_token_text(self, start: int) -> str:
|
|
return self.info.source_code[start:self.info.pos]
|
|
|
|
def skip_comments_and_whitespace(self):
|
|
while True:
|
|
c = self.peek()
|
|
|
|
# Skip comments
|
|
if (c == '/' and self.far_peek(1) == '/') or c == '#':
|
|
while self.peek() not in ('\n', '\0'):
|
|
self.advance()
|
|
|
|
# Skip whitespace
|
|
if c.isspace():
|
|
self.advance()
|
|
continue
|
|
|
|
break
|
|
|
|
def is_identifier_continue(self, c: str) -> bool:
|
|
if not c.isprintable():
|
|
return False
|
|
if c == '/' and self.far_peek(1) == '/':
|
|
return False
|
|
if c in '{}[]()\'\"#':
|
|
return False
|
|
if c.isspace() or c == '\0':
|
|
return False
|
|
return True
|
|
|
|
def is_identifier_start(self) -> bool:
|
|
c = self.peek()
|
|
if c == ':' and self.far_peek(1) == ':':
|
|
c = self.far_peek(2)
|
|
return not c.isdigit() and self.is_identifier_continue(c)
|
|
|
|
# =====================================================================
|
|
# Integer Parsing Helpers
|
|
# =====================================================================
|
|
|
|
def create_binary_integer(self, start: int) -> int:
|
|
token = self.get_token_text(start)
|
|
negative = token[0] == '-'
|
|
i = 3 if negative else 2
|
|
|
|
value = 0
|
|
while i < len(token):
|
|
c = token[i]
|
|
if c.isspace() or c in '/:' or c == '\0':
|
|
break
|
|
if c in '._':
|
|
i += 1
|
|
continue
|
|
value *= 2
|
|
if c == '1':
|
|
value += 1
|
|
i += 1
|
|
|
|
if negative:
|
|
# Python handles negative integers naturally
|
|
value = -value
|
|
return value
|
|
|
|
def create_octal_integer(self, start: int) -> int:
|
|
token = self.get_token_text(start)
|
|
negative = token[0] == '-'
|
|
i = 3 if negative else 2
|
|
|
|
value = 0
|
|
while i < len(token):
|
|
c = token[i]
|
|
if c.isspace() or c in '/:' or c == '\0':
|
|
break
|
|
if c in '._':
|
|
i += 1
|
|
continue
|
|
value *= 8
|
|
if c.isdigit() and c < '8':
|
|
value += int(c)
|
|
i += 1
|
|
|
|
if negative:
|
|
value = -value
|
|
return value
|
|
|
|
def create_decimal_integer(self, start: int) -> int:
|
|
token = self.get_token_text(start)
|
|
negative = token[0] == '-'
|
|
i = 1 if negative else 0
|
|
|
|
value = 0
|
|
while i < len(token):
|
|
c = token[i]
|
|
if c.isspace() or c in '/:' or c == '\0':
|
|
break
|
|
if c == '_':
|
|
i += 1
|
|
continue
|
|
if c.isdigit():
|
|
value *= 10
|
|
value += int(c)
|
|
i += 1
|
|
|
|
if negative:
|
|
value = -value
|
|
return value
|
|
|
|
def create_hexadecimal_integer(self, start: int) -> int:
|
|
token = self.get_token_text(start)
|
|
negative = token[0] == '-'
|
|
i = 3 if negative else 2
|
|
|
|
value = 0
|
|
while i < len(token):
|
|
c = token[i]
|
|
if c.isspace() or c in '/:' or c == '\0':
|
|
break
|
|
if c in '._':
|
|
i += 1
|
|
continue
|
|
value *= 16
|
|
if c.isdigit():
|
|
value += int(c)
|
|
elif c.upper() in 'ABCDEF':
|
|
value += ord(c.upper()) - ord('A') + 10
|
|
i += 1
|
|
|
|
if negative:
|
|
value = -value
|
|
return value
|
|
|
|
def create_float(self, start: int) -> float:
|
|
token = self.get_token_text(start)
|
|
negative = token[0] == '-'
|
|
i = 1 if negative else 0
|
|
|
|
value = 0.0
|
|
fractional = 0
|
|
|
|
while i < len(token):
|
|
c = token[i]
|
|
if c.isspace() or c in '/:' or c == '\0':
|
|
break
|
|
if c == '_':
|
|
i += 1
|
|
continue
|
|
if c == '.':
|
|
fractional = 1
|
|
i += 1
|
|
continue
|
|
|
|
if fractional == 0:
|
|
value *= 10
|
|
else:
|
|
fractional *= 10
|
|
|
|
if c.isdigit():
|
|
digit = int(c)
|
|
if fractional == 0:
|
|
value += digit
|
|
else:
|
|
value += digit / fractional
|
|
i += 1
|
|
|
|
if negative:
|
|
value = -value
|
|
return value
|
|
|
|
# =====================================================================
|
|
# Integer Type Validation
|
|
# =====================================================================
|
|
|
|
def get_integer_type(self, numeric_type: NumericType) -> IntegerBuiltInType:
|
|
type_map = {
|
|
NumericType.I64: IntegerBuiltInType.I64,
|
|
NumericType.I32: IntegerBuiltInType.I32,
|
|
NumericType.I16: IntegerBuiltInType.I16,
|
|
NumericType.I8: IntegerBuiltInType.I8,
|
|
NumericType.U64: IntegerBuiltInType.U64,
|
|
NumericType.U32: IntegerBuiltInType.U32,
|
|
NumericType.U16: IntegerBuiltInType.U16,
|
|
NumericType.U8: IntegerBuiltInType.U8,
|
|
}
|
|
|
|
if numeric_type not in type_map:
|
|
raise ValueError("Encountered a Float where there should not be one.")
|
|
|
|
return type_map[numeric_type]
|
|
|
|
def validate_integer_range(self, value: int, int_type: IntegerBuiltInType, start: int, start_line: int):
|
|
ranges = {
|
|
IntegerBuiltInType.I64: (-2**63, 2**63 - 1),
|
|
IntegerBuiltInType.I32: (-2**31, 2**31 - 1),
|
|
IntegerBuiltInType.I16: (-2**15, 2**15 - 1),
|
|
IntegerBuiltInType.I8: (-2**7, 2**7 - 1),
|
|
IntegerBuiltInType.U64: (0, 2**64 - 1),
|
|
IntegerBuiltInType.U32: (0, 2**32 - 1),
|
|
IntegerBuiltInType.U16: (0, 2**16 - 1),
|
|
IntegerBuiltInType.U8: (0, 2**8 - 1),
|
|
}
|
|
|
|
min_val, max_val = ranges[int_type]
|
|
if value < min_val or value > max_val:
|
|
type_name = int_type.name.lower()
|
|
raise LexerError(
|
|
f"Integer overflow: value exceeds range for {type_name}.",
|
|
self.get_file_info(start, start_line)
|
|
)
|
|
|
|
def create_integer_token(self, int_type: IntegerBuiltInType, value: int, start: int, start_line: int) -> Token:
|
|
self.validate_integer_range(value, int_type, start, start_line)
|
|
return Token(
|
|
type=TokenType.INTEGER,
|
|
integer_literal=IntegerLiteral(value=value, type=int_type)
|
|
)
|
|
|
|
def create_float_token(self, numeric_type: NumericType, start: int, start_line: int) -> Token:
|
|
value = self.create_float(start)
|
|
if numeric_type == NumericType.F64:
|
|
return Token(type=TokenType.DOUBLE, double_literal=value)
|
|
else:
|
|
return Token(type=TokenType.FLOAT, float_literal=value)
|
|
|
|
# =====================================================================
|
|
# Numeric Type Parsing
|
|
# =====================================================================
|
|
|
|
def parse_numeric_type(self, start: int, start_line: int, literal_type: NumericLiteralType) -> NumericType:
|
|
c = self.advance()
|
|
|
|
if c == 'f':
|
|
if literal_type not in (NumericLiteralType.DECIMAL, NumericLiteralType.FLOAT, NumericLiteralType.EXPONENTIAL):
|
|
raise LexerError("Invalid numeric literal: float type not allowed.", self.get_file_info(start, start_line))
|
|
|
|
c = self.advance()
|
|
if c == '6' and self.far_peek(1) == '4':
|
|
self.advance()
|
|
self.advance()
|
|
return NumericType.F64
|
|
elif c == '3' and self.far_peek(1) == '2':
|
|
self.advance()
|
|
self.advance()
|
|
return NumericType.F32
|
|
else:
|
|
raise LexerError("Invalid float type: must be of type 'f64' or 'f32'.", self.get_file_info(start, start_line))
|
|
|
|
elif c in 'iu':
|
|
if literal_type in (NumericLiteralType.FLOAT, NumericLiteralType.EXPONENTIAL):
|
|
raise LexerError("Invalid float type: must be of type 'f64' or 'f32'.", self.get_file_info(start, start_line))
|
|
|
|
unsigned = c == 'u'
|
|
c = self.advance()
|
|
|
|
if c == '6' and self.far_peek(1) == '4':
|
|
self.advance()
|
|
self.advance()
|
|
return NumericType.U64 if unsigned else NumericType.I64
|
|
elif c == '3' and self.far_peek(1) == '2':
|
|
self.advance()
|
|
self.advance()
|
|
return NumericType.U32 if unsigned else NumericType.I32
|
|
elif c == '1' and self.far_peek(1) == '6':
|
|
self.advance()
|
|
self.advance()
|
|
return NumericType.U16 if unsigned else NumericType.I16
|
|
elif c == '8':
|
|
self.advance()
|
|
return NumericType.U8 if unsigned else NumericType.I8
|
|
else:
|
|
prefix = 'unsigned' if unsigned else 'signed'
|
|
raise LexerError(f"Invalid {prefix} integer type.", self.get_file_info(start, start_line))
|
|
|
|
else:
|
|
raise LexerError("Invalid numeric type: type must start with 'f', 'i', or 'u'.", self.get_file_info(start, start_line))
|
|
|
|
# =====================================================================
|
|
# Numeric Literal Parsing
|
|
# =====================================================================
|
|
|
|
def parse_binary_integer(self, start: int, start_line: int) -> Token:
|
|
c = self.peek()
|
|
while c in '01_':
|
|
c = self.advance()
|
|
|
|
if c == ':':
|
|
numeric_type = self.parse_numeric_type(start, start_line, NumericLiteralType.BINARY)
|
|
int_type = self.get_integer_type(numeric_type)
|
|
value = self.create_binary_integer(start)
|
|
return self.create_integer_token(int_type, value, start, start_line)
|
|
|
|
if c.isspace() or c in '/\0':
|
|
value = self.create_binary_integer(start)
|
|
return self.create_integer_token(IntegerBuiltInType.I64, value, start, start_line)
|
|
|
|
raise LexerError(f"Invalid binary literal: unexpected '{c}' in binary integer.", self.get_file_info(start, start_line))
|
|
|
|
def parse_octal_integer(self, start: int, start_line: int) -> Token:
|
|
c = self.peek()
|
|
while c.isdigit() and c not in '89' or c == '_':
|
|
c = self.advance()
|
|
|
|
if c == ':':
|
|
numeric_type = self.parse_numeric_type(start, start_line, NumericLiteralType.OCTAL)
|
|
int_type = self.get_integer_type(numeric_type)
|
|
value = self.create_octal_integer(start)
|
|
return self.create_integer_token(int_type, value, start, start_line)
|
|
|
|
if c.isspace() or c in '/\0':
|
|
value = self.create_octal_integer(start)
|
|
return self.create_integer_token(IntegerBuiltInType.I64, value, start, start_line)
|
|
|
|
raise LexerError(f"Invalid octal literal: unexpected '{c}' in octal integer.", self.get_file_info(start, start_line))
|
|
|
|
def parse_hexadecimal_integer(self, start: int, start_line: int) -> Token:
|
|
c = self.peek()
|
|
while c in '0123456789ABCDEFabcdef_':
|
|
c = self.advance()
|
|
|
|
if c == ':':
|
|
numeric_type = self.parse_numeric_type(start, start_line, NumericLiteralType.HEXADECIMAL)
|
|
int_type = self.get_integer_type(numeric_type)
|
|
value = self.create_hexadecimal_integer(start)
|
|
return self.create_integer_token(int_type, value, start, start_line)
|
|
|
|
if c.isspace() or c in '/\0':
|
|
value = self.create_hexadecimal_integer(start)
|
|
return self.create_integer_token(IntegerBuiltInType.I64, value, start, start_line)
|
|
|
|
raise LexerError(f"Invalid hexadecimal literal: unexpected '{c}' in hexadecimal integer.", self.get_file_info(start, start_line))
|
|
|
|
def parse_exponential(self, start: int, start_line: int) -> Token:
|
|
raise NotImplementedError("Float exponential not implemented yet.")
|
|
|
|
def parse_float(self, start: int, start_line: int) -> Token:
|
|
c = self.peek()
|
|
while c.isdigit() or c == '_':
|
|
c = self.advance()
|
|
|
|
if c in 'eE':
|
|
return self.parse_exponential(start, start_line)
|
|
|
|
if c == ':':
|
|
numeric_type = self.parse_numeric_type(start, start_line, NumericLiteralType.FLOAT)
|
|
return self.create_float_token(numeric_type, start, start_line)
|
|
|
|
if c.isspace() or c in '/\0':
|
|
return self.create_float_token(NumericType.F64, start, start_line)
|
|
|
|
raise LexerError(f"Invalid float literal: unexpected '{c}' in float.", self.get_file_info(start, start_line))
|
|
|
|
def parse_decimal_integer(self, start: int, start_line: int) -> Token:
|
|
c = self.peek()
|
|
while c.isdigit() or c == '_':
|
|
c = self.advance()
|
|
|
|
if c == '.':
|
|
self.advance()
|
|
return self.parse_float(start, start_line)
|
|
|
|
if c in 'eE':
|
|
return self.parse_exponential(start, start_line)
|
|
|
|
if c == ':':
|
|
numeric_type = self.parse_numeric_type(start, start_line, NumericLiteralType.DECIMAL)
|
|
int_type = self.get_integer_type(numeric_type)
|
|
value = self.create_decimal_integer(start)
|
|
return self.create_integer_token(int_type, value, start, start_line)
|
|
|
|
if c.isspace() or c in '/\0':
|
|
value = self.create_decimal_integer(start)
|
|
return self.create_integer_token(IntegerBuiltInType.I64, value, start, start_line)
|
|
|
|
raise LexerError(f"Invalid decimal literal: unexpected '{c}' in decimal integer.", self.get_file_info(start, start_line))
|
|
|
|
def parse_numeric_literal(self, start: int, start_line: int) -> Token:
|
|
c = self.peek()
|
|
if c == '-':
|
|
c = self.advance()
|
|
|
|
if c == '0':
|
|
c = self.advance()
|
|
if c in 'bB':
|
|
self.advance()
|
|
return self.parse_binary_integer(start, start_line)
|
|
elif c in 'oO':
|
|
self.advance()
|
|
return self.parse_octal_integer(start, start_line)
|
|
elif c in 'xX':
|
|
self.advance()
|
|
return self.parse_hexadecimal_integer(start, start_line)
|
|
|
|
return self.parse_decimal_integer(start, start_line)
|
|
|
|
# =====================================================================
|
|
# Character Literal Parsing
|
|
# =====================================================================
|
|
|
|
def parse_character_literal(self, start: int, start_line: int) -> Token:
|
|
c = self.peek()
|
|
|
|
if c == '\'':
|
|
raise LexerError("Invalid character literal: empty character literal.", self.get_file_info(start, start_line))
|
|
|
|
if c == '\\':
|
|
c = self.advance()
|
|
escape_map = {
|
|
'n': '\n',
|
|
'r': '\r',
|
|
't': '\t',
|
|
'\\': '\\',
|
|
'\'': '\'',
|
|
'0': '\0'
|
|
}
|
|
if c in escape_map:
|
|
value = ord(escape_map[c])
|
|
else:
|
|
raise LexerError(f"Invalid character literal: unknown escape sequence '\\{c}'.", self.get_file_info(start, start_line))
|
|
elif c in '\n\r':
|
|
raise LexerError("Invalid character literal: unclosed character literal.", self.get_file_info(start, start_line))
|
|
else:
|
|
value = ord(c)
|
|
|
|
c = self.advance()
|
|
|
|
if c.isspace() or c in '/\0':
|
|
raise LexerError("Invalid character literal: unclosed character literal.", self.get_file_info(start, start_line))
|
|
elif c != '\'':
|
|
raise LexerError(f"Invalid character literal: unexpected '{c}' in character.", self.get_file_info(start, start_line))
|
|
|
|
self.advance()
|
|
return Token(type=TokenType.CHARACTER, character_literal=value)
|
|
|
|
# =====================================================================
|
|
# String Literal Parsing (stub)
|
|
# =====================================================================
|
|
|
|
def parse_string_literal(self, start: int, start_line: int) -> Token:
|
|
raise NotImplementedError("String literals not implemented yet.")
|
|
|
|
# =====================================================================
|
|
# Token String Parsing
|
|
# =====================================================================
|
|
|
|
def parse_token_string(self, start: int, start_line: int) -> Token:
|
|
tokens = []
|
|
self.advance() # Skip opening '{'
|
|
|
|
watchdog = 0
|
|
while self.peek() != '\0':
|
|
self.skip_comments_and_whitespace()
|
|
c = self.peek()
|
|
|
|
if c == '}':
|
|
self.advance()
|
|
return Token(type=TokenType.TOKEN_STRING, token_string=TokenString(tokens=tokens))
|
|
|
|
token = self.lexer_next()
|
|
tokens.append(token)
|
|
|
|
if token.type == TokenType.EOF:
|
|
break
|
|
|
|
watchdog += 1
|
|
if watchdog > 1000000:
|
|
raise LexerError("Watchdog triggered in token string.", self.get_file_info(start, start_line))
|
|
|
|
raise LexerError("Unclosed token string: missing closing brace '}'.", self.get_file_info(start, start_line))
|
|
|
|
# =====================================================================
|
|
# Array and Type Tuple Parsing (stubs)
|
|
# =====================================================================
|
|
|
|
def parse_array_literal(self, start: int, start_line: int) -> Token:
|
|
raise NotImplementedError("Array literals not implemented yet.")
|
|
|
|
def parse_type_tuple(self, start: int, start_line: int) -> Token:
|
|
raise NotImplementedError("Type tuples not implemented yet.")
|
|
|
|
# =====================================================================
|
|
# Identifier and Boolean Parsing
|
|
# =====================================================================
|
|
|
|
def parse_identifiers_and_booleans(self, start: int, start_line: int) -> Token:
|
|
c = self.peek()
|
|
is_literal = False
|
|
|
|
# Check for identifier literal (::)
|
|
if c == ':' and self.far_peek(1) == ':':
|
|
is_literal = True
|
|
self.advance()
|
|
self.advance()
|
|
c = self.peek()
|
|
|
|
# Read identifier name
|
|
name_chars = []
|
|
while self.is_identifier_continue(c):
|
|
if c == ':':
|
|
raise LexerError("Invalid identifier: ':' is not allowed in identifiers.", self.get_file_info(start, start_line))
|
|
if c == '.':
|
|
raise LexerError("Invalid identifier: '.' is not allowed in identifiers.", self.get_file_info(start, start_line))
|
|
name_chars.append(c)
|
|
c = self.advance()
|
|
|
|
name = ''.join(name_chars)
|
|
|
|
# Check for boolean literals
|
|
if name == 'false':
|
|
return Token(type=TokenType.BOOLEAN, boolean_literal=False)
|
|
elif name == 'true':
|
|
return Token(type=TokenType.BOOLEAN, boolean_literal=True)
|
|
else:
|
|
return Token(type=TokenType.IDENTIFIER, identifier=Identifier(name=name, is_literal=is_literal))
|
|
|
|
# =====================================================================
|
|
# Main Lexer Logic
|
|
# =====================================================================
|
|
|
|
def lexer_next(self) -> Token:
|
|
self.skip_comments_and_whitespace()
|
|
|
|
c = self.peek()
|
|
start = self.info.pos
|
|
start_line = self.info.line
|
|
|
|
# End of file
|
|
if c == '\0':
|
|
return Token(type=TokenType.EOF)
|
|
|
|
# Numeric literals (integers and floats)
|
|
if c.isdigit() or (c == '.' and self.far_peek(1).isdigit()) or (c == '-' and self.far_peek(1).isdigit()):
|
|
return self.parse_numeric_literal(start, start_line)
|
|
|
|
# Character literals
|
|
if c == '\'':
|
|
self.advance()
|
|
return self.parse_character_literal(start, start_line)
|
|
|
|
# String literals
|
|
if c == '"':
|
|
return self.parse_string_literal(start, start_line)
|
|
|
|
# Token strings
|
|
if c == '{':
|
|
return self.parse_token_string(start, start_line)
|
|
|
|
if c == '}':
|
|
self.advance()
|
|
raise LexerError("Unexpected closing brace '}' without matching opening brace.", self.get_file_info(start, start_line))
|
|
|
|
# Array literals
|
|
if c == '[':
|
|
return self.parse_array_literal(start, start_line)
|
|
|
|
if c == ']':
|
|
self.advance()
|
|
raise LexerError("Unexpected closing bracket ']' without matching opening bracket.", self.get_file_info(start, start_line))
|
|
|
|
# Type tuples
|
|
if c == '(':
|
|
return self.parse_type_tuple(start, start_line)
|
|
|
|
if c == ')':
|
|
self.advance()
|
|
raise LexerError("Unexpected closing parentheses ')' without matching opening parentheses.", self.get_file_info(start, start_line))
|
|
|
|
# Identifiers and booleans
|
|
if self.is_identifier_start():
|
|
return self.parse_identifiers_and_booleans(start, start_line)
|
|
|
|
# Check for malformed identifier literal
|
|
if c == ':':
|
|
self.advance()
|
|
if self.far_peek(1) == ':':
|
|
raise LexerError("Invalid identifier literal: empty identifier after '::'.", self.get_file_info(start, start_line))
|
|
else:
|
|
raise LexerError("Unexpected single colon ':'.", self.get_file_info(start, start_line))
|
|
|
|
# Unknown character
|
|
raise LexerError(f"Unexpected character: unexpected '{c}' during parsing.", self.get_file_info(start, start_line))
|
|
|
|
def lexical_analysis(self) -> List[Token]:
|
|
"""Main entry point for lexical analysis."""
|
|
tokens = []
|
|
|
|
while True:
|
|
try:
|
|
token = self.lexer_next()
|
|
tokens.append(token)
|
|
|
|
if token.type == TokenType.EOF:
|
|
break
|
|
except LexerError as e:
|
|
# Re-raise lexer errors
|
|
raise
|
|
|
|
return tokens
|
|
|
|
|
|
# =====================================================================
|
|
# Public API
|
|
# =====================================================================
|
|
|
|
def lexical_analysis(lexer_info: LexerInfo) -> List[Token]:
|
|
"""Convenience function matching the original C API."""
|
|
lexer = Lexer(lexer_info)
|
|
return lexer.lexical_analysis()
|