# Kyler Olsen # Feb 2024 from enum import Enum from typing import ClassVar, Sequence from .compiler_types import CompilerError, FileInfo class LexerError(CompilerError): _compiler_error_type = "Lexical" class _InterTokenType(Enum): Generic = 'Generic' Directive = 'Directive' SingleLineComment = 'SingleLineComment' MultiLineComment = 'MultiLineComment' Word = 'Word' NumberLiteral = 'NumberLiteral' CharLiteral = 'CharLiteral' StringLiteral = 'StringLiteral' Punctuation = 'Punctuation' class _NumberLiteralType(Enum): Number = 'Number' Real = 'Real' Exp = 'Exp' Base = 'Base' Binary = 'Binary' Octal = 'Octal' Hex = 'Hex' _OnlyNewLineTerminatedTokens = ( _InterTokenType.Directive, _InterTokenType.SingleLineComment, ) _NewLineTerminatedTokens = _OnlyNewLineTerminatedTokens + ( _InterTokenType.Word, _InterTokenType.NumberLiteral, _InterTokenType.Punctuation, ) _NewLineErrorTokens = ( _InterTokenType.CharLiteral, _InterTokenType.StringLiteral, ) _ID_Start = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" "abcdefghijklmnopqrstuvwxyz" "_" _ID_Continue = _ID_Start + "0123456789" _Keywords = ( 'struct', 'fn', 'enum', 'static', 'if', 'else', 'do', 'while', 'for', 'let', 'break', 'continue', 'unsigned', 'int', 'fixed', 'float', 'True', 'False', 'None', ) _Num_Start = "0123456789" _Num_Start_Next = { _NumberLiteralType.Number: { '.': _NumberLiteralType.Real, '0': _NumberLiteralType.Base, } } _Num_Second = { _NumberLiteralType.Number: _Num_Start + ".eE_", _NumberLiteralType.Real: _Num_Start + "eE_", _NumberLiteralType.Base: "bBoOxX", } _Num_Second_Next = { _NumberLiteralType.Number: { '.': _NumberLiteralType.Real, 'e': _NumberLiteralType.Exp, 'E': _NumberLiteralType.Exp, }, _NumberLiteralType.Real: { 'e': _NumberLiteralType.Exp, 'E': _NumberLiteralType.Exp, }, _NumberLiteralType.Base: { 'b': _NumberLiteralType.Binary, 'B': _NumberLiteralType.Binary, 'o': _NumberLiteralType.Octal, 'O': _NumberLiteralType.Octal, 'x': _NumberLiteralType.Hex, 'X': _NumberLiteralType.Hex, } } _Num_Continue = { _NumberLiteralType.Number: _Num_Start + ".eE_", _NumberLiteralType.Real: _Num_Start + "eE_", _NumberLiteralType.Exp: _Num_Start + "_", _NumberLiteralType.Binary: "01_", _NumberLiteralType.Octal: "01234567_", _NumberLiteralType.Hex: _Num_Start + "abcdefABCDEF_", } _Num_Continue_Next = { _NumberLiteralType.Number: { '.': _NumberLiteralType.Real, 'e': _NumberLiteralType.Exp, 'E': _NumberLiteralType.Exp, }, _NumberLiteralType.Real: { 'e': _NumberLiteralType.Exp, 'E': _NumberLiteralType.Exp, } } _Punctuation_Any = "@$+-*/%~&|^<>=!?{[(}]).->,;:" _Punctuation = ( "++", "--", "@", "$", "+", "-", "*", "/", "%", "~", "&", "|", "^", "<<", ">>", "=", "+=", "-=", "*=", "/=", "%=", "&=", "|=", "^=", "<<=", ">>=", "!", "&&", "||", "^^", "==", "!=", "<", "<=", ">", ">=", "{", "}", "[", "]", "(", ")", "?", ".", "->", ",", ";", ":", ) class Token: _type: ClassVar[str] = 'Generic' _value: str _file_info: FileInfo def __init__(self, value: str, file_info: FileInfo): self._value = value self._file_info = file_info def __str__(self) -> str: return f"Type: {self._type}, Value: {self.value}" @property def value(self) -> str: return self._value @property def file_info(self) -> FileInfo: return self._file_info class Directive(Token): _type = 'Directive' class Identifier(Token): _type = 'Identifier' class Keyword(Token): _type = 'Keyword' class NumberLiteral(Token): _type = 'NumberLiteral' class CharLiteral(Token): _type = 'CharLiteral' class StringLiteral(Token): _type = 'StringLiteral' class Punctuation(Token): _type = 'Punctuation' def lexer(file: str, filename: str) -> Sequence[Token]: tokens: list[Token] = [] current: str = "" current_line: int = 0 current_col: int = 0 escaped: bool = False number_type: _NumberLiteralType = _NumberLiteralType.Number token_type: _InterTokenType = _InterTokenType.Generic for line, line_str in enumerate(file.splitlines()): fi = FileInfo(filename, current_line, current_col, len(current)) if token_type in _NewLineErrorTokens: raise LexerError("Unexpected Newline", fi) if token_type in _NewLineTerminatedTokens: if token_type is _InterTokenType.Directive: tokens.append(Directive(current, fi)) elif token_type is _InterTokenType.Word: if len(current) > 15: raise LexerError("Identifier Too Long", fi) if current in _Keywords: tokens.append(Keyword(current, fi)) else: tokens.append(Identifier(current, fi)) elif token_type is _InterTokenType.NumberLiteral: tokens.append(NumberLiteral(current, fi)) number_type = _NumberLiteralType.Number elif token_type is _InterTokenType.Punctuation: if current not in _Punctuation: raise LexerError("Invalid Punctuation", fi) tokens.append(Punctuation(current, fi)) token_type = _InterTokenType.Generic for col, char in enumerate(line_str): if token_type in _OnlyNewLineTerminatedTokens: current += char elif token_type is _InterTokenType.MultiLineComment: if len(current) >= 2 and current[-1] == '*' and char == '/': token_type = _InterTokenType.Generic continue current += char elif token_type is _InterTokenType.Word: if char in _ID_Continue: current += char else: fi = FileInfo( filename, current_line, current_col, len(current)) if len(current) > 15: raise LexerError("Identifier Too Long", fi) if current in _Keywords: tokens.append(Keyword(current, fi)) else: tokens.append(Identifier(current, fi)) token_type = _InterTokenType.Generic elif token_type is _InterTokenType.NumberLiteral: if ( len(current) == 1 and number_type in _Num_Second and char in _Num_Second[number_type] ): current += char if ( number_type in _Num_Second_Next and char in _Num_Second_Next[number_type] ): number_type = _Num_Second_Next[number_type][char] elif ( number_type in _Num_Continue and char in _Num_Continue[number_type] ): current += char if ( number_type in _Num_Continue_Next and char in _Num_Continue_Next[number_type] ): number_type = _Num_Continue_Next[number_type][char] else: fi = FileInfo( filename, current_line, current_col, len(current)) tokens.append(NumberLiteral(current, fi)) number_type = _NumberLiteralType.Number token_type = _InterTokenType.Generic elif token_type is _InterTokenType.CharLiteral: if escaped: escaped = False elif char == '\\': escaped = True elif char == "'": current += char fi = FileInfo( filename, current_line, current_col, len(current)) if ( (current[1] != '\\' and len(current) > 3) or len(current) > 4 ): raise LexerError("Character Literal Too Long", fi) tokens.append(CharLiteral(current, fi)) token_type = _InterTokenType.Generic continue current += char elif token_type is _InterTokenType.StringLiteral: if escaped: escaped = False elif char == '\\': escaped = True elif char == '"': current += char fi = FileInfo( filename, current_line, current_col, len(current)) tokens.append(StringLiteral(current, fi)) token_type = _InterTokenType.Generic continue current += char elif token_type is _InterTokenType.Punctuation: if char in _Punctuation_Any and current + char in _Punctuation: current += char else: fi = FileInfo( filename, current_line, current_col, len(current)) if current not in _Punctuation: raise LexerError("Invalid Punctuation", fi) tokens.append(Punctuation(current, fi)) token_type = _InterTokenType.Generic if token_type is _InterTokenType.Generic: current = char current_line = line + 1 current_col = col + 1 escaped = False if char == '#': token_type = _InterTokenType.Directive elif char == '/' and line_str[col+1] == '/': token_type = _InterTokenType.SingleLineComment elif char == '/' and line_str[col+1] == '*': token_type = _InterTokenType.MultiLineComment elif char in _ID_Start: token_type = _InterTokenType.Word elif ( char == '.' and line_str[col+1] in _Num_Second[_NumberLiteralType.Real] ): token_type = _InterTokenType.NumberLiteral if char in _Num_Start_Next[number_type]: number_type = _Num_Start_Next[number_type][char] elif char in _Num_Start: token_type = _InterTokenType.NumberLiteral if char in _Num_Start_Next[number_type]: number_type = _Num_Start_Next[number_type][char] elif char == "'": token_type = _InterTokenType.CharLiteral elif char == '"': token_type = _InterTokenType.StringLiteral elif char in _Punctuation_Any: token_type = _InterTokenType.Punctuation fi = FileInfo(filename, current_line, current_col, len(current)) if token_type in _NewLineErrorTokens: raise LexerError("Unexpected Newline", fi) if token_type in _NewLineTerminatedTokens: if token_type is _InterTokenType.Directive: tokens.append(Directive(current, fi)) elif token_type is _InterTokenType.Word: if len(current) > 15: raise LexerError("Identifier Too Long", fi) if current in _Keywords: tokens.append(Keyword(current, fi)) else: tokens.append(Identifier(current, fi)) elif token_type is _InterTokenType.NumberLiteral: tokens.append(NumberLiteral(current, fi)) number_type = _NumberLiteralType.Number elif token_type is _InterTokenType.Punctuation: if current not in _Punctuation: raise LexerError("Invalid Punctuation", fi) tokens.append(Punctuation(current, fi)) token_type = _InterTokenType.Generic return tokens