Started lexer in the compiler

This commit is contained in:
Kyler 2024-02-26 00:06:45 -07:00
parent 1dc6c9ab73
commit 83224b3acb
3 changed files with 359 additions and 8 deletions

View File

@ -2,7 +2,8 @@
*Yeahbut, aka Kyler Olsen*
It is a custom computer and instruction set architecture. It also has its own
assembly language with assembler. Custom high level language coming soon!
assembly language with assembler written in Python. Custom high level language
coming soon!
## ISA
*WIP*
@ -63,10 +64,107 @@ assembly language with assembler. Custom high level language coming soon!
## High Level Language
*WIP*
Paradigm: Multi-Paradigm: Procedural (Imperative), Structured
Designer: Kyler Olsen
First Appeared: *Future*
Typing Discipline: Typeless
Platform: ytd 12-bit computer, ytd 12-bit emulator (multi-platform)
License: *Tentatively MIT*
Filename extension: `.ytd12c`
About
- Paradigm: Multi-Paradigm: Procedural (Imperative), Structured
- Designer: Kyler Olsen
- Created: *Future*
- Typing Discipline: ~~Typeless~~ Static, Weak
- Platform: ytd 12-bit computer, ytd 12-bit emulator (multi-platform)
- License: *Tentatively MIT*
- Filename extension: `.ytd12c`
### Lexical
#### Directives
Directives start with `#` and end at the end of a line.
They are used to give additional instructions to the compiler.
#### Comments
Comments can either be single-line comments or multi-line comments.
Single-line comments start with `//` and end at the end of a line.
Multi-line comments start with `/*` and end with `*/`.
#### Identifiers
Can be up to 15 characters in length, and are case sensitive.
They cannot be a keyword.
```
Identifier ::= ID_Start ID_Continue*
ID_Start ::= <Any latin alphabet ligature or an underscore: "A"-"Z", "a"-"z", "_">
ID_Continue ::= ID_Start | <Any decimal ligature: "0"-"9">
```
#### Keywords
```
struct fn enum static
if else do while
for pub let break
continue True False None
unsigned int fixed float
```
#### Literals
Number Literals
String Literals
#### Punctuation
```
++ -- @ $ + -
* / % ~ & |
^ << >> = += -=
*= /= %= &= |= ^=
<<= >>= ! && || ==
!= < <= > >= ?
{ } [ ] ( )
. -> , ; :
```
### Syntax
#### Operator Operand Counts
- Unitary: `++`, `--`, `@`, `$`
- Binary: `[ ]`, `+`, `-`, `*`, `/`, `%`, `~`, `&`, `|`, `^`, `<<`, `>>`, `=`,
`+=`, `-=`, `*=`, `/=`, `%=`, `&=`, `|=`, `^=`, `<<=`, `>>=`, `!`, `&&`, `||`,
`==`, `!=`, `<`, `<=`, `>`, `>=`
- Ternary: `? :`
#### Operator Precedence
- Assignment: `=`, `+=`, `-=`, `*=`, `/=`, `%=`, `&=`, `|=`, `^=`, `<<=`, `>>=`
- Ternary Operator: `? :`
- Equality and order testing: `==`, `!=`, `<`, `<=`, `>`, `>=`
- Boolean logic: `!`, `&&`, `||`
- Arithmetic: `+`, `-`, `*`, `/`, `%`
- Bitwise: `~`, `&`, `|`, `^`, `<<`, `>>`
- Increment and decrement: `++`, `--`
- Reference and dereference: `@`, `$`, `[ ]`
### Semantics
### Scratch Area
#### Keywords
- Types: `unsigned`, `int`, `fixed`, `float`
- Structural: `if`, `else`, `do`, `while`, `for`, `break`, `continue`
- Constants: `True`, `False`, `None`
- Other: `struct`, `fn`, `enum`, `static`, `pub`, `let`
#### Delimiters
```
. -> , ( )
{ } [ ] ;
:
```

View File

@ -0,0 +1,25 @@
# Kyler Olsen
# Feb 2024
class FileInfo:
_filename: str
_line: int
_col: int
_length: int
def __init__(
self,
filename: str,
line: int,
col: int,
length: int,
):
self._filename = filename
self._line = line
self._col = col
self._length = length
class CompilerError(Exception): pass

228
pytd12dk/compiler/lexer.py Normal file
View File

@ -0,0 +1,228 @@
# Kyler Olsen
# Feb 2024
from enum import Enum
from typing import ClassVar, Sequence, TextIO
from .compiler_types import CompilerError, FileInfo
class _InterTokenType(Enum):
Generic = 'Generic'
Directive = 'Directive'
SingleLineComment = 'SingleLineComment'
MultiLineComment = 'MultiLineComment'
Word = 'Word'
NumberLiteral = 'NumberLiteral'
CharLiteral = 'CharLiteral'
StringLiteral = 'StringLiteral'
Punctuation = 'Punctuation'
_OnlyNewLineTerminatedTokens = (
_InterTokenType.Directive,
_InterTokenType.SingleLineComment,
)
_NewLineTerminatedTokens = _OnlyNewLineTerminatedTokens + (
_InterTokenType.Word,
_InterTokenType.NumberLiteral,
_InterTokenType.Punctuation,
)
_NewLineErrorTokens = (
_InterTokenType.CharLiteral,
_InterTokenType.StringLiteral,
)
_ID_Start = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" "abcdefghijklmnopqrstuvwxyz" "_"
_ID_Continue = _ID_Start + "0123456789"
_Keywords = (
'struct', 'fn', 'enum', 'static',
'if', 'else', 'do', 'while',
'for', 'pub', 'let', 'break',
'continue', 'True', 'False', 'None',
'unsigned', 'int', 'fixed', 'float',
)
_Num_Start = "0123456789"
_Num_Second = _Num_Start + "box._Ee"
_Num_Continue = _Num_Start + "._" "ABCDEF" "abcdef"
_Punctuation_Any = "@$+-*/%~&|^<>=!?{}[]().->,;:"
_Punctuation = (
"++", "--", "@", "$", "+", "-",
"*", "/", "%", "~", "&", "|",
"^", "<<", ">>", "=", "+=", "-=",
"*=", "/=", "%=", "&=", "|=", "^=",
"<<=", ">>=", "!", "&&", "||", "==",
"!=", "<", "<=", ">", ">=", "?",
"{", "}", "[", "]", "(", ")",
".", "->", ",", ";", ":",
)
class LexerError(CompilerError): pass
class Token:
_type: ClassVar[str] = 'Generic'
_value: str
_file_info: FileInfo
def __init__(self, value: str, file_info: FileInfo):
self._value = value
self._file_info = file_info
@property
def value(self) -> str: return self._value
@property
def file_info(self) -> FileInfo: return self._file_info
class Directive(Token): _type = 'Directive'
class Identifier(Token): _type = 'Identifier'
class Keyword(Token): _type = 'Keyword'
class NumberLiteral(Token): _type = 'NumberLiteral'
class CharLiteral(Token): _type = 'CharLiteral'
class StringLiteral(Token): _type = 'StringLiteral'
class Punctuation(Token): _type = 'Punctuation'
def lexer(file: str | TextIO, filename: str) -> Sequence[Token]:
if not isinstance(file, str):
file = file.read()
tokens: list[Token] = []
current: str = ""
current_line: int = 0
current_col: int = 0
escaped: bool = False
token_type: _InterTokenType = _InterTokenType.Generic
for line, line_str in enumerate(file.splitlines()):
if token_type in _NewLineErrorTokens:
raise LexerError("Unexpected Newline")
if token_type in _NewLineTerminatedTokens:
fi = FileInfo(filename, current_line, current_col, len(current))
if token_type is _InterTokenType.Directive:
tokens.append(Directive(current, fi))
elif token_type is _InterTokenType.Word:
if len(current) > 15:
raise LexerError("Identifier Too Long")
if current in _Keywords:
tokens.append(Keyword(current, fi))
else:
tokens.append(Identifier(current, fi))
elif token_type is _InterTokenType.NumberLiteral:
tokens.append(NumberLiteral(current, fi))
elif token_type is _InterTokenType.Punctuation:
if current not in _Punctuation:
raise LexerError("Invalid Punctuation")
tokens.append(Punctuation(current, fi))
token_type = _InterTokenType.Generic
for col, char in enumerate(line_str):
if token_type in _OnlyNewLineTerminatedTokens:
current += char
elif token_type is _InterTokenType.MultiLineComment:
if len(current) >= 2 and current[-1] == '*' and char == '/':
token_type = _InterTokenType.Generic
continue
current += char
elif token_type is _InterTokenType.Word:
if char in _ID_Continue:
current += char
else:
if len(current) > 15:
raise LexerError("Identifier Too Long")
fi = FileInfo(
filename, current_line, current_col, len(current))
if current in _Keywords:
tokens.append(Keyword(current, fi))
else:
tokens.append(Identifier(current, fi))
token_type = _InterTokenType.Generic
elif token_type is _InterTokenType.NumberLiteral:
if (
(len(current) == 2 and char in _Num_Second) ^
(char in _Num_Continue)
):
current += char
else:
fi = FileInfo(
filename, current_line, current_col, len(current))
tokens.append(NumberLiteral(current, fi))
token_type = _InterTokenType.Generic
elif token_type is _InterTokenType.CharLiteral:
if escaped:
escaped = False
elif char == '\\':
escaped = True
elif char == "'":
current += char
if (
current[1] != '\\' and
len(current) == 3 or
len(current) > 3
):
raise LexerError("Character Literal Too Long")
fi = FileInfo(
filename, current_line, current_col, len(current))
tokens.append(StringLiteral(current, fi))
token_type = _InterTokenType.Generic
continue
current += char
elif token_type is _InterTokenType.StringLiteral:
if escaped:
escaped = False
elif char == '\\':
escaped = True
elif char == '"':
current += char
fi = FileInfo(
filename, current_line, current_col, len(current))
tokens.append(StringLiteral(current, fi))
token_type = _InterTokenType.Generic
continue
current += char
elif token_type is _InterTokenType.Punctuation:
if char in _Punctuation_Any:
current += char
else:
if current not in _Punctuation:
raise LexerError("Invalid Punctuation")
fi = FileInfo(
filename, current_line, current_col, len(current))
tokens.append(Punctuation(current, fi))
token_type = _InterTokenType.Generic
if token_type is _InterTokenType.Generic:
current = char
current_line = line + 1
current_col = col + 1
escaped = False
if char == '#': token_type = _InterTokenType.Directive
elif char == '/' and line_str[col+1] == '/':
token_type = _InterTokenType.SingleLineComment
elif char == '/' and line_str[col+1] == '*':
token_type = _InterTokenType.MultiLineComment
elif char in _ID_Start:
token_type = _InterTokenType.Word
elif char == '.' and line_str[col+1] in _Num_Second:
token_type = _InterTokenType.NumberLiteral
elif char in _Num_Start:
token_type = _InterTokenType.NumberLiteral
elif char == "'":
token_type = _InterTokenType.CharLiteral
elif char == '"':
token_type = _InterTokenType.StringLiteral
elif char in _Punctuation_Any:
token_type = _InterTokenType.Punctuation
return tokens