Started lexer in the compiler
This commit is contained in:
parent
1dc6c9ab73
commit
83224b3acb
|
@ -2,7 +2,8 @@
|
||||||
*Yeahbut, aka Kyler Olsen*
|
*Yeahbut, aka Kyler Olsen*
|
||||||
|
|
||||||
It is a custom computer and instruction set architecture. It also has its own
|
It is a custom computer and instruction set architecture. It also has its own
|
||||||
assembly language with assembler. Custom high level language coming soon!
|
assembly language with assembler written in Python. Custom high level language
|
||||||
|
coming soon!
|
||||||
|
|
||||||
## ISA
|
## ISA
|
||||||
*WIP*
|
*WIP*
|
||||||
|
@ -63,10 +64,107 @@ assembly language with assembler. Custom high level language coming soon!
|
||||||
## High Level Language
|
## High Level Language
|
||||||
*WIP*
|
*WIP*
|
||||||
|
|
||||||
Paradigm: Multi-Paradigm: Procedural (Imperative), Structured
|
About
|
||||||
Designer: Kyler Olsen
|
- Paradigm: Multi-Paradigm: Procedural (Imperative), Structured
|
||||||
First Appeared: *Future*
|
- Designer: Kyler Olsen
|
||||||
Typing Discipline: Typeless
|
- Created: *Future*
|
||||||
Platform: ytd 12-bit computer, ytd 12-bit emulator (multi-platform)
|
- Typing Discipline: ~~Typeless~~ Static, Weak
|
||||||
License: *Tentatively MIT*
|
- Platform: ytd 12-bit computer, ytd 12-bit emulator (multi-platform)
|
||||||
Filename extension: `.ytd12c`
|
- License: *Tentatively MIT*
|
||||||
|
- Filename extension: `.ytd12c`
|
||||||
|
|
||||||
|
### Lexical
|
||||||
|
|
||||||
|
#### Directives
|
||||||
|
|
||||||
|
Directives start with `#` and end at the end of a line.
|
||||||
|
They are used to give additional instructions to the compiler.
|
||||||
|
|
||||||
|
#### Comments
|
||||||
|
|
||||||
|
Comments can either be single-line comments or multi-line comments.
|
||||||
|
|
||||||
|
Single-line comments start with `//` and end at the end of a line.
|
||||||
|
|
||||||
|
Multi-line comments start with `/*` and end with `*/`.
|
||||||
|
|
||||||
|
|
||||||
|
#### Identifiers
|
||||||
|
|
||||||
|
Can be up to 15 characters in length, and are case sensitive.
|
||||||
|
They cannot be a keyword.
|
||||||
|
|
||||||
|
```
|
||||||
|
Identifier ::= ID_Start ID_Continue*
|
||||||
|
ID_Start ::= <Any latin alphabet ligature or an underscore: "A"-"Z", "a"-"z", "_">
|
||||||
|
ID_Continue ::= ID_Start | <Any decimal ligature: "0"-"9">
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Keywords
|
||||||
|
|
||||||
|
```
|
||||||
|
struct fn enum static
|
||||||
|
if else do while
|
||||||
|
for pub let break
|
||||||
|
continue True False None
|
||||||
|
unsigned int fixed float
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Literals
|
||||||
|
|
||||||
|
Number Literals
|
||||||
|
|
||||||
|
String Literals
|
||||||
|
|
||||||
|
#### Punctuation
|
||||||
|
|
||||||
|
```
|
||||||
|
++ -- @ $ + -
|
||||||
|
* / % ~ & |
|
||||||
|
^ << >> = += -=
|
||||||
|
*= /= %= &= |= ^=
|
||||||
|
<<= >>= ! && || ==
|
||||||
|
!= < <= > >= ?
|
||||||
|
{ } [ ] ( )
|
||||||
|
. -> , ; :
|
||||||
|
```
|
||||||
|
|
||||||
|
### Syntax
|
||||||
|
|
||||||
|
#### Operator Operand Counts
|
||||||
|
|
||||||
|
- Unitary: `++`, `--`, `@`, `$`
|
||||||
|
- Binary: `[ ]`, `+`, `-`, `*`, `/`, `%`, `~`, `&`, `|`, `^`, `<<`, `>>`, `=`,
|
||||||
|
`+=`, `-=`, `*=`, `/=`, `%=`, `&=`, `|=`, `^=`, `<<=`, `>>=`, `!`, `&&`, `||`,
|
||||||
|
`==`, `!=`, `<`, `<=`, `>`, `>=`
|
||||||
|
- Ternary: `? :`
|
||||||
|
|
||||||
|
#### Operator Precedence
|
||||||
|
|
||||||
|
- Assignment: `=`, `+=`, `-=`, `*=`, `/=`, `%=`, `&=`, `|=`, `^=`, `<<=`, `>>=`
|
||||||
|
- Ternary Operator: `? :`
|
||||||
|
- Equality and order testing: `==`, `!=`, `<`, `<=`, `>`, `>=`
|
||||||
|
- Boolean logic: `!`, `&&`, `||`
|
||||||
|
- Arithmetic: `+`, `-`, `*`, `/`, `%`
|
||||||
|
- Bitwise: `~`, `&`, `|`, `^`, `<<`, `>>`
|
||||||
|
- Increment and decrement: `++`, `--`
|
||||||
|
- Reference and dereference: `@`, `$`, `[ ]`
|
||||||
|
|
||||||
|
### Semantics
|
||||||
|
|
||||||
|
### Scratch Area
|
||||||
|
|
||||||
|
#### Keywords
|
||||||
|
|
||||||
|
- Types: `unsigned`, `int`, `fixed`, `float`
|
||||||
|
- Structural: `if`, `else`, `do`, `while`, `for`, `break`, `continue`
|
||||||
|
- Constants: `True`, `False`, `None`
|
||||||
|
- Other: `struct`, `fn`, `enum`, `static`, `pub`, `let`
|
||||||
|
|
||||||
|
#### Delimiters
|
||||||
|
|
||||||
|
```
|
||||||
|
. -> , ( )
|
||||||
|
{ } [ ] ;
|
||||||
|
:
|
||||||
|
```
|
||||||
|
|
|
@ -0,0 +1,25 @@
|
||||||
|
# Kyler Olsen
|
||||||
|
# Feb 2024
|
||||||
|
|
||||||
|
|
||||||
|
class FileInfo:
|
||||||
|
|
||||||
|
_filename: str
|
||||||
|
_line: int
|
||||||
|
_col: int
|
||||||
|
_length: int
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
filename: str,
|
||||||
|
line: int,
|
||||||
|
col: int,
|
||||||
|
length: int,
|
||||||
|
):
|
||||||
|
self._filename = filename
|
||||||
|
self._line = line
|
||||||
|
self._col = col
|
||||||
|
self._length = length
|
||||||
|
|
||||||
|
|
||||||
|
class CompilerError(Exception): pass
|
|
@ -0,0 +1,228 @@
|
||||||
|
# Kyler Olsen
|
||||||
|
# Feb 2024
|
||||||
|
|
||||||
|
from enum import Enum
|
||||||
|
from typing import ClassVar, Sequence, TextIO
|
||||||
|
|
||||||
|
from .compiler_types import CompilerError, FileInfo
|
||||||
|
|
||||||
|
|
||||||
|
class _InterTokenType(Enum):
|
||||||
|
Generic = 'Generic'
|
||||||
|
Directive = 'Directive'
|
||||||
|
SingleLineComment = 'SingleLineComment'
|
||||||
|
MultiLineComment = 'MultiLineComment'
|
||||||
|
Word = 'Word'
|
||||||
|
NumberLiteral = 'NumberLiteral'
|
||||||
|
CharLiteral = 'CharLiteral'
|
||||||
|
StringLiteral = 'StringLiteral'
|
||||||
|
Punctuation = 'Punctuation'
|
||||||
|
|
||||||
|
|
||||||
|
_OnlyNewLineTerminatedTokens = (
|
||||||
|
_InterTokenType.Directive,
|
||||||
|
_InterTokenType.SingleLineComment,
|
||||||
|
)
|
||||||
|
|
||||||
|
_NewLineTerminatedTokens = _OnlyNewLineTerminatedTokens + (
|
||||||
|
_InterTokenType.Word,
|
||||||
|
_InterTokenType.NumberLiteral,
|
||||||
|
_InterTokenType.Punctuation,
|
||||||
|
)
|
||||||
|
|
||||||
|
_NewLineErrorTokens = (
|
||||||
|
_InterTokenType.CharLiteral,
|
||||||
|
_InterTokenType.StringLiteral,
|
||||||
|
)
|
||||||
|
|
||||||
|
_ID_Start = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" "abcdefghijklmnopqrstuvwxyz" "_"
|
||||||
|
|
||||||
|
_ID_Continue = _ID_Start + "0123456789"
|
||||||
|
|
||||||
|
_Keywords = (
|
||||||
|
'struct', 'fn', 'enum', 'static',
|
||||||
|
'if', 'else', 'do', 'while',
|
||||||
|
'for', 'pub', 'let', 'break',
|
||||||
|
'continue', 'True', 'False', 'None',
|
||||||
|
'unsigned', 'int', 'fixed', 'float',
|
||||||
|
)
|
||||||
|
|
||||||
|
_Num_Start = "0123456789"
|
||||||
|
|
||||||
|
_Num_Second = _Num_Start + "box._Ee"
|
||||||
|
|
||||||
|
_Num_Continue = _Num_Start + "._" "ABCDEF" "abcdef"
|
||||||
|
|
||||||
|
_Punctuation_Any = "@$+-*/%~&|^<>=!?{}[]().->,;:"
|
||||||
|
|
||||||
|
_Punctuation = (
|
||||||
|
"++", "--", "@", "$", "+", "-",
|
||||||
|
"*", "/", "%", "~", "&", "|",
|
||||||
|
"^", "<<", ">>", "=", "+=", "-=",
|
||||||
|
"*=", "/=", "%=", "&=", "|=", "^=",
|
||||||
|
"<<=", ">>=", "!", "&&", "||", "==",
|
||||||
|
"!=", "<", "<=", ">", ">=", "?",
|
||||||
|
"{", "}", "[", "]", "(", ")",
|
||||||
|
".", "->", ",", ";", ":",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class LexerError(CompilerError): pass
|
||||||
|
|
||||||
|
|
||||||
|
class Token:
|
||||||
|
|
||||||
|
_type: ClassVar[str] = 'Generic'
|
||||||
|
_value: str
|
||||||
|
_file_info: FileInfo
|
||||||
|
|
||||||
|
def __init__(self, value: str, file_info: FileInfo):
|
||||||
|
self._value = value
|
||||||
|
self._file_info = file_info
|
||||||
|
|
||||||
|
@property
|
||||||
|
def value(self) -> str: return self._value
|
||||||
|
|
||||||
|
@property
|
||||||
|
def file_info(self) -> FileInfo: return self._file_info
|
||||||
|
|
||||||
|
class Directive(Token): _type = 'Directive'
|
||||||
|
class Identifier(Token): _type = 'Identifier'
|
||||||
|
class Keyword(Token): _type = 'Keyword'
|
||||||
|
class NumberLiteral(Token): _type = 'NumberLiteral'
|
||||||
|
class CharLiteral(Token): _type = 'CharLiteral'
|
||||||
|
class StringLiteral(Token): _type = 'StringLiteral'
|
||||||
|
class Punctuation(Token): _type = 'Punctuation'
|
||||||
|
|
||||||
|
|
||||||
|
def lexer(file: str | TextIO, filename: str) -> Sequence[Token]:
|
||||||
|
if not isinstance(file, str):
|
||||||
|
file = file.read()
|
||||||
|
tokens: list[Token] = []
|
||||||
|
current: str = ""
|
||||||
|
current_line: int = 0
|
||||||
|
current_col: int = 0
|
||||||
|
escaped: bool = False
|
||||||
|
token_type: _InterTokenType = _InterTokenType.Generic
|
||||||
|
|
||||||
|
for line, line_str in enumerate(file.splitlines()):
|
||||||
|
if token_type in _NewLineErrorTokens:
|
||||||
|
raise LexerError("Unexpected Newline")
|
||||||
|
if token_type in _NewLineTerminatedTokens:
|
||||||
|
fi = FileInfo(filename, current_line, current_col, len(current))
|
||||||
|
if token_type is _InterTokenType.Directive:
|
||||||
|
tokens.append(Directive(current, fi))
|
||||||
|
elif token_type is _InterTokenType.Word:
|
||||||
|
if len(current) > 15:
|
||||||
|
raise LexerError("Identifier Too Long")
|
||||||
|
if current in _Keywords:
|
||||||
|
tokens.append(Keyword(current, fi))
|
||||||
|
else:
|
||||||
|
tokens.append(Identifier(current, fi))
|
||||||
|
elif token_type is _InterTokenType.NumberLiteral:
|
||||||
|
tokens.append(NumberLiteral(current, fi))
|
||||||
|
elif token_type is _InterTokenType.Punctuation:
|
||||||
|
if current not in _Punctuation:
|
||||||
|
raise LexerError("Invalid Punctuation")
|
||||||
|
tokens.append(Punctuation(current, fi))
|
||||||
|
token_type = _InterTokenType.Generic
|
||||||
|
|
||||||
|
for col, char in enumerate(line_str):
|
||||||
|
if token_type in _OnlyNewLineTerminatedTokens:
|
||||||
|
current += char
|
||||||
|
elif token_type is _InterTokenType.MultiLineComment:
|
||||||
|
if len(current) >= 2 and current[-1] == '*' and char == '/':
|
||||||
|
token_type = _InterTokenType.Generic
|
||||||
|
continue
|
||||||
|
current += char
|
||||||
|
elif token_type is _InterTokenType.Word:
|
||||||
|
if char in _ID_Continue:
|
||||||
|
current += char
|
||||||
|
else:
|
||||||
|
if len(current) > 15:
|
||||||
|
raise LexerError("Identifier Too Long")
|
||||||
|
fi = FileInfo(
|
||||||
|
filename, current_line, current_col, len(current))
|
||||||
|
if current in _Keywords:
|
||||||
|
tokens.append(Keyword(current, fi))
|
||||||
|
else:
|
||||||
|
tokens.append(Identifier(current, fi))
|
||||||
|
token_type = _InterTokenType.Generic
|
||||||
|
elif token_type is _InterTokenType.NumberLiteral:
|
||||||
|
if (
|
||||||
|
(len(current) == 2 and char in _Num_Second) ^
|
||||||
|
(char in _Num_Continue)
|
||||||
|
):
|
||||||
|
current += char
|
||||||
|
else:
|
||||||
|
fi = FileInfo(
|
||||||
|
filename, current_line, current_col, len(current))
|
||||||
|
tokens.append(NumberLiteral(current, fi))
|
||||||
|
token_type = _InterTokenType.Generic
|
||||||
|
elif token_type is _InterTokenType.CharLiteral:
|
||||||
|
if escaped:
|
||||||
|
escaped = False
|
||||||
|
elif char == '\\':
|
||||||
|
escaped = True
|
||||||
|
elif char == "'":
|
||||||
|
current += char
|
||||||
|
if (
|
||||||
|
current[1] != '\\' and
|
||||||
|
len(current) == 3 or
|
||||||
|
len(current) > 3
|
||||||
|
):
|
||||||
|
raise LexerError("Character Literal Too Long")
|
||||||
|
fi = FileInfo(
|
||||||
|
filename, current_line, current_col, len(current))
|
||||||
|
tokens.append(StringLiteral(current, fi))
|
||||||
|
token_type = _InterTokenType.Generic
|
||||||
|
continue
|
||||||
|
current += char
|
||||||
|
elif token_type is _InterTokenType.StringLiteral:
|
||||||
|
if escaped:
|
||||||
|
escaped = False
|
||||||
|
elif char == '\\':
|
||||||
|
escaped = True
|
||||||
|
elif char == '"':
|
||||||
|
current += char
|
||||||
|
fi = FileInfo(
|
||||||
|
filename, current_line, current_col, len(current))
|
||||||
|
tokens.append(StringLiteral(current, fi))
|
||||||
|
token_type = _InterTokenType.Generic
|
||||||
|
continue
|
||||||
|
current += char
|
||||||
|
elif token_type is _InterTokenType.Punctuation:
|
||||||
|
if char in _Punctuation_Any:
|
||||||
|
current += char
|
||||||
|
else:
|
||||||
|
if current not in _Punctuation:
|
||||||
|
raise LexerError("Invalid Punctuation")
|
||||||
|
fi = FileInfo(
|
||||||
|
filename, current_line, current_col, len(current))
|
||||||
|
tokens.append(Punctuation(current, fi))
|
||||||
|
token_type = _InterTokenType.Generic
|
||||||
|
|
||||||
|
if token_type is _InterTokenType.Generic:
|
||||||
|
current = char
|
||||||
|
current_line = line + 1
|
||||||
|
current_col = col + 1
|
||||||
|
escaped = False
|
||||||
|
if char == '#': token_type = _InterTokenType.Directive
|
||||||
|
elif char == '/' and line_str[col+1] == '/':
|
||||||
|
token_type = _InterTokenType.SingleLineComment
|
||||||
|
elif char == '/' and line_str[col+1] == '*':
|
||||||
|
token_type = _InterTokenType.MultiLineComment
|
||||||
|
elif char in _ID_Start:
|
||||||
|
token_type = _InterTokenType.Word
|
||||||
|
elif char == '.' and line_str[col+1] in _Num_Second:
|
||||||
|
token_type = _InterTokenType.NumberLiteral
|
||||||
|
elif char in _Num_Start:
|
||||||
|
token_type = _InterTokenType.NumberLiteral
|
||||||
|
elif char == "'":
|
||||||
|
token_type = _InterTokenType.CharLiteral
|
||||||
|
elif char == '"':
|
||||||
|
token_type = _InterTokenType.StringLiteral
|
||||||
|
elif char in _Punctuation_Any:
|
||||||
|
token_type = _InterTokenType.Punctuation
|
||||||
|
|
||||||
|
return tokens
|
Loading…
Reference in New Issue