Started lexer in the compiler
This commit is contained in:
parent
1dc6c9ab73
commit
83224b3acb
|
@ -2,7 +2,8 @@
|
|||
*Yeahbut, aka Kyler Olsen*
|
||||
|
||||
It is a custom computer and instruction set architecture. It also has its own
|
||||
assembly language with assembler. Custom high level language coming soon!
|
||||
assembly language with assembler written in Python. Custom high level language
|
||||
coming soon!
|
||||
|
||||
## ISA
|
||||
*WIP*
|
||||
|
@ -63,10 +64,107 @@ assembly language with assembler. Custom high level language coming soon!
|
|||
## High Level Language
|
||||
*WIP*
|
||||
|
||||
Paradigm: Multi-Paradigm: Procedural (Imperative), Structured
|
||||
Designer: Kyler Olsen
|
||||
First Appeared: *Future*
|
||||
Typing Discipline: Typeless
|
||||
Platform: ytd 12-bit computer, ytd 12-bit emulator (multi-platform)
|
||||
License: *Tentatively MIT*
|
||||
Filename extension: `.ytd12c`
|
||||
About
|
||||
- Paradigm: Multi-Paradigm: Procedural (Imperative), Structured
|
||||
- Designer: Kyler Olsen
|
||||
- Created: *Future*
|
||||
- Typing Discipline: ~~Typeless~~ Static, Weak
|
||||
- Platform: ytd 12-bit computer, ytd 12-bit emulator (multi-platform)
|
||||
- License: *Tentatively MIT*
|
||||
- Filename extension: `.ytd12c`
|
||||
|
||||
### Lexical
|
||||
|
||||
#### Directives
|
||||
|
||||
Directives start with `#` and end at the end of a line.
|
||||
They are used to give additional instructions to the compiler.
|
||||
|
||||
#### Comments
|
||||
|
||||
Comments can either be single-line comments or multi-line comments.
|
||||
|
||||
Single-line comments start with `//` and end at the end of a line.
|
||||
|
||||
Multi-line comments start with `/*` and end with `*/`.
|
||||
|
||||
|
||||
#### Identifiers
|
||||
|
||||
Can be up to 15 characters in length, and are case sensitive.
|
||||
They cannot be a keyword.
|
||||
|
||||
```
|
||||
Identifier ::= ID_Start ID_Continue*
|
||||
ID_Start ::= <Any latin alphabet ligature or an underscore: "A"-"Z", "a"-"z", "_">
|
||||
ID_Continue ::= ID_Start | <Any decimal ligature: "0"-"9">
|
||||
```
|
||||
|
||||
#### Keywords
|
||||
|
||||
```
|
||||
struct fn enum static
|
||||
if else do while
|
||||
for pub let break
|
||||
continue True False None
|
||||
unsigned int fixed float
|
||||
```
|
||||
|
||||
#### Literals
|
||||
|
||||
Number Literals
|
||||
|
||||
String Literals
|
||||
|
||||
#### Punctuation
|
||||
|
||||
```
|
||||
++ -- @ $ + -
|
||||
* / % ~ & |
|
||||
^ << >> = += -=
|
||||
*= /= %= &= |= ^=
|
||||
<<= >>= ! && || ==
|
||||
!= < <= > >= ?
|
||||
{ } [ ] ( )
|
||||
. -> , ; :
|
||||
```
|
||||
|
||||
### Syntax
|
||||
|
||||
#### Operator Operand Counts
|
||||
|
||||
- Unitary: `++`, `--`, `@`, `$`
|
||||
- Binary: `[ ]`, `+`, `-`, `*`, `/`, `%`, `~`, `&`, `|`, `^`, `<<`, `>>`, `=`,
|
||||
`+=`, `-=`, `*=`, `/=`, `%=`, `&=`, `|=`, `^=`, `<<=`, `>>=`, `!`, `&&`, `||`,
|
||||
`==`, `!=`, `<`, `<=`, `>`, `>=`
|
||||
- Ternary: `? :`
|
||||
|
||||
#### Operator Precedence
|
||||
|
||||
- Assignment: `=`, `+=`, `-=`, `*=`, `/=`, `%=`, `&=`, `|=`, `^=`, `<<=`, `>>=`
|
||||
- Ternary Operator: `? :`
|
||||
- Equality and order testing: `==`, `!=`, `<`, `<=`, `>`, `>=`
|
||||
- Boolean logic: `!`, `&&`, `||`
|
||||
- Arithmetic: `+`, `-`, `*`, `/`, `%`
|
||||
- Bitwise: `~`, `&`, `|`, `^`, `<<`, `>>`
|
||||
- Increment and decrement: `++`, `--`
|
||||
- Reference and dereference: `@`, `$`, `[ ]`
|
||||
|
||||
### Semantics
|
||||
|
||||
### Scratch Area
|
||||
|
||||
#### Keywords
|
||||
|
||||
- Types: `unsigned`, `int`, `fixed`, `float`
|
||||
- Structural: `if`, `else`, `do`, `while`, `for`, `break`, `continue`
|
||||
- Constants: `True`, `False`, `None`
|
||||
- Other: `struct`, `fn`, `enum`, `static`, `pub`, `let`
|
||||
|
||||
#### Delimiters
|
||||
|
||||
```
|
||||
. -> , ( )
|
||||
{ } [ ] ;
|
||||
:
|
||||
```
|
||||
|
|
|
@ -0,0 +1,25 @@
|
|||
# Kyler Olsen
|
||||
# Feb 2024
|
||||
|
||||
|
||||
class FileInfo:
|
||||
|
||||
_filename: str
|
||||
_line: int
|
||||
_col: int
|
||||
_length: int
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
filename: str,
|
||||
line: int,
|
||||
col: int,
|
||||
length: int,
|
||||
):
|
||||
self._filename = filename
|
||||
self._line = line
|
||||
self._col = col
|
||||
self._length = length
|
||||
|
||||
|
||||
class CompilerError(Exception): pass
|
|
@ -0,0 +1,228 @@
|
|||
# Kyler Olsen
|
||||
# Feb 2024
|
||||
|
||||
from enum import Enum
|
||||
from typing import ClassVar, Sequence, TextIO
|
||||
|
||||
from .compiler_types import CompilerError, FileInfo
|
||||
|
||||
|
||||
class _InterTokenType(Enum):
|
||||
Generic = 'Generic'
|
||||
Directive = 'Directive'
|
||||
SingleLineComment = 'SingleLineComment'
|
||||
MultiLineComment = 'MultiLineComment'
|
||||
Word = 'Word'
|
||||
NumberLiteral = 'NumberLiteral'
|
||||
CharLiteral = 'CharLiteral'
|
||||
StringLiteral = 'StringLiteral'
|
||||
Punctuation = 'Punctuation'
|
||||
|
||||
|
||||
_OnlyNewLineTerminatedTokens = (
|
||||
_InterTokenType.Directive,
|
||||
_InterTokenType.SingleLineComment,
|
||||
)
|
||||
|
||||
_NewLineTerminatedTokens = _OnlyNewLineTerminatedTokens + (
|
||||
_InterTokenType.Word,
|
||||
_InterTokenType.NumberLiteral,
|
||||
_InterTokenType.Punctuation,
|
||||
)
|
||||
|
||||
_NewLineErrorTokens = (
|
||||
_InterTokenType.CharLiteral,
|
||||
_InterTokenType.StringLiteral,
|
||||
)
|
||||
|
||||
_ID_Start = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" "abcdefghijklmnopqrstuvwxyz" "_"
|
||||
|
||||
_ID_Continue = _ID_Start + "0123456789"
|
||||
|
||||
_Keywords = (
|
||||
'struct', 'fn', 'enum', 'static',
|
||||
'if', 'else', 'do', 'while',
|
||||
'for', 'pub', 'let', 'break',
|
||||
'continue', 'True', 'False', 'None',
|
||||
'unsigned', 'int', 'fixed', 'float',
|
||||
)
|
||||
|
||||
_Num_Start = "0123456789"
|
||||
|
||||
_Num_Second = _Num_Start + "box._Ee"
|
||||
|
||||
_Num_Continue = _Num_Start + "._" "ABCDEF" "abcdef"
|
||||
|
||||
_Punctuation_Any = "@$+-*/%~&|^<>=!?{}[]().->,;:"
|
||||
|
||||
_Punctuation = (
|
||||
"++", "--", "@", "$", "+", "-",
|
||||
"*", "/", "%", "~", "&", "|",
|
||||
"^", "<<", ">>", "=", "+=", "-=",
|
||||
"*=", "/=", "%=", "&=", "|=", "^=",
|
||||
"<<=", ">>=", "!", "&&", "||", "==",
|
||||
"!=", "<", "<=", ">", ">=", "?",
|
||||
"{", "}", "[", "]", "(", ")",
|
||||
".", "->", ",", ";", ":",
|
||||
)
|
||||
|
||||
|
||||
class LexerError(CompilerError): pass
|
||||
|
||||
|
||||
class Token:
|
||||
|
||||
_type: ClassVar[str] = 'Generic'
|
||||
_value: str
|
||||
_file_info: FileInfo
|
||||
|
||||
def __init__(self, value: str, file_info: FileInfo):
|
||||
self._value = value
|
||||
self._file_info = file_info
|
||||
|
||||
@property
|
||||
def value(self) -> str: return self._value
|
||||
|
||||
@property
|
||||
def file_info(self) -> FileInfo: return self._file_info
|
||||
|
||||
class Directive(Token): _type = 'Directive'
|
||||
class Identifier(Token): _type = 'Identifier'
|
||||
class Keyword(Token): _type = 'Keyword'
|
||||
class NumberLiteral(Token): _type = 'NumberLiteral'
|
||||
class CharLiteral(Token): _type = 'CharLiteral'
|
||||
class StringLiteral(Token): _type = 'StringLiteral'
|
||||
class Punctuation(Token): _type = 'Punctuation'
|
||||
|
||||
|
||||
def lexer(file: str | TextIO, filename: str) -> Sequence[Token]:
|
||||
if not isinstance(file, str):
|
||||
file = file.read()
|
||||
tokens: list[Token] = []
|
||||
current: str = ""
|
||||
current_line: int = 0
|
||||
current_col: int = 0
|
||||
escaped: bool = False
|
||||
token_type: _InterTokenType = _InterTokenType.Generic
|
||||
|
||||
for line, line_str in enumerate(file.splitlines()):
|
||||
if token_type in _NewLineErrorTokens:
|
||||
raise LexerError("Unexpected Newline")
|
||||
if token_type in _NewLineTerminatedTokens:
|
||||
fi = FileInfo(filename, current_line, current_col, len(current))
|
||||
if token_type is _InterTokenType.Directive:
|
||||
tokens.append(Directive(current, fi))
|
||||
elif token_type is _InterTokenType.Word:
|
||||
if len(current) > 15:
|
||||
raise LexerError("Identifier Too Long")
|
||||
if current in _Keywords:
|
||||
tokens.append(Keyword(current, fi))
|
||||
else:
|
||||
tokens.append(Identifier(current, fi))
|
||||
elif token_type is _InterTokenType.NumberLiteral:
|
||||
tokens.append(NumberLiteral(current, fi))
|
||||
elif token_type is _InterTokenType.Punctuation:
|
||||
if current not in _Punctuation:
|
||||
raise LexerError("Invalid Punctuation")
|
||||
tokens.append(Punctuation(current, fi))
|
||||
token_type = _InterTokenType.Generic
|
||||
|
||||
for col, char in enumerate(line_str):
|
||||
if token_type in _OnlyNewLineTerminatedTokens:
|
||||
current += char
|
||||
elif token_type is _InterTokenType.MultiLineComment:
|
||||
if len(current) >= 2 and current[-1] == '*' and char == '/':
|
||||
token_type = _InterTokenType.Generic
|
||||
continue
|
||||
current += char
|
||||
elif token_type is _InterTokenType.Word:
|
||||
if char in _ID_Continue:
|
||||
current += char
|
||||
else:
|
||||
if len(current) > 15:
|
||||
raise LexerError("Identifier Too Long")
|
||||
fi = FileInfo(
|
||||
filename, current_line, current_col, len(current))
|
||||
if current in _Keywords:
|
||||
tokens.append(Keyword(current, fi))
|
||||
else:
|
||||
tokens.append(Identifier(current, fi))
|
||||
token_type = _InterTokenType.Generic
|
||||
elif token_type is _InterTokenType.NumberLiteral:
|
||||
if (
|
||||
(len(current) == 2 and char in _Num_Second) ^
|
||||
(char in _Num_Continue)
|
||||
):
|
||||
current += char
|
||||
else:
|
||||
fi = FileInfo(
|
||||
filename, current_line, current_col, len(current))
|
||||
tokens.append(NumberLiteral(current, fi))
|
||||
token_type = _InterTokenType.Generic
|
||||
elif token_type is _InterTokenType.CharLiteral:
|
||||
if escaped:
|
||||
escaped = False
|
||||
elif char == '\\':
|
||||
escaped = True
|
||||
elif char == "'":
|
||||
current += char
|
||||
if (
|
||||
current[1] != '\\' and
|
||||
len(current) == 3 or
|
||||
len(current) > 3
|
||||
):
|
||||
raise LexerError("Character Literal Too Long")
|
||||
fi = FileInfo(
|
||||
filename, current_line, current_col, len(current))
|
||||
tokens.append(StringLiteral(current, fi))
|
||||
token_type = _InterTokenType.Generic
|
||||
continue
|
||||
current += char
|
||||
elif token_type is _InterTokenType.StringLiteral:
|
||||
if escaped:
|
||||
escaped = False
|
||||
elif char == '\\':
|
||||
escaped = True
|
||||
elif char == '"':
|
||||
current += char
|
||||
fi = FileInfo(
|
||||
filename, current_line, current_col, len(current))
|
||||
tokens.append(StringLiteral(current, fi))
|
||||
token_type = _InterTokenType.Generic
|
||||
continue
|
||||
current += char
|
||||
elif token_type is _InterTokenType.Punctuation:
|
||||
if char in _Punctuation_Any:
|
||||
current += char
|
||||
else:
|
||||
if current not in _Punctuation:
|
||||
raise LexerError("Invalid Punctuation")
|
||||
fi = FileInfo(
|
||||
filename, current_line, current_col, len(current))
|
||||
tokens.append(Punctuation(current, fi))
|
||||
token_type = _InterTokenType.Generic
|
||||
|
||||
if token_type is _InterTokenType.Generic:
|
||||
current = char
|
||||
current_line = line + 1
|
||||
current_col = col + 1
|
||||
escaped = False
|
||||
if char == '#': token_type = _InterTokenType.Directive
|
||||
elif char == '/' and line_str[col+1] == '/':
|
||||
token_type = _InterTokenType.SingleLineComment
|
||||
elif char == '/' and line_str[col+1] == '*':
|
||||
token_type = _InterTokenType.MultiLineComment
|
||||
elif char in _ID_Start:
|
||||
token_type = _InterTokenType.Word
|
||||
elif char == '.' and line_str[col+1] in _Num_Second:
|
||||
token_type = _InterTokenType.NumberLiteral
|
||||
elif char in _Num_Start:
|
||||
token_type = _InterTokenType.NumberLiteral
|
||||
elif char == "'":
|
||||
token_type = _InterTokenType.CharLiteral
|
||||
elif char == '"':
|
||||
token_type = _InterTokenType.StringLiteral
|
||||
elif char in _Punctuation_Any:
|
||||
token_type = _InterTokenType.Punctuation
|
||||
|
||||
return tokens
|
Loading…
Reference in New Issue