From 83224b3acb716caf27d298ef7d10c7eb835bba18 Mon Sep 17 00:00:00 2001
From: Kyler <59854022+KylerOlsen@users.noreply.github.com>
Date: Mon, 26 Feb 2024 00:06:45 -0700
Subject: [PATCH] Started lexer in the compiler

---
 docs/ytd 12-bit computer.md         | 114 +++++++++++++-
 pytd12dk/compiler/compiler_types.py |  25 +++
 pytd12dk/compiler/lexer.py          | 228 ++++++++++++++++++++++++++++
 3 files changed, 359 insertions(+), 8 deletions(-)
 create mode 100644 pytd12dk/compiler/compiler_types.py
 create mode 100644 pytd12dk/compiler/lexer.py
diff --git a/docs/ytd 12-bit computer.md b/docs/ytd 12-bit computer.md
index 8af8ba1..fc2d846 100644
--- a/docs/ytd 12-bit computer.md	
+++ b/docs/ytd 12-bit computer.md	
@@ -2,7 +2,8 @@
 *Yeahbut, aka Kyler Olsen*
 
 It is a custom computer and instruction set architecture. It also has its own
-assembly language with assembler. Custom high level language coming soon!
+assembly language with assembler written in Python. Custom high level language
+coming soon!
 
 ## ISA
 *WIP*
@@ -63,10 +64,107 @@ assembly language with assembler. Custom high level language coming soon!
 ## High Level Language
 *WIP*
 
-Paradigm: Multi-Paradigm: Procedural (Imperative), Structured
-Designer: Kyler Olsen
-First Appeared: *Future*
-Typing Discipline: Typeless
-Platform: ytd 12-bit computer, ytd 12-bit emulator (multi-platform)
-License: *Tentatively MIT*
-Filename extension: `.ytd12c`
+About
+- Paradigm: Multi-Paradigm: Procedural (Imperative), Structured
+- Designer: Kyler Olsen
+- Created: *Future*
+- Typing Discipline: ~~Typeless~~ Static, Weak
+- Platform: ytd 12-bit computer, ytd 12-bit emulator (multi-platform)
+- License: *Tentatively MIT*
+- Filename extension: `.ytd12c`
+
+### Lexical
+
+#### Directives
+
+Directives start with `#` and end at the end of a line.
+They are used to give additional instructions to the compiler.
+
+#### Comments
+
+Comments can either be single-line comments or multi-line comments.
+
+Single-line comments start with `//` and end at the end of a line.
+
+Multi-line comments start with `/*` and end with `*/`.
+
+
+#### Identifiers
+
+Can be up to 15 characters in length, and are case sensitive.
+They cannot be a keyword.
+
+```
+Identifier ::= ID_Start ID_Continue*
+ID_Start ::= <Any latin alphabet ligature or an underscore: "A"-"Z", "a"-"z", "_">
+ID_Continue ::= ID_Start | <Any decimal ligature: "0"-"9">
+```
+
+#### Keywords
+
+```
+struct      fn          enum        static
+if          else        do          while
+for         pub         let         break
+continue    True        False       None
+unsigned    int         fixed       float
+```
+
+#### Literals
+
+Number Literals
+
+String Literals
+
+#### Punctuation
+
+```
+++      --      @       $       +       -
+*       /       %       ~       &       |
+^       <<      >>      =       +=      -=
+*=      /=      %=      &=      |=      ^=
+<<=     >>=     !       &&      ||      ==
+!=      <       <=      >       >=      ?
+{       }       [       ]       (       )
+.       ->      ,       ;       :
+```
+
+### Syntax
+
+#### Operator Operand Counts
+
+- Unitary: `++`, `--`, `@`, `$`
+- Binary: `[ ]`, `+`, `-`, `*`, `/`, `%`, `~`, `&`, `|`, `^`, `<<`, `>>`, `=`,
+`+=`, `-=`, `*=`, `/=`, `%=`, `&=`, `|=`, `^=`, `<<=`, `>>=`, `!`, `&&`, `||`,
+`==`, `!=`, `<`, `<=`, `>`, `>=`
+- Ternary: `? :`
+
+#### Operator Precedence
+
+- Assignment: `=`, `+=`, `-=`, `*=`, `/=`, `%=`, `&=`, `|=`, `^=`, `<<=`, `>>=`
+- Ternary Operator: `? :`
+- Equality and order testing: `==`, `!=`, `<`, `<=`, `>`, `>=`
+- Boolean logic: `!`, `&&`, `||`
+- Arithmetic: `+`, `-`, `*`, `/`, `%`
+- Bitwise: `~`, `&`, `|`, `^`, `<<`, `>>`
+- Increment and decrement: `++`, `--`
+- Reference and dereference: `@`, `$`, `[ ]`
+
+### Semantics
+
+### Scratch Area
+
+#### Keywords
+
+- Types: `unsigned`, `int`, `fixed`, `float`
+- Structural: `if`, `else`, `do`, `while`, `for`, `break`, `continue`
+- Constants: `True`, `False`, `None`
+- Other: `struct`, `fn`, `enum`, `static`, `pub`, `let`
+
+#### Delimiters
+
+```
+.       ->      ,       (       )
+{       }       [       ]       ;
+:
+```
diff --git a/pytd12dk/compiler/compiler_types.py b/pytd12dk/compiler/compiler_types.py
new file mode 100644
index 0000000..22b25e6
--- /dev/null
+++ b/pytd12dk/compiler/compiler_types.py
@@ -0,0 +1,25 @@
+# Kyler Olsen
+# Feb 2024
+
+
+class FileInfo:
+
+    _filename: str
+    _line: int
+    _col: int
+    _length: int
+
+    def __init__(
+        self,
+        filename: str,
+        line: int,
+        col: int,
+        length: int,
+    ):
+        self._filename = filename
+        self._line = line
+        self._col = col
+        self._length = length
+
+
+class CompilerError(Exception): pass
diff --git a/pytd12dk/compiler/lexer.py b/pytd12dk/compiler/lexer.py
new file mode 100644
index 0000000..e93d8f3
--- /dev/null
+++ b/pytd12dk/compiler/lexer.py
@@ -0,0 +1,228 @@
+# Kyler Olsen
+# Feb 2024
+
+from enum import Enum
+from typing import ClassVar, Sequence, TextIO
+
+from .compiler_types import CompilerError, FileInfo
+
+
+class _InterTokenType(Enum):
+    Generic = 'Generic'
+    Directive = 'Directive'
+    SingleLineComment = 'SingleLineComment'
+    MultiLineComment = 'MultiLineComment'
+    Word = 'Word'
+    NumberLiteral = 'NumberLiteral'
+    CharLiteral = 'CharLiteral'
+    StringLiteral = 'StringLiteral'
+    Punctuation = 'Punctuation'
+
+
+_OnlyNewLineTerminatedTokens = (
+    _InterTokenType.Directive,
+    _InterTokenType.SingleLineComment,
+)
+
+_NewLineTerminatedTokens = _OnlyNewLineTerminatedTokens + (
+    _InterTokenType.Word,
+    _InterTokenType.NumberLiteral,
+    _InterTokenType.Punctuation,
+)
+
+_NewLineErrorTokens = (
+    _InterTokenType.CharLiteral,
+    _InterTokenType.StringLiteral,
+)
+
+_ID_Start = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" "abcdefghijklmnopqrstuvwxyz" "_"
+
+_ID_Continue = _ID_Start + "0123456789"
+
+_Keywords = (
+    'struct',   'fn',   'enum',  'static',
+    'if',       'else', 'do',    'while',
+    'for',      'pub',  'let',   'break',
+    'continue', 'True', 'False', 'None',
+    'unsigned', 'int',  'fixed', 'float',
+)
+
+_Num_Start = "0123456789"
+
+_Num_Second = _Num_Start + "box._Ee"
+
+_Num_Continue = _Num_Start + "._" "ABCDEF" "abcdef"
+
+_Punctuation_Any = "@$+-*/%~&|^<>=!?{}[]().->,;:"
+
+_Punctuation = (
+    "++",  "--",  "@",  "$",  "+",  "-",
+    "*",   "/",   "%",  "~",  "&",  "|",
+    "^",   "<<",  ">>", "=",  "+=", "-=",
+    "*=",  "/=",  "%=", "&=", "|=", "^=",
+    "<<=", ">>=", "!",  "&&", "||", "==",
+    "!=",  "<",   "<=", ">",  ">=", "?",
+    "{",   "}",   "[",  "]",  "(",  ")",
+    ".",   "->",  ",",  ";",  ":",
+)
+
+
+class LexerError(CompilerError): pass
+
+
+class Token:
+
+    _type: ClassVar[str] = 'Generic'
+    _value: str
+    _file_info: FileInfo
+
+    def __init__(self, value: str, file_info: FileInfo):
+        self._value = value
+        self._file_info = file_info
+
+    @property
+    def value(self) -> str: return self._value
+
+    @property
+    def file_info(self) -> FileInfo: return self._file_info
+
+class Directive(Token): _type = 'Directive'
+class Identifier(Token): _type = 'Identifier'
+class Keyword(Token): _type = 'Keyword'
+class NumberLiteral(Token): _type = 'NumberLiteral'
+class CharLiteral(Token): _type = 'CharLiteral'
+class StringLiteral(Token): _type = 'StringLiteral'
+class Punctuation(Token): _type = 'Punctuation'
+
+
+def lexer(file: str | TextIO, filename: str) -> Sequence[Token]:
+    if not isinstance(file, str):
+        file = file.read()
+    tokens: list[Token] = []
+    current: str = ""
+    current_line: int = 0
+    current_col: int = 0
+    escaped: bool = False
+    token_type: _InterTokenType = _InterTokenType.Generic
+
+    for line, line_str in enumerate(file.splitlines()):
+        if token_type in _NewLineErrorTokens:
+            raise LexerError("Unexpected Newline")
+        if token_type in _NewLineTerminatedTokens:
+            fi = FileInfo(filename, current_line, current_col, len(current))
+            if token_type is _InterTokenType.Directive:
+                tokens.append(Directive(current, fi))
+            elif token_type is _InterTokenType.Word:
+                if len(current) > 15:
+                    raise LexerError("Identifier Too Long")
+                if current in _Keywords:
+                    tokens.append(Keyword(current, fi))
+                else:
+                    tokens.append(Identifier(current, fi))
+            elif token_type is _InterTokenType.NumberLiteral:
+                tokens.append(NumberLiteral(current, fi))
+            elif token_type is _InterTokenType.Punctuation:
+                if current not in _Punctuation:
+                    raise LexerError("Invalid Punctuation")
+                tokens.append(Punctuation(current, fi))
+            token_type = _InterTokenType.Generic
+
+        for col, char in enumerate(line_str):
+            if token_type in _OnlyNewLineTerminatedTokens:
+                current += char
+            elif token_type is _InterTokenType.MultiLineComment:
+                if len(current) >= 2 and current[-1] == '*' and char == '/':
+                    token_type = _InterTokenType.Generic
+                    continue
+                current += char
+            elif token_type is _InterTokenType.Word:
+                if char in _ID_Continue:
+                    current += char
+                else:
+                    if len(current) > 15:
+                        raise LexerError("Identifier Too Long")
+                    fi = FileInfo(
+                        filename, current_line, current_col, len(current))
+                    if current in _Keywords:
+                        tokens.append(Keyword(current, fi))
+                    else:
+                        tokens.append(Identifier(current, fi))
+                token_type = _InterTokenType.Generic
+            elif token_type is _InterTokenType.NumberLiteral:
+                if (
+                    (len(current) == 2 and char in _Num_Second) ^
+                    (char in _Num_Continue)
+                ):
+                    current += char
+                else:
+                    fi = FileInfo(
+                        filename, current_line, current_col, len(current))
+                    tokens.append(NumberLiteral(current, fi))
+                    token_type = _InterTokenType.Generic
+            elif token_type is _InterTokenType.CharLiteral:
+                if escaped:
+                    escaped = False
+                elif char == '\\':
+                    escaped = True
+                elif char == "'":
+                    current += char
+                    if (
+                        current[1] != '\\' and
+                        len(current) == 3 or
+                        len(current) > 3
+                    ):
+                        raise LexerError("Character Literal Too Long")
+                    fi = FileInfo(
+                        filename, current_line, current_col, len(current))
+                    tokens.append(StringLiteral(current, fi))
+                    token_type = _InterTokenType.Generic
+                    continue
+                current += char
+            elif token_type is _InterTokenType.StringLiteral:
+                if escaped:
+                    escaped = False
+                elif char == '\\':
+                    escaped = True
+                elif char == '"':
+                    current += char
+                    fi = FileInfo(
+                        filename, current_line, current_col, len(current))
+                    tokens.append(StringLiteral(current, fi))
+                    token_type = _InterTokenType.Generic
+                    continue
+                current += char
+            elif token_type is _InterTokenType.Punctuation:
+                if char in _Punctuation_Any:
+                    current += char
+                else:
+                    if current not in _Punctuation:
+                        raise LexerError("Invalid Punctuation")
+                    fi = FileInfo(
+                        filename, current_line, current_col, len(current))
+                    tokens.append(Punctuation(current, fi))
+                token_type = _InterTokenType.Generic
+
+            if token_type is _InterTokenType.Generic:
+                current = char
+                current_line = line + 1
+                current_col = col + 1
+                escaped = False
+                if char == '#': token_type = _InterTokenType.Directive
+                elif char == '/' and line_str[col+1] == '/':
+                    token_type = _InterTokenType.SingleLineComment
+                elif char == '/' and line_str[col+1] == '*':
+                    token_type = _InterTokenType.MultiLineComment
+                elif char in _ID_Start:
+                    token_type = _InterTokenType.Word
+                elif char == '.' and line_str[col+1] in _Num_Second:
+                    token_type = _InterTokenType.NumberLiteral
+                elif char in _Num_Start:
+                    token_type = _InterTokenType.NumberLiteral
+                elif char == "'":
+                    token_type = _InterTokenType.CharLiteral
+                elif char == '"':
+                    token_type = _InterTokenType.StringLiteral
+                elif char in _Punctuation_Any:
+                    token_type = _InterTokenType.Punctuation
+
+    return tokens