Refactor lexer to implement token types and enhance token generation logic

This commit is contained in:
Kyler Olsen 2025-12-01 09:14:35 -07:00
parent a15490b521
commit e1c43f7b2e
1 changed files with 223 additions and 8 deletions

View File

@ -1,17 +1,232 @@
#[derive(Debug, Clone)] #[derive(Debug, Clone, PartialEq, Eq)]
pub enum TokenType {
Illegal,
Eof,
Ident,
Int,
Float,
Str,
// Operators
Assign,
Plus,
Minus,
Asterisk,
Slash,
Bang,
Lt,
Gt,
Eq,
NotEq,
// Delimiters
Comma,
Semicolon,
Colon,
LParen,
RParen,
LBrace,
RBrace,
LBracket,
RBracket,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Token {
pub ttype: TokenType,
pub lexeme: String,
pub line: usize,
pub column: usize,
}
pub struct Lexer { pub struct Lexer {
pub input: String, input: Vec<char>,
pub pos: usize, pos: usize, // current position in input (points to current char)
read_pos: usize, // current reading position in input (after current char)
ch: Option<char>,
line: usize,
column: usize,
} }
impl Lexer { impl Lexer {
pub fn new(input: impl Into<String>) -> Self { pub fn new(input: impl Into<String>) -> Self {
Lexer { input: input.into(), pos: 0 } let s = input.into();
let mut lexer = Lexer {
input: s.chars().collect(),
pos: 0,
read_pos: 0,
ch: None,
line: 1,
column: 0,
};
lexer.read_char();
lexer
} }
/// Stub: return next token as a string or None at EOF. fn read_char(&mut self) {
pub fn next_token(&mut self) -> Option<String> { if self.read_pos >= self.input.len() {
// placeholder for a real lexer implementation self.ch = None;
} else {
self.ch = Some(self.input[self.read_pos]);
}
self.pos = self.read_pos;
self.read_pos += 1;
if let Some(c) = self.ch {
if c == '\n' {
self.line += 1;
self.column = 0;
} else {
self.column += 1;
}
}
}
fn peek_char(&self) -> Option<char> {
if self.read_pos >= self.input.len() {
None None
} else {
Some(self.input[self.read_pos])
}
}
fn skip_whitespace(&mut self) {
while let Some(c) = self.ch {
if c.is_whitespace() {
self.read_char();
} else {
break;
}
}
}
fn read_identifier(&mut self) -> String {
let start = self.pos;
while let Some(c) = self.ch {
if is_identifier_char(c) {
self.read_char();
} else {
break;
}
}
self.input[start..self.pos].iter().collect()
}
fn read_number(&mut self) -> String {
let start = self.pos;
let mut seen_dot = false;
while let Some(c) = self.ch {
if c == '.' {
if seen_dot {
break;
}
seen_dot = true;
self.read_char();
} else if c.is_ascii_digit() {
self.read_char();
} else {
break;
}
}
self.input[start..self.pos].iter().collect()
}
fn read_string(&mut self) -> String {
// consume opening quote
self.read_char();
let start = self.pos;
while let Some(c) = self.ch {
if c == '"' {
break;
}
// support escape handling later
self.read_char();
}
let s: String = self.input[start..self.pos].iter().collect();
// consume closing quote
self.read_char();
s
}
pub fn next_token(&mut self) -> Token {
self.skip_whitespace();
let token = match self.ch {
Some('=') => {
if self.peek_char() == Some('=') {
self.read_char();
let lex = "==".to_string();
Token { ttype: TokenType::Eq, lexeme: lex, line: self.line, column: self.column }
} else {
Token { ttype: TokenType::Assign, lexeme: "=".to_string(), line: self.line, column: self.column }
}
}
Some('+') => { Token { ttype: TokenType::Plus, lexeme: "+".to_string(), line: self.line, column: self.column } }
Some('-') => { Token { ttype: TokenType::Minus, lexeme: "-".to_string(), line: self.line, column: self.column } }
Some('*') => { Token { ttype: TokenType::Asterisk, lexeme: "*".to_string(), line: self.line, column: self.column } }
Some('/') => { Token { ttype: TokenType::Slash, lexeme: "/".to_string(), line: self.line, column: self.column } }
Some('!') => {
if self.peek_char() == Some('=') {
self.read_char();
Token { ttype: TokenType::NotEq, lexeme: "!=".to_string(), line: self.line, column: self.column }
} else {
Token { ttype: TokenType::Bang, lexeme: "!".to_string(), line: self.line, column: self.column }
}
}
Some('<') => { Token { ttype: TokenType::Lt, lexeme: "<".to_string(), line: self.line, column: self.column } }
Some('>') => { Token { ttype: TokenType::Gt, lexeme: ">".to_string(), line: self.line, column: self.column } }
Some(',') => { Token { ttype: TokenType::Comma, lexeme: ",".to_string(), line: self.line, column: self.column } }
Some(';') => { Token { ttype: TokenType::Semicolon, lexeme: ";".to_string(), line: self.line, column: self.column } }
Some(':') => { Token { ttype: TokenType::Colon, lexeme: ":".to_string(), line: self.line, column: self.column } }
Some('(') => { Token { ttype: TokenType::LParen, lexeme: "(".to_string(), line: self.line, column: self.column } }
Some(')') => { Token { ttype: TokenType::RParen, lexeme: ")".to_string(), line: self.line, column: self.column } }
Some('{') => { Token { ttype: TokenType::LBrace, lexeme: "{".to_string(), line: self.line, column: self.column } }
Some('}') => { Token { ttype: TokenType::RBrace, lexeme: "}".to_string(), line: self.line, column: self.column } }
Some('[') => { Token { ttype: TokenType::LBracket, lexeme: "[".to_string(), line: self.line, column: self.column } }
Some(']') => { Token { ttype: TokenType::RBracket, lexeme: "]".to_string(), line: self.line, column: self.column } }
Some('"') => {
let s = self.read_string();
Token { ttype: TokenType::Str, lexeme: s, line: self.line, column: self.column }
}
Some(c) if is_letter(c) => {
let ident = self.read_identifier();
Token { ttype: TokenType::Ident, lexeme: ident, line: self.line, column: self.column }
}
Some(c) if c.is_ascii_digit() => {
let num = self.read_number();
if num.contains('.') {
Token { ttype: TokenType::Float, lexeme: num, line: self.line, column: self.column }
} else {
Token { ttype: TokenType::Int, lexeme: num, line: self.line, column: self.column }
}
}
None => { Token { ttype: TokenType::Eof, lexeme: "".to_string(), line: self.line, column: self.column } }
Some(_) => { Token { ttype: TokenType::Illegal, lexeme: self.ch.unwrap().to_string(), line: self.line, column: self.column } }
};
// advance to next char if not EOF and we didn't already advance inside readers
if self.ch.is_some() {
// For cases where read_identifier/read_number/read_string already moved position
// we don't want to skip an extra char. The read_* helpers leave `ch` at the
// character after the token. To keep behavior consistent, only call read_char
// when the token was produced from a single-char branch.
match token.ttype {
TokenType::Ident | TokenType::Int | TokenType::Float | TokenType::Str => {}
TokenType::Eof => {}
_ => { self.read_char(); }
}
}
token
} }
} }
fn is_letter(c: char) -> bool {
c.is_ascii_alphabetic() || c == '_'
}
fn is_identifier_char(c: char) -> bool {
c.is_ascii_alphanumeric() || c == '_'
}