diff --git a/SLS_Rust/sls/src/lexer.rs b/SLS_Rust/sls/src/lexer.rs index d4a4abe..9dbcf0d 100644 --- a/SLS_Rust/sls/src/lexer.rs +++ b/SLS_Rust/sls/src/lexer.rs @@ -1,17 +1,232 @@ -#[derive(Debug, Clone)] +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum TokenType { + Illegal, + Eof, + + Ident, + Int, + Float, + Str, + + // Operators + Assign, + Plus, + Minus, + Asterisk, + Slash, + Bang, + Lt, + Gt, + Eq, + NotEq, + + // Delimiters + Comma, + Semicolon, + Colon, + LParen, + RParen, + LBrace, + RBrace, + LBracket, + RBracket, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct Token { + pub ttype: TokenType, + pub lexeme: String, + pub line: usize, + pub column: usize, +} + pub struct Lexer { - pub input: String, - pub pos: usize, + input: Vec, + pos: usize, // current position in input (points to current char) + read_pos: usize, // current reading position in input (after current char) + ch: Option, + line: usize, + column: usize, } impl Lexer { pub fn new(input: impl Into) -> Self { - Lexer { input: input.into(), pos: 0 } + let s = input.into(); + let mut lexer = Lexer { + input: s.chars().collect(), + pos: 0, + read_pos: 0, + ch: None, + line: 1, + column: 0, + }; + lexer.read_char(); + lexer } - /// Stub: return next token as a string or None at EOF. - pub fn next_token(&mut self) -> Option { - // placeholder for a real lexer implementation - None + fn read_char(&mut self) { + if self.read_pos >= self.input.len() { + self.ch = None; + } else { + self.ch = Some(self.input[self.read_pos]); + } + self.pos = self.read_pos; + self.read_pos += 1; + + if let Some(c) = self.ch { + if c == '\n' { + self.line += 1; + self.column = 0; + } else { + self.column += 1; + } + } + } + + fn peek_char(&self) -> Option { + if self.read_pos >= self.input.len() { + None + } else { + Some(self.input[self.read_pos]) + } + } + + fn skip_whitespace(&mut self) { + while let Some(c) = self.ch { + if c.is_whitespace() { + self.read_char(); + } else { + break; + } + } + } + + fn read_identifier(&mut self) -> String { + let start = self.pos; + while let Some(c) = self.ch { + if is_identifier_char(c) { + self.read_char(); + } else { + break; + } + } + self.input[start..self.pos].iter().collect() + } + + fn read_number(&mut self) -> String { + let start = self.pos; + let mut seen_dot = false; + while let Some(c) = self.ch { + if c == '.' { + if seen_dot { + break; + } + seen_dot = true; + self.read_char(); + } else if c.is_ascii_digit() { + self.read_char(); + } else { + break; + } + } + self.input[start..self.pos].iter().collect() + } + + fn read_string(&mut self) -> String { + // consume opening quote + self.read_char(); + let start = self.pos; + while let Some(c) = self.ch { + if c == '"' { + break; + } + // support escape handling later + self.read_char(); + } + let s: String = self.input[start..self.pos].iter().collect(); + // consume closing quote + self.read_char(); + s + } + + pub fn next_token(&mut self) -> Token { + self.skip_whitespace(); + + let token = match self.ch { + Some('=') => { + if self.peek_char() == Some('=') { + self.read_char(); + let lex = "==".to_string(); + Token { ttype: TokenType::Eq, lexeme: lex, line: self.line, column: self.column } + } else { + Token { ttype: TokenType::Assign, lexeme: "=".to_string(), line: self.line, column: self.column } + } + } + Some('+') => { Token { ttype: TokenType::Plus, lexeme: "+".to_string(), line: self.line, column: self.column } } + Some('-') => { Token { ttype: TokenType::Minus, lexeme: "-".to_string(), line: self.line, column: self.column } } + Some('*') => { Token { ttype: TokenType::Asterisk, lexeme: "*".to_string(), line: self.line, column: self.column } } + Some('/') => { Token { ttype: TokenType::Slash, lexeme: "/".to_string(), line: self.line, column: self.column } } + Some('!') => { + if self.peek_char() == Some('=') { + self.read_char(); + Token { ttype: TokenType::NotEq, lexeme: "!=".to_string(), line: self.line, column: self.column } + } else { + Token { ttype: TokenType::Bang, lexeme: "!".to_string(), line: self.line, column: self.column } + } + } + Some('<') => { Token { ttype: TokenType::Lt, lexeme: "<".to_string(), line: self.line, column: self.column } } + Some('>') => { Token { ttype: TokenType::Gt, lexeme: ">".to_string(), line: self.line, column: self.column } } + Some(',') => { Token { ttype: TokenType::Comma, lexeme: ",".to_string(), line: self.line, column: self.column } } + Some(';') => { Token { ttype: TokenType::Semicolon, lexeme: ";".to_string(), line: self.line, column: self.column } } + Some(':') => { Token { ttype: TokenType::Colon, lexeme: ":".to_string(), line: self.line, column: self.column } } + Some('(') => { Token { ttype: TokenType::LParen, lexeme: "(".to_string(), line: self.line, column: self.column } } + Some(')') => { Token { ttype: TokenType::RParen, lexeme: ")".to_string(), line: self.line, column: self.column } } + Some('{') => { Token { ttype: TokenType::LBrace, lexeme: "{".to_string(), line: self.line, column: self.column } } + Some('}') => { Token { ttype: TokenType::RBrace, lexeme: "}".to_string(), line: self.line, column: self.column } } + Some('[') => { Token { ttype: TokenType::LBracket, lexeme: "[".to_string(), line: self.line, column: self.column } } + Some(']') => { Token { ttype: TokenType::RBracket, lexeme: "]".to_string(), line: self.line, column: self.column } } + Some('"') => { + let s = self.read_string(); + Token { ttype: TokenType::Str, lexeme: s, line: self.line, column: self.column } + } + Some(c) if is_letter(c) => { + let ident = self.read_identifier(); + Token { ttype: TokenType::Ident, lexeme: ident, line: self.line, column: self.column } + } + Some(c) if c.is_ascii_digit() => { + let num = self.read_number(); + if num.contains('.') { + Token { ttype: TokenType::Float, lexeme: num, line: self.line, column: self.column } + } else { + Token { ttype: TokenType::Int, lexeme: num, line: self.line, column: self.column } + } + } + None => { Token { ttype: TokenType::Eof, lexeme: "".to_string(), line: self.line, column: self.column } } + Some(_) => { Token { ttype: TokenType::Illegal, lexeme: self.ch.unwrap().to_string(), line: self.line, column: self.column } } + }; + + // advance to next char if not EOF and we didn't already advance inside readers + if self.ch.is_some() { + // For cases where read_identifier/read_number/read_string already moved position + // we don't want to skip an extra char. The read_* helpers leave `ch` at the + // character after the token. To keep behavior consistent, only call read_char + // when the token was produced from a single-char branch. + match token.ttype { + TokenType::Ident | TokenType::Int | TokenType::Float | TokenType::Str => {} + TokenType::Eof => {} + _ => { self.read_char(); } + } + } + + token } } + +fn is_letter(c: char) -> bool { + c.is_ascii_alphabetic() || c == '_' +} + +fn is_identifier_char(c: char) -> bool { + c.is_ascii_alphanumeric() || c == '_' +} +