From 08a8aadf1670e9660c180fd9cc17d5181cb1dbd4 Mon Sep 17 00:00:00 2001 From: Kyler Date: Mon, 1 Dec 2025 23:46:46 -0700 Subject: [PATCH] Claude attempt at lexer.rs --- SLS_Rust/sls/src/lexer.rs | 458 +++++++++++++++++++++++++++++++++++++- 1 file changed, 448 insertions(+), 10 deletions(-) diff --git a/SLS_Rust/sls/src/lexer.rs b/SLS_Rust/sls/src/lexer.rs index 93ba757..62a9af9 100644 --- a/SLS_Rust/sls/src/lexer.rs +++ b/SLS_Rust/sls/src/lexer.rs @@ -17,6 +17,47 @@ impl LexerInfo { line: 1, } } + + fn peek(&self) -> char { + self.source.chars().nth(self.pos).unwrap_or('\0') + } + + fn far_peek(&self, offset: usize) -> char { + self.source.chars().nth(self.pos + offset).unwrap_or('\0') + } + + fn advance(&mut self) -> char { + if self.peek() == '\n' { + self.line += 1; + self.column = 1; + } else { + self.column += 1; + } + self.pos += 1; + self.peek() + } + + fn skip_comments_and_whitespace(&mut self) { + loop { + let c = self.peek(); + + // Skip comments + if (c == '/' && self.far_peek(1) == '/') || c == '#' { + while self.peek() != '\n' && self.peek() != '\0' { + self.advance(); + } + } + + // Skip whitespace + if self.peek().is_whitespace() { + while self.peek().is_whitespace() { + self.advance(); + } + } else { + break; + } + } + } } #[derive(Debug, Clone)] @@ -83,10 +124,7 @@ pub enum StructValue { #[derive(Debug, Clone)] pub enum Token { Eof, - Identifier(Identifier), - - // All integer sizes I64(i64), I32(i32), I16(i16), @@ -95,14 +133,11 @@ pub enum Token { U32(u32), U16(u16), U8(u8), - Float(f32), Double(f64), - Character(u8), StringLiteral(String), Boolean(bool), - Array(ShapedArray), TokenString(TokenString), TypeTuple(TypeTuple), @@ -118,10 +153,413 @@ pub struct LexError { pub type LexResult = Result; -pub fn get_token(_lexer: &mut LexerInfo) -> Option { - None +impl LexerInfo { + fn make_error(&self, message: impl Into, start_line: usize, start_col: usize) -> LexError { + LexError { + message: message.into(), + file: self.filename.clone(), + line: start_line, + column: start_col, + } + } + + fn is_identifier_continue(&self, c: char) -> bool { + if !c.is_ascii() || !c.is_ascii_graphic() { + return false; + } + if c == '/' && self.far_peek(1) == '/' { + return false; + } + !matches!(c, '{' | '}' | '[' | ']' | '(' | ')' | '\'' | '"' | '#') && !c.is_whitespace() + } + + fn is_identifier_start(&self) -> bool { + let mut c = self.peek(); + if c == ':' && self.far_peek(1) == ':' { + c = self.far_peek(2); + } + !c.is_ascii_digit() && self.is_identifier_continue(c) + } + + fn parse_identifiers_and_booleans(&mut self, start: usize, start_line: usize, start_col: usize) -> LexResult { + let mut c = self.peek(); + let mut literal = false; + + // Skip leading `::` for identifier literals + if c == ':' && self.far_peek(1) == ':' { + literal = true; + self.advance(); + c = self.advance(); + } + + // Read the name + let name_start = self.pos; + while self.is_identifier_continue(c) { + if c == ':' { + return Err(self.make_error("Invalid identifier: ':' is not allowed in identifiers.", start_line, start_col)); + } + if c == '.' { + return Err(self.make_error("Invalid identifier: '.' is not allowed in identifiers.", start_line, start_col)); + } + c = self.advance(); + } + + let name = self.source[name_start..self.pos].to_string(); + + // Check for booleans + match name.as_str() { + "false" => Ok(Token::Boolean(false)), + "true" => Ok(Token::Boolean(true)), + _ => Ok(Token::Identifier(Identifier { name, is_literal: literal })), + } + } + + fn parse_character_literal(&mut self, start_line: usize, start_col: usize) -> LexResult { + let mut c = self.peek(); + + if c == '\'' { + return Err(self.make_error("Invalid character literal: empty character literal.", start_line, start_col)); + } + + let value = if c == '\\' { + c = self.advance(); + match c { + 'n' => b'\n', + 'r' => b'\r', + 't' => b'\t', + '\\' => b'\\', + '\'' => b'\'', + '0' => b'\0', + _ => return Err(self.make_error(format!("Invalid character literal: unknown escape sequence '\\{}'.", c), start_line, start_col)), + } + } else if c == '\n' || c == '\r' { + return Err(self.make_error("Invalid character literal: unclosed character literal.", start_line, start_col)); + } else { + c as u8 + }; + + c = self.advance(); + + if c.is_whitespace() || c == '/' || c == '\0' { + return Err(self.make_error("Invalid character literal: unclosed character literal.", start_line, start_col)); + } else if c != '\'' { + return Err(self.make_error(format!("Invalid character literal: unexpected '{}' in character.", c), start_line, start_col)); + } + + self.advance(); + Ok(Token::Character(value)) + } + + fn parse_token_string(&mut self, start: usize, start_line: usize, start_col: usize) -> LexResult { + let mut tokens = Vec::new(); + self.advance(); // skip '{' + + loop { + self.skip_comments_and_whitespace(); + let c = self.peek(); + + if c == '}' { + self.advance(); + return Ok(Token::TokenString(TokenString { tokens })); + } + + if c == '\0' { + return Err(self.make_error("Unclosed token string: missing closing brace '}'.", start_line, start_col)); + } + + match get_token(self) { + Some(token) => { + if matches!(token, Token::Eof) { + break; + } + tokens.push(token); + } + None => return Err(self.make_error("Failed to parse token in token string.", start_line, start_col)), + } + } + + Err(self.make_error("Unclosed token string: missing closing brace '}'.", start_line, start_col)) + } + + fn parse_numeric_literal(&mut self, start: usize, start_line: usize, start_col: usize) -> LexResult { + let mut c = self.peek(); + + if c == '-' { + c = self.advance(); + } + + if c == '0' { + c = self.advance(); + match c { + 'b' | 'B' => { + self.advance(); + return self.parse_binary_integer(start, start_line, start_col); + } + 'o' | 'O' => { + self.advance(); + return self.parse_octal_integer(start, start_line, start_col); + } + 'x' | 'X' => { + self.advance(); + return self.parse_hexadecimal_integer(start, start_line, start_col); + } + _ => {} + } + } + + self.parse_decimal_integer(start, start_line, start_col) + } + + fn parse_binary_integer(&mut self, start: usize, start_line: usize, start_col: usize) -> LexResult { + let mut c = self.peek(); + while c == '0' || c == '1' || c == '_' { + c = self.advance(); + } + + let value = self.create_binary_integer(start); + Ok(Token::I64(value as i64)) + } + + fn parse_octal_integer(&mut self, start: usize, start_line: usize, start_col: usize) -> LexResult { + let mut c = self.peek(); + while c.is_ascii_digit() && c != '8' && c != '9' || c == '_' { + c = self.advance(); + } + + let value = self.create_octal_integer(start); + Ok(Token::I64(value as i64)) + } + + fn parse_decimal_integer(&mut self, start: usize, start_line: usize, start_col: usize) -> LexResult { + let mut c = self.peek(); + while c.is_ascii_digit() || c == '_' { + c = self.advance(); + } + + if c == '.' { + self.advance(); + return self.parse_float(start, start_line, start_col); + } + + let value = self.create_decimal_integer(start); + Ok(Token::I64(value as i64)) + } + + fn parse_hexadecimal_integer(&mut self, start: usize, start_line: usize, start_col: usize) -> LexResult { + let mut c = self.peek(); + while c.is_ascii_hexdigit() || c == '_' { + c = self.advance(); + } + + let value = self.create_hexadecimal_integer(start); + Ok(Token::I64(value as i64)) + } + + fn parse_float(&mut self, start: usize, start_line: usize, start_col: usize) -> LexResult { + let mut c = self.peek(); + while c.is_ascii_digit() || c == '_' { + c = self.advance(); + } + + let value = self.create_float(start); + Ok(Token::Double(value)) + } + + fn create_binary_integer(&self, start: usize) -> u64 { + let token = &self.source[start..self.pos]; + let mut value = 0u64; + let mut i = 2; + + if token.starts_with('-') { + i += 1; + } + + for c in token[i..].chars() { + if c == '_' || c == '.' { + continue; + } + if c.is_whitespace() || c == '/' || c == ':' { + break; + } + value *= 2; + if c == '1' { + value += 1; + } + } + + if token.starts_with('-') { + (!value).wrapping_add(1) + } else { + value + } + } + + fn create_octal_integer(&self, start: usize) -> u64 { + let token = &self.source[start..self.pos]; + let mut value = 0u64; + let mut i = 2; + + if token.starts_with('-') { + i += 1; + } + + for c in token[i..].chars() { + if c == '_' || c == '.' { + continue; + } + if c.is_whitespace() || c == '/' || c == ':' { + break; + } + value *= 8; + value += c.to_digit(8).unwrap_or(0) as u64; + } + + if token.starts_with('-') { + (!value).wrapping_add(1) + } else { + value + } + } + + fn create_decimal_integer(&self, start: usize) -> u64 { + let token = &self.source[start..self.pos]; + let mut value = 0u64; + let mut i = 0; + + if token.starts_with('-') { + i += 1; + } + + for c in token[i..].chars() { + if c == '_' { + continue; + } + if c.is_whitespace() || c == '/' || c == ':' { + break; + } + value *= 10; + value += c.to_digit(10).unwrap_or(0) as u64; + } + + if token.starts_with('-') { + (!value).wrapping_add(1) + } else { + value + } + } + + fn create_hexadecimal_integer(&self, start: usize) -> u64 { + let token = &self.source[start..self.pos]; + let mut value = 0u64; + let mut i = 2; + + if token.starts_with('-') { + i += 1; + } + + for c in token[i..].chars() { + if c == '_' || c == '.' { + continue; + } + if c.is_whitespace() || c == '/' || c == ':' { + break; + } + value *= 16; + value += c.to_digit(16).unwrap_or(0) as u64; + } + + if token.starts_with('-') { + (!value).wrapping_add(1) + } else { + value + } + } + + fn create_float(&self, start: usize) -> f64 { + let token = &self.source[start..self.pos]; + let mut value = 0.0; + let mut fractional = 0u64; + let mut i = 0; + + if token.starts_with('-') { + i += 1; + } + + for c in token[i..].chars() { + if c == '_' { + continue; + } + if c.is_whitespace() || c == '/' || c == ':' { + break; + } + if c == '.' { + fractional = 1; + continue; + } + + if fractional == 0 { + value *= 10.0; + } else { + fractional *= 10; + } + + let digit = c.to_digit(10).unwrap_or(0) as f64; + if fractional == 0 { + value += digit; + } else { + value += digit / fractional as f64; + } + } + + if token.starts_with('-') { + -value + } else { + value + } + } } -pub fn lexical_analysis(_lexer: &mut LexerInfo) -> LexResult> { - Ok(Vec::new()) +pub fn get_token(lexer: &mut LexerInfo) -> Option { + lexer.skip_comments_and_whitespace(); + + let c = lexer.peek(); + let start = lexer.pos; + let start_line = lexer.line; + let start_col = lexer.column; + + if c == '\0' { + return Some(Token::Eof); + } + + let result = if c.is_ascii_digit() || (c == '.' && lexer.far_peek(1).is_ascii_digit()) || (c == '-' && lexer.far_peek(1).is_ascii_digit()) { + lexer.parse_numeric_literal(start, start_line, start_col) + } else if c == '\'' { + lexer.advance(); + lexer.parse_character_literal(start_line, start_col) + } else if c == '{' { + lexer.parse_token_string(start, start_line, start_col) + } else if lexer.is_identifier_start() { + lexer.parse_identifiers_and_booleans(start, start_line, start_col) + } else { + Err(lexer.make_error(format!("Unexpected character: '{}'", c), start_line, start_col)) + }; + + result.ok() +} + +pub fn lexical_analysis(lexer: &mut LexerInfo) -> LexResult> { + let mut tokens = Vec::new(); + + loop { + match get_token(lexer) { + Some(Token::Eof) => { + tokens.push(Token::Eof); + break; + } + Some(token) => tokens.push(token), + None => break, + } + } + + Ok(tokens) }