diff --git a/SLS_Rust/sls/src/lexer.rs b/SLS_Rust/sls/src/lexer.rs index 93ba757..9f5778d 100644 --- a/SLS_Rust/sls/src/lexer.rs +++ b/SLS_Rust/sls/src/lexer.rs @@ -17,111 +17,607 @@ impl LexerInfo { line: 1, } } + + pub fn peek(&self) -> char { + self.source[self.pos..].chars().next().unwrap_or('\0') + } + + pub fn far_peek(&self, index: usize) -> char { + self.source[self.pos..].chars().nth(index).unwrap_or('\0') + } + + pub fn seek(&self, index: usize) -> char { + self.source[index..].chars().next().unwrap_or('\0') + } + + /// Advance by one UTF-8 char and update line/column. Returns the char after advancing. + /// If already at end, returns '\0'. + pub fn advance(&mut self) -> char { + if self.pos >= self.source.len() { + return '\0'; + } + let ch = self.peek(); + if ch == '\n' { + self.line += 1; + self.column = 1; + } else { + self.column += 1; + } + // move pos forward by char len + self.pos += ch.len_utf8(); + self.peek() + } + + /// Return substring from start to current pos (slice by byte indices). + /// start is a byte index into the original source (we store start as byte index). + pub fn get_token_text(&self, start: usize) -> &str { + // clamp + let end = self.pos.min(self.source.len()); + &self.source[start..end] + } } #[derive(Debug, Clone)] -pub struct Identifier { - pub name: String, - pub is_literal: bool, -} - -#[derive(Debug, Clone)] -pub enum ArrayLiteral { - Identifiers(Vec), - I64(Vec), - I32(Vec), - I16(Vec), - I8(Vec), - U64(Vec), - U32(Vec), - U16(Vec), - U8(Vec), - Float(Vec), - Double(Vec), - Character(Vec), - Strings(Vec), - Boolean(Vec), - TokenStrings(Vec), - TypeTuples(Vec), - StructInline(StructInline), -} - -#[derive(Debug, Clone)] -pub struct ShapedArray { - pub array: ArrayLiteral, - pub shape: Vec, -} - -#[derive(Debug, Clone)] -pub struct TokenString { - pub tokens: Vec, -} - -#[derive(Debug, Clone)] -pub struct TypeTuple { - pub inputs: Vec, - pub outputs: Vec, -} - -#[derive(Debug, Clone)] -pub struct StructInline { - pub name: String, - pub values: Vec, -} - -#[derive(Debug, Clone)] -pub enum StructValue { - Integer(i64), - Float(f32), - Double(f64), - Boolean(bool), - Character(u8), - String(String), - Token(Token), -} - -#[derive(Debug, Clone)] -pub enum Token { - Eof, - - Identifier(Identifier), - - // All integer sizes - I64(i64), - I32(i32), - I16(i16), - I8(i8), - U64(u64), - U32(u32), - U16(u16), - U8(u8), - - Float(f32), - Double(f64), - - Character(u8), - StringLiteral(String), - Boolean(bool), - - Array(ShapedArray), - TokenString(TokenString), - TypeTuple(TypeTuple), -} - -#[derive(Debug, Clone)] -pub struct LexError { - pub message: String, +pub struct FileInfo { pub file: String, pub line: usize, pub column: usize, + pub length: usize, + pub lines: usize, +} + +#[derive(Debug)] +pub enum LexErrorKind { + Message(String), + NotImplemented(String), +} + +#[derive(Debug)] +pub struct LexError { + pub kind: LexErrorKind, + pub file_info: Option, +} + +impl LexError { + pub fn new(msg: impl Into, file_info: Option) -> Self { + Self { + kind: LexErrorKind::Message(msg.into()), + file_info, + } + } + + pub fn not_implemented(msg: impl Into, file_info: Option) -> Self { + Self { + kind: LexErrorKind::NotImplemented(msg.into()), + file_info, + } + } } pub type LexResult = Result; -pub fn get_token(_lexer: &mut LexerInfo) -> Option { - None +#[derive(Debug, Clone)] +pub enum IntegerType { + I64, + I32, + I16, + I8, + U64, + U32, + U16, + U8, } -pub fn lexical_analysis(_lexer: &mut LexerInfo) -> LexResult> { - Ok(Vec::new()) +#[derive(Debug, Clone)] +pub struct IntegerLiteral { + pub typ: IntegerType, + pub value: u64, // we'll keep as u64; sign info handled by type and parsing +} + +#[derive(Debug, Clone)] +pub enum Token { + EOF, + Identifier { is_literal: bool, name: String }, + Integer(IntegerLiteral), + Float(f32), + Double(f64), + Character(u8), + String(String), + Boolean(bool), + Array, // placeholder + TokenString, // placeholder + TypeTuple, // placeholder +} + +fn make_file_info(lexer: &LexerInfo, start: usize, start_line: usize) -> FileInfo { + FileInfo { + file: lexer.filename.clone(), + line: lexer.line, + column: lexer.column, + length: lexer.pos.saturating_sub(start), + lines: lexer.line.saturating_sub(start_line), + } +} + +/// Helpers for numeric parsing (create values from the slice) +fn create_binary_integer(token: &str, negative: bool) -> u64 { + // token begins like "0b..." or "-0b..." + let mut value: u64 = 0; + for ch in token.chars() { + if ch == '0' || ch == '1' { + value = (value << 1) + (if ch == '1' { 1 } else { 0 }); + } else if ch == '_' || ch == '.' || ch == ':' || ch == '-' || ch == 'b' || ch == 'B' { + continue; + } else { + break; + } + } + if negative { + // two's complement style as in C original: (~value) + 1 + (!value).wrapping_add(1) + } else { + value + } +} + +fn create_octal_integer(token: &str, negative: bool) -> u64 { + let mut value: u64 = 0; + for ch in token.chars() { + if ch >= '0' && ch <= '7' { + value = value * 8 + ((ch as u8 - b'0') as u64); + } else if ch == '_' || ch == '.' || ch == ':' || ch == '-' || ch == 'o' || ch == 'O' { + continue; + } else { + break; + } + } + if negative { + (!value).wrapping_add(1) + } else { + value + } +} + +fn create_decimal_integer(token: &str, negative: bool) -> u64 { + let mut value: u64 = 0; + for ch in token.chars() { + if ch.is_ascii_digit() { + value = value * 10 + ((ch as u8 - b'0') as u64); + } else if ch == '_' || ch == ':' || ch == '-' { + continue; + } else { + break; + } + } + if negative { + (!value).wrapping_add(1) + } else { + value + } +} + +fn create_hex_integer(token: &str, negative: bool) -> u64 { + let mut value: u64 = 0; + for ch in token.chars() { + if ch.is_ascii_hexdigit() { + value = value * 16 + + (ch.to_digit(16).unwrap_or(0) as u64); + } else if ch == '_' || ch == '.' || ch == ':' || ch == '-' || ch == 'x' || ch == 'X' { + continue; + } else { + break; + } + } + if negative { + (!value).wrapping_add(1) + } else { + value + } +} + +fn create_float_value(token: &str, negative: bool) -> f64 { + // A simple but robust approach: remove underscores and parse with Rust + let cleaned: String = token.chars().filter(|&c| c != '_').collect(); + // Accept leading '-' in cleaned string + let parsed = cleaned.parse::().unwrap_or(0.0); + if negative { -parsed } else { parsed } +} + +/// Determine integer builtin type from suffix bits similar to original +fn select_integer_type(unsigned: bool, bits: usize) -> Option { + match (unsigned, bits) { + (false, 64) => Some(IntegerType::I64), + (false, 32) => Some(IntegerType::I32), + (false, 16) => Some(IntegerType::I16), + (false, 8) => Some(IntegerType::I8), + (true, 64) => Some(IntegerType::U64), + (true, 32) => Some(IntegerType::U32), + (true, 16) => Some(IntegerType::U16), + (true, 8) => Some(IntegerType::U8), + _ => None, + } +} + +/// Top-level numeric-type-suffix parser: handles things like ":i32", ":u8", ":f64", ":f32" +fn parse_numeric_type_suffix(token: &str) -> Result<(bool, Option, bool /*is_float*/, usize /*bits*/), ()> { + // returns (unsigned, integer_type_option, is_float, bits) + let mut unsigned = false; + let mut is_float = false; + let mut bits = 0usize; + // find suffix start colon + if let Some(colon_pos) = token.find(':') { + let suffix = &token[colon_pos + 1..]; + // float? + if suffix.starts_with('f') { + is_float = true; + if suffix == "f64" { bits = 64; } + else if suffix == "f32" { bits = 32; } + else { return Err(()); } + } else if suffix.starts_with('i') || suffix.starts_with('u') { + unsigned = suffix.starts_with('u'); + if suffix == "i64" || suffix == "u64" { bits = 64; } + else if suffix == "i32" || suffix == "u32" { bits = 32; } + else if suffix == "i16" || suffix == "u16" { bits = 16; } + else if suffix == "i8" || suffix == "u8" { bits = 8; } + else { return Err(()); } + } else { + return Err(()); + } + let int_type = if is_float { None } else { select_integer_type(unsigned, bits) }; + Ok((unsigned, int_type, is_float, bits)) + } else { + // no suffix + Ok((false, Some(IntegerType::I64), false, 64)) + } +} + +/// Skip comments and whitespace similar to C version +fn skip_comments_and_whitespace(lexer: &mut LexerInfo) { + loop { + let p = lexer.peek(); + if p == '\0' { + break; + } + if p.is_whitespace() { + lexer.advance(); + continue; + } + // comment styles: // or # + if p == '/' && lexer.far_peek(1) == '/' { + // consume until newline or '\0' + while lexer.peek() != '\n' && lexer.peek() != '\0' { + lexer.advance(); + } + continue; + } + if p == '#' { + while lexer.peek() != '\n' && lexer.peek() != '\0' { + lexer.advance(); + } + continue; + } + break; + } +} + +/// Is identifier char allowed to continue? +fn is_identifier_continue(lexer: &LexerInfo, c: char) -> bool { + if !c.is_ascii_graphic() && c != ' ' { // keep printable heuristic + return false; + } + if c == '/' && lexer.far_peek(1) == '/' { return false; } + match c { + '{' | '}' | '[' | ']' | '(' | ')' | '\'' | '"' | '#' => false, + _ if c.is_whitespace() || c == '\0' => false, + _ => true, + } +} + +/// Is identifier start +fn is_identifier_start(lexer: &LexerInfo) -> bool { + let mut c = lexer.peek(); + if c == ':' && lexer.far_peek(1) == ':' { + c = lexer.far_peek(2); + } + (!c.is_ascii_digit()) && is_identifier_continue(lexer, c) +} + +/// Parse identifiers and booleans +fn parse_identifiers_and_booleans(lexer: &mut LexerInfo, start: usize, start_line: usize) -> LexResult { + let mut literal = false; + if lexer.peek() == ':' && lexer.far_peek(1) == ':' { + // skip the leading '::' + lexer.advance(); + lexer.advance(); + literal = true; + } + + // collect name + let mut name = String::new(); + while is_identifier_continue(lexer, lexer.peek()) { + let c = lexer.peek(); + if c == ':' || c == '.' { + return Err(LexError::new(format!("Invalid identifier: '{}' not allowed in identifiers.", c), Some(make_file_info(lexer, start, start_line)))); + } + name.push(c); + lexer.advance(); + } + + match name.as_str() { + "false" => Ok(Token::Boolean(false)), + "true" => Ok(Token::Boolean(true)), + _ => Ok(Token::Identifier { is_literal: literal, name }), + } +} + +/// Parse character literal +fn parse_character_literal(lexer: &mut LexerInfo, start: usize, start_line: usize) -> LexResult { + // At entry the opening quote was consumed by caller; peek() is first char of literal (or '\0') + let c = lexer.peek(); + if c == '\'' { + return Err(LexError::new("Invalid character literal: empty character literal.", Some(make_file_info(lexer, start, start_line)))); + } + let mut value: u8 = 0; + if c == '\\' { + lexer.advance(); // consume '\' + let esc = lexer.peek(); + value = match esc { + 'n' => b'\n', + 'r' => b'\r', + 't' => b'\t', + '\\' => b'\\', + '\'' => b'\'', + '0' => b'\0', + _ => { + return Err(LexError::new(format!("Invalid character literal: unknown escape sequence '\\{}'", esc), + Some(make_file_info(lexer, start, start_line)))); + } + }; + lexer.advance(); // move past the escaped char + } else if c == '\n' || c == '\r' || c == '\0' { + return Err(LexError::new("Invalid character literal: unclosed character literal.", Some(make_file_info(lexer, start, start_line)))); + } else { + value = lexer.peek() as u8; + lexer.advance(); + } + // expect closing quote + let closing = lexer.peek(); + if closing == '\0' || closing.is_whitespace() || closing == '/' { + return Err(LexError::new("Invalid character literal: unclosed character literal.", Some(make_file_info(lexer, start, start_line)))); + } else if closing != '\'' { + return Err(LexError::new(format!("Invalid character literal: unexpected '{}' in character.", closing), Some(make_file_info(lexer, start, start_line)))); + } + lexer.advance(); // consume closing ' + Ok(Token::Character(value)) +} + +/// Parse numeric literal (entry when first char is digit or '.' or '-') +fn parse_numeric_literal(lexer: &mut LexerInfo, start: usize, start_line: usize) -> LexResult { + // We'll gather the token by moving the lexer forward until end of numeric token, + // then analyze the string slice. + // Handle leading '-' + if lexer.peek() == '-' { + lexer.advance(); + } + + // If 0b, 0o, 0x style: + if lexer.peek() == '0' { + let next = lexer.far_peek(1); + if next == 'b' || next == 'B' { + // binary + lexer.advance(); // consume '0' + lexer.advance(); // consume 'b' + while matches!(lexer.peek(), '0' | '1' | '_' ) { + lexer.advance(); + } + // optional suffix :... + if lexer.peek() == ':' { + // we'll let suffix parser handle it by consuming suffix chars up to whitespace or '/' + while !lexer.peek().is_whitespace() && lexer.peek() != '/' && lexer.peek() != '\0' { + lexer.advance(); + } + } + let s = lexer.get_token_text(start); + let negative = s.starts_with('-'); + // default type i64 + let v = create_binary_integer(s, negative); + Ok(Token::Integer(IntegerLiteral { typ: IntegerType::I64, value: v })) + } else if next == 'o' || next == 'O' { + // octal + lexer.advance(); lexer.advance(); + while (lexer.peek().is_ascii_digit() && !matches!(lexer.peek(), '8' | '9')) || lexer.peek() == '_' { + lexer.advance(); + } + if lexer.peek() == ':' { + while !lexer.peek().is_whitespace() && lexer.peek() != '/' && lexer.peek() != '\0' { + lexer.advance(); + } + } + let s = lexer.get_token_text(start); + let negative = s.starts_with('-'); + let v = create_octal_integer(s, negative); + Ok(Token::Integer(IntegerLiteral { typ: IntegerType::I64, value: v })) + } else if next == 'x' || next == 'X' { + // hex + lexer.advance(); lexer.advance(); + while lexer.peek().is_ascii_hexdigit() || lexer.peek() == '_' { + lexer.advance(); + } + if lexer.peek() == ':' { + while !lexer.peek().is_whitespace() && lexer.peek() != '/' && lexer.peek() != '\0' { + lexer.advance(); + } + } + let s = lexer.get_token_text(start); + let negative = s.starts_with('-'); + let v = create_hex_integer(s, negative); + Ok(Token::Integer(IntegerLiteral { typ: IntegerType::I64, value: v })) + } + } + + // decimal or float + // consume digits and underscores + while lexer.peek().is_ascii_digit() || lexer.peek() == '_' { + lexer.advance(); + } + if lexer.peek() == '.' { + // float case: .digit or digits.digits + lexer.advance(); + while lexer.peek().is_ascii_digit() || lexer.peek() == '_' { + lexer.advance(); + } + // exponential 'e/E' not implemented: mirror C behavior -> not implemented + if lexer.peek() == 'e' || lexer.peek() == 'E' { + // not implemented exponential in C either + return Err(LexError::not_implemented("Float exponential parsing not implemented.", Some(make_file_info(lexer, start, start_line)))); + } + // optional suffix :f64 or :f32 + if lexer.peek() == ':' { + // capture suffix + while !lexer.peek().is_whitespace() && lexer.peek() != '/' && lexer.peek() != '\0' { + lexer.advance(); + } + } + let s = lexer.get_token_text(start); + let negative = s.starts_with('-'); + let value = create_float_value(s, negative); + Ok(Token::Double(value)) + } else if lexer.peek() == 'e' || lexer.peek() == 'E' { + return Err(LexError::not_implemented("Float exponential parsing not implemented.", Some(make_file_info(lexer, start, start_line)))); + } else if lexer.peek() == ':' { + // suffix that might indicate integer type or float base change + // consume suffix characters + while !lexer.peek().is_whitespace() && lexer.peek() != '/' && lexer.peek() != '\0' { + lexer.advance(); + } + let s = lexer.get_token_text(start); + let negative = s.starts_with('-'); + // check suffix + match parse_numeric_type_suffix(s) { + Ok((_unsigned, int_type_opt, is_float, _bits)) => { + if is_float { + let v = create_float_value(s, negative); + return Ok(Token::Double(v)); + } else { + let v = create_decimal_integer(s, negative); + let typ = int_type_opt.unwrap_or(IntegerType::I64); + return Ok(Token::Integer(IntegerLiteral { typ, value: v })); + } + } + Err(_) => { + return Err(LexError::new("Invalid numeric type suffix.", Some(make_file_info(lexer, start, start_line)))); + } + } + } else { + // plain decimal integer + let s = lexer.get_token_text(start); + let negative = s.starts_with('-'); + let v = create_decimal_integer(s, negative); + Ok(Token::Integer(IntegerLiteral { typ: IntegerType::I64, value: v })) + } +} + +/// parse string literal - not implemented (mirrors C behavior) +fn parse_string_literal(lexer: &mut LexerInfo, start: usize, start_line: usize) -> LexResult { + Err(LexError::not_implemented("String literals not implemented.", Some(make_file_info(lexer, start, start_line)))) +} + +/// parse token string - similar placeholder to C +fn parse_token_string(lexer: &mut LexerInfo, start: usize, start_line: usize) -> LexResult { + Err(LexError::not_implemented("Token string parsing not implemented.", Some(make_file_info(lexer, start, start_line)))) +} + +fn parse_array_literal(_lexer: &mut LexerInfo, start: usize, start_line: usize) -> LexResult { + Err(LexError::not_implemented("Array literal parsing not implemented.", Some(make_file_info(_lexer, start, start_line)))) +} + +fn parse_type_tuple(_lexer: &mut LexerInfo, start: usize, start_line: usize) -> LexResult { + Err(LexError::not_implemented("Type tuple parsing not implemented.", Some(make_file_info(_lexer, start, start_line)))) +} + +/// The main get_token function returns a Token or a LexError (wrapped by Result in lexical_analysis). +pub fn get_token(lexer: &mut LexerInfo) -> Result { + skip_comments_and_whitespace(lexer); + + let start = lexer.pos; + let start_line = lexer.line; + + let c = lexer.peek(); + if c == '\0' { + return Ok(Token::EOF); + } + + if c.is_ascii_digit() || (c == '.' && lexer.far_peek(1).is_ascii_digit()) || (c == '-' && lexer.far_peek(1).is_ascii_digit()) { + return parse_numeric_literal(lexer, start, start_line); + } + + if c == '\'' { + lexer.advance(); // consume opening ' + return parse_character_literal(lexer, start, start_line); + } + + if c == '"' { + lexer.advance(); // consume opening " + return parse_string_literal(lexer, start, start_line); + } + + if c == '{' { + lexer.advance(); + return parse_token_string(lexer, start, start_line); + } + if c == '}' { + lexer.advance(); + return Err(LexError::new("Unexpected closing brace '}' without matching opening brace.", Some(make_file_info(lexer, start, start_line)))); + } + + if c == '[' { + lexer.advance(); + return parse_array_literal(lexer, start, start_line); + } + if c == ']' { + lexer.advance(); + return Err(LexError::new("Unexpected closing bracket ']' without matching opening bracket.", Some(make_file_info(lexer, start, start_line)))); + } + + if c == '(' { + lexer.advance(); + return parse_type_tuple(lexer, start, start_line); + } + if c == ')' { + lexer.advance(); + return Err(LexError::new("Unexpected closing parentheses ')' without matching opening parentheses.", Some(make_file_info(lexer, start, start_line)))); + } + + if is_identifier_start(lexer) { + return parse_identifiers_and_booleans(lexer, start, start_line); + } + + if c == ':' { + lexer.advance(); + if lexer.peek() == ':' { + return Err(LexError::new("Invalid identifier literal: empty identifier after '::'.", Some(make_file_info(lexer, start, start_line)))); + } else { + return Err(LexError::new("Unexpected single colon ':'.", Some(make_file_info(lexer, start, start_line)))); + } + } + + Err(LexError::new(format!("Unexpected character: unexpected '{}' during parsing.", c), Some(make_file_info(lexer, start, start_line)))) +} + +/// Perform full lexical analysis, returning a Vec or LexError +pub fn lexical_analysis(lexer: &mut LexerInfo) -> LexResult> { + let mut tokens = Vec::new(); + loop { + match get_token(lexer) { + Ok(Token::EOF) => { + tokens.push(Token::EOF); + break; + } + Ok(tok) => tokens.push(tok), + Err(e) => { + return Err(e); + } + } + } + Ok(tokens) }