Refactor token_match_expectation to improve numeric and char handling

This commit is contained in:
Kyler Olsen 2025-12-01 17:27:25 -07:00
parent 483e0c3d52
commit 1875f2debd
3 changed files with 627 additions and 443 deletions

View File

@ -1,4 +1,4 @@
#[derive(Debug, Clone, PartialEq, Eq)] #[derive(Debug, Clone, PartialEq)]
pub enum TokenType { pub enum TokenType {
Illegal, Illegal,
Eof, Eof,
@ -32,12 +32,14 @@ pub enum TokenType {
RBracket, RBracket,
} }
#[derive(Debug, Clone, PartialEq, Eq)] #[derive(Debug, Clone, PartialEq)]
pub struct Token { pub struct Token {
pub ttype: TokenType, pub ttype: TokenType,
pub lexeme: String, pub lexeme: String,
pub line: usize, pub line: usize,
pub column: usize, pub column: usize,
pub numeric: Option<i128>,
pub float: Option<f64>,
} }
pub struct Lexer { pub struct Lexer {
@ -114,16 +116,10 @@ impl Lexer {
} }
fn read_number(&mut self) -> String { fn read_number(&mut self) -> String {
// Simple numeric reader: read digits, dot, underscores, and suffixes will be handled by caller
let start = self.pos; let start = self.pos;
let mut seen_dot = false;
while let Some(c) = self.ch { while let Some(c) = self.ch {
if c == '.' { if c.is_ascii_digit() || c == '.' || c == '_' || c == 'x' || c == 'b' || c == 'o' || c.is_ascii_hexdigit() {
if seen_dot {
break;
}
seen_dot = true;
self.read_char();
} else if c.is_ascii_digit() {
self.read_char(); self.read_char();
} else { } else {
break; break;
@ -157,52 +153,233 @@ impl Lexer {
if self.peek_char() == Some('=') { if self.peek_char() == Some('=') {
self.read_char(); self.read_char();
let lex = "==".to_string(); let lex = "==".to_string();
Token { ttype: TokenType::Eq, lexeme: lex, line: self.line, column: self.column } Token { ttype: TokenType::Eq, lexeme: lex, line: self.line, column: self.column, numeric: None, float: None }
} else { } else {
Token { ttype: TokenType::Assign, lexeme: "=".to_string(), line: self.line, column: self.column } Token { ttype: TokenType::Assign, lexeme: "=".to_string(), line: self.line, column: self.column, numeric: None, float: None }
} }
} }
Some('+') => { Token { ttype: TokenType::Plus, lexeme: "+".to_string(), line: self.line, column: self.column } } Some('+') => { Token { ttype: TokenType::Plus, lexeme: "+".to_string(), line: self.line, column: self.column, numeric: None, float: None } }
Some('-') => { Token { ttype: TokenType::Minus, lexeme: "-".to_string(), line: self.line, column: self.column } } Some('*') => { Token { ttype: TokenType::Asterisk, lexeme: "*".to_string(), line: self.line, column: self.column, numeric: None, float: None } }
Some('*') => { Token { ttype: TokenType::Asterisk, lexeme: "*".to_string(), line: self.line, column: self.column } } Some('/') => { Token { ttype: TokenType::Slash, lexeme: "/".to_string(), line: self.line, column: self.column, numeric: None, float: None } }
Some('/') => { Token { ttype: TokenType::Slash, lexeme: "/".to_string(), line: self.line, column: self.column } }
Some('!') => { Some('!') => {
if self.peek_char() == Some('=') { if self.peek_char() == Some('=') {
self.read_char(); self.read_char();
Token { ttype: TokenType::NotEq, lexeme: "!=".to_string(), line: self.line, column: self.column } Token { ttype: TokenType::NotEq, lexeme: "!=".to_string(), line: self.line, column: self.column, numeric: None, float: None }
} else { } else {
Token { ttype: TokenType::Bang, lexeme: "!".to_string(), line: self.line, column: self.column } Token { ttype: TokenType::Bang, lexeme: "!".to_string(), line: self.line, column: self.column, numeric: None, float: None }
} }
} }
Some('<') => { Token { ttype: TokenType::Lt, lexeme: "<".to_string(), line: self.line, column: self.column } } Some('<') => { Token { ttype: TokenType::Lt, lexeme: "<".to_string(), line: self.line, column: self.column, numeric: None, float: None } }
Some('>') => { Token { ttype: TokenType::Gt, lexeme: ">".to_string(), line: self.line, column: self.column } } Some('>') => { Token { ttype: TokenType::Gt, lexeme: ">".to_string(), line: self.line, column: self.column, numeric: None, float: None } }
Some(',') => { Token { ttype: TokenType::Comma, lexeme: ",".to_string(), line: self.line, column: self.column } } Some(',') => { Token { ttype: TokenType::Comma, lexeme: ",".to_string(), line: self.line, column: self.column, numeric: None, float: None } }
Some(';') => { Token { ttype: TokenType::Semicolon, lexeme: ";".to_string(), line: self.line, column: self.column } } Some(';') => { Token { ttype: TokenType::Semicolon, lexeme: ";".to_string(), line: self.line, column: self.column, numeric: None, float: None } }
Some(':') => { Token { ttype: TokenType::Colon, lexeme: ":".to_string(), line: self.line, column: self.column } } Some(':') => { Token { ttype: TokenType::Colon, lexeme: ":".to_string(), line: self.line, column: self.column, numeric: None, float: None } }
Some('(') => { Token { ttype: TokenType::LParen, lexeme: "(".to_string(), line: self.line, column: self.column } } Some('(') => { Token { ttype: TokenType::LParen, lexeme: "(".to_string(), line: self.line, column: self.column, numeric: None, float: None } }
Some(')') => { Token { ttype: TokenType::RParen, lexeme: ")".to_string(), line: self.line, column: self.column } } Some(')') => { Token { ttype: TokenType::RParen, lexeme: ")".to_string(), line: self.line, column: self.column, numeric: None, float: None } }
Some('{') => { Token { ttype: TokenType::LBrace, lexeme: "{".to_string(), line: self.line, column: self.column } } Some('{') => { Token { ttype: TokenType::LBrace, lexeme: "{".to_string(), line: self.line, column: self.column, numeric: None, float: None } }
Some('}') => { Token { ttype: TokenType::RBrace, lexeme: "}".to_string(), line: self.line, column: self.column } } Some('}') => { Token { ttype: TokenType::RBrace, lexeme: "}".to_string(), line: self.line, column: self.column, numeric: None, float: None } }
Some('[') => { Token { ttype: TokenType::LBracket, lexeme: "[".to_string(), line: self.line, column: self.column } } Some('[') => { Token { ttype: TokenType::LBracket, lexeme: "[".to_string(), line: self.line, column: self.column, numeric: None, float: None } }
Some(']') => { Token { ttype: TokenType::RBracket, lexeme: "]".to_string(), line: self.line, column: self.column } } Some(']') => { Token { ttype: TokenType::RBracket, lexeme: "]".to_string(), line: self.line, column: self.column, numeric: None, float: None } }
Some('"') => { Some('"') => {
let s = self.read_string(); let s = self.read_string();
Token { ttype: TokenType::Str, lexeme: s, line: self.line, column: self.column } Token { ttype: TokenType::Str, lexeme: s, line: self.line, column: self.column, numeric: None, float: None }
} }
Some(c) if is_letter(c) => { Some(c) if is_letter(c) => {
let ident = self.read_identifier(); let ident = self.read_identifier();
Token { ttype: TokenType::Ident, lexeme: ident, line: self.line, column: self.column } Token { ttype: TokenType::Ident, lexeme: ident, line: self.line, column: self.column, numeric: None, float: None }
}
Some(c) if c.is_ascii_digit() || (c == '-' && self.peek_char().map_or(false, |pc| pc.is_ascii_digit())) => {
// Handle optional leading '-' as part of number
let mut sign = 1i128;
if c == '-' {
sign = -1;
self.read_char();
}
// Determine base and read digits with underscores and optional suffix
let start_pos = self.pos;
// If starting with '0' and next is x/b/o, handle prefixes
let mut base = 10u32;
let mut raw_digits = String::new();
if self.ch == Some('0') && self.peek_char().map_or(false, |pc| pc == 'x' || pc == 'X' || pc == 'b' || pc == 'o') {
// consume '0'
self.read_char();
if let Some(prefix) = self.ch {
match prefix {
'x' | 'X' => base = 16,
'b' => base = 2,
'o' => base = 8,
_ => {}
}
// consume prefix
self.read_char();
// read digits appropriate for base (allow underscores)
while let Some(d) = self.ch {
if d == '_' { self.read_char(); continue; }
if base == 16 && d.is_ascii_hexdigit() { raw_digits.push(d); self.read_char(); continue; }
if base == 10 && d.is_ascii_digit() { raw_digits.push(d); self.read_char(); continue; }
if base == 8 && ('0'..='7').contains(&d) { raw_digits.push(d); self.read_char(); continue; }
if base == 2 && (d == '0' || d == '1') { raw_digits.push(d); self.read_char(); continue; }
break;
}
} }
Some(c) if c.is_ascii_digit() => {
let num = self.read_number();
if num.contains('.') {
Token { ttype: TokenType::Float, lexeme: num, line: self.line, column: self.column }
} else { } else {
Token { ttype: TokenType::Int, lexeme: num, line: self.line, column: self.column } // Decimal or float — detect invalid characters inside the literal
let mut seen_dot = false;
let mut invalid_start = false;
while let Some(d) = self.ch {
if d == '_' { self.read_char(); continue; }
if d == '.' {
if seen_dot { break; }
seen_dot = true;
raw_digits.push('.');
self.read_char();
continue;
}
if d.is_ascii_digit() {
raw_digits.push(d);
self.read_char();
continue;
}
// if we encounter a comma or alphabetic character inside a decimal
// treat the whole sequence as an invalid literal
if d == ',' || d.is_ascii_alphabetic() {
invalid_start = true;
break;
}
break;
}
if invalid_start {
// consume until whitespace or a clear delimiter to form a single Illegal token
while let Some(ch) = self.ch {
if ch.is_whitespace() { break; }
match ch {
'+' | '*' | '/' | '!' | '<' | '>' | '=' | ';' | '(' | ')' | '{' | '}' | '[' | ']' | '"' | '\'' => break,
_ => { self.read_char(); }
} }
} }
None => { Token { ttype: TokenType::Eof, lexeme: "".to_string(), line: self.line, column: self.column } } raw_digits = self.input[start_pos..self.pos].iter().collect();
Some(_) => { Token { ttype: TokenType::Illegal, lexeme: self.ch.unwrap().to_string(), line: self.line, column: self.column } } }
}
// After digits, check for optional type suffix like ':i8'
let mut suffix: Option<String> = None;
if self.ch == Some(':') {
// consume ':'
self.read_char();
let mut sstart = self.pos;
while let Some(sc) = self.ch {
if sc.is_ascii_alphanumeric() || sc == '_' { self.read_char(); } else { break; }
}
suffix = Some(self.input[sstart..self.pos].iter().collect());
}
// Now attempt to parse numeric value
let mut token = Token { ttype: TokenType::Int, lexeme: String::new(), line: self.line, column: self.column, numeric: None, float: None };
// reconstruct lexeme (include sign and any prefix)
let lexeme: String = self.input[start_pos..self.pos].iter().collect();
let full_lex = if sign < 0 { format!("-{}", lexeme) } else { lexeme.clone() };
token.lexeme = full_lex.clone();
// parse according to base
if raw_digits.contains('.') {
// float
match full_lex.replace("_", "").parse::<f64>() {
Ok(f) => { token.ttype = TokenType::Float; token.float = Some(f); }
Err(_) => { token.ttype = TokenType::Illegal; }
}
} else {
// integer: need to remove underscores and handle base
let digits = raw_digits.replace("_", "");
if digits.is_empty() {
token.ttype = TokenType::Illegal;
} else {
// if prefix was used, adjust parsing
let parsed = if base == 10 {
digits.parse::<i128>().ok()
} else {
i128::from_str_radix(&digits, base).ok()
};
if let Some(mut v) = parsed {
v *= sign;
// If base was non-decimal and a signed suffix exists, interpret
// the parsed unsigned bits as a two's-complement signed value
if base != 10 {
if let Some(ref sfx) = suffix {
match sfx.as_str() {
"i8" => {
let bits = 8u32;
let max_unsigned = (1i128 << bits) - 1;
let signed_max = i8::MAX as i128;
if v >= 0 && v <= max_unsigned && v > signed_max {
v = v - (1i128 << bits);
}
}
"i16" => {
let bits = 16u32;
let max_unsigned = (1i128 << bits) - 1;
let signed_max = i16::MAX as i128;
if v >= 0 && v <= max_unsigned && v > signed_max {
v = v - (1i128 << bits);
}
}
"i32" => {
let bits = 32u32;
let max_unsigned = (1i128 << bits) - 1;
let signed_max = i32::MAX as i128;
if v >= 0 && v <= max_unsigned && v > signed_max {
v = v - (1i128 << bits);
}
}
"i64" => {
let bits = 64u32;
let max_unsigned = (1i128 << bits) - 1;
let signed_max = i64::MAX as i128;
if v >= 0 && v <= max_unsigned && v > signed_max {
v = v - (1i128 << bits);
}
}
_ => {}
}
}
}
// validate suffix ranges if present
let mut overflow = false;
if let Some(ref s) = suffix {
match s.as_str() {
"i8" => if v < i8::MIN as i128 || v > i8::MAX as i128 { overflow = true; }
"i16" => if v < i16::MIN as i128 || v > i16::MAX as i128 { overflow = true; }
"i32" => if v < i32::MIN as i128 || v > i32::MAX as i128 { overflow = true; }
"i64" => if v < i64::MIN as i128 || v > i64::MAX as i128 { overflow = true; }
"u8" => if v < 0 || v > u8::MAX as i128 { overflow = true; }
"u16" => if v < 0 || v > u16::MAX as i128 { overflow = true; }
"u32" => if v < 0 || v > u32::MAX as i128 { overflow = true; }
"u64" => if v < 0 || v > u64::MAX as i128 { overflow = true; }
_ => {}
}
} else {
// default type is i64 per tests: check i64 range
if v < i64::MIN as i128 || v > i64::MAX as i128 { overflow = true; }
}
if overflow {
token.ttype = TokenType::Illegal;
} else {
token.numeric = Some(v);
token.ttype = TokenType::Int;
}
} else {
token.ttype = TokenType::Illegal;
}
}
}
token
}
Some('-') => { Token { ttype: TokenType::Minus, lexeme: "-".to_string(), line: self.line, column: self.column, numeric: None, float: None } }
None => { Token { ttype: TokenType::Eof, lexeme: "".to_string(), line: self.line, column: self.column, numeric: None, float: None } }
Some(_) => { Token { ttype: TokenType::Illegal, lexeme: self.ch.unwrap().to_string(), line: self.line, column: self.column, numeric: None, float: None } }
}; };
// advance to next char if not EOF and we didn't already advance inside readers // advance to next char if not EOF and we didn't already advance inside readers

File diff suppressed because it is too large Load Diff

View File

@ -31,16 +31,22 @@ def token_match_expectation(token_var: str, expected: dict) -> str:
val = expected.get('value') val = expected.get('value')
if ttype in ('i64','i32','i16','i8','u64','u32','u16','u8'): if ttype in ('i64','i32','i16','i8','u64','u32','u16','u8'):
# parse integer from token.lexeme and compare # check numeric field produced by the lexer
return f"assert_eq!({token_var}.ttype, sls::lexer::TokenType::Int);\n let parsed: i128 = {token_var}.lexeme.parse().expect(\"expected integer\");\n assert_eq!(parsed, {val});" # allow constant names like INT64_MIN to be used directly in generated code
if isinstance(val, str) and re.match(r"^[A-Za-z_][A-Za-z0-9_]*$", val):
vexpr = val
else:
vexpr = str(val)
return f"assert_eq!({token_var}.ttype, sls::lexer::TokenType::Int);\n assert!({token_var}.numeric.is_some(), \"expected numeric value\");\n assert_eq!({token_var}.numeric.unwrap(), {vexpr});"
elif ttype in ('f64','f32'): elif ttype in ('f64','f32'):
return f"assert_eq!({token_var}.ttype, sls::lexer::TokenType::Float);\n let parsed: f64 = {token_var}.lexeme.parse().expect(\"expected float\");\n assert!((parsed - {val}).abs() < 1e-12);" return f"assert_eq!({token_var}.ttype, sls::lexer::TokenType::Float);\n assert!({token_var}.float.is_some(), \"expected float value\");\n assert!(({token_var}.float.unwrap() - {val}).abs() < 1e-12);"
elif ttype == 'string': elif ttype == 'string':
return f"assert_eq!({token_var}.ttype, sls::lexer::TokenType::Str);\n assert_eq!({token_var}.lexeme, \"{rust_string_literal(str(val))}\");" return f"assert_eq!({token_var}.ttype, sls::lexer::TokenType::Str);\n assert_eq!({token_var}.lexeme, \"{rust_string_literal(str(val))}\");"
elif ttype in ('identifier', 'identifier_literal'): elif ttype in ('identifier', 'identifier_literal'):
return f"assert_eq!({token_var}.ttype, sls::lexer::TokenType::Ident);\n assert_eq!({token_var}.lexeme, \"{rust_string_literal(str(val))}\");" return f"assert_eq!({token_var}.ttype, sls::lexer::TokenType::Ident);\n assert_eq!({token_var}.lexeme, \"{rust_string_literal(str(val))}\");"
elif ttype == 'char': elif ttype == 'char':
return f"assert_eq!({token_var}.ttype, sls::lexer::TokenType::Int);\n let parsed: i128 = {token_var}.lexeme.parse().expect(\"expected char code\");\n assert_eq!(parsed, {(ord(val))});" # type: ignore codepoint = ord(val) if isinstance(val, str) and len(val) == 1 else val
return f"assert_eq!({token_var}.ttype, sls::lexer::TokenType::Int);\n assert!({token_var}.numeric.is_some(), \"expected numeric char code\");\n assert_eq!({token_var}.numeric.unwrap(), {codepoint});"
elif ttype == 'bool': elif ttype == 'bool':
return f"assert_eq!({token_var}.ttype, sls::lexer::TokenType::Ident);\n assert_eq!({token_var}.lexeme, \"{'true' if val else 'false'}\");" return f"assert_eq!({token_var}.ttype, sls::lexer::TokenType::Ident);\n assert_eq!({token_var}.lexeme, \"{'true' if val else 'false'}\");"
elif ttype == 'error': elif ttype == 'error':