Refactor token_match_expectation to improve numeric and char handling
This commit is contained in:
parent
483e0c3d52
commit
1875f2debd
|
|
@ -1,4 +1,4 @@
|
|||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
pub enum TokenType {
|
||||
Illegal,
|
||||
Eof,
|
||||
|
|
@ -32,12 +32,14 @@ pub enum TokenType {
|
|||
RBracket,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
pub struct Token {
|
||||
pub ttype: TokenType,
|
||||
pub lexeme: String,
|
||||
pub line: usize,
|
||||
pub column: usize,
|
||||
pub numeric: Option<i128>,
|
||||
pub float: Option<f64>,
|
||||
}
|
||||
|
||||
pub struct Lexer {
|
||||
|
|
@ -114,16 +116,10 @@ impl Lexer {
|
|||
}
|
||||
|
||||
fn read_number(&mut self) -> String {
|
||||
// Simple numeric reader: read digits, dot, underscores, and suffixes will be handled by caller
|
||||
let start = self.pos;
|
||||
let mut seen_dot = false;
|
||||
while let Some(c) = self.ch {
|
||||
if c == '.' {
|
||||
if seen_dot {
|
||||
break;
|
||||
}
|
||||
seen_dot = true;
|
||||
self.read_char();
|
||||
} else if c.is_ascii_digit() {
|
||||
if c.is_ascii_digit() || c == '.' || c == '_' || c == 'x' || c == 'b' || c == 'o' || c.is_ascii_hexdigit() {
|
||||
self.read_char();
|
||||
} else {
|
||||
break;
|
||||
|
|
@ -157,52 +153,233 @@ impl Lexer {
|
|||
if self.peek_char() == Some('=') {
|
||||
self.read_char();
|
||||
let lex = "==".to_string();
|
||||
Token { ttype: TokenType::Eq, lexeme: lex, line: self.line, column: self.column }
|
||||
Token { ttype: TokenType::Eq, lexeme: lex, line: self.line, column: self.column, numeric: None, float: None }
|
||||
} else {
|
||||
Token { ttype: TokenType::Assign, lexeme: "=".to_string(), line: self.line, column: self.column }
|
||||
Token { ttype: TokenType::Assign, lexeme: "=".to_string(), line: self.line, column: self.column, numeric: None, float: None }
|
||||
}
|
||||
}
|
||||
Some('+') => { Token { ttype: TokenType::Plus, lexeme: "+".to_string(), line: self.line, column: self.column } }
|
||||
Some('-') => { Token { ttype: TokenType::Minus, lexeme: "-".to_string(), line: self.line, column: self.column } }
|
||||
Some('*') => { Token { ttype: TokenType::Asterisk, lexeme: "*".to_string(), line: self.line, column: self.column } }
|
||||
Some('/') => { Token { ttype: TokenType::Slash, lexeme: "/".to_string(), line: self.line, column: self.column } }
|
||||
Some('+') => { Token { ttype: TokenType::Plus, lexeme: "+".to_string(), line: self.line, column: self.column, numeric: None, float: None } }
|
||||
Some('*') => { Token { ttype: TokenType::Asterisk, lexeme: "*".to_string(), line: self.line, column: self.column, numeric: None, float: None } }
|
||||
Some('/') => { Token { ttype: TokenType::Slash, lexeme: "/".to_string(), line: self.line, column: self.column, numeric: None, float: None } }
|
||||
Some('!') => {
|
||||
if self.peek_char() == Some('=') {
|
||||
self.read_char();
|
||||
Token { ttype: TokenType::NotEq, lexeme: "!=".to_string(), line: self.line, column: self.column }
|
||||
Token { ttype: TokenType::NotEq, lexeme: "!=".to_string(), line: self.line, column: self.column, numeric: None, float: None }
|
||||
} else {
|
||||
Token { ttype: TokenType::Bang, lexeme: "!".to_string(), line: self.line, column: self.column }
|
||||
Token { ttype: TokenType::Bang, lexeme: "!".to_string(), line: self.line, column: self.column, numeric: None, float: None }
|
||||
}
|
||||
}
|
||||
Some('<') => { Token { ttype: TokenType::Lt, lexeme: "<".to_string(), line: self.line, column: self.column } }
|
||||
Some('>') => { Token { ttype: TokenType::Gt, lexeme: ">".to_string(), line: self.line, column: self.column } }
|
||||
Some(',') => { Token { ttype: TokenType::Comma, lexeme: ",".to_string(), line: self.line, column: self.column } }
|
||||
Some(';') => { Token { ttype: TokenType::Semicolon, lexeme: ";".to_string(), line: self.line, column: self.column } }
|
||||
Some(':') => { Token { ttype: TokenType::Colon, lexeme: ":".to_string(), line: self.line, column: self.column } }
|
||||
Some('(') => { Token { ttype: TokenType::LParen, lexeme: "(".to_string(), line: self.line, column: self.column } }
|
||||
Some(')') => { Token { ttype: TokenType::RParen, lexeme: ")".to_string(), line: self.line, column: self.column } }
|
||||
Some('{') => { Token { ttype: TokenType::LBrace, lexeme: "{".to_string(), line: self.line, column: self.column } }
|
||||
Some('}') => { Token { ttype: TokenType::RBrace, lexeme: "}".to_string(), line: self.line, column: self.column } }
|
||||
Some('[') => { Token { ttype: TokenType::LBracket, lexeme: "[".to_string(), line: self.line, column: self.column } }
|
||||
Some(']') => { Token { ttype: TokenType::RBracket, lexeme: "]".to_string(), line: self.line, column: self.column } }
|
||||
Some('<') => { Token { ttype: TokenType::Lt, lexeme: "<".to_string(), line: self.line, column: self.column, numeric: None, float: None } }
|
||||
Some('>') => { Token { ttype: TokenType::Gt, lexeme: ">".to_string(), line: self.line, column: self.column, numeric: None, float: None } }
|
||||
Some(',') => { Token { ttype: TokenType::Comma, lexeme: ",".to_string(), line: self.line, column: self.column, numeric: None, float: None } }
|
||||
Some(';') => { Token { ttype: TokenType::Semicolon, lexeme: ";".to_string(), line: self.line, column: self.column, numeric: None, float: None } }
|
||||
Some(':') => { Token { ttype: TokenType::Colon, lexeme: ":".to_string(), line: self.line, column: self.column, numeric: None, float: None } }
|
||||
Some('(') => { Token { ttype: TokenType::LParen, lexeme: "(".to_string(), line: self.line, column: self.column, numeric: None, float: None } }
|
||||
Some(')') => { Token { ttype: TokenType::RParen, lexeme: ")".to_string(), line: self.line, column: self.column, numeric: None, float: None } }
|
||||
Some('{') => { Token { ttype: TokenType::LBrace, lexeme: "{".to_string(), line: self.line, column: self.column, numeric: None, float: None } }
|
||||
Some('}') => { Token { ttype: TokenType::RBrace, lexeme: "}".to_string(), line: self.line, column: self.column, numeric: None, float: None } }
|
||||
Some('[') => { Token { ttype: TokenType::LBracket, lexeme: "[".to_string(), line: self.line, column: self.column, numeric: None, float: None } }
|
||||
Some(']') => { Token { ttype: TokenType::RBracket, lexeme: "]".to_string(), line: self.line, column: self.column, numeric: None, float: None } }
|
||||
Some('"') => {
|
||||
let s = self.read_string();
|
||||
Token { ttype: TokenType::Str, lexeme: s, line: self.line, column: self.column }
|
||||
Token { ttype: TokenType::Str, lexeme: s, line: self.line, column: self.column, numeric: None, float: None }
|
||||
}
|
||||
Some(c) if is_letter(c) => {
|
||||
let ident = self.read_identifier();
|
||||
Token { ttype: TokenType::Ident, lexeme: ident, line: self.line, column: self.column }
|
||||
Token { ttype: TokenType::Ident, lexeme: ident, line: self.line, column: self.column, numeric: None, float: None }
|
||||
}
|
||||
Some(c) if c.is_ascii_digit() => {
|
||||
let num = self.read_number();
|
||||
if num.contains('.') {
|
||||
Token { ttype: TokenType::Float, lexeme: num, line: self.line, column: self.column }
|
||||
} else {
|
||||
Token { ttype: TokenType::Int, lexeme: num, line: self.line, column: self.column }
|
||||
Some(c) if c.is_ascii_digit() || (c == '-' && self.peek_char().map_or(false, |pc| pc.is_ascii_digit())) => {
|
||||
// Handle optional leading '-' as part of number
|
||||
let mut sign = 1i128;
|
||||
if c == '-' {
|
||||
sign = -1;
|
||||
self.read_char();
|
||||
}
|
||||
// Determine base and read digits with underscores and optional suffix
|
||||
let start_pos = self.pos;
|
||||
// If starting with '0' and next is x/b/o, handle prefixes
|
||||
let mut base = 10u32;
|
||||
let mut raw_digits = String::new();
|
||||
if self.ch == Some('0') && self.peek_char().map_or(false, |pc| pc == 'x' || pc == 'X' || pc == 'b' || pc == 'o') {
|
||||
// consume '0'
|
||||
self.read_char();
|
||||
if let Some(prefix) = self.ch {
|
||||
match prefix {
|
||||
'x' | 'X' => base = 16,
|
||||
'b' => base = 2,
|
||||
'o' => base = 8,
|
||||
_ => {}
|
||||
}
|
||||
// consume prefix
|
||||
self.read_char();
|
||||
// read digits appropriate for base (allow underscores)
|
||||
while let Some(d) = self.ch {
|
||||
if d == '_' { self.read_char(); continue; }
|
||||
if base == 16 && d.is_ascii_hexdigit() { raw_digits.push(d); self.read_char(); continue; }
|
||||
if base == 10 && d.is_ascii_digit() { raw_digits.push(d); self.read_char(); continue; }
|
||||
if base == 8 && ('0'..='7').contains(&d) { raw_digits.push(d); self.read_char(); continue; }
|
||||
if base == 2 && (d == '0' || d == '1') { raw_digits.push(d); self.read_char(); continue; }
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Decimal or float — detect invalid characters inside the literal
|
||||
let mut seen_dot = false;
|
||||
let mut invalid_start = false;
|
||||
while let Some(d) = self.ch {
|
||||
if d == '_' { self.read_char(); continue; }
|
||||
if d == '.' {
|
||||
if seen_dot { break; }
|
||||
seen_dot = true;
|
||||
raw_digits.push('.');
|
||||
self.read_char();
|
||||
continue;
|
||||
}
|
||||
if d.is_ascii_digit() {
|
||||
raw_digits.push(d);
|
||||
self.read_char();
|
||||
continue;
|
||||
}
|
||||
// if we encounter a comma or alphabetic character inside a decimal
|
||||
// treat the whole sequence as an invalid literal
|
||||
if d == ',' || d.is_ascii_alphabetic() {
|
||||
invalid_start = true;
|
||||
break;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
if invalid_start {
|
||||
// consume until whitespace or a clear delimiter to form a single Illegal token
|
||||
while let Some(ch) = self.ch {
|
||||
if ch.is_whitespace() { break; }
|
||||
match ch {
|
||||
'+' | '*' | '/' | '!' | '<' | '>' | '=' | ';' | '(' | ')' | '{' | '}' | '[' | ']' | '"' | '\'' => break,
|
||||
_ => { self.read_char(); }
|
||||
}
|
||||
}
|
||||
raw_digits = self.input[start_pos..self.pos].iter().collect();
|
||||
}
|
||||
}
|
||||
|
||||
// After digits, check for optional type suffix like ':i8'
|
||||
let mut suffix: Option<String> = None;
|
||||
if self.ch == Some(':') {
|
||||
// consume ':'
|
||||
self.read_char();
|
||||
let mut sstart = self.pos;
|
||||
while let Some(sc) = self.ch {
|
||||
if sc.is_ascii_alphanumeric() || sc == '_' { self.read_char(); } else { break; }
|
||||
}
|
||||
suffix = Some(self.input[sstart..self.pos].iter().collect());
|
||||
}
|
||||
|
||||
// Now attempt to parse numeric value
|
||||
let mut token = Token { ttype: TokenType::Int, lexeme: String::new(), line: self.line, column: self.column, numeric: None, float: None };
|
||||
// reconstruct lexeme (include sign and any prefix)
|
||||
let lexeme: String = self.input[start_pos..self.pos].iter().collect();
|
||||
let full_lex = if sign < 0 { format!("-{}", lexeme) } else { lexeme.clone() };
|
||||
token.lexeme = full_lex.clone();
|
||||
|
||||
// parse according to base
|
||||
if raw_digits.contains('.') {
|
||||
// float
|
||||
match full_lex.replace("_", "").parse::<f64>() {
|
||||
Ok(f) => { token.ttype = TokenType::Float; token.float = Some(f); }
|
||||
Err(_) => { token.ttype = TokenType::Illegal; }
|
||||
}
|
||||
} else {
|
||||
// integer: need to remove underscores and handle base
|
||||
let digits = raw_digits.replace("_", "");
|
||||
if digits.is_empty() {
|
||||
token.ttype = TokenType::Illegal;
|
||||
} else {
|
||||
// if prefix was used, adjust parsing
|
||||
let parsed = if base == 10 {
|
||||
digits.parse::<i128>().ok()
|
||||
} else {
|
||||
i128::from_str_radix(&digits, base).ok()
|
||||
};
|
||||
if let Some(mut v) = parsed {
|
||||
v *= sign;
|
||||
// If base was non-decimal and a signed suffix exists, interpret
|
||||
// the parsed unsigned bits as a two's-complement signed value
|
||||
if base != 10 {
|
||||
if let Some(ref sfx) = suffix {
|
||||
match sfx.as_str() {
|
||||
"i8" => {
|
||||
let bits = 8u32;
|
||||
let max_unsigned = (1i128 << bits) - 1;
|
||||
let signed_max = i8::MAX as i128;
|
||||
if v >= 0 && v <= max_unsigned && v > signed_max {
|
||||
v = v - (1i128 << bits);
|
||||
}
|
||||
}
|
||||
"i16" => {
|
||||
let bits = 16u32;
|
||||
let max_unsigned = (1i128 << bits) - 1;
|
||||
let signed_max = i16::MAX as i128;
|
||||
if v >= 0 && v <= max_unsigned && v > signed_max {
|
||||
v = v - (1i128 << bits);
|
||||
}
|
||||
}
|
||||
"i32" => {
|
||||
let bits = 32u32;
|
||||
let max_unsigned = (1i128 << bits) - 1;
|
||||
let signed_max = i32::MAX as i128;
|
||||
if v >= 0 && v <= max_unsigned && v > signed_max {
|
||||
v = v - (1i128 << bits);
|
||||
}
|
||||
}
|
||||
"i64" => {
|
||||
let bits = 64u32;
|
||||
let max_unsigned = (1i128 << bits) - 1;
|
||||
let signed_max = i64::MAX as i128;
|
||||
if v >= 0 && v <= max_unsigned && v > signed_max {
|
||||
v = v - (1i128 << bits);
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// validate suffix ranges if present
|
||||
let mut overflow = false;
|
||||
if let Some(ref s) = suffix {
|
||||
match s.as_str() {
|
||||
"i8" => if v < i8::MIN as i128 || v > i8::MAX as i128 { overflow = true; }
|
||||
"i16" => if v < i16::MIN as i128 || v > i16::MAX as i128 { overflow = true; }
|
||||
"i32" => if v < i32::MIN as i128 || v > i32::MAX as i128 { overflow = true; }
|
||||
"i64" => if v < i64::MIN as i128 || v > i64::MAX as i128 { overflow = true; }
|
||||
"u8" => if v < 0 || v > u8::MAX as i128 { overflow = true; }
|
||||
"u16" => if v < 0 || v > u16::MAX as i128 { overflow = true; }
|
||||
"u32" => if v < 0 || v > u32::MAX as i128 { overflow = true; }
|
||||
"u64" => if v < 0 || v > u64::MAX as i128 { overflow = true; }
|
||||
_ => {}
|
||||
}
|
||||
} else {
|
||||
// default type is i64 per tests: check i64 range
|
||||
if v < i64::MIN as i128 || v > i64::MAX as i128 { overflow = true; }
|
||||
}
|
||||
|
||||
if overflow {
|
||||
token.ttype = TokenType::Illegal;
|
||||
} else {
|
||||
token.numeric = Some(v);
|
||||
token.ttype = TokenType::Int;
|
||||
}
|
||||
} else {
|
||||
token.ttype = TokenType::Illegal;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
token
|
||||
}
|
||||
None => { Token { ttype: TokenType::Eof, lexeme: "".to_string(), line: self.line, column: self.column } }
|
||||
Some(_) => { Token { ttype: TokenType::Illegal, lexeme: self.ch.unwrap().to_string(), line: self.line, column: self.column } }
|
||||
Some('-') => { Token { ttype: TokenType::Minus, lexeme: "-".to_string(), line: self.line, column: self.column, numeric: None, float: None } }
|
||||
None => { Token { ttype: TokenType::Eof, lexeme: "".to_string(), line: self.line, column: self.column, numeric: None, float: None } }
|
||||
Some(_) => { Token { ttype: TokenType::Illegal, lexeme: self.ch.unwrap().to_string(), line: self.line, column: self.column, numeric: None, float: None } }
|
||||
};
|
||||
|
||||
// advance to next char if not EOF and we didn't already advance inside readers
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load Diff
|
|
@ -31,16 +31,22 @@ def token_match_expectation(token_var: str, expected: dict) -> str:
|
|||
val = expected.get('value')
|
||||
|
||||
if ttype in ('i64','i32','i16','i8','u64','u32','u16','u8'):
|
||||
# parse integer from token.lexeme and compare
|
||||
return f"assert_eq!({token_var}.ttype, sls::lexer::TokenType::Int);\n let parsed: i128 = {token_var}.lexeme.parse().expect(\"expected integer\");\n assert_eq!(parsed, {val});"
|
||||
# check numeric field produced by the lexer
|
||||
# allow constant names like INT64_MIN to be used directly in generated code
|
||||
if isinstance(val, str) and re.match(r"^[A-Za-z_][A-Za-z0-9_]*$", val):
|
||||
vexpr = val
|
||||
else:
|
||||
vexpr = str(val)
|
||||
return f"assert_eq!({token_var}.ttype, sls::lexer::TokenType::Int);\n assert!({token_var}.numeric.is_some(), \"expected numeric value\");\n assert_eq!({token_var}.numeric.unwrap(), {vexpr});"
|
||||
elif ttype in ('f64','f32'):
|
||||
return f"assert_eq!({token_var}.ttype, sls::lexer::TokenType::Float);\n let parsed: f64 = {token_var}.lexeme.parse().expect(\"expected float\");\n assert!((parsed - {val}).abs() < 1e-12);"
|
||||
return f"assert_eq!({token_var}.ttype, sls::lexer::TokenType::Float);\n assert!({token_var}.float.is_some(), \"expected float value\");\n assert!(({token_var}.float.unwrap() - {val}).abs() < 1e-12);"
|
||||
elif ttype == 'string':
|
||||
return f"assert_eq!({token_var}.ttype, sls::lexer::TokenType::Str);\n assert_eq!({token_var}.lexeme, \"{rust_string_literal(str(val))}\");"
|
||||
elif ttype in ('identifier', 'identifier_literal'):
|
||||
return f"assert_eq!({token_var}.ttype, sls::lexer::TokenType::Ident);\n assert_eq!({token_var}.lexeme, \"{rust_string_literal(str(val))}\");"
|
||||
elif ttype == 'char':
|
||||
return f"assert_eq!({token_var}.ttype, sls::lexer::TokenType::Int);\n let parsed: i128 = {token_var}.lexeme.parse().expect(\"expected char code\");\n assert_eq!(parsed, {(ord(val))});" # type: ignore
|
||||
codepoint = ord(val) if isinstance(val, str) and len(val) == 1 else val
|
||||
return f"assert_eq!({token_var}.ttype, sls::lexer::TokenType::Int);\n assert!({token_var}.numeric.is_some(), \"expected numeric char code\");\n assert_eq!({token_var}.numeric.unwrap(), {codepoint});"
|
||||
elif ttype == 'bool':
|
||||
return f"assert_eq!({token_var}.ttype, sls::lexer::TokenType::Ident);\n assert_eq!({token_var}.lexeme, \"{'true' if val else 'false'}\");"
|
||||
elif ttype == 'error':
|
||||
|
|
|
|||
Loading…
Reference in New Issue