Refactor token_match_expectation to improve numeric and char handling
This commit is contained in:
parent
483e0c3d52
commit
1875f2debd
|
|
@ -1,4 +1,4 @@
|
||||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
#[derive(Debug, Clone, PartialEq)]
|
||||||
pub enum TokenType {
|
pub enum TokenType {
|
||||||
Illegal,
|
Illegal,
|
||||||
Eof,
|
Eof,
|
||||||
|
|
@ -32,12 +32,14 @@ pub enum TokenType {
|
||||||
RBracket,
|
RBracket,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
#[derive(Debug, Clone, PartialEq)]
|
||||||
pub struct Token {
|
pub struct Token {
|
||||||
pub ttype: TokenType,
|
pub ttype: TokenType,
|
||||||
pub lexeme: String,
|
pub lexeme: String,
|
||||||
pub line: usize,
|
pub line: usize,
|
||||||
pub column: usize,
|
pub column: usize,
|
||||||
|
pub numeric: Option<i128>,
|
||||||
|
pub float: Option<f64>,
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct Lexer {
|
pub struct Lexer {
|
||||||
|
|
@ -114,16 +116,10 @@ impl Lexer {
|
||||||
}
|
}
|
||||||
|
|
||||||
fn read_number(&mut self) -> String {
|
fn read_number(&mut self) -> String {
|
||||||
|
// Simple numeric reader: read digits, dot, underscores, and suffixes will be handled by caller
|
||||||
let start = self.pos;
|
let start = self.pos;
|
||||||
let mut seen_dot = false;
|
|
||||||
while let Some(c) = self.ch {
|
while let Some(c) = self.ch {
|
||||||
if c == '.' {
|
if c.is_ascii_digit() || c == '.' || c == '_' || c == 'x' || c == 'b' || c == 'o' || c.is_ascii_hexdigit() {
|
||||||
if seen_dot {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
seen_dot = true;
|
|
||||||
self.read_char();
|
|
||||||
} else if c.is_ascii_digit() {
|
|
||||||
self.read_char();
|
self.read_char();
|
||||||
} else {
|
} else {
|
||||||
break;
|
break;
|
||||||
|
|
@ -157,52 +153,233 @@ impl Lexer {
|
||||||
if self.peek_char() == Some('=') {
|
if self.peek_char() == Some('=') {
|
||||||
self.read_char();
|
self.read_char();
|
||||||
let lex = "==".to_string();
|
let lex = "==".to_string();
|
||||||
Token { ttype: TokenType::Eq, lexeme: lex, line: self.line, column: self.column }
|
Token { ttype: TokenType::Eq, lexeme: lex, line: self.line, column: self.column, numeric: None, float: None }
|
||||||
} else {
|
} else {
|
||||||
Token { ttype: TokenType::Assign, lexeme: "=".to_string(), line: self.line, column: self.column }
|
Token { ttype: TokenType::Assign, lexeme: "=".to_string(), line: self.line, column: self.column, numeric: None, float: None }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Some('+') => { Token { ttype: TokenType::Plus, lexeme: "+".to_string(), line: self.line, column: self.column } }
|
Some('+') => { Token { ttype: TokenType::Plus, lexeme: "+".to_string(), line: self.line, column: self.column, numeric: None, float: None } }
|
||||||
Some('-') => { Token { ttype: TokenType::Minus, lexeme: "-".to_string(), line: self.line, column: self.column } }
|
Some('*') => { Token { ttype: TokenType::Asterisk, lexeme: "*".to_string(), line: self.line, column: self.column, numeric: None, float: None } }
|
||||||
Some('*') => { Token { ttype: TokenType::Asterisk, lexeme: "*".to_string(), line: self.line, column: self.column } }
|
Some('/') => { Token { ttype: TokenType::Slash, lexeme: "/".to_string(), line: self.line, column: self.column, numeric: None, float: None } }
|
||||||
Some('/') => { Token { ttype: TokenType::Slash, lexeme: "/".to_string(), line: self.line, column: self.column } }
|
|
||||||
Some('!') => {
|
Some('!') => {
|
||||||
if self.peek_char() == Some('=') {
|
if self.peek_char() == Some('=') {
|
||||||
self.read_char();
|
self.read_char();
|
||||||
Token { ttype: TokenType::NotEq, lexeme: "!=".to_string(), line: self.line, column: self.column }
|
Token { ttype: TokenType::NotEq, lexeme: "!=".to_string(), line: self.line, column: self.column, numeric: None, float: None }
|
||||||
} else {
|
} else {
|
||||||
Token { ttype: TokenType::Bang, lexeme: "!".to_string(), line: self.line, column: self.column }
|
Token { ttype: TokenType::Bang, lexeme: "!".to_string(), line: self.line, column: self.column, numeric: None, float: None }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Some('<') => { Token { ttype: TokenType::Lt, lexeme: "<".to_string(), line: self.line, column: self.column } }
|
Some('<') => { Token { ttype: TokenType::Lt, lexeme: "<".to_string(), line: self.line, column: self.column, numeric: None, float: None } }
|
||||||
Some('>') => { Token { ttype: TokenType::Gt, lexeme: ">".to_string(), line: self.line, column: self.column } }
|
Some('>') => { Token { ttype: TokenType::Gt, lexeme: ">".to_string(), line: self.line, column: self.column, numeric: None, float: None } }
|
||||||
Some(',') => { Token { ttype: TokenType::Comma, lexeme: ",".to_string(), line: self.line, column: self.column } }
|
Some(',') => { Token { ttype: TokenType::Comma, lexeme: ",".to_string(), line: self.line, column: self.column, numeric: None, float: None } }
|
||||||
Some(';') => { Token { ttype: TokenType::Semicolon, lexeme: ";".to_string(), line: self.line, column: self.column } }
|
Some(';') => { Token { ttype: TokenType::Semicolon, lexeme: ";".to_string(), line: self.line, column: self.column, numeric: None, float: None } }
|
||||||
Some(':') => { Token { ttype: TokenType::Colon, lexeme: ":".to_string(), line: self.line, column: self.column } }
|
Some(':') => { Token { ttype: TokenType::Colon, lexeme: ":".to_string(), line: self.line, column: self.column, numeric: None, float: None } }
|
||||||
Some('(') => { Token { ttype: TokenType::LParen, lexeme: "(".to_string(), line: self.line, column: self.column } }
|
Some('(') => { Token { ttype: TokenType::LParen, lexeme: "(".to_string(), line: self.line, column: self.column, numeric: None, float: None } }
|
||||||
Some(')') => { Token { ttype: TokenType::RParen, lexeme: ")".to_string(), line: self.line, column: self.column } }
|
Some(')') => { Token { ttype: TokenType::RParen, lexeme: ")".to_string(), line: self.line, column: self.column, numeric: None, float: None } }
|
||||||
Some('{') => { Token { ttype: TokenType::LBrace, lexeme: "{".to_string(), line: self.line, column: self.column } }
|
Some('{') => { Token { ttype: TokenType::LBrace, lexeme: "{".to_string(), line: self.line, column: self.column, numeric: None, float: None } }
|
||||||
Some('}') => { Token { ttype: TokenType::RBrace, lexeme: "}".to_string(), line: self.line, column: self.column } }
|
Some('}') => { Token { ttype: TokenType::RBrace, lexeme: "}".to_string(), line: self.line, column: self.column, numeric: None, float: None } }
|
||||||
Some('[') => { Token { ttype: TokenType::LBracket, lexeme: "[".to_string(), line: self.line, column: self.column } }
|
Some('[') => { Token { ttype: TokenType::LBracket, lexeme: "[".to_string(), line: self.line, column: self.column, numeric: None, float: None } }
|
||||||
Some(']') => { Token { ttype: TokenType::RBracket, lexeme: "]".to_string(), line: self.line, column: self.column } }
|
Some(']') => { Token { ttype: TokenType::RBracket, lexeme: "]".to_string(), line: self.line, column: self.column, numeric: None, float: None } }
|
||||||
Some('"') => {
|
Some('"') => {
|
||||||
let s = self.read_string();
|
let s = self.read_string();
|
||||||
Token { ttype: TokenType::Str, lexeme: s, line: self.line, column: self.column }
|
Token { ttype: TokenType::Str, lexeme: s, line: self.line, column: self.column, numeric: None, float: None }
|
||||||
}
|
}
|
||||||
Some(c) if is_letter(c) => {
|
Some(c) if is_letter(c) => {
|
||||||
let ident = self.read_identifier();
|
let ident = self.read_identifier();
|
||||||
Token { ttype: TokenType::Ident, lexeme: ident, line: self.line, column: self.column }
|
Token { ttype: TokenType::Ident, lexeme: ident, line: self.line, column: self.column, numeric: None, float: None }
|
||||||
|
}
|
||||||
|
Some(c) if c.is_ascii_digit() || (c == '-' && self.peek_char().map_or(false, |pc| pc.is_ascii_digit())) => {
|
||||||
|
// Handle optional leading '-' as part of number
|
||||||
|
let mut sign = 1i128;
|
||||||
|
if c == '-' {
|
||||||
|
sign = -1;
|
||||||
|
self.read_char();
|
||||||
|
}
|
||||||
|
// Determine base and read digits with underscores and optional suffix
|
||||||
|
let start_pos = self.pos;
|
||||||
|
// If starting with '0' and next is x/b/o, handle prefixes
|
||||||
|
let mut base = 10u32;
|
||||||
|
let mut raw_digits = String::new();
|
||||||
|
if self.ch == Some('0') && self.peek_char().map_or(false, |pc| pc == 'x' || pc == 'X' || pc == 'b' || pc == 'o') {
|
||||||
|
// consume '0'
|
||||||
|
self.read_char();
|
||||||
|
if let Some(prefix) = self.ch {
|
||||||
|
match prefix {
|
||||||
|
'x' | 'X' => base = 16,
|
||||||
|
'b' => base = 2,
|
||||||
|
'o' => base = 8,
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
// consume prefix
|
||||||
|
self.read_char();
|
||||||
|
// read digits appropriate for base (allow underscores)
|
||||||
|
while let Some(d) = self.ch {
|
||||||
|
if d == '_' { self.read_char(); continue; }
|
||||||
|
if base == 16 && d.is_ascii_hexdigit() { raw_digits.push(d); self.read_char(); continue; }
|
||||||
|
if base == 10 && d.is_ascii_digit() { raw_digits.push(d); self.read_char(); continue; }
|
||||||
|
if base == 8 && ('0'..='7').contains(&d) { raw_digits.push(d); self.read_char(); continue; }
|
||||||
|
if base == 2 && (d == '0' || d == '1') { raw_digits.push(d); self.read_char(); continue; }
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
Some(c) if c.is_ascii_digit() => {
|
|
||||||
let num = self.read_number();
|
|
||||||
if num.contains('.') {
|
|
||||||
Token { ttype: TokenType::Float, lexeme: num, line: self.line, column: self.column }
|
|
||||||
} else {
|
} else {
|
||||||
Token { ttype: TokenType::Int, lexeme: num, line: self.line, column: self.column }
|
// Decimal or float — detect invalid characters inside the literal
|
||||||
|
let mut seen_dot = false;
|
||||||
|
let mut invalid_start = false;
|
||||||
|
while let Some(d) = self.ch {
|
||||||
|
if d == '_' { self.read_char(); continue; }
|
||||||
|
if d == '.' {
|
||||||
|
if seen_dot { break; }
|
||||||
|
seen_dot = true;
|
||||||
|
raw_digits.push('.');
|
||||||
|
self.read_char();
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if d.is_ascii_digit() {
|
||||||
|
raw_digits.push(d);
|
||||||
|
self.read_char();
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
// if we encounter a comma or alphabetic character inside a decimal
|
||||||
|
// treat the whole sequence as an invalid literal
|
||||||
|
if d == ',' || d.is_ascii_alphabetic() {
|
||||||
|
invalid_start = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if invalid_start {
|
||||||
|
// consume until whitespace or a clear delimiter to form a single Illegal token
|
||||||
|
while let Some(ch) = self.ch {
|
||||||
|
if ch.is_whitespace() { break; }
|
||||||
|
match ch {
|
||||||
|
'+' | '*' | '/' | '!' | '<' | '>' | '=' | ';' | '(' | ')' | '{' | '}' | '[' | ']' | '"' | '\'' => break,
|
||||||
|
_ => { self.read_char(); }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
None => { Token { ttype: TokenType::Eof, lexeme: "".to_string(), line: self.line, column: self.column } }
|
raw_digits = self.input[start_pos..self.pos].iter().collect();
|
||||||
Some(_) => { Token { ttype: TokenType::Illegal, lexeme: self.ch.unwrap().to_string(), line: self.line, column: self.column } }
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// After digits, check for optional type suffix like ':i8'
|
||||||
|
let mut suffix: Option<String> = None;
|
||||||
|
if self.ch == Some(':') {
|
||||||
|
// consume ':'
|
||||||
|
self.read_char();
|
||||||
|
let mut sstart = self.pos;
|
||||||
|
while let Some(sc) = self.ch {
|
||||||
|
if sc.is_ascii_alphanumeric() || sc == '_' { self.read_char(); } else { break; }
|
||||||
|
}
|
||||||
|
suffix = Some(self.input[sstart..self.pos].iter().collect());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Now attempt to parse numeric value
|
||||||
|
let mut token = Token { ttype: TokenType::Int, lexeme: String::new(), line: self.line, column: self.column, numeric: None, float: None };
|
||||||
|
// reconstruct lexeme (include sign and any prefix)
|
||||||
|
let lexeme: String = self.input[start_pos..self.pos].iter().collect();
|
||||||
|
let full_lex = if sign < 0 { format!("-{}", lexeme) } else { lexeme.clone() };
|
||||||
|
token.lexeme = full_lex.clone();
|
||||||
|
|
||||||
|
// parse according to base
|
||||||
|
if raw_digits.contains('.') {
|
||||||
|
// float
|
||||||
|
match full_lex.replace("_", "").parse::<f64>() {
|
||||||
|
Ok(f) => { token.ttype = TokenType::Float; token.float = Some(f); }
|
||||||
|
Err(_) => { token.ttype = TokenType::Illegal; }
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// integer: need to remove underscores and handle base
|
||||||
|
let digits = raw_digits.replace("_", "");
|
||||||
|
if digits.is_empty() {
|
||||||
|
token.ttype = TokenType::Illegal;
|
||||||
|
} else {
|
||||||
|
// if prefix was used, adjust parsing
|
||||||
|
let parsed = if base == 10 {
|
||||||
|
digits.parse::<i128>().ok()
|
||||||
|
} else {
|
||||||
|
i128::from_str_radix(&digits, base).ok()
|
||||||
|
};
|
||||||
|
if let Some(mut v) = parsed {
|
||||||
|
v *= sign;
|
||||||
|
// If base was non-decimal and a signed suffix exists, interpret
|
||||||
|
// the parsed unsigned bits as a two's-complement signed value
|
||||||
|
if base != 10 {
|
||||||
|
if let Some(ref sfx) = suffix {
|
||||||
|
match sfx.as_str() {
|
||||||
|
"i8" => {
|
||||||
|
let bits = 8u32;
|
||||||
|
let max_unsigned = (1i128 << bits) - 1;
|
||||||
|
let signed_max = i8::MAX as i128;
|
||||||
|
if v >= 0 && v <= max_unsigned && v > signed_max {
|
||||||
|
v = v - (1i128 << bits);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
"i16" => {
|
||||||
|
let bits = 16u32;
|
||||||
|
let max_unsigned = (1i128 << bits) - 1;
|
||||||
|
let signed_max = i16::MAX as i128;
|
||||||
|
if v >= 0 && v <= max_unsigned && v > signed_max {
|
||||||
|
v = v - (1i128 << bits);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
"i32" => {
|
||||||
|
let bits = 32u32;
|
||||||
|
let max_unsigned = (1i128 << bits) - 1;
|
||||||
|
let signed_max = i32::MAX as i128;
|
||||||
|
if v >= 0 && v <= max_unsigned && v > signed_max {
|
||||||
|
v = v - (1i128 << bits);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
"i64" => {
|
||||||
|
let bits = 64u32;
|
||||||
|
let max_unsigned = (1i128 << bits) - 1;
|
||||||
|
let signed_max = i64::MAX as i128;
|
||||||
|
if v >= 0 && v <= max_unsigned && v > signed_max {
|
||||||
|
v = v - (1i128 << bits);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// validate suffix ranges if present
|
||||||
|
let mut overflow = false;
|
||||||
|
if let Some(ref s) = suffix {
|
||||||
|
match s.as_str() {
|
||||||
|
"i8" => if v < i8::MIN as i128 || v > i8::MAX as i128 { overflow = true; }
|
||||||
|
"i16" => if v < i16::MIN as i128 || v > i16::MAX as i128 { overflow = true; }
|
||||||
|
"i32" => if v < i32::MIN as i128 || v > i32::MAX as i128 { overflow = true; }
|
||||||
|
"i64" => if v < i64::MIN as i128 || v > i64::MAX as i128 { overflow = true; }
|
||||||
|
"u8" => if v < 0 || v > u8::MAX as i128 { overflow = true; }
|
||||||
|
"u16" => if v < 0 || v > u16::MAX as i128 { overflow = true; }
|
||||||
|
"u32" => if v < 0 || v > u32::MAX as i128 { overflow = true; }
|
||||||
|
"u64" => if v < 0 || v > u64::MAX as i128 { overflow = true; }
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// default type is i64 per tests: check i64 range
|
||||||
|
if v < i64::MIN as i128 || v > i64::MAX as i128 { overflow = true; }
|
||||||
|
}
|
||||||
|
|
||||||
|
if overflow {
|
||||||
|
token.ttype = TokenType::Illegal;
|
||||||
|
} else {
|
||||||
|
token.numeric = Some(v);
|
||||||
|
token.ttype = TokenType::Int;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
token.ttype = TokenType::Illegal;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
token
|
||||||
|
}
|
||||||
|
Some('-') => { Token { ttype: TokenType::Minus, lexeme: "-".to_string(), line: self.line, column: self.column, numeric: None, float: None } }
|
||||||
|
None => { Token { ttype: TokenType::Eof, lexeme: "".to_string(), line: self.line, column: self.column, numeric: None, float: None } }
|
||||||
|
Some(_) => { Token { ttype: TokenType::Illegal, lexeme: self.ch.unwrap().to_string(), line: self.line, column: self.column, numeric: None, float: None } }
|
||||||
};
|
};
|
||||||
|
|
||||||
// advance to next char if not EOF and we didn't already advance inside readers
|
// advance to next char if not EOF and we didn't already advance inside readers
|
||||||
|
|
|
||||||
File diff suppressed because it is too large
Load Diff
|
|
@ -31,16 +31,22 @@ def token_match_expectation(token_var: str, expected: dict) -> str:
|
||||||
val = expected.get('value')
|
val = expected.get('value')
|
||||||
|
|
||||||
if ttype in ('i64','i32','i16','i8','u64','u32','u16','u8'):
|
if ttype in ('i64','i32','i16','i8','u64','u32','u16','u8'):
|
||||||
# parse integer from token.lexeme and compare
|
# check numeric field produced by the lexer
|
||||||
return f"assert_eq!({token_var}.ttype, sls::lexer::TokenType::Int);\n let parsed: i128 = {token_var}.lexeme.parse().expect(\"expected integer\");\n assert_eq!(parsed, {val});"
|
# allow constant names like INT64_MIN to be used directly in generated code
|
||||||
|
if isinstance(val, str) and re.match(r"^[A-Za-z_][A-Za-z0-9_]*$", val):
|
||||||
|
vexpr = val
|
||||||
|
else:
|
||||||
|
vexpr = str(val)
|
||||||
|
return f"assert_eq!({token_var}.ttype, sls::lexer::TokenType::Int);\n assert!({token_var}.numeric.is_some(), \"expected numeric value\");\n assert_eq!({token_var}.numeric.unwrap(), {vexpr});"
|
||||||
elif ttype in ('f64','f32'):
|
elif ttype in ('f64','f32'):
|
||||||
return f"assert_eq!({token_var}.ttype, sls::lexer::TokenType::Float);\n let parsed: f64 = {token_var}.lexeme.parse().expect(\"expected float\");\n assert!((parsed - {val}).abs() < 1e-12);"
|
return f"assert_eq!({token_var}.ttype, sls::lexer::TokenType::Float);\n assert!({token_var}.float.is_some(), \"expected float value\");\n assert!(({token_var}.float.unwrap() - {val}).abs() < 1e-12);"
|
||||||
elif ttype == 'string':
|
elif ttype == 'string':
|
||||||
return f"assert_eq!({token_var}.ttype, sls::lexer::TokenType::Str);\n assert_eq!({token_var}.lexeme, \"{rust_string_literal(str(val))}\");"
|
return f"assert_eq!({token_var}.ttype, sls::lexer::TokenType::Str);\n assert_eq!({token_var}.lexeme, \"{rust_string_literal(str(val))}\");"
|
||||||
elif ttype in ('identifier', 'identifier_literal'):
|
elif ttype in ('identifier', 'identifier_literal'):
|
||||||
return f"assert_eq!({token_var}.ttype, sls::lexer::TokenType::Ident);\n assert_eq!({token_var}.lexeme, \"{rust_string_literal(str(val))}\");"
|
return f"assert_eq!({token_var}.ttype, sls::lexer::TokenType::Ident);\n assert_eq!({token_var}.lexeme, \"{rust_string_literal(str(val))}\");"
|
||||||
elif ttype == 'char':
|
elif ttype == 'char':
|
||||||
return f"assert_eq!({token_var}.ttype, sls::lexer::TokenType::Int);\n let parsed: i128 = {token_var}.lexeme.parse().expect(\"expected char code\");\n assert_eq!(parsed, {(ord(val))});" # type: ignore
|
codepoint = ord(val) if isinstance(val, str) and len(val) == 1 else val
|
||||||
|
return f"assert_eq!({token_var}.ttype, sls::lexer::TokenType::Int);\n assert!({token_var}.numeric.is_some(), \"expected numeric char code\");\n assert_eq!({token_var}.numeric.unwrap(), {codepoint});"
|
||||||
elif ttype == 'bool':
|
elif ttype == 'bool':
|
||||||
return f"assert_eq!({token_var}.ttype, sls::lexer::TokenType::Ident);\n assert_eq!({token_var}.lexeme, \"{'true' if val else 'false'}\");"
|
return f"assert_eq!({token_var}.ttype, sls::lexer::TokenType::Ident);\n assert_eq!({token_var}.lexeme, \"{'true' if val else 'false'}\");"
|
||||||
elif ttype == 'error':
|
elif ttype == 'error':
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue