ChatGPT attempt at lexer.rs

This commit is contained in:
Kyler Olsen 2025-12-01 23:46:05 -07:00
parent ae077ef433
commit 349cb29762
1 changed files with 591 additions and 95 deletions

View File

@ -17,111 +17,607 @@ impl LexerInfo {
line: 1,
}
}
pub fn peek(&self) -> char {
self.source[self.pos..].chars().next().unwrap_or('\0')
}
pub fn far_peek(&self, index: usize) -> char {
self.source[self.pos..].chars().nth(index).unwrap_or('\0')
}
pub fn seek(&self, index: usize) -> char {
self.source[index..].chars().next().unwrap_or('\0')
}
/// Advance by one UTF-8 char and update line/column. Returns the char after advancing.
/// If already at end, returns '\0'.
pub fn advance(&mut self) -> char {
if self.pos >= self.source.len() {
return '\0';
}
let ch = self.peek();
if ch == '\n' {
self.line += 1;
self.column = 1;
} else {
self.column += 1;
}
// move pos forward by char len
self.pos += ch.len_utf8();
self.peek()
}
/// Return substring from start to current pos (slice by byte indices).
/// start is a byte index into the original source (we store start as byte index).
pub fn get_token_text(&self, start: usize) -> &str {
// clamp
let end = self.pos.min(self.source.len());
&self.source[start..end]
}
}
#[derive(Debug, Clone)]
pub struct Identifier {
pub name: String,
pub is_literal: bool,
}
#[derive(Debug, Clone)]
pub enum ArrayLiteral {
Identifiers(Vec<Identifier>),
I64(Vec<i64>),
I32(Vec<i32>),
I16(Vec<i16>),
I8(Vec<i8>),
U64(Vec<u64>),
U32(Vec<u32>),
U16(Vec<u16>),
U8(Vec<u8>),
Float(Vec<f32>),
Double(Vec<f64>),
Character(Vec<u8>),
Strings(Vec<String>),
Boolean(Vec<bool>),
TokenStrings(Vec<TokenString>),
TypeTuples(Vec<TypeTuple>),
StructInline(StructInline),
}
#[derive(Debug, Clone)]
pub struct ShapedArray {
pub array: ArrayLiteral,
pub shape: Vec<usize>,
}
#[derive(Debug, Clone)]
pub struct TokenString {
pub tokens: Vec<Token>,
}
#[derive(Debug, Clone)]
pub struct TypeTuple {
pub inputs: Vec<Identifier>,
pub outputs: Vec<Identifier>,
}
#[derive(Debug, Clone)]
pub struct StructInline {
pub name: String,
pub values: Vec<StructValue>,
}
#[derive(Debug, Clone)]
pub enum StructValue {
Integer(i64),
Float(f32),
Double(f64),
Boolean(bool),
Character(u8),
String(String),
Token(Token),
}
#[derive(Debug, Clone)]
pub enum Token {
Eof,
Identifier(Identifier),
// All integer sizes
I64(i64),
I32(i32),
I16(i16),
I8(i8),
U64(u64),
U32(u32),
U16(u16),
U8(u8),
Float(f32),
Double(f64),
Character(u8),
StringLiteral(String),
Boolean(bool),
Array(ShapedArray),
TokenString(TokenString),
TypeTuple(TypeTuple),
}
#[derive(Debug, Clone)]
pub struct LexError {
pub message: String,
pub struct FileInfo {
pub file: String,
pub line: usize,
pub column: usize,
pub length: usize,
pub lines: usize,
}
#[derive(Debug)]
pub enum LexErrorKind {
Message(String),
NotImplemented(String),
}
#[derive(Debug)]
pub struct LexError {
pub kind: LexErrorKind,
pub file_info: Option<FileInfo>,
}
impl LexError {
pub fn new(msg: impl Into<String>, file_info: Option<FileInfo>) -> Self {
Self {
kind: LexErrorKind::Message(msg.into()),
file_info,
}
}
pub fn not_implemented(msg: impl Into<String>, file_info: Option<FileInfo>) -> Self {
Self {
kind: LexErrorKind::NotImplemented(msg.into()),
file_info,
}
}
}
pub type LexResult<T> = Result<T, LexError>;
pub fn get_token(_lexer: &mut LexerInfo) -> Option<Token> {
None
#[derive(Debug, Clone)]
pub enum IntegerType {
I64,
I32,
I16,
I8,
U64,
U32,
U16,
U8,
}
pub fn lexical_analysis(_lexer: &mut LexerInfo) -> LexResult<Vec<Token>> {
Ok(Vec::new())
#[derive(Debug, Clone)]
pub struct IntegerLiteral {
pub typ: IntegerType,
pub value: u64, // we'll keep as u64; sign info handled by type and parsing
}
#[derive(Debug, Clone)]
pub enum Token {
EOF,
Identifier { is_literal: bool, name: String },
Integer(IntegerLiteral),
Float(f32),
Double(f64),
Character(u8),
String(String),
Boolean(bool),
Array, // placeholder
TokenString, // placeholder
TypeTuple, // placeholder
}
fn make_file_info(lexer: &LexerInfo, start: usize, start_line: usize) -> FileInfo {
FileInfo {
file: lexer.filename.clone(),
line: lexer.line,
column: lexer.column,
length: lexer.pos.saturating_sub(start),
lines: lexer.line.saturating_sub(start_line),
}
}
/// Helpers for numeric parsing (create values from the slice)
fn create_binary_integer(token: &str, negative: bool) -> u64 {
// token begins like "0b..." or "-0b..."
let mut value: u64 = 0;
for ch in token.chars() {
if ch == '0' || ch == '1' {
value = (value << 1) + (if ch == '1' { 1 } else { 0 });
} else if ch == '_' || ch == '.' || ch == ':' || ch == '-' || ch == 'b' || ch == 'B' {
continue;
} else {
break;
}
}
if negative {
// two's complement style as in C original: (~value) + 1
(!value).wrapping_add(1)
} else {
value
}
}
fn create_octal_integer(token: &str, negative: bool) -> u64 {
let mut value: u64 = 0;
for ch in token.chars() {
if ch >= '0' && ch <= '7' {
value = value * 8 + ((ch as u8 - b'0') as u64);
} else if ch == '_' || ch == '.' || ch == ':' || ch == '-' || ch == 'o' || ch == 'O' {
continue;
} else {
break;
}
}
if negative {
(!value).wrapping_add(1)
} else {
value
}
}
fn create_decimal_integer(token: &str, negative: bool) -> u64 {
let mut value: u64 = 0;
for ch in token.chars() {
if ch.is_ascii_digit() {
value = value * 10 + ((ch as u8 - b'0') as u64);
} else if ch == '_' || ch == ':' || ch == '-' {
continue;
} else {
break;
}
}
if negative {
(!value).wrapping_add(1)
} else {
value
}
}
fn create_hex_integer(token: &str, negative: bool) -> u64 {
let mut value: u64 = 0;
for ch in token.chars() {
if ch.is_ascii_hexdigit() {
value = value * 16
+ (ch.to_digit(16).unwrap_or(0) as u64);
} else if ch == '_' || ch == '.' || ch == ':' || ch == '-' || ch == 'x' || ch == 'X' {
continue;
} else {
break;
}
}
if negative {
(!value).wrapping_add(1)
} else {
value
}
}
fn create_float_value(token: &str, negative: bool) -> f64 {
// A simple but robust approach: remove underscores and parse with Rust
let cleaned: String = token.chars().filter(|&c| c != '_').collect();
// Accept leading '-' in cleaned string
let parsed = cleaned.parse::<f64>().unwrap_or(0.0);
if negative { -parsed } else { parsed }
}
/// Determine integer builtin type from suffix bits similar to original
fn select_integer_type(unsigned: bool, bits: usize) -> Option<IntegerType> {
match (unsigned, bits) {
(false, 64) => Some(IntegerType::I64),
(false, 32) => Some(IntegerType::I32),
(false, 16) => Some(IntegerType::I16),
(false, 8) => Some(IntegerType::I8),
(true, 64) => Some(IntegerType::U64),
(true, 32) => Some(IntegerType::U32),
(true, 16) => Some(IntegerType::U16),
(true, 8) => Some(IntegerType::U8),
_ => None,
}
}
/// Top-level numeric-type-suffix parser: handles things like ":i32", ":u8", ":f64", ":f32"
fn parse_numeric_type_suffix(token: &str) -> Result<(bool, Option<IntegerType>, bool /*is_float*/, usize /*bits*/), ()> {
// returns (unsigned, integer_type_option, is_float, bits)
let mut unsigned = false;
let mut is_float = false;
let mut bits = 0usize;
// find suffix start colon
if let Some(colon_pos) = token.find(':') {
let suffix = &token[colon_pos + 1..];
// float?
if suffix.starts_with('f') {
is_float = true;
if suffix == "f64" { bits = 64; }
else if suffix == "f32" { bits = 32; }
else { return Err(()); }
} else if suffix.starts_with('i') || suffix.starts_with('u') {
unsigned = suffix.starts_with('u');
if suffix == "i64" || suffix == "u64" { bits = 64; }
else if suffix == "i32" || suffix == "u32" { bits = 32; }
else if suffix == "i16" || suffix == "u16" { bits = 16; }
else if suffix == "i8" || suffix == "u8" { bits = 8; }
else { return Err(()); }
} else {
return Err(());
}
let int_type = if is_float { None } else { select_integer_type(unsigned, bits) };
Ok((unsigned, int_type, is_float, bits))
} else {
// no suffix
Ok((false, Some(IntegerType::I64), false, 64))
}
}
/// Skip comments and whitespace similar to C version
fn skip_comments_and_whitespace(lexer: &mut LexerInfo) {
loop {
let p = lexer.peek();
if p == '\0' {
break;
}
if p.is_whitespace() {
lexer.advance();
continue;
}
// comment styles: // or #
if p == '/' && lexer.far_peek(1) == '/' {
// consume until newline or '\0'
while lexer.peek() != '\n' && lexer.peek() != '\0' {
lexer.advance();
}
continue;
}
if p == '#' {
while lexer.peek() != '\n' && lexer.peek() != '\0' {
lexer.advance();
}
continue;
}
break;
}
}
/// Is identifier char allowed to continue?
fn is_identifier_continue(lexer: &LexerInfo, c: char) -> bool {
if !c.is_ascii_graphic() && c != ' ' { // keep printable heuristic
return false;
}
if c == '/' && lexer.far_peek(1) == '/' { return false; }
match c {
'{' | '}' | '[' | ']' | '(' | ')' | '\'' | '"' | '#' => false,
_ if c.is_whitespace() || c == '\0' => false,
_ => true,
}
}
/// Is identifier start
fn is_identifier_start(lexer: &LexerInfo) -> bool {
let mut c = lexer.peek();
if c == ':' && lexer.far_peek(1) == ':' {
c = lexer.far_peek(2);
}
(!c.is_ascii_digit()) && is_identifier_continue(lexer, c)
}
/// Parse identifiers and booleans
fn parse_identifiers_and_booleans(lexer: &mut LexerInfo, start: usize, start_line: usize) -> LexResult<Token> {
let mut literal = false;
if lexer.peek() == ':' && lexer.far_peek(1) == ':' {
// skip the leading '::'
lexer.advance();
lexer.advance();
literal = true;
}
// collect name
let mut name = String::new();
while is_identifier_continue(lexer, lexer.peek()) {
let c = lexer.peek();
if c == ':' || c == '.' {
return Err(LexError::new(format!("Invalid identifier: '{}' not allowed in identifiers.", c), Some(make_file_info(lexer, start, start_line))));
}
name.push(c);
lexer.advance();
}
match name.as_str() {
"false" => Ok(Token::Boolean(false)),
"true" => Ok(Token::Boolean(true)),
_ => Ok(Token::Identifier { is_literal: literal, name }),
}
}
/// Parse character literal
fn parse_character_literal(lexer: &mut LexerInfo, start: usize, start_line: usize) -> LexResult<Token> {
// At entry the opening quote was consumed by caller; peek() is first char of literal (or '\0')
let c = lexer.peek();
if c == '\'' {
return Err(LexError::new("Invalid character literal: empty character literal.", Some(make_file_info(lexer, start, start_line))));
}
let mut value: u8 = 0;
if c == '\\' {
lexer.advance(); // consume '\'
let esc = lexer.peek();
value = match esc {
'n' => b'\n',
'r' => b'\r',
't' => b'\t',
'\\' => b'\\',
'\'' => b'\'',
'0' => b'\0',
_ => {
return Err(LexError::new(format!("Invalid character literal: unknown escape sequence '\\{}'", esc),
Some(make_file_info(lexer, start, start_line))));
}
};
lexer.advance(); // move past the escaped char
} else if c == '\n' || c == '\r' || c == '\0' {
return Err(LexError::new("Invalid character literal: unclosed character literal.", Some(make_file_info(lexer, start, start_line))));
} else {
value = lexer.peek() as u8;
lexer.advance();
}
// expect closing quote
let closing = lexer.peek();
if closing == '\0' || closing.is_whitespace() || closing == '/' {
return Err(LexError::new("Invalid character literal: unclosed character literal.", Some(make_file_info(lexer, start, start_line))));
} else if closing != '\'' {
return Err(LexError::new(format!("Invalid character literal: unexpected '{}' in character.", closing), Some(make_file_info(lexer, start, start_line))));
}
lexer.advance(); // consume closing '
Ok(Token::Character(value))
}
/// Parse numeric literal (entry when first char is digit or '.' or '-')
fn parse_numeric_literal(lexer: &mut LexerInfo, start: usize, start_line: usize) -> LexResult<Token> {
// We'll gather the token by moving the lexer forward until end of numeric token,
// then analyze the string slice.
// Handle leading '-'
if lexer.peek() == '-' {
lexer.advance();
}
// If 0b, 0o, 0x style:
if lexer.peek() == '0' {
let next = lexer.far_peek(1);
if next == 'b' || next == 'B' {
// binary
lexer.advance(); // consume '0'
lexer.advance(); // consume 'b'
while matches!(lexer.peek(), '0' | '1' | '_' ) {
lexer.advance();
}
// optional suffix :...
if lexer.peek() == ':' {
// we'll let suffix parser handle it by consuming suffix chars up to whitespace or '/'
while !lexer.peek().is_whitespace() && lexer.peek() != '/' && lexer.peek() != '\0' {
lexer.advance();
}
}
let s = lexer.get_token_text(start);
let negative = s.starts_with('-');
// default type i64
let v = create_binary_integer(s, negative);
Ok(Token::Integer(IntegerLiteral { typ: IntegerType::I64, value: v }))
} else if next == 'o' || next == 'O' {
// octal
lexer.advance(); lexer.advance();
while (lexer.peek().is_ascii_digit() && !matches!(lexer.peek(), '8' | '9')) || lexer.peek() == '_' {
lexer.advance();
}
if lexer.peek() == ':' {
while !lexer.peek().is_whitespace() && lexer.peek() != '/' && lexer.peek() != '\0' {
lexer.advance();
}
}
let s = lexer.get_token_text(start);
let negative = s.starts_with('-');
let v = create_octal_integer(s, negative);
Ok(Token::Integer(IntegerLiteral { typ: IntegerType::I64, value: v }))
} else if next == 'x' || next == 'X' {
// hex
lexer.advance(); lexer.advance();
while lexer.peek().is_ascii_hexdigit() || lexer.peek() == '_' {
lexer.advance();
}
if lexer.peek() == ':' {
while !lexer.peek().is_whitespace() && lexer.peek() != '/' && lexer.peek() != '\0' {
lexer.advance();
}
}
let s = lexer.get_token_text(start);
let negative = s.starts_with('-');
let v = create_hex_integer(s, negative);
Ok(Token::Integer(IntegerLiteral { typ: IntegerType::I64, value: v }))
}
}
// decimal or float
// consume digits and underscores
while lexer.peek().is_ascii_digit() || lexer.peek() == '_' {
lexer.advance();
}
if lexer.peek() == '.' {
// float case: .digit or digits.digits
lexer.advance();
while lexer.peek().is_ascii_digit() || lexer.peek() == '_' {
lexer.advance();
}
// exponential 'e/E' not implemented: mirror C behavior -> not implemented
if lexer.peek() == 'e' || lexer.peek() == 'E' {
// not implemented exponential in C either
return Err(LexError::not_implemented("Float exponential parsing not implemented.", Some(make_file_info(lexer, start, start_line))));
}
// optional suffix :f64 or :f32
if lexer.peek() == ':' {
// capture suffix
while !lexer.peek().is_whitespace() && lexer.peek() != '/' && lexer.peek() != '\0' {
lexer.advance();
}
}
let s = lexer.get_token_text(start);
let negative = s.starts_with('-');
let value = create_float_value(s, negative);
Ok(Token::Double(value))
} else if lexer.peek() == 'e' || lexer.peek() == 'E' {
return Err(LexError::not_implemented("Float exponential parsing not implemented.", Some(make_file_info(lexer, start, start_line))));
} else if lexer.peek() == ':' {
// suffix that might indicate integer type or float base change
// consume suffix characters
while !lexer.peek().is_whitespace() && lexer.peek() != '/' && lexer.peek() != '\0' {
lexer.advance();
}
let s = lexer.get_token_text(start);
let negative = s.starts_with('-');
// check suffix
match parse_numeric_type_suffix(s) {
Ok((_unsigned, int_type_opt, is_float, _bits)) => {
if is_float {
let v = create_float_value(s, negative);
return Ok(Token::Double(v));
} else {
let v = create_decimal_integer(s, negative);
let typ = int_type_opt.unwrap_or(IntegerType::I64);
return Ok(Token::Integer(IntegerLiteral { typ, value: v }));
}
}
Err(_) => {
return Err(LexError::new("Invalid numeric type suffix.", Some(make_file_info(lexer, start, start_line))));
}
}
} else {
// plain decimal integer
let s = lexer.get_token_text(start);
let negative = s.starts_with('-');
let v = create_decimal_integer(s, negative);
Ok(Token::Integer(IntegerLiteral { typ: IntegerType::I64, value: v }))
}
}
/// parse string literal - not implemented (mirrors C behavior)
fn parse_string_literal(lexer: &mut LexerInfo, start: usize, start_line: usize) -> LexResult<Token> {
Err(LexError::not_implemented("String literals not implemented.", Some(make_file_info(lexer, start, start_line))))
}
/// parse token string - similar placeholder to C
fn parse_token_string(lexer: &mut LexerInfo, start: usize, start_line: usize) -> LexResult<Token> {
Err(LexError::not_implemented("Token string parsing not implemented.", Some(make_file_info(lexer, start, start_line))))
}
fn parse_array_literal(_lexer: &mut LexerInfo, start: usize, start_line: usize) -> LexResult<Token> {
Err(LexError::not_implemented("Array literal parsing not implemented.", Some(make_file_info(_lexer, start, start_line))))
}
fn parse_type_tuple(_lexer: &mut LexerInfo, start: usize, start_line: usize) -> LexResult<Token> {
Err(LexError::not_implemented("Type tuple parsing not implemented.", Some(make_file_info(_lexer, start, start_line))))
}
/// The main get_token function returns a Token or a LexError (wrapped by Result in lexical_analysis).
pub fn get_token(lexer: &mut LexerInfo) -> Result<Token, LexError> {
skip_comments_and_whitespace(lexer);
let start = lexer.pos;
let start_line = lexer.line;
let c = lexer.peek();
if c == '\0' {
return Ok(Token::EOF);
}
if c.is_ascii_digit() || (c == '.' && lexer.far_peek(1).is_ascii_digit()) || (c == '-' && lexer.far_peek(1).is_ascii_digit()) {
return parse_numeric_literal(lexer, start, start_line);
}
if c == '\'' {
lexer.advance(); // consume opening '
return parse_character_literal(lexer, start, start_line);
}
if c == '"' {
lexer.advance(); // consume opening "
return parse_string_literal(lexer, start, start_line);
}
if c == '{' {
lexer.advance();
return parse_token_string(lexer, start, start_line);
}
if c == '}' {
lexer.advance();
return Err(LexError::new("Unexpected closing brace '}' without matching opening brace.", Some(make_file_info(lexer, start, start_line))));
}
if c == '[' {
lexer.advance();
return parse_array_literal(lexer, start, start_line);
}
if c == ']' {
lexer.advance();
return Err(LexError::new("Unexpected closing bracket ']' without matching opening bracket.", Some(make_file_info(lexer, start, start_line))));
}
if c == '(' {
lexer.advance();
return parse_type_tuple(lexer, start, start_line);
}
if c == ')' {
lexer.advance();
return Err(LexError::new("Unexpected closing parentheses ')' without matching opening parentheses.", Some(make_file_info(lexer, start, start_line))));
}
if is_identifier_start(lexer) {
return parse_identifiers_and_booleans(lexer, start, start_line);
}
if c == ':' {
lexer.advance();
if lexer.peek() == ':' {
return Err(LexError::new("Invalid identifier literal: empty identifier after '::'.", Some(make_file_info(lexer, start, start_line))));
} else {
return Err(LexError::new("Unexpected single colon ':'.", Some(make_file_info(lexer, start, start_line))));
}
}
Err(LexError::new(format!("Unexpected character: unexpected '{}' during parsing.", c), Some(make_file_info(lexer, start, start_line))))
}
/// Perform full lexical analysis, returning a Vec<Token> or LexError
pub fn lexical_analysis(lexer: &mut LexerInfo) -> LexResult<Vec<Token>> {
let mut tokens = Vec::new();
loop {
match get_token(lexer) {
Ok(Token::EOF) => {
tokens.push(Token::EOF);
break;
}
Ok(tok) => tokens.push(tok),
Err(e) => {
return Err(e);
}
}
}
Ok(tokens)
}