Claude attempt at lexer.rs

This commit is contained in:
Kyler Olsen 2025-12-01 23:46:46 -07:00
parent ae077ef433
commit 08a8aadf16
1 changed files with 448 additions and 10 deletions

View File

@ -17,6 +17,47 @@ impl LexerInfo {
line: 1,
}
}
fn peek(&self) -> char {
self.source.chars().nth(self.pos).unwrap_or('\0')
}
fn far_peek(&self, offset: usize) -> char {
self.source.chars().nth(self.pos + offset).unwrap_or('\0')
}
fn advance(&mut self) -> char {
if self.peek() == '\n' {
self.line += 1;
self.column = 1;
} else {
self.column += 1;
}
self.pos += 1;
self.peek()
}
fn skip_comments_and_whitespace(&mut self) {
loop {
let c = self.peek();
// Skip comments
if (c == '/' && self.far_peek(1) == '/') || c == '#' {
while self.peek() != '\n' && self.peek() != '\0' {
self.advance();
}
}
// Skip whitespace
if self.peek().is_whitespace() {
while self.peek().is_whitespace() {
self.advance();
}
} else {
break;
}
}
}
}
#[derive(Debug, Clone)]
@ -83,10 +124,7 @@ pub enum StructValue {
#[derive(Debug, Clone)]
pub enum Token {
Eof,
Identifier(Identifier),
// All integer sizes
I64(i64),
I32(i32),
I16(i16),
@ -95,14 +133,11 @@ pub enum Token {
U32(u32),
U16(u16),
U8(u8),
Float(f32),
Double(f64),
Character(u8),
StringLiteral(String),
Boolean(bool),
Array(ShapedArray),
TokenString(TokenString),
TypeTuple(TypeTuple),
@ -118,10 +153,413 @@ pub struct LexError {
pub type LexResult<T> = Result<T, LexError>;
pub fn get_token(_lexer: &mut LexerInfo) -> Option<Token> {
None
impl LexerInfo {
fn make_error(&self, message: impl Into<String>, start_line: usize, start_col: usize) -> LexError {
LexError {
message: message.into(),
file: self.filename.clone(),
line: start_line,
column: start_col,
}
}
fn is_identifier_continue(&self, c: char) -> bool {
if !c.is_ascii() || !c.is_ascii_graphic() {
return false;
}
if c == '/' && self.far_peek(1) == '/' {
return false;
}
!matches!(c, '{' | '}' | '[' | ']' | '(' | ')' | '\'' | '"' | '#') && !c.is_whitespace()
}
fn is_identifier_start(&self) -> bool {
let mut c = self.peek();
if c == ':' && self.far_peek(1) == ':' {
c = self.far_peek(2);
}
!c.is_ascii_digit() && self.is_identifier_continue(c)
}
fn parse_identifiers_and_booleans(&mut self, start: usize, start_line: usize, start_col: usize) -> LexResult<Token> {
let mut c = self.peek();
let mut literal = false;
// Skip leading `::` for identifier literals
if c == ':' && self.far_peek(1) == ':' {
literal = true;
self.advance();
c = self.advance();
}
// Read the name
let name_start = self.pos;
while self.is_identifier_continue(c) {
if c == ':' {
return Err(self.make_error("Invalid identifier: ':' is not allowed in identifiers.", start_line, start_col));
}
if c == '.' {
return Err(self.make_error("Invalid identifier: '.' is not allowed in identifiers.", start_line, start_col));
}
c = self.advance();
}
let name = self.source[name_start..self.pos].to_string();
// Check for booleans
match name.as_str() {
"false" => Ok(Token::Boolean(false)),
"true" => Ok(Token::Boolean(true)),
_ => Ok(Token::Identifier(Identifier { name, is_literal: literal })),
}
}
fn parse_character_literal(&mut self, start_line: usize, start_col: usize) -> LexResult<Token> {
let mut c = self.peek();
if c == '\'' {
return Err(self.make_error("Invalid character literal: empty character literal.", start_line, start_col));
}
let value = if c == '\\' {
c = self.advance();
match c {
'n' => b'\n',
'r' => b'\r',
't' => b'\t',
'\\' => b'\\',
'\'' => b'\'',
'0' => b'\0',
_ => return Err(self.make_error(format!("Invalid character literal: unknown escape sequence '\\{}'.", c), start_line, start_col)),
}
} else if c == '\n' || c == '\r' {
return Err(self.make_error("Invalid character literal: unclosed character literal.", start_line, start_col));
} else {
c as u8
};
c = self.advance();
if c.is_whitespace() || c == '/' || c == '\0' {
return Err(self.make_error("Invalid character literal: unclosed character literal.", start_line, start_col));
} else if c != '\'' {
return Err(self.make_error(format!("Invalid character literal: unexpected '{}' in character.", c), start_line, start_col));
}
self.advance();
Ok(Token::Character(value))
}
fn parse_token_string(&mut self, start: usize, start_line: usize, start_col: usize) -> LexResult<Token> {
let mut tokens = Vec::new();
self.advance(); // skip '{'
loop {
self.skip_comments_and_whitespace();
let c = self.peek();
if c == '}' {
self.advance();
return Ok(Token::TokenString(TokenString { tokens }));
}
if c == '\0' {
return Err(self.make_error("Unclosed token string: missing closing brace '}'.", start_line, start_col));
}
match get_token(self) {
Some(token) => {
if matches!(token, Token::Eof) {
break;
}
tokens.push(token);
}
None => return Err(self.make_error("Failed to parse token in token string.", start_line, start_col)),
}
}
Err(self.make_error("Unclosed token string: missing closing brace '}'.", start_line, start_col))
}
fn parse_numeric_literal(&mut self, start: usize, start_line: usize, start_col: usize) -> LexResult<Token> {
let mut c = self.peek();
if c == '-' {
c = self.advance();
}
if c == '0' {
c = self.advance();
match c {
'b' | 'B' => {
self.advance();
return self.parse_binary_integer(start, start_line, start_col);
}
'o' | 'O' => {
self.advance();
return self.parse_octal_integer(start, start_line, start_col);
}
'x' | 'X' => {
self.advance();
return self.parse_hexadecimal_integer(start, start_line, start_col);
}
_ => {}
}
}
self.parse_decimal_integer(start, start_line, start_col)
}
fn parse_binary_integer(&mut self, start: usize, start_line: usize, start_col: usize) -> LexResult<Token> {
let mut c = self.peek();
while c == '0' || c == '1' || c == '_' {
c = self.advance();
}
let value = self.create_binary_integer(start);
Ok(Token::I64(value as i64))
}
fn parse_octal_integer(&mut self, start: usize, start_line: usize, start_col: usize) -> LexResult<Token> {
let mut c = self.peek();
while c.is_ascii_digit() && c != '8' && c != '9' || c == '_' {
c = self.advance();
}
let value = self.create_octal_integer(start);
Ok(Token::I64(value as i64))
}
fn parse_decimal_integer(&mut self, start: usize, start_line: usize, start_col: usize) -> LexResult<Token> {
let mut c = self.peek();
while c.is_ascii_digit() || c == '_' {
c = self.advance();
}
if c == '.' {
self.advance();
return self.parse_float(start, start_line, start_col);
}
let value = self.create_decimal_integer(start);
Ok(Token::I64(value as i64))
}
fn parse_hexadecimal_integer(&mut self, start: usize, start_line: usize, start_col: usize) -> LexResult<Token> {
let mut c = self.peek();
while c.is_ascii_hexdigit() || c == '_' {
c = self.advance();
}
let value = self.create_hexadecimal_integer(start);
Ok(Token::I64(value as i64))
}
fn parse_float(&mut self, start: usize, start_line: usize, start_col: usize) -> LexResult<Token> {
let mut c = self.peek();
while c.is_ascii_digit() || c == '_' {
c = self.advance();
}
let value = self.create_float(start);
Ok(Token::Double(value))
}
fn create_binary_integer(&self, start: usize) -> u64 {
let token = &self.source[start..self.pos];
let mut value = 0u64;
let mut i = 2;
if token.starts_with('-') {
i += 1;
}
for c in token[i..].chars() {
if c == '_' || c == '.' {
continue;
}
if c.is_whitespace() || c == '/' || c == ':' {
break;
}
value *= 2;
if c == '1' {
value += 1;
}
}
if token.starts_with('-') {
(!value).wrapping_add(1)
} else {
value
}
}
fn create_octal_integer(&self, start: usize) -> u64 {
let token = &self.source[start..self.pos];
let mut value = 0u64;
let mut i = 2;
if token.starts_with('-') {
i += 1;
}
for c in token[i..].chars() {
if c == '_' || c == '.' {
continue;
}
if c.is_whitespace() || c == '/' || c == ':' {
break;
}
value *= 8;
value += c.to_digit(8).unwrap_or(0) as u64;
}
if token.starts_with('-') {
(!value).wrapping_add(1)
} else {
value
}
}
fn create_decimal_integer(&self, start: usize) -> u64 {
let token = &self.source[start..self.pos];
let mut value = 0u64;
let mut i = 0;
if token.starts_with('-') {
i += 1;
}
for c in token[i..].chars() {
if c == '_' {
continue;
}
if c.is_whitespace() || c == '/' || c == ':' {
break;
}
value *= 10;
value += c.to_digit(10).unwrap_or(0) as u64;
}
if token.starts_with('-') {
(!value).wrapping_add(1)
} else {
value
}
}
fn create_hexadecimal_integer(&self, start: usize) -> u64 {
let token = &self.source[start..self.pos];
let mut value = 0u64;
let mut i = 2;
if token.starts_with('-') {
i += 1;
}
for c in token[i..].chars() {
if c == '_' || c == '.' {
continue;
}
if c.is_whitespace() || c == '/' || c == ':' {
break;
}
value *= 16;
value += c.to_digit(16).unwrap_or(0) as u64;
}
if token.starts_with('-') {
(!value).wrapping_add(1)
} else {
value
}
}
fn create_float(&self, start: usize) -> f64 {
let token = &self.source[start..self.pos];
let mut value = 0.0;
let mut fractional = 0u64;
let mut i = 0;
if token.starts_with('-') {
i += 1;
}
for c in token[i..].chars() {
if c == '_' {
continue;
}
if c.is_whitespace() || c == '/' || c == ':' {
break;
}
if c == '.' {
fractional = 1;
continue;
}
if fractional == 0 {
value *= 10.0;
} else {
fractional *= 10;
}
let digit = c.to_digit(10).unwrap_or(0) as f64;
if fractional == 0 {
value += digit;
} else {
value += digit / fractional as f64;
}
}
if token.starts_with('-') {
-value
} else {
value
}
}
}
pub fn lexical_analysis(_lexer: &mut LexerInfo) -> LexResult<Vec<Token>> {
Ok(Vec::new())
pub fn get_token(lexer: &mut LexerInfo) -> Option<Token> {
lexer.skip_comments_and_whitespace();
let c = lexer.peek();
let start = lexer.pos;
let start_line = lexer.line;
let start_col = lexer.column;
if c == '\0' {
return Some(Token::Eof);
}
let result = if c.is_ascii_digit() || (c == '.' && lexer.far_peek(1).is_ascii_digit()) || (c == '-' && lexer.far_peek(1).is_ascii_digit()) {
lexer.parse_numeric_literal(start, start_line, start_col)
} else if c == '\'' {
lexer.advance();
lexer.parse_character_literal(start_line, start_col)
} else if c == '{' {
lexer.parse_token_string(start, start_line, start_col)
} else if lexer.is_identifier_start() {
lexer.parse_identifiers_and_booleans(start, start_line, start_col)
} else {
Err(lexer.make_error(format!("Unexpected character: '{}'", c), start_line, start_col))
};
result.ok()
}
pub fn lexical_analysis(lexer: &mut LexerInfo) -> LexResult<Vec<Token>> {
let mut tokens = Vec::new();
loop {
match get_token(lexer) {
Some(Token::Eof) => {
tokens.push(Token::Eof);
break;
}
Some(token) => tokens.push(token),
None => break,
}
}
Ok(tokens)
}