From b5d552376ecec7b9383641ab78fd2780b59433c9 Mon Sep 17 00:00:00 2001 From: John Date: Fri, 10 Oct 2025 14:45:08 -0400 Subject: [PATCH] Move integer and char parsing out of the parser and back into the lexer --- src/lexer.rs | 30 +++++++++++++++++------ src/main.rs | 7 +++--- src/parser.rs | 67 +++++++++++++++++++++++++++++++++------------------ src/token.rs | 47 +++++++++++++++++++++++++++++++++++- 4 files changed, 114 insertions(+), 37 deletions(-) diff --git a/src/lexer.rs b/src/lexer.rs index 46ca7a3..589a4e5 100644 --- a/src/lexer.rs +++ b/src/lexer.rs @@ -75,15 +75,20 @@ impl<'t> Lexer<'t> { LexError { pos: self.head, res } } + pub fn as_str(&self) -> (&'t str, Span) { + let span = Span(self.head, self.tail); + (&self.text[Range::from(span)], span) + } + /// Produces a Token pub fn produce(&mut self, kind: TKind) -> Token { self.advance_tail(); - let span = Span(self.head, self.tail); + let (lexeme, span) = self.as_str(); self.head = self.tail; - Token { lexeme: self.text[Range::from(span)].to_owned(), kind, span } + Token { lexeme: Lexeme::String(lexeme.to_owned()), kind, span } } - pub fn produce_with_lexeme(&mut self, kind: TKind, lexeme: String) -> Token { + pub fn produce_with_lexeme(&mut self, kind: TKind, lexeme: Lexeme) -> Token { self.advance_tail(); let span = Span(self.head, self.tail); self.head = self.tail; @@ -226,9 +231,10 @@ impl<'t> Lexer<'t> { pub fn identifier(&mut self) -> Result { while self.consume().peek().is_some_and(is_xid_continue) {} + let (lexeme, _span) = self.as_str(); let token = self.produce(TKind::Identifier); Ok(Token { - kind: match token.lexeme.as_str() { + kind: match lexeme { "as" => TKind::As, "break" => TKind::Break, "const" => TKind::Const, @@ -236,6 +242,7 @@ impl<'t> Lexer<'t> { "else" => TKind::Else, "false" => TKind::False, "fn" => TKind::Fn, + "for" => TKind::For, "if" => TKind::If, "let" => TKind::Let, "loop" => TKind::Loop, @@ -261,7 +268,7 @@ impl<'t> Lexer<'t> { None => '\0', }; if self.take().is_some_and(|c| c == '\'') { - Ok(self.produce_with_lexeme(TKind::Character, c.into())) + Ok(self.produce_with_lexeme(TKind::Character, Lexeme::Char(c))) } else { Err(self.error("Unterminated character")) } @@ -279,7 +286,7 @@ impl<'t> Lexer<'t> { }) } lexeme.shrink_to_fit(); - Ok(self.produce_with_lexeme(TKind::String, lexeme)) + Ok(self.produce_with_lexeme(TKind::String, Lexeme::String(lexeme))) } pub fn escape(&mut self) -> Result { @@ -318,10 +325,17 @@ impl<'t> Lexer<'t> { } pub fn digits(&mut self) -> Result { - while self.peek().is_some_and(|c| c.is_digit(BASE)) { + let mut int: u128 = 0; + while let Some(c) = self.peek() { + int = match c.to_digit(BASE).ok_or(c) { + Err('_') => int, + Ok(c) => int.wrapping_mul(BASE as _).wrapping_add(c as _), + _ => break, + }; self.consume(); } - Ok(self.produce(TKind::Integer)) + + Ok(self.produce_with_lexeme(TKind::Integer, Lexeme::Integer(int, BASE))) } pub fn digit(&mut self) -> Result { diff --git a/src/main.rs b/src/main.rs index 597b353..4b4898f 100644 --- a/src/main.rs +++ b/src/main.rs @@ -75,10 +75,9 @@ fn lex() -> Result<(), Box> { println!("\x1b[31m{e}\x1b[0m"); break Ok(Response::Deny); } - Ok(Token { lexeme, kind, span: Span { head, tail } }) => println!( - "{kind:?}\x1b[11G {head:<4} {tail:<4} {}", - lexeme.escape_debug() - ), + Ok(Token { lexeme, kind, span: Span { head, tail } }) => { + println!("{kind:?}\x1b[11G {head:<4} {tail:<4} {lexeme:?}") + } } } })?; diff --git a/src/parser.rs b/src/parser.rs index 7b9ec92..ffe2334 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -3,7 +3,7 @@ use crate::{ ast::*, lexer::{LexError, Lexer}, span::Span, - token::{TKind, Token}, + token::{Lexeme, TKind, Token}, }; use std::{error::Error, fmt::Display, vec}; @@ -90,7 +90,7 @@ impl<'t> Parser<'t> { } /// Consumes the currently-peeked [Token], returning its lexeme without cloning. - pub fn take_lexeme(&mut self) -> Option { + pub fn take_lexeme(&mut self) -> Option { self.take().map(|tok| tok.lexeme) } @@ -196,19 +196,23 @@ impl<'t> Parse<'t> for Literal { TKind::Character => Literal::Char( p.take_lexeme() .expect("should have Token") - .chars() - .next() + .char() .expect("should have one char in char literal"), ), TKind::Integer => { let Token { lexeme, kind: _, span } = p.take().expect("should have Token"); // TODO: more complex int parsing let int = lexeme - .parse() - .map_err(|_| ParseError::Expected(TKind::Integer, span))?; - Literal::Int(int) + .int() + .ok_or(ParseError::Expected(TKind::Integer, span))?; + Literal::Int(int as _) } - TKind::String => Literal::Str(p.take_lexeme().expect("should have Token")), + TKind::String => Literal::Str({ + let Token { lexeme, kind: _, span } = p.take().expect("should have Token"); + lexeme + .string() + .ok_or(ParseError::Expected(TKind::String, span))? + }), _ => Err(ParseError::Expected(TKind::Integer, tok.span))?, }) } @@ -246,10 +250,14 @@ impl<'t> Parse<'t> for Pat { TKind::True | TKind::False | TKind::Character | TKind::Integer | TKind::String => { Pat::Lit(p.parse(())?) } - TKind::Identifier => match tok.lexeme.as_str() { - "_" => p.consume().then(Pat::Ignore), + TKind::Identifier => match tok.lexeme.str() { + Some("_") => p.consume().then(Pat::Ignore), _ => { - let name = p.take_lexeme().expect("should have Token"); + let name = p + .take_lexeme() + .expect("should have Token") + .string() + .expect("Identifier token should have String"); match p.peek().map(|t| t.kind)? { TKind::LParen => Pat::TupStruct(name, p.parse(PPrec::Tuple)?), TKind::LCurly => Pat::Struct( @@ -262,7 +270,7 @@ impl<'t> Parse<'t> for Pat { } } }, - TKind::Grave => Pat::MetId(p.consume().next()?.lexeme), + TKind::Grave => Pat::MetId(p.consume().next()?.lexeme.to_string()), TKind::DotDot => Pat::Rest(match p.consume().peek_if(TKind::Identifier) { Some(_) => Some(p.parse(level)?), None => None, @@ -314,9 +322,14 @@ impl<'t> Parse<'t> for Ty { let tok = p.peek()?; let head = match tok.kind { - TKind::Identifier => match tok.lexeme.as_str() { - "_" => p.consume().then(Ty::Infer), - _ => Ty::Named(p.take_lexeme().expect("should have Token")), + TKind::Identifier => match tok.lexeme.str() { + Some("_") => p.consume().then(Ty::Infer), + _ => Ty::Named( + p.take_lexeme() + .expect("should have Token") + .string() + .expect("Identifier token should have String"), + ), }, TKind::LBrack => { let ty = p.consume().parse(level)?; @@ -550,7 +563,7 @@ impl<'t> Parse<'t> for Fn { fn parse(p: &mut Parser<'t>, _level: Self::Prec) -> PResult { match p.consume().next_if(TKind::Identifier) { Ok(Token { lexeme, .. }) => Ok(Self( - Some(lexeme), + lexeme.string(), p.parse(PPrec::Typed)?, p.parse(Prec::Body.next())?, )), @@ -603,12 +616,18 @@ impl<'t> Parse<'t> for MatchArm { impl<'t> Parse<'t> for MakeArm { type Prec = (); fn parse(p: &mut Parser<'t>, _level: ()) -> PResult { - Ok(MakeArm(p.next_if(TKind::Identifier)?.lexeme, { - p.next_if(TKind::Colon) - .ok() - .map(|_| p.parse(Prec::Body.value())) - .transpose()? - })) + Ok(MakeArm( + p.next_if(TKind::Identifier)? + .lexeme + .string() + .expect("Identifier should have String"), + { + p.next_if(TKind::Colon) + .ok() + .map(|_| p.parse(Prec::Body.value())) + .transpose()? + }, + )) } } @@ -639,8 +658,8 @@ impl<'t> Parse<'t> for Expr { Ps::End if level == prec.next() => Expr::Op(Op::Tuple, vec![]), Ps::End => Err(ParseError::NotPrefix(tok.kind, span))?, - Ps::Id => Expr::Id(p.take_lexeme().expect("should have ident")), - Ps::Mid => Expr::MetId(p.consume().next()?.lexeme), + Ps::Id => Expr::Id(p.take_lexeme().expect("should have ident").to_string()), + Ps::Mid => Expr::MetId(p.consume().next()?.lexeme.to_string()), Ps::Lit => Expr::Lit(p.parse(())?), Ps::Let => Expr::Let(p.parse(())?), Ps::Const => Expr::Const(p.parse(())?), diff --git a/src/token.rs b/src/token.rs index 93e444c..c866613 100644 --- a/src/token.rs +++ b/src/token.rs @@ -4,11 +4,55 @@ use crate::span::Span; #[derive(Clone, Debug)] pub struct Token { - pub lexeme: String, + pub lexeme: Lexeme, pub kind: TKind, pub span: Span, } +#[derive(Clone, Debug)] +pub enum Lexeme { + String(String), + Integer(u128, u32), + Char(char), +} + +impl Lexeme { + pub fn string(self) -> Option { + match self { + Self::String(s) => Some(s), + _ => None, + } + } + pub fn str(&self) -> Option<&str> { + match self { + Self::String(s) => Some(s), + _ => None, + } + } + pub fn int(&self) -> Option { + match self { + Self::Integer(i, _) => Some(*i), + _ => None, + } + } + pub fn char(&self) -> Option { + match self { + Self::Char(c) => Some(*c), + _ => None, + } + } +} + +impl std::fmt::Display for Lexeme { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::String(v) => v.fmt(f), + Self::Integer(v, _) => v.fmt(f), + Self::Char(v) => v.fmt(f), + } + } +} + #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub enum TKind { Comment, @@ -19,6 +63,7 @@ pub enum TKind { Else, False, Fn, + For, If, Let, Loop,