Move integer and char parsing out of the parser and back into the lexer

This commit is contained in:
2025-10-10 14:45:08 -04:00
parent 0cbb800c19
commit b5d552376e
4 changed files with 114 additions and 37 deletions

View File

@@ -75,15 +75,20 @@ impl<'t> Lexer<'t> {
LexError { pos: self.head, res } LexError { pos: self.head, res }
} }
pub fn as_str(&self) -> (&'t str, Span) {
let span = Span(self.head, self.tail);
(&self.text[Range::from(span)], span)
}
/// Produces a Token /// Produces a Token
pub fn produce(&mut self, kind: TKind) -> Token { pub fn produce(&mut self, kind: TKind) -> Token {
self.advance_tail(); self.advance_tail();
let span = Span(self.head, self.tail); let (lexeme, span) = self.as_str();
self.head = self.tail; self.head = self.tail;
Token { lexeme: self.text[Range::from(span)].to_owned(), kind, span } Token { lexeme: Lexeme::String(lexeme.to_owned()), kind, span }
} }
pub fn produce_with_lexeme(&mut self, kind: TKind, lexeme: String) -> Token { pub fn produce_with_lexeme(&mut self, kind: TKind, lexeme: Lexeme) -> Token {
self.advance_tail(); self.advance_tail();
let span = Span(self.head, self.tail); let span = Span(self.head, self.tail);
self.head = self.tail; self.head = self.tail;
@@ -226,9 +231,10 @@ impl<'t> Lexer<'t> {
pub fn identifier(&mut self) -> Result<Token, LexError> { pub fn identifier(&mut self) -> Result<Token, LexError> {
while self.consume().peek().is_some_and(is_xid_continue) {} while self.consume().peek().is_some_and(is_xid_continue) {}
let (lexeme, _span) = self.as_str();
let token = self.produce(TKind::Identifier); let token = self.produce(TKind::Identifier);
Ok(Token { Ok(Token {
kind: match token.lexeme.as_str() { kind: match lexeme {
"as" => TKind::As, "as" => TKind::As,
"break" => TKind::Break, "break" => TKind::Break,
"const" => TKind::Const, "const" => TKind::Const,
@@ -236,6 +242,7 @@ impl<'t> Lexer<'t> {
"else" => TKind::Else, "else" => TKind::Else,
"false" => TKind::False, "false" => TKind::False,
"fn" => TKind::Fn, "fn" => TKind::Fn,
"for" => TKind::For,
"if" => TKind::If, "if" => TKind::If,
"let" => TKind::Let, "let" => TKind::Let,
"loop" => TKind::Loop, "loop" => TKind::Loop,
@@ -261,7 +268,7 @@ impl<'t> Lexer<'t> {
None => '\0', None => '\0',
}; };
if self.take().is_some_and(|c| c == '\'') { if self.take().is_some_and(|c| c == '\'') {
Ok(self.produce_with_lexeme(TKind::Character, c.into())) Ok(self.produce_with_lexeme(TKind::Character, Lexeme::Char(c)))
} else { } else {
Err(self.error("Unterminated character")) Err(self.error("Unterminated character"))
} }
@@ -279,7 +286,7 @@ impl<'t> Lexer<'t> {
}) })
} }
lexeme.shrink_to_fit(); lexeme.shrink_to_fit();
Ok(self.produce_with_lexeme(TKind::String, lexeme)) Ok(self.produce_with_lexeme(TKind::String, Lexeme::String(lexeme)))
} }
pub fn escape(&mut self) -> Result<char, LexError> { pub fn escape(&mut self) -> Result<char, LexError> {
@@ -318,10 +325,17 @@ impl<'t> Lexer<'t> {
} }
pub fn digits<const BASE: u32>(&mut self) -> Result<Token, LexError> { pub fn digits<const BASE: u32>(&mut self) -> Result<Token, LexError> {
while self.peek().is_some_and(|c| c.is_digit(BASE)) { let mut int: u128 = 0;
while let Some(c) = self.peek() {
int = match c.to_digit(BASE).ok_or(c) {
Err('_') => int,
Ok(c) => int.wrapping_mul(BASE as _).wrapping_add(c as _),
_ => break,
};
self.consume(); self.consume();
} }
Ok(self.produce(TKind::Integer))
Ok(self.produce_with_lexeme(TKind::Integer, Lexeme::Integer(int, BASE)))
} }
pub fn digit<const BASE: u32>(&mut self) -> Result<u32, LexError> { pub fn digit<const BASE: u32>(&mut self) -> Result<u32, LexError> {

View File

@@ -75,10 +75,9 @@ fn lex() -> Result<(), Box<dyn Error>> {
println!("\x1b[31m{e}\x1b[0m"); println!("\x1b[31m{e}\x1b[0m");
break Ok(Response::Deny); break Ok(Response::Deny);
} }
Ok(Token { lexeme, kind, span: Span { head, tail } }) => println!( Ok(Token { lexeme, kind, span: Span { head, tail } }) => {
"{kind:?}\x1b[11G {head:<4} {tail:<4} {}", println!("{kind:?}\x1b[11G {head:<4} {tail:<4} {lexeme:?}")
lexeme.escape_debug() }
),
} }
} }
})?; })?;

View File

@@ -3,7 +3,7 @@ use crate::{
ast::*, ast::*,
lexer::{LexError, Lexer}, lexer::{LexError, Lexer},
span::Span, span::Span,
token::{TKind, Token}, token::{Lexeme, TKind, Token},
}; };
use std::{error::Error, fmt::Display, vec}; use std::{error::Error, fmt::Display, vec};
@@ -90,7 +90,7 @@ impl<'t> Parser<'t> {
} }
/// Consumes the currently-peeked [Token], returning its lexeme without cloning. /// Consumes the currently-peeked [Token], returning its lexeme without cloning.
pub fn take_lexeme(&mut self) -> Option<String> { pub fn take_lexeme(&mut self) -> Option<Lexeme> {
self.take().map(|tok| tok.lexeme) self.take().map(|tok| tok.lexeme)
} }
@@ -196,19 +196,23 @@ impl<'t> Parse<'t> for Literal {
TKind::Character => Literal::Char( TKind::Character => Literal::Char(
p.take_lexeme() p.take_lexeme()
.expect("should have Token") .expect("should have Token")
.chars() .char()
.next()
.expect("should have one char in char literal"), .expect("should have one char in char literal"),
), ),
TKind::Integer => { TKind::Integer => {
let Token { lexeme, kind: _, span } = p.take().expect("should have Token"); let Token { lexeme, kind: _, span } = p.take().expect("should have Token");
// TODO: more complex int parsing // TODO: more complex int parsing
let int = lexeme let int = lexeme
.parse() .int()
.map_err(|_| ParseError::Expected(TKind::Integer, span))?; .ok_or(ParseError::Expected(TKind::Integer, span))?;
Literal::Int(int) Literal::Int(int as _)
} }
TKind::String => Literal::Str(p.take_lexeme().expect("should have Token")), TKind::String => Literal::Str({
let Token { lexeme, kind: _, span } = p.take().expect("should have Token");
lexeme
.string()
.ok_or(ParseError::Expected(TKind::String, span))?
}),
_ => Err(ParseError::Expected(TKind::Integer, tok.span))?, _ => Err(ParseError::Expected(TKind::Integer, tok.span))?,
}) })
} }
@@ -246,10 +250,14 @@ impl<'t> Parse<'t> for Pat {
TKind::True | TKind::False | TKind::Character | TKind::Integer | TKind::String => { TKind::True | TKind::False | TKind::Character | TKind::Integer | TKind::String => {
Pat::Lit(p.parse(())?) Pat::Lit(p.parse(())?)
} }
TKind::Identifier => match tok.lexeme.as_str() { TKind::Identifier => match tok.lexeme.str() {
"_" => p.consume().then(Pat::Ignore), Some("_") => p.consume().then(Pat::Ignore),
_ => { _ => {
let name = p.take_lexeme().expect("should have Token"); let name = p
.take_lexeme()
.expect("should have Token")
.string()
.expect("Identifier token should have String");
match p.peek().map(|t| t.kind)? { match p.peek().map(|t| t.kind)? {
TKind::LParen => Pat::TupStruct(name, p.parse(PPrec::Tuple)?), TKind::LParen => Pat::TupStruct(name, p.parse(PPrec::Tuple)?),
TKind::LCurly => Pat::Struct( TKind::LCurly => Pat::Struct(
@@ -262,7 +270,7 @@ impl<'t> Parse<'t> for Pat {
} }
} }
}, },
TKind::Grave => Pat::MetId(p.consume().next()?.lexeme), TKind::Grave => Pat::MetId(p.consume().next()?.lexeme.to_string()),
TKind::DotDot => Pat::Rest(match p.consume().peek_if(TKind::Identifier) { TKind::DotDot => Pat::Rest(match p.consume().peek_if(TKind::Identifier) {
Some(_) => Some(p.parse(level)?), Some(_) => Some(p.parse(level)?),
None => None, None => None,
@@ -314,9 +322,14 @@ impl<'t> Parse<'t> for Ty {
let tok = p.peek()?; let tok = p.peek()?;
let head = match tok.kind { let head = match tok.kind {
TKind::Identifier => match tok.lexeme.as_str() { TKind::Identifier => match tok.lexeme.str() {
"_" => p.consume().then(Ty::Infer), Some("_") => p.consume().then(Ty::Infer),
_ => Ty::Named(p.take_lexeme().expect("should have Token")), _ => Ty::Named(
p.take_lexeme()
.expect("should have Token")
.string()
.expect("Identifier token should have String"),
),
}, },
TKind::LBrack => { TKind::LBrack => {
let ty = p.consume().parse(level)?; let ty = p.consume().parse(level)?;
@@ -550,7 +563,7 @@ impl<'t> Parse<'t> for Fn {
fn parse(p: &mut Parser<'t>, _level: Self::Prec) -> PResult<Self> { fn parse(p: &mut Parser<'t>, _level: Self::Prec) -> PResult<Self> {
match p.consume().next_if(TKind::Identifier) { match p.consume().next_if(TKind::Identifier) {
Ok(Token { lexeme, .. }) => Ok(Self( Ok(Token { lexeme, .. }) => Ok(Self(
Some(lexeme), lexeme.string(),
p.parse(PPrec::Typed)?, p.parse(PPrec::Typed)?,
p.parse(Prec::Body.next())?, p.parse(Prec::Body.next())?,
)), )),
@@ -603,12 +616,18 @@ impl<'t> Parse<'t> for MatchArm {
impl<'t> Parse<'t> for MakeArm { impl<'t> Parse<'t> for MakeArm {
type Prec = (); type Prec = ();
fn parse(p: &mut Parser<'t>, _level: ()) -> PResult<Self> { fn parse(p: &mut Parser<'t>, _level: ()) -> PResult<Self> {
Ok(MakeArm(p.next_if(TKind::Identifier)?.lexeme, { Ok(MakeArm(
p.next_if(TKind::Colon) p.next_if(TKind::Identifier)?
.ok() .lexeme
.map(|_| p.parse(Prec::Body.value())) .string()
.transpose()? .expect("Identifier should have String"),
})) {
p.next_if(TKind::Colon)
.ok()
.map(|_| p.parse(Prec::Body.value()))
.transpose()?
},
))
} }
} }
@@ -639,8 +658,8 @@ impl<'t> Parse<'t> for Expr {
Ps::End if level == prec.next() => Expr::Op(Op::Tuple, vec![]), Ps::End if level == prec.next() => Expr::Op(Op::Tuple, vec![]),
Ps::End => Err(ParseError::NotPrefix(tok.kind, span))?, Ps::End => Err(ParseError::NotPrefix(tok.kind, span))?,
Ps::Id => Expr::Id(p.take_lexeme().expect("should have ident")), Ps::Id => Expr::Id(p.take_lexeme().expect("should have ident").to_string()),
Ps::Mid => Expr::MetId(p.consume().next()?.lexeme), Ps::Mid => Expr::MetId(p.consume().next()?.lexeme.to_string()),
Ps::Lit => Expr::Lit(p.parse(())?), Ps::Lit => Expr::Lit(p.parse(())?),
Ps::Let => Expr::Let(p.parse(())?), Ps::Let => Expr::Let(p.parse(())?),
Ps::Const => Expr::Const(p.parse(())?), Ps::Const => Expr::Const(p.parse(())?),

View File

@@ -4,11 +4,55 @@ use crate::span::Span;
#[derive(Clone, Debug)] #[derive(Clone, Debug)]
pub struct Token { pub struct Token {
pub lexeme: String, pub lexeme: Lexeme,
pub kind: TKind, pub kind: TKind,
pub span: Span, pub span: Span,
} }
#[derive(Clone, Debug)]
pub enum Lexeme {
String(String),
Integer(u128, u32),
Char(char),
}
impl Lexeme {
pub fn string(self) -> Option<String> {
match self {
Self::String(s) => Some(s),
_ => None,
}
}
pub fn str(&self) -> Option<&str> {
match self {
Self::String(s) => Some(s),
_ => None,
}
}
pub fn int(&self) -> Option<u128> {
match self {
Self::Integer(i, _) => Some(*i),
_ => None,
}
}
pub fn char(&self) -> Option<char> {
match self {
Self::Char(c) => Some(*c),
_ => None,
}
}
}
impl std::fmt::Display for Lexeme {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::String(v) => v.fmt(f),
Self::Integer(v, _) => v.fmt(f),
Self::Char(v) => v.fmt(f),
}
}
}
#[derive(Clone, Copy, Debug, PartialEq, Eq)] #[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum TKind { pub enum TKind {
Comment, Comment,
@@ -19,6 +63,7 @@ pub enum TKind {
Else, Else,
False, False,
Fn, Fn,
For,
If, If,
Let, Let,
Loop, Loop,