Initial Commit

This commit is contained in:
2025-08-28 02:26:06 -04:00
committed by Val
commit c83218d750
17 changed files with 2276 additions and 0 deletions

516
src/parser.rs Normal file
View File

@@ -0,0 +1,516 @@
//! The parser takes a stream of [Token]s from the [Lexer], and turns them into [crate::ast] nodes.
use crate::{
ast::*,
lexer::{LexError, Lexer},
span::Span,
token::{TKind, Token},
};
use std::{error::Error, fmt::Display, vec};
pub mod numeric;
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum ParseError {
FromLexer(LexError),
Expected(TKind, Span),
NotPattern(TKind, Span),
NotPrefix(TKind, Span),
NotInfix(TKind, Span),
NotPostfix(TKind, Span),
}
impl Error for ParseError {}
impl Display for ParseError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::FromLexer(e) => e.fmt(f),
Self::Expected(tk, loc) => write!(f, "{loc}: Expected {tk:?}."),
Self::NotPattern(tk, loc) => write!(f, "{loc}: {tk:?} is not valid in a pattern."),
Self::NotPrefix(tk, loc) => write!(f, "{loc}: {tk:?} is not a prefix operator."),
Self::NotInfix(tk, loc) => write!(f, "{loc}: {tk:?} is not a infix operator."),
Self::NotPostfix(tk, loc) => write!(f, "{loc}: {tk:?} is not a postfix operator."),
}
}
}
pub type PResult<T> = Result<T, ParseError>;
#[derive(Debug)]
pub struct Parser<'t> {
pub lexer: Lexer<'t>,
pub next_tok: Option<Token>,
pub last_loc: Span,
}
impl<'t> Parser<'t> {
/// Constructs a new Parser
pub fn new(lexer: Lexer<'t>) -> Self {
Self { lexer, next_tok: None, last_loc: Span::default() }
}
/// The identity function. This exists to make production chaining easier.
pub fn then<T>(&self, t: T) -> T {
t
}
pub fn span(&self) -> Span {
self.last_loc
}
/// Parses a value that implements the [Parse] trait.
pub fn parse<T: Parse<'t>>(&mut self, level: usize) -> PResult<T> {
Parse::parse(self, level)
}
/// Peeks the next [Token]. Returns [ParseError::FromLexer] on lexer error.
pub fn peek(&mut self) -> PResult<&Token> {
let next_tok = match self.next_tok.take() {
Some(tok) => tok,
None => match self.lexer.scan() {
Ok(tok) => tok,
Err(e) => Err(ParseError::FromLexer(e))?,
},
};
self.last_loc = next_tok.span;
self.next_tok = Some(next_tok);
Ok(self.next_tok.as_ref().expect("should have token"))
}
/// Peeks the next token if it matches the `expected` [TKind]
pub fn peek_if(&mut self, expected: TKind) -> Option<&Token> {
self.peek().into_iter().find(|tok| tok.kind == expected)
}
/// Consumes and returns the currently-peeked [Token].
pub fn take(&mut self) -> Option<Token> {
self.next_tok.take()
}
/// Consumes the currently-peeked [Token], returning its lexeme without cloning.
pub fn take_lexeme(&mut self) -> Option<String> {
self.take().map(|tok| tok.lexeme)
}
#[allow(clippy::should_implement_trait)]
pub fn next(&mut self) -> PResult<Token> {
self.peek()?;
Ok(self.take().expect("should have token here"))
}
/// Consumes and returns the next [Token] if it matches the `expected` [TKind]
pub fn next_if(&mut self, expected: TKind) -> PResult<Token> {
let token = self.peek()?;
if token.kind == expected {
Ok(self.take().expect("should have token here"))
} else {
Err(ParseError::Expected(expected, token.span))
}
}
/// Parses a list of P separated by `sep` tokens, ending in an `end` token.
/// ```nobnf
/// List<T> = (T `sep`)* T? `end` ;
/// ```
pub fn list<P: Parse<'t>>(
&mut self,
mut elems: Vec<P>,
sep: TKind,
end: TKind,
) -> PResult<Vec<P>> {
while self.peek_if(end).is_none() {
elems.push(self.parse(0)?);
if self.next_if(sep).is_err() {
break;
}
}
self.next_if(end)?;
Ok(elems)
}
/// Parses into an [`Option<P>`] if the next token is `next`
pub fn opt_if<P: Parse<'t>>(&mut self, level: usize, next: TKind) -> PResult<Option<P>> {
Ok(match self.next_if(next) {
Ok(_) => Some(self.parse(level)?),
Err(_) => None,
})
}
/// Parses an expression into a vec unless the next token is `end`
pub fn opt<P: Parse<'t>>(&mut self, level: usize, end: TKind) -> PResult<Option<P>> {
let out = match self.peek_if(end) {
None => Some(self.parse(level)?),
Some(_) => None,
};
self.next_if(end)?;
Ok(out)
}
/// Consumes the currently peeked token without returning it.
pub fn consume(&mut self) -> &mut Self {
self.next_tok = None;
self
}
}
pub trait Parse<'t> {
fn parse(p: &mut Parser<'t>, level: usize) -> PResult<Self>
where Self: Sized;
}
impl<'t> Parse<'t> for Literal {
fn parse(p: &mut Parser<'t>, _level: usize) -> PResult<Self> {
let tok = p.peek()?;
Ok(match tok.kind {
TKind::True => p.consume().then(Literal::Bool(true)),
TKind::False => p.consume().then(Literal::Bool(false)),
TKind::Character => {
Literal::Char(p.take_lexeme().expect("should have Token").remove(0))
}
TKind::Integer => {
let Token { lexeme, kind: _, span } = p.take().expect("should have Token");
// TODO: more complex int parsing
let int = lexeme
.parse()
.map_err(|_| ParseError::Expected(TKind::Integer, span))?;
Literal::Int(int)
}
TKind::String => Literal::Str(p.take_lexeme().expect("should have Token")),
_ => Err(ParseError::Expected(TKind::Integer, tok.span))?,
})
}
}
impl<'t> Parse<'t> for Pat {
fn parse(p: &mut Parser<'t>, level: usize) -> PResult<Self> {
let tok = p.peek()?;
match tok.kind {
TKind::Comment => p.consume().parse(level),
TKind::True | TKind::False | TKind::Character | TKind::Integer | TKind::String => {
Ok(Pat::Lit(p.parse(0)?))
}
TKind::Identifier => match tok.lexeme.as_str() {
"_" => Ok(p.consume().then(Pat::Ignore)),
_ => Ok(Pat::Name(p.take_lexeme().expect("should have Token"))),
},
TKind::Grave => Ok(Pat::MetId(p.consume().next_if(TKind::Identifier)?.lexeme)),
TKind::DotDot => Ok(Pat::Rest(match p.consume().peek_if(TKind::Identifier) {
Some(_) => Some(p.parse(level)?),
None => None,
})),
TKind::LParen => Ok(Pat::Tuple(p.consume().list(
vec![],
TKind::Comma,
TKind::RParen,
)?)),
TKind::LBrack => Ok(Pat::Slice(p.consume().list(
vec![],
TKind::Comma,
TKind::RBrack,
)?)),
_ => Err(ParseError::NotPattern(tok.kind, tok.span)),
}
}
}
impl<'t> Parse<'t> for MatchArm {
fn parse(p: &mut Parser<'t>, _level: usize) -> PResult<Self> {
p.next_if(TKind::Bar).ok();
Ok(MatchArm(
p.list(vec![], TKind::Bar, TKind::FatArrow)?,
p.parse(0)?,
))
}
}
impl<'t> Parse<'t> for MakeArm {
fn parse(p: &mut Parser<'t>, level: usize) -> PResult<Self> {
Ok(MakeArm(p.next_if(TKind::Identifier)?.lexeme, {
p.next_if(TKind::Colon)
.ok()
.map(|_| p.parse(level))
.transpose()?
}))
}
}
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
enum Prec {
Min,
Do,
Assign,
Tuple,
Make,
Body,
Logical,
LogOr,
LogAnd,
Compare,
Range,
Binary,
Shift,
Factor,
Term,
Project,
Unary,
Extend,
Max,
}
impl Prec {
pub const MIN: usize = Prec::Min.value();
pub const fn value(self) -> usize {
self as usize * 2
}
pub const fn prev(self) -> usize {
match self {
Self::Assign => self.value() + 1,
_ => self.value(),
}
}
pub const fn next(self) -> usize {
match self {
Self::Assign => self.value(),
_ => self.value() + 1,
}
}
}
fn from_prefix(token: &Token) -> PResult<(Op, Prec)> {
Ok(match token.kind {
TKind::Do => (Op::Do, Prec::Do),
TKind::True | TKind::False | TKind::Character | TKind::Integer | TKind::String => {
(Op::Lit, Prec::Max)
}
TKind::Identifier => (Op::Id, Prec::Max),
TKind::Grave => (Op::Mid, Prec::Max),
TKind::Fn => (Op::Fn, Prec::Body),
TKind::Match => (Op::Match, Prec::Body),
TKind::Macro => (Op::Macro, Prec::Assign),
TKind::Let => (Op::Let, Prec::Body),
TKind::Const => (Op::Const, Prec::Body),
TKind::Loop => (Op::Loop, Prec::Body),
TKind::If => (Op::If, Prec::Body),
TKind::While => (Op::While, Prec::Body),
TKind::Break => (Op::Break, Prec::Body),
TKind::Return => (Op::Return, Prec::Body),
TKind::LBrack => (Op::Array, Prec::Min),
TKind::RBrack => (Op::End, Prec::Min),
TKind::LCurly => (Op::Block, Prec::Min),
TKind::RCurly => (Op::End, Prec::Min),
TKind::LParen => (Op::Group, Prec::Min),
TKind::RParen => (Op::End, Prec::Min),
TKind::Amp => (Op::Refer, Prec::Max),
// TKind::AmpAmp => todo!("addraddr"),
TKind::Bang => (Op::Not, Prec::Unary),
TKind::BangBang => (Op::Identity, Prec::Unary),
TKind::Bar => (Op::Lambda, Prec::Min),
TKind::BarBar => (Op::Lambda, Prec::Max),
TKind::DotDot => (Op::RangeEx, Prec::Range),
TKind::DotDotEq => (Op::RangeIn, Prec::Range),
TKind::Minus => (Op::Neg, Prec::Unary),
TKind::Plus => (Op::Identity, Prec::Unary),
TKind::Star => (Op::Deref, Prec::Unary),
kind => Err(ParseError::NotPrefix(kind, token.span))?,
})
}
fn from_infix(token: &Token) -> PResult<(Op, Prec)> {
Ok(match token.kind {
TKind::Semi => (Op::Do, Prec::Do), // the inspiration
TKind::RParen => (Op::End, Prec::Do),
TKind::Comma => (Op::Tuple, Prec::Tuple),
TKind::Eq => (Op::Set, Prec::Assign),
TKind::XorXor => (Op::LogXor, Prec::Logical),
TKind::AmpAmp => (Op::LogAnd, Prec::LogAnd),
TKind::BarBar => (Op::LogOr, Prec::LogOr),
TKind::Lt => (Op::Lt, Prec::Compare),
TKind::LtEq => (Op::Leq, Prec::Compare),
TKind::EqEq => (Op::Eq, Prec::Compare),
TKind::BangEq => (Op::Neq, Prec::Compare),
TKind::GtEq => (Op::Geq, Prec::Compare),
TKind::Gt => (Op::Gt, Prec::Compare),
TKind::DotDot => (Op::RangeEx, Prec::Range),
TKind::DotDotEq => (Op::RangeIn, Prec::Range),
TKind::Amp => (Op::And, Prec::Binary),
TKind::Xor => (Op::Xor, Prec::Binary),
TKind::Bar => (Op::Or, Prec::Binary),
TKind::LtLt => (Op::Shl, Prec::Shift),
TKind::GtGt => (Op::Shr, Prec::Shift),
TKind::Plus => (Op::Add, Prec::Factor),
TKind::Minus => (Op::Sub, Prec::Factor),
TKind::Star => (Op::Mul, Prec::Term),
TKind::Slash => (Op::Div, Prec::Term),
TKind::Rem => (Op::Rem, Prec::Term),
TKind::Dot => (Op::Dot, Prec::Project),
TKind::ColonColon => (Op::Path, Prec::Max),
kind => Err(ParseError::NotInfix(kind, token.span))?,
})
}
fn from_postfix(token: &Token) -> PResult<(Op, Prec)> {
Ok(match token.kind {
TKind::Question => (Op::Try, Prec::Unary),
TKind::LParen => (Op::Call, Prec::Extend),
TKind::LBrack => (Op::Index, Prec::Extend),
TKind::LCurly => (Op::Make, Prec::Make),
kind => Err(ParseError::NotPostfix(kind, token.span))?,
})
}
#[rustfmt::skip]
fn should_coagulate(prev: Op, op: Op) -> bool {
prev == op && (match prev {
Op::Do => true,
Op::Tuple => true,
Op::Dot => false,
Op::Path => true,
Op::Lt => false,
Op::Leq => false,
Op::Eq => false,
Op::Neq => false,
Op::Geq => false,
Op::Gt => false,
_ => false,
})
}
impl<'t> Parse<'t> for Expr {
/// Parses an [Expr]ession.
///
/// The `level` parameter indicates the operator binding level of the expression.
fn parse(p: &mut Parser<'t>, level: usize) -> PResult<Self> {
const MIN: usize = Prec::MIN;
while p.next_if(TKind::Comment).is_ok() {}
// Prefix
let tok = p.peek()?;
let ((op, prec), span) = (from_prefix(tok)?, tok.span);
let mut head = match op {
// Empty is returned when a block finisher is an expr prefix.
// It's the only expr that doesn't consume.
Op::End if level == Prec::Do.next() => Expr::Op(Op::Tuple, vec![]),
Op::End => Err(ParseError::NotPrefix(tok.kind, span))?,
Op::Id => Expr::Id(p.take_lexeme().expect("should have ident")),
Op::Mid => Expr::MetId(p.consume().next_if(TKind::Identifier)?.lexeme),
Op::Lit => Expr::Lit(p.parse(MIN)?),
Op::Let => Expr::Let(p.consume().parse(MIN)?, p.opt_if(prec.next(), TKind::Eq)?),
Op::Const => Expr::Const(p.consume().parse(prec.next())?, {
p.next_if(TKind::Eq)?;
p.parse(prec.next())?
}),
Op::Macro => Expr::Op(
op,
vec![p.consume().parse(prec.next())?, {
p.next_if(TKind::FatArrow)?;
p.parse(prec.next())?
}],
),
Op::Match => Expr::Match(p.consume().parse(Prec::Logical.value())?, {
p.next_if(TKind::LCurly)?;
p.list(vec![], TKind::Comma, TKind::RCurly)?
}),
Op::Block => Expr::Op(
op,
p.consume().opt(MIN, TKind::RCurly)?.into_iter().collect(),
),
Op::Array => Expr::Op(op, p.consume().list(vec![], TKind::Comma, TKind::RBrack)?),
Op::Group => match p.consume().opt(MIN, TKind::RParen)? {
Some(value) => Expr::Op(Op::Group, vec![value]),
None => Expr::Op(Op::Tuple, vec![]),
},
Op::If | Op::While => {
p.consume();
let exprs = vec![
// conditional restricted to Logical operators or above
p.parse(Prec::Logical.value())?,
p.parse(prec.next())?,
match p.peek() {
Ok(Token { kind: TKind::Else, .. }) => p.consume().parse(prec.next())?,
_ => Expr::Op(Op::End, vec![]).anno(span.merge(p.span())),
},
];
Expr::Op(op, exprs)
}
Op::Fn => {
p.consume().next_if(TKind::LParen)?;
Expr::Fn(
p.list(vec![], TKind::Comma, TKind::RParen)?,
p.parse(prec.next())?,
)
}
// dirty hack: There are two closure operators, signaled by returned prec.
Op::Lambda if prec == Prec::Min => Expr::Fn(
p.consume().list(vec![], TKind::Comma, TKind::Bar)?,
p.parse(Prec::Body.next())?,
),
Op::Lambda => Expr::Fn(vec![], p.consume().parse(Prec::Body.next())?),
_ => Expr::Op(op, vec![p.consume().parse(prec.next())?]),
};
// Postfix
while let Ok(tok) = p.peek()
&& let Ok((op, prec)) = from_postfix(tok)
&& level <= prec.prev()
&& op != Op::End
{
let span = span.merge(p.span());
p.consume();
head = match op {
Op::Make => Expr::Make(
head.anno(span).into(),
p.consume().list(vec![], TKind::Comma, TKind::RCurly)?,
),
Op::Index => Expr::Op(
op,
p.list(vec![head.anno(span)], TKind::Comma, TKind::RBrack)?,
),
Op::Call => Expr::Op(
op,
p.list(vec![head.anno(span)], TKind::Comma, TKind::RParen)?,
),
_ => Expr::Op(op, vec![head.anno(span)]),
};
}
// Infix
while let Ok(tok) = p.peek()
&& let Ok((op, prec)) = from_infix(tok)
&& level <= prec.prev()
&& op != Op::End
{
let span = span.merge(p.span());
p.consume();
head = match head {
// controls expression chaining vs coagulating
Expr::Op(prev, mut args) if should_coagulate(prev, op) => {
args.push(p.parse(prec.next())?);
Expr::Op(op, args)
}
head => Expr::Op(op, vec![head.anno(span), p.parse(prec.next())?]),
}
}
Ok(head)
}
}
impl<'t, P: Parse<'t> + Annotation> Parse<'t> for Anno<P> {
fn parse(p: &mut Parser<'t>, level: usize) -> PResult<Self>
where Self: Sized {
let start = p.span();
Ok(Anno(p.parse(level)?, start.merge(p.span())))
}
}
impl<'t, P: Parse<'t>> Parse<'t> for Box<P> {
fn parse(p: &mut Parser<'t>, level: usize) -> PResult<Self>
where Self: Sized {
Ok(Box::new(p.parse(level)?))
}
}