pratt/src/lib.rs

341 lines
10 KiB
Rust

//! A Pratt parser which aims for simplicity
//!
//! Based on [Simple but Powerful Pratt Parsing][1] by Alex Kladov
//!
//! [1]: https://matklad.github.io/2020/04/13/simple-but-powerful-pratt-parsing.html
pub mod expr {
use crate::token::Op;
use std::fmt;
#[derive(Clone, Debug)]
pub enum Expr {
Int(usize),
Char(char),
Str(String),
Ident(String),
Unary(Op, Box<Expr>),
Postfix(Op, Box<Expr>),
// Binary operators like `a + b`, `a * b`, ...
Binary(Op, Box<[Expr; 2]>),
Index(Box<[Expr; 2]>),
}
impl fmt::Display for Expr {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Expr::Int(v) => write!(f, "{v}"),
Expr::Str(v) => write!(f, "\"{v}\""),
Expr::Char(v) => write!(f, "'{v}'"),
Expr::Ident(v) => write!(f, "{v}"),
Expr::Unary(op, e) => write!(f, "{op}{e}"),
Expr::Postfix(op, e) => write!(f, "{e}{op}"),
Expr::Binary(op, e) => write!(f, "({} {op} {})", e[0], e[1]),
Expr::Index(e) => write!(f, "{}[{}]", e[0], e[1]),
}
}
}
}
pub mod parser {
#![allow(unused)]
use std::iter::Peekable;
use cl_lexer::Lexer;
use crate::{
expr::Expr,
token::{Op, Token, Tokenizer},
};
pub fn expr(text: &str) -> Option<Expr> {
let mut lexer = Tokenizer::new(Lexer::new(text)).peekable();
exprec(&mut lexer, 0)
}
/// Performs the pratt precedence ascent algorithm
fn exprec<I>(lexer: &mut Peekable<I>, min: u8) -> Option<Expr>
where
I: Iterator<Item = Token>,
{
let mut head = match lexer.next()? {
Token::Int(d) => Expr::Int(d),
Token::Char(c) => Expr::Char(c),
Token::Ident(c) => Expr::Ident(c),
Token::Str(c) => Expr::Str(c),
Token::Op(Op::Lpa) => {
let head = exprec(lexer, 0)?;
assert_eq!(lexer.next()?, Token::Op(Op::Rpa));
head
}
Token::Op(op) => {
let ((), after) = prefix(op)?;
Expr::Unary(op, Box::new(exprec(lexer, after)?))
}
};
loop {
let op = match lexer.peek() {
None => break,
Some(Token::Op(op)) => *op,
Some(t) => {
eprintln!("Bad token: {t}");
return Some(head);
}
};
if let Some((before, ())) = postfix(op) {
if before < min {
break;
}
lexer.next().expect("should not change since peeked");
head = match op {
Op::Lbk => {
let tail = exprec(lexer, 0)?;
assert_eq!(lexer.next(), Some(Token::Op(Op::Rbk)));
Expr::Index(Box::new([head, tail]))
}
_ => Expr::Postfix(op, Box::new(head)),
};
continue;
}
if let Some((before, after)) = infix(op) {
if before < min {
break;
}
lexer.next().expect("should not change since peeked");
let tail = exprec(lexer, after)?;
head = Expr::Binary(op, [head, tail].into());
continue;
}
break;
}
Some(head)
}
fn prefix(op: Op) -> Option<((), u8)> {
match op {
Op::Sub | Op::Not => Prec::Unary,
_ => None?,
}
.prefix()
}
fn infix(op: Op) -> Option<(u8, u8)> {
match op {
Op::Dot => Prec::Member,
Op::Not => Prec::Unary,
Op::Mul | Op::Div | Op::Rem => Prec::Term,
Op::Add | Op::Sub => Prec::Factor,
Op::Shl | Op::Shr => Prec::Shift,
Op::Ban | Op::Bor | Op::Bxr => Prec::Bitwise,
Op::Lan | Op::Lor | Op::Lxr => Prec::Logic,
Op::Inc | Op::Exc => Prec::Range,
Op::Lt | Op::Lte | Op::Eq | Op::Neq | Op::Gte | Op::Gt => Prec::Compare,
Op::Lpa => None?,
Op::Rpa => None?,
Op::Lbk => None?,
Op::Rbk => None?,
Op::Huh => None?,
}
.infix()
}
fn postfix(op: Op) -> Option<(u8, ())> {
match op {
Op::Lbk => Prec::Index,
Op::Huh => Prec::Postfix,
_ => None?,
}
.postfix()
}
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
enum Prec {
Compare,
Range,
Index,
Logic,
Bitwise,
Shift,
Factor,
Term,
Unary,
Postfix,
Member, // left-associative
}
impl Prec {
#[inline]
fn level(self) -> u8 {
(self as u8) << 1
}
fn prefix(self) -> Option<((), u8)> {
match self {
Self::Unary => Some(((), self.level())),
_ => None,
}
}
fn infix(self) -> Option<(u8, u8)> {
let level = self.level();
match self {
Self::Unary => None,
Self::Member => Some((level + 1, level)),
_ => Some((level, level + 1)),
}
}
fn postfix(self) -> Option<(u8, ())> {
match self {
Self::Index | Self::Postfix => Some((self.level(), ())),
_ => None,
}
}
}
}
pub mod token {
//! Custom token type, plus a [Tokenizer] iterator adapter for cl-lexer's token type
use cl_token::{token_type::Op as Tkop, *};
pub struct Tokenizer<'t> {
lexer: cl_lexer::lexer_iter::LexerIter<'t>,
}
impl<'t> Tokenizer<'t> {
pub fn new(lexer: cl_lexer::Lexer<'t>) -> Self {
Self {
lexer: lexer.into_iter(),
}
}
}
impl Iterator for Tokenizer<'_> {
type Item = Token;
fn next(&mut self) -> Option<Self::Item> {
let token = self.lexer.next()?.ok()?;
let (ty, data) = (token.ty(), token.into_data());
match data {
TokenData::Integer(v) => return Some(Token::Int(v as _)),
TokenData::Character(v) => return Some(Token::Char(v)),
TokenData::Identifier(v) => return Some(Token::Ident(v.into_string())),
TokenData::String(v) => return Some(Token::Str(v.to_owned())),
_ => {}
}
match ty.try_into() {
Ok(op) => Some(Token::Op(op)),
Err(Er::Invalid) => self.next(),
Err(Er::NotAnOp) => None,
}
}
}
#[derive(Clone, Debug, PartialEq, Eq)]
pub enum Token {
Int(usize),
Char(char),
Ident(String),
Str(String),
Op(Op),
}
impl std::fmt::Display for Token {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Token::Int(v) => write!(f, "{v}"),
Token::Char(v) => write!(f, "'{v}'"),
Token::Ident(v) => write!(f, "{v}"),
Token::Str(v) => write!(f, "\"{v}\""),
Token::Op(v) => write!(f, "{v}"),
}
}
}
macro_rules! operator {
(
$(#[$Meta:meta])*
$vis:vis enum $Name:ident {
$(
$(#[$meta:meta])*
#[$rep:literal]
$name:ident = $try_from:pat
),*$(,)?
}
) => {
$(#[$Meta])*
$vis enum $Name {$(
$(#[$meta])*
#[doc = $rep]
$name,
)*}
impl ::core::fmt::Display for $Name {
fn fmt(
&self, f: &mut ::core::fmt::Formatter<'_>
) -> ::core::fmt::Result {
match self { $($Name::$name => $rep,)* }.fmt(f)
}
}
impl TryFrom<cl_token::TokenKind> for $Name {
type Error = $crate::token::Er;
fn try_from(value: cl_token::TokenKind) -> Result<Self, Self::Error> {
match value {
cl_token::TokenKind::Comment |
cl_token::TokenKind::Invalid => Err(Er::Invalid),
$($try_from => Ok($Name::$name),)*
_ => Err(Er::NotAnOp)
}
}
}
};
}
operator! {
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum Op {
// Delimiter
#["("] Lpa = TokenKind::Op(Tkop::LParen),
#[")"] Rpa = TokenKind::Op(Tkop::RParen),
#["["] Lbk = TokenKind::Op(Tkop::LBrack),
#["]"] Rbk = TokenKind::Op(Tkop::RBrack),
// Member
#["."] Dot = TokenKind::Op(Tkop::Dot),
// Factor
#["*"] Mul = TokenKind::Op(Tkop::Star),
#["/"] Div = TokenKind::Op(Tkop::Slash),
#["%"] Rem = TokenKind::Op(Tkop::Rem),
// Term
#["+"] Add = TokenKind::Op(Tkop::Plus),
#["-"] Sub = TokenKind::Op(Tkop::Minus),
// Shift
#["<<"] Shl = TokenKind::Op(Tkop::LtLt),
#[">>"] Shr = TokenKind::Op(Tkop::GtGt),
// Bitwise
#["&"] Ban = TokenKind::Op(Tkop::Amp),
#["|"] Bor = TokenKind::Op(Tkop::Bar),
#["^"] Bxr = TokenKind::Op(Tkop::Xor),
// Logic
#["&&"] Lan = TokenKind::Op(Tkop::AmpAmp),
#["||"] Lor = TokenKind::Op(Tkop::BarBar),
#["^^"] Lxr = TokenKind::Op(Tkop::XorXor),
// Range
#["..="] Inc = TokenKind::Op(Tkop::DotDotEq),
#[".."] Exc = TokenKind::Op(Tkop::DotDot),
// Compare
#["<"] Lt = TokenKind::Op(Tkop::Lt),
#["<="] Lte = TokenKind::Op(Tkop::LtEq),
#["=="] Eq = TokenKind::Op(Tkop::EqEq),
#["!="] Neq = TokenKind::Op(Tkop::BangEq),
#[">="] Gte = TokenKind::Op(Tkop::GtEq),
#[">"] Gt = TokenKind::Op(Tkop::Gt),
// Unary-only
#["!"] Not = TokenKind::Op(Tkop::Bang),
// Postfix unary
#["?"] Huh = TokenKind::Op(Tkop::Question),
}
}
#[doc(hidden)]
pub enum Er {
Invalid,
NotAnOp,
}
}