From 5c6a588250a3496700f6d7644da596192864d320 Mon Sep 17 00:00:00 2001 From: John Date: Fri, 12 Apr 2024 13:57:50 -0500 Subject: [PATCH] Implement a simple but powerful pratt parser based on matklad's minipratt --- .gitignore | 1 + Cargo.lock | 404 ++++++++++++++++++++++++++++++++++++++++++++++++++++ Cargo.toml | 9 ++ src/lib.rs | 340 +++++++++++++++++++++++++++++++++++++++++++ src/main.rs | 13 ++ 5 files changed, 767 insertions(+) create mode 100644 .gitignore create mode 100644 Cargo.lock create mode 100644 Cargo.toml create mode 100644 src/lib.rs create mode 100644 src/main.rs diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ea8c4bf --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/target diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..7505939 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,404 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "argh" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7af5ba06967ff7214ce4c7419c7d185be7ecd6cc4965a8f6e1d8ce0398aad219" +dependencies = [ + "argh_derive", + "argh_shared", +] + +[[package]] +name = "argh_derive" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56df0aeedf6b7a2fc67d06db35b09684c3e8da0c95f8f27685cb17e08413d87a" +dependencies = [ + "argh_shared", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "argh_shared" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5693f39141bda5760ecc4111ab08da40565d1771038c4a0250f03457ec707531" +dependencies = [ + "serde", +] + +[[package]] +name = "autocfg" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1fdabc7756949593fe60f30ec81974b613357de856987752631dea1e3394c80" + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + +[[package]] +name = "bitflags" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf4b9d6a944f767f8e5e0db018570623c85f3d925ac718db4e06d0187adb21c1" + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "cl-ast" +version = "0.0.4" +dependencies = [ + "cl-structures", +] + +[[package]] +name = "cl-interpret" +version = "0.0.4" +dependencies = [ + "cl-ast", + "cl-structures", +] + +[[package]] +name = "cl-lexer" +version = "0.0.4" +dependencies = [ + "cl-structures", + "cl-token", + "unicode-ident", +] + +[[package]] +name = "cl-parser" +version = "0.0.4" +dependencies = [ + "cl-ast", + "cl-lexer", + "cl-structures", + "cl-token", +] + +[[package]] +name = "cl-repl" +version = "0.0.4" +dependencies = [ + "argh", + "cl-ast", + "cl-interpret", + "cl-lexer", + "cl-parser", + "cl-token", + "crossterm", +] + +[[package]] +name = "cl-structures" +version = "0.0.4" + +[[package]] +name = "cl-token" +version = "0.0.4" + +[[package]] +name = "crossterm" +version = "0.27.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f476fe445d41c9e991fd07515a6f463074b782242ccf4a5b7b1d1012e70824df" +dependencies = [ + "bitflags 2.5.0", + "crossterm_winapi", + "libc", + "mio", + "parking_lot", + "signal-hook", + "signal-hook-mio", + "winapi", +] + +[[package]] +name = "crossterm_winapi" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "acdd7c62a3665c7f6830a51635d9ac9b23ed385797f70a83bb8bafe9c572ab2b" +dependencies = [ + "winapi", +] + +[[package]] +name = "libc" +version = "0.2.153" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd" + +[[package]] +name = "lock_api" +version = "0.4.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c168f8615b12bc01f9c17e2eb0cc07dcae1940121185446edc3744920e8ef45" +dependencies = [ + "autocfg", + "scopeguard", +] + +[[package]] +name = "log" +version = "0.4.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c" + +[[package]] +name = "mio" +version = "0.8.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c" +dependencies = [ + "libc", + "log", + "wasi", + "windows-sys", +] + +[[package]] +name = "parking_lot" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f" +dependencies = [ + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c42a9226546d68acdd9c0a280d17ce19bfe27a46bf68784e4066115788d008e" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-targets", +] + +[[package]] +name = "pratt" +version = "0.1.0" +dependencies = [ + "cl-lexer", + "cl-repl", + "cl-token", +] + +[[package]] +name = "proc-macro2" +version = "1.0.79" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e835ff2298f5721608eb1a980ecaee1aef2c132bf95ecc026a11b7bf3c01c02e" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.36" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "redox_syscall" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa" +dependencies = [ + "bitflags 1.3.2", +] + +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + +[[package]] +name = "serde" +version = "1.0.197" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3fb1c873e1b9b056a4dc4c0c198b24c3ffa059243875552b2bd0933b1aee4ce2" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.197" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7eb0b34b42edc17f6b7cac84a52a1c5f0e1bb2227e997ca9011ea3dd34e8610b" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "signal-hook" +version = "0.3.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8621587d4798caf8eb44879d42e56b9a93ea5dcd315a6487c357130095b62801" +dependencies = [ + "libc", + "signal-hook-registry", +] + +[[package]] +name = "signal-hook-mio" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29ad2e15f37ec9a6cc544097b78a1ec90001e9f71b81338ca39f430adaca99af" +dependencies = [ + "libc", + "mio", + "signal-hook", +] + +[[package]] +name = "signal-hook-registry" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d8229b473baa5980ac72ef434c4415e70c4b5e71b423043adb4ba059f89c99a1" +dependencies = [ + "libc", +] + +[[package]] +name = "smallvec" +version = "1.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" + +[[package]] +name = "syn" +version = "2.0.58" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44cfb93f38070beee36b3fef7d4f5a16f27751d94b187b666a5cc5e9b0d30687" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "unicode-ident" +version = "1.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" + +[[package]] +name = "wasi" +version = "0.11.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "windows-sys" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-targets" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" + +[[package]] +name = "windows_i686_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" + +[[package]] +name = "windows_i686_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..473c4a8 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "pratt" +version = "0.1.0" +edition = "2021" + +[dependencies] +cl-lexer = { path = "../conlang/cl-lexer" } +cl-token = { path = "../conlang/cl-token" } +cl-repl = { path = "../conlang/cl-repl" } diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..a366d8e --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,340 @@ +//! A Pratt parser which aims for simplicity +//! +//! Based on [Simple but Powerful Pratt Parsing][1] by Alex Kladov +//! +//! [1]: https://matklad.github.io/2020/04/13/simple-but-powerful-pratt-parsing.html + +pub mod expr { + use crate::token::Op; + use std::fmt; + + #[derive(Clone, Debug)] + pub enum Expr { + Int(usize), + Char(char), + Str(String), + Ident(String), + Unary(Op, Box), + Postfix(Op, Box), + // Binary operators like `a + b`, `a * b`, ... + Binary(Op, Box<[Expr; 2]>), + Index(Box<[Expr; 2]>), + } + impl fmt::Display for Expr { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Expr::Int(v) => write!(f, "{v}"), + Expr::Str(v) => write!(f, "\"{v}\""), + Expr::Char(v) => write!(f, "'{v}'"), + Expr::Ident(v) => write!(f, "{v}"), + Expr::Unary(op, e) => write!(f, "{op}{e}"), + Expr::Postfix(op, e) => write!(f, "{e}{op}"), + Expr::Binary(op, e) => write!(f, "({} {op} {})", e[0], e[1]), + Expr::Index(e) => write!(f, "{}[{}]", e[0], e[1]), + } + } + } +} + +pub mod parser { + #![allow(unused)] + use std::iter::Peekable; + + use cl_lexer::Lexer; + + use crate::{ + expr::Expr, + token::{Op, Token, Tokenizer}, + }; + + pub fn expr(text: &str) -> Option { + let mut lexer = Tokenizer::new(Lexer::new(text)).peekable(); + exprec(&mut lexer, 0) + } + + /// Performs the pratt precedence ascent algorithm + fn exprec(lexer: &mut Peekable, min: u8) -> Option + where + I: Iterator, + { + let mut head = match lexer.next()? { + Token::Int(d) => Expr::Int(d), + Token::Char(c) => Expr::Char(c), + Token::Ident(c) => Expr::Ident(c), + Token::Str(c) => Expr::Str(c), + Token::Op(Op::Lpa) => { + let head = exprec(lexer, 0)?; + assert_eq!(lexer.next()?, Token::Op(Op::Rpa)); + head + } + Token::Op(op) => { + let ((), after) = prefix(op)?; + Expr::Unary(op, Box::new(exprec(lexer, after)?)) + } + }; + + loop { + let op = match lexer.peek() { + None => break, + Some(Token::Op(op)) => *op, + Some(t) => { + eprintln!("Bad token: {t}"); + return Some(head); + } + }; + + if let Some((before, ())) = postfix(op) { + if before < min { + break; + } + lexer.next().expect("should not change since peeked"); + + head = match op { + Op::Lbk => { + let tail = exprec(lexer, 0)?; + assert_eq!(lexer.next(), Some(Token::Op(Op::Rbk))); + Expr::Index(Box::new([head, tail])) + } + _ => Expr::Postfix(op, Box::new(head)), + }; + continue; + } + + if let Some((before, after)) = infix(op) { + if before < min { + break; + } + lexer.next().expect("should not change since peeked"); + + let tail = exprec(lexer, after)?; + head = Expr::Binary(op, [head, tail].into()); + continue; + } + break; + } + Some(head) + } + + fn prefix(op: Op) -> Option<((), u8)> { + match op { + Op::Sub | Op::Not => Prec::Unary, + _ => None?, + } + .prefix() + } + fn infix(op: Op) -> Option<(u8, u8)> { + match op { + Op::Dot => Prec::Member, + Op::Not => Prec::Unary, + Op::Mul | Op::Div | Op::Rem => Prec::Term, + Op::Add | Op::Sub => Prec::Factor, + Op::Shl | Op::Shr => Prec::Shift, + Op::Ban | Op::Bor | Op::Bxr => Prec::Bitwise, + Op::Lan | Op::Lor | Op::Lxr => Prec::Logic, + Op::Inc | Op::Exc => Prec::Range, + Op::Lt | Op::Lte | Op::Eq | Op::Neq | Op::Gte | Op::Gt => Prec::Compare, + Op::Lpa => None?, + Op::Rpa => None?, + Op::Lbk => None?, + Op::Rbk => None?, + Op::Huh => None?, + } + .infix() + } + fn postfix(op: Op) -> Option<(u8, ())> { + match op { + Op::Lbk => Prec::Index, + Op::Huh => Prec::Postfix, + _ => None?, + } + .postfix() + } + + #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] + enum Prec { + Compare, + Range, + Index, + Logic, + Bitwise, + Shift, + Factor, + Term, + Unary, + Postfix, + Member, // left-associative + } + impl Prec { + #[inline] + fn level(self) -> u8 { + (self as u8) << 1 + } + fn prefix(self) -> Option<((), u8)> { + match self { + Self::Unary => Some(((), self.level())), + _ => None, + } + } + fn infix(self) -> Option<(u8, u8)> { + let level = self.level(); + match self { + Self::Unary => None, + Self::Member => Some((level + 1, level)), + _ => Some((level, level + 1)), + } + } + fn postfix(self) -> Option<(u8, ())> { + match self { + Self::Index | Self::Postfix => Some((self.level(), ())), + _ => None, + } + } + } +} + +pub mod token { + //! Custom token type, plus a [Tokenizer] iterator adapter for cl-lexer's token type + use cl_token::*; + + pub struct Tokenizer<'t> { + lexer: cl_lexer::lexer_iter::LexerIter<'t>, + } + impl<'t> Tokenizer<'t> { + pub fn new(lexer: cl_lexer::Lexer<'t>) -> Self { + Self { + lexer: lexer.into_iter(), + } + } + } + impl Iterator for Tokenizer<'_> { + type Item = Token; + + fn next(&mut self) -> Option { + let token = self.lexer.next()?.ok()?; + let (ty, data) = (token.ty(), token.into_data()); + + match data { + Data::Integer(v) => return Some(Token::Int(v as _)), + Data::Character(v) => return Some(Token::Char(v)), + Data::Identifier(v) => return Some(Token::Ident(v.into_string())), + Data::String(v) => return Some(Token::Str(v.to_owned())), + _ => {} + } + + match ty.try_into() { + Ok(op) => Some(Token::Op(op)), + Err(Er::Invalid) => self.next(), + Err(Er::NotAnOp) => None, + } + } + } + + #[derive(Clone, Debug, PartialEq, Eq)] + pub enum Token { + Int(usize), + Char(char), + Ident(String), + Str(String), + Op(Op), + } + + impl std::fmt::Display for Token { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Token::Int(v) => write!(f, "{v}"), + Token::Char(v) => write!(f, "'{v}'"), + Token::Ident(v) => write!(f, "{v}"), + Token::Str(v) => write!(f, "\"{v}\""), + Token::Op(v) => write!(f, "{v}"), + } + } + } + + macro_rules! operator { + ( + $(#[$Meta:meta])* + $vis:vis enum $Name:ident { + $( + $(#[$meta:meta])* + #[$rep:literal] + $name:ident = $try_from:pat + ),*$(,)? + } + ) => { + $(#[$Meta])* + $vis enum $Name {$( + $(#[$meta])* + #[doc = $rep] + $name, + )*} + impl ::core::fmt::Display for $Name { + fn fmt( + &self, f: &mut ::core::fmt::Formatter<'_> + ) -> ::core::fmt::Result { + match self { $($Name::$name => $rep,)* }.fmt(f) + } + } + impl TryFrom for $Name { + type Error = $crate::token::Er; + fn try_from(value: cl_token::Type) -> Result { + match value { + cl_token::Type::Comment | + cl_token::Type::Invalid => Err(Er::Invalid), + $($try_from => Ok($Name::$name),)* + _ => Err(Er::NotAnOp) + } + } + } + }; + } + operator! { + #[derive(Clone, Copy, Debug, PartialEq, Eq)] + pub enum Op { + // Delimiter + #["("] Lpa = Type::LParen, + #[")"] Rpa = Type::RParen, + #["["] Lbk = Type::LBrack, + #["]"] Rbk = Type::RBrack, + // Member + #["."] Dot = Type::Dot, + // Factor + #["*"] Mul = Type::Star, + #["/"] Div = Type::Slash, + #["%"] Rem = Type::Rem, + // Term + #["+"] Add = Type::Plus, + #["-"] Sub = Type::Minus, + // Shift + #["<<"] Shl = Type::LtLt, + #[">>"] Shr = Type::GtGt, + // Bitwise + #["&"] Ban = Type::Amp, + #["|"] Bor = Type::Bar, + #["^"] Bxr = Type::Xor, + // Logic + #["&&"] Lan = Type::AmpAmp, + #["||"] Lor = Type::BarBar, + #["^^"] Lxr = Type::XorXor, + // Range + #["..="] Inc = Type::DotDotEq, + #[".."] Exc = Type::DotDot, + // Compare + #["<"] Lt = Type::Lt, + #["<="] Lte = Type::LtEq, + #["=="] Eq = Type::EqEq, + #["!="] Neq = Type::BangEq, + #[">="] Gte = Type::GtEq, + #[">"] Gt = Type::Gt, + // Unary-only + #["!"] Not = Type::Bang, + // Postfix unary + #["?"] Huh = Type::Question, + } + } + + #[doc(hidden)] + pub enum Er { + Invalid, + NotAnOp, + } +} diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..d054a63 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,13 @@ +use cl_repl::repline::Repline; +use pratt::parser; + +fn main() { + let mut rl = Repline::new("\x1b[32m", "crisp >", "what? >"); + + while let Ok(line) = rl.read() { + if let Some(expr) = parser::expr(&line) { + println!("\x1b[G\x1b[J{expr:?}"); + rl.accept(); + } + } +}