From a26a01fc178cebb2a1c2221f7eacf3befacab212 Mon Sep 17 00:00:00 2001 From: John Date: Fri, 20 Oct 2023 15:33:02 -0500 Subject: [PATCH] constr: Misuse iterators to parse tokens --- Cargo.toml | 2 +- constr/Cargo.toml | 10 ++ constr/src/lib.rs | 236 +++++++++++++++++++++++++++++ grammar.ebnf | 5 +- libconlang/Cargo.toml | 1 + libconlang/examples/parse_input.rs | 20 +-- libconlang/src/ast.rs | 21 ++- libconlang/src/parser.rs | 100 ++++++++---- libconlang/src/pretty_printer.rs | 5 +- 9 files changed, 333 insertions(+), 67 deletions(-) create mode 100644 constr/Cargo.toml create mode 100644 constr/src/lib.rs diff --git a/Cargo.toml b/Cargo.toml index ddbd93a..006e400 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,5 +1,5 @@ [workspace] -members = ["libconlang", "lerox"] +members = ["libconlang", "lerox", "constr"] resolver = "2" [workspace.package] diff --git a/constr/Cargo.toml b/constr/Cargo.toml new file mode 100644 index 0000000..6f02797 --- /dev/null +++ b/constr/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "constr" +version.workspace = true +authors.workspace = true +edition.workspace = true +license.workspace = true + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] diff --git a/constr/src/lib.rs b/constr/src/lib.rs new file mode 100644 index 0000000..6b55800 --- /dev/null +++ b/constr/src/lib.rs @@ -0,0 +1,236 @@ +//! [String] tools for Conlang +//#![warn(clippy::all)] +#![feature(decl_macro, const_trait_impl)] + +impl ConstrTools for T {} +pub trait ConstrTools { + /// Unescapes string escape sequences + fn unescape(self) -> UnescapeString + where Self: Iterator + Sized { + UnescapeString::new(self) + } + fn parse_int(self) -> ParseInt + where Self: Iterator + Sized { + ParseInt::new(self) + } +} + +pub use unescape_string::UnescapeString; +pub mod unescape_string { + //! TODO: Write the module-level documentation + pub struct UnescapeString> { + inner: I, + } + + impl> Iterator for UnescapeString { + type Item = I::Item; + fn next(&mut self) -> Option { + self.unescape() + } + } + + impl> UnescapeString { + pub fn new(inner: I) -> Self { + Self { inner } + } + /// Consumes an escape sequence. See the [module level documentation](self). + pub fn unescape(&mut self) -> Option { + match self.inner.next()? { + '\\' => (), + other => return Some(other), + } + Some(match self.inner.next()? { + 'a' => '\x07', + 'b' => '\x08', + 'f' => '\x0c', + 'n' => '\n', + 't' => '\t', + 'x' => self.hex_digits::<2>()?, + 'u' => self.hex_digits::<4>()?, + 'U' => self.hex_digits::<8>()?, + '0' => '\0', + byte => byte, + }) + } + fn hex_digits(&mut self) -> Option { + let mut out = 0; + for _ in 0..DIGITS { + out = (out << 4) + self.hex_digit()? as u32; + } + char::from_u32(out) + } + fn hex_digit(&mut self) -> Option { + super::base::<16>(self.inner.next()?) + } + } +} +pub use parse_int::ParseInt; +pub mod parse_int { + use std::marker::PhantomData; + + pub struct ParseInt, O> { + inner: I, + _data: PhantomData, + } + impl, O> ParseInt { + pub fn new(inner: I) -> Self { + Self { inner, _data: Default::default() } + } + fn digit(&mut self) -> Option { + let next = loop { + match self.inner.next()? { + '_' => continue, + c => break c, + } + }; + super::base::(next) + } + } + parse_int_impl!(u8, i8, u16, i16, u32, i32, u64, i64, u128, i128); + macro parse_int_impl($($T:ty),*$(,)?) {$( + impl> ParseInt { + fn digits(&mut self, init: Option) -> Option<$T> { + let mut out = match init { + Some(digit) => digit, + None => self.digit::()?, + } as $T; + while let Some(digit) = self.digit::() { + out = out.checked_mul(B as $T)?.checked_add(digit as $T)? + } + Some(out) + } + fn base(&mut self) -> Option<$T> { + match self.inner.next()? { + 'b' => self.digits::<2>(None), + 'd' => self.digits::<10>(None), + 'o' => self.digits::<8>(None), + 'x' => self.digits::<16>(None), + c => self.digits::<10>(Some(super::base::<10>(c)?)), + } + } + } + impl> Iterator for ParseInt { + type Item = $T; + fn next(&mut self) -> Option { + match self.digit::<10>()? { + 0 => self.base(), + c if (0..=9).contains(&c) => self.digits::<10>(Some(c)), + _ => None, + } + } + } + )*} +} + +/// Converts a single char [0-9A-Za-z] to their [base B](base::) equivalent. +/// +/// # May Panic +/// Panics in debug mode when B > 36 +pub const fn base(c: char) -> Option { + // TODO: Wait for a way to limit const generics at compile time + debug_assert!(B <= 36); + // Can't use Ord::min in const context yet :( + // This function also relies on wrapping arithmetic + macro wrap ($c:ident - $b:literal $(+ $ten:literal)? $(< $B:ident.min($min:literal))?) { + $c.wrapping_sub($b)$(.wrapping_add($ten))? $(< if $B < $min {$B} else {$min})? + } + let c = c as u8; + match c { + c if wrap!(c - b'0' < B.min(10)) => Some(wrap!(c - b'0')), + _ if B <= 10 => None, // cuts base<1..=10> to 4 instructions on x86 :^) + c if wrap!(c - b'A' + 10 < B.min(36)) => Some(wrap!(c - b'A' + 10)), + c if wrap!(c - b'a' + 10 < B.min(36)) => Some(wrap!(c - b'a' + 10)), + _ => None, + } +} + +#[cfg(test)] +mod tests { + use super::*; + mod unescape_string { + use super::*; + test_unescape! { + empty = ["" => ""]; + n_newline = ["\\n" => "\n", "This is a\\ntest" => "This is a\ntest"]; + a_bell = ["\\a" => "\x07", "Ring the \\abell" => "Ring the \x07bell"]; + b_backspace = ["\\b" => "\x08"]; + f_feed = ["\\f" => "\x0c"]; + t_tab = ["\\t" => "\t"]; + _0_nul = ["\\0" => "\0"]; + x_hex = [ + "\\x41\\x41\\x41\\x41" => "AAAA", + "\x00" => "\0", + "\\x7f" => "\x7f", + "\\x80" => "\u{80}", + "\\xD0" => "\u{D0}", + ]; + } + macro test_unescape ($($f:ident = [$($test:expr => $expect:expr),*$(,)?];)*) {$( + #[test] fn $f () { + $(assert_eq!($test.chars().unescape().collect::(), dbg!($expect));)* + } + )*} + } + mod parse_int { + use super::*; + #[test] + #[should_panic] + fn base_37_panics() { + base::<37>('a'); + } + test_parse! { + parse_u8: u8 = [ + "0xc5" => Some(0xc5), + "0xc_____________________5" => Some(0xc5), + "0x7d" => Some(0x7d), + "0b10" => Some(0b10), + "0o10" => Some(0o10), + "0x10" => Some(0x10), + "0d10" => Some(10), + "10" => Some(10), + ]; + parse_u16: u16 = [ + "0xc5c5" => Some(0xc5c5), + "0x1234" => Some(0x1234), + "0x5678" => Some(0x5678), + "0x9abc" => Some(0x9abc), + "0xdef0" => Some(0xdef0), + "0xg" => None, + "0b10" => Some(0b10), + "0o10" => Some(0o10), + "0x10" => Some(0x10), + "0d10" => Some(10), + "10" => Some(10), + ]; + parse_u32: u32 = [ + "0xc5c5c5c5" => Some(0xc5c5c5c5), + "0xc5_c5_c5_c5" => Some(0xc5c5c5c5), + "1_234_567____" => Some(1234567), + "4294967295" => Some(4294967295), + "4294967296" => None, + "🦈" => None, + ]; + parse_u64: u64 = [ + "0xffffffffffffffff" => Some(0xffffffffffffffff), + "0x10000000000000000" => None, + "0xc5c5c5c5c5c5c5c5" => Some(0xc5c5c5c5c5c5c5c5), + "0x123456789abcdef0" => Some(1311768467463790320), + "0x123456789abcdefg" => Some(81985529216486895), + "0d1234567890" => Some(1234567890), + "0o12345670" => Some(2739128), + "0b10" => Some(2), + ]; + parse_u128: u128 = [ + "0x10000000000000000" => Some(0x10000000000000000), + "0xc5c5c5c5c5c5c5c5c5c5c5c5c5c5c5c5" => Some(0xc5c5c5c5c5c5c5c5c5c5c5c5c5c5c5c5), + "0o77777777777777777777777777777777" => Some(0o77777777777777777777777777777777), + ]; + } + macro test_parse ($($f:ident : $T:ty = [$($test:expr => $expect:expr),*$(,)?];)*) {$( + #[test] fn $f () { + type Test = $T; + $(assert_eq!(($test.chars().parse_int() as ParseInt<_, Test>).next(), dbg!($expect));)* + } + )*} + } +} diff --git a/grammar.ebnf b/grammar.ebnf index 2cfb3e6..59bd24d 100644 --- a/grammar.ebnf +++ b/grammar.ebnf @@ -5,10 +5,11 @@ Start = Expr ; Literal = STRING | CHARACTER | FLOAT | INTEGER | Bool ; Bool = "true" | "false" ; Identifier = IDENTIFIER ; + (* # Expressions *) (* expression *) -Expr = Ignore Block = '{' Expr? '}' ; +Expr = Ignore ; Group = '(' Expr? ')' ; Primary = Item | Identifier | Literal | Block | Group | Branch ; @@ -26,9 +27,9 @@ Unary = (UnaryOp)* Primary ; (* expression::math::operator *) IgnoreOp = ';' ; -CompareOp = '<' | "<=" | "==" | "!=" | ">=" | '>' ; AssignOp = '=' | "+=" | "-=" | "*=" | "/=" | "&=" | "|=" | "^=" |"<<=" |">>=" ; +CompareOp = '<' | "<=" | "==" | "!=" | ">=" | '>' ; LogicOp = "&&" | "||" | "^^" ; BitwiseOp = '&' | '|' | '^' ; diff --git a/libconlang/Cargo.toml b/libconlang/Cargo.toml index 3dcdc02..8d109cf 100644 --- a/libconlang/Cargo.toml +++ b/libconlang/Cargo.toml @@ -9,4 +9,5 @@ license.workspace = true [dependencies] lerox = { path = "../lerox" } +constr = { path = "../constr" } unicode-xid = "0.2.4" diff --git a/libconlang/examples/parse_input.rs b/libconlang/examples/parse_input.rs index b71f9f1..73c7d9b 100644 --- a/libconlang/examples/parse_input.rs +++ b/libconlang/examples/parse_input.rs @@ -41,25 +41,11 @@ fn take_stdin() -> Result<(), Box> { } fn parse(file: &str, path: Option<&Path>) { + use conlang::parser::error::Error; match Parser::from(Lexer::new(file)).parse() { Ok(ast) => ast.print(), - Err(e) => { - println!("{e:?}"); - if let Some(t) = e.start() { - print_token(path, file, t) - } - } + Err(e) if e.start().is_some() => println!("{:?}:{}", path.unwrap_or(Path::new("-")), e), + Err(e) => println!("{e}"), } println!(); } - -fn print_token(path: Option<&Path>, file: &str, t: conlang::token::Token) { - let path = path.unwrap_or(Path::new("")); - println!( - "{path:?}:{:02}:{:02}: {} ({})", - t.line(), - t.col(), - &file[t.range()], - t.ty(), - ) -} diff --git a/libconlang/src/ast.rs b/libconlang/src/ast.rs index 916ff9e..6a3a032 100644 --- a/libconlang/src/ast.rs +++ b/libconlang/src/ast.rs @@ -97,9 +97,7 @@ mod visitor { } impl + ?Sized, R> Walk for Expr { fn walk(&self, visitor: &mut T) -> R { - match self { - Expr::Ignore(i) => visitor.visit_ignore(i), - } + visitor.visit_ignore(&self.ignore) } } impl + ?Sized, R> Walk for Primary { @@ -157,10 +155,10 @@ mod visitor { } } /// Visit a [Group] expression - fn visit_group(&mut self, expr: &Group) -> R { - match &expr.expr { - Some(expr) => self.visit_expr(expr), - None => self.visit_empty(), + fn visit_group(&mut self, group: &Group) -> R { + match group { + Group::Expr(expr) => self.visit_expr(expr), + Group::Empty => self.visit_empty(), } } @@ -421,8 +419,8 @@ pub mod expression { /// # Syntax /// [`Expr`] := [`math::Ignore`] #[derive(Clone, Debug)] - pub enum Expr { - Ignore(math::Ignore), + pub struct Expr { + pub ignore: math::Ignore, } /// A [Primary] Expression is the expression with the highest precedence (i.e. the deepest @@ -454,8 +452,9 @@ pub mod expression { /// # Syntax /// [`Group`] := `'('` [`Expr`]? `')'` #[derive(Clone, Debug)] - pub struct Group { - pub expr: Option>, + pub enum Group { + Expr(Box), + Empty, } pub mod math { diff --git a/libconlang/src/parser.rs b/libconlang/src/parser.rs index d162230..de75857 100644 --- a/libconlang/src/parser.rs +++ b/libconlang/src/parser.rs @@ -6,10 +6,12 @@ use super::{ lexer::Lexer, token::{Keyword, Token, Type}, }; +use constr::ConstrTools; use error::{Error, Reason::*, *}; -mod error { +pub mod error { use super::{Token, Type}; + use std::fmt::Display; #[derive(Clone, Debug, Default, PartialEq, Eq)] pub enum Reason { @@ -31,6 +33,32 @@ mod error { Unspecified, } use Reason::*; + + impl Display for Reason { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::Expected(t) => write!(f, "Expected {t}"), + Self::NotIdentifier => Display::fmt("Not an identifier", f), + Self::NotLiteral => Display::fmt("Not a literal", f), + Self::NotString => Display::fmt("Not a string", f), + Self::NotChar => Display::fmt("Not a char", f), + Self::NotBool => Display::fmt("Not a bool", f), + Self::NotFloat => Display::fmt("Not a float", f), + Self::FloatExponentOverflow => Display::fmt("Float exponent too large", f), + Self::FloatMantissaOverflow => Display::fmt("Float mantissa too large", f), + Self::NotInt => Display::fmt("Not an integer", f), + Self::IntOverflow => Display::fmt("Integer too large", f), + Self::NotControlFlow => Display::fmt("Control flow expression was incomplete", f), + Self::NotBranch => Display::fmt("Branch expression was incomplete", f), + Self::EndOfFile => Display::fmt("Got end of file", f), + Self::Unspecified => Display::fmt( + "Unspecified error. You are permitted to slap the code author.", + f, + ), + } + } + } + /// [Parser] [Result] pub type PResult = Result; #[derive(Clone, Debug, Default, PartialEq, Eq)] @@ -38,6 +66,16 @@ mod error { reason: Reason, start: Option, } + + impl Display for Error { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + if let Some(token) = self.start { + write!(f, "{}:{}: ", token.line(), token.col())?; + } + write!(f, "{}", self.reason) + } + } + macro error_impl($($fn:ident$(($($p:ident: $t:ty),*))?: $reason:expr),*$(,)?) {$( /// Creates an [Error] with this [Reason]: #[doc = concat!("[`", stringify!($reason), "`]")] @@ -102,9 +140,14 @@ impl<'t> Parser<'t> { pub fn new(tokens: Vec, text: &'t str) -> Self { Self { tokens, text, panic_stack: vec![], curr: 0 } } + /// Parse the [start of an AST](Start) + pub fn parse(&mut self) -> PResult { + self.consume_comments(); + Ok(Start(self.expr()?)) + } /// Consumes any consecutive comments fn consume_comments(&mut self) -> &mut Self { - while let Some(Type::Comment) = self.peek().map(|t| t.ty()) { + while let Ok(Type::Comment) = self.peek().map(|t| t.ty()) { self.curr += 1; } self @@ -117,8 +160,8 @@ impl<'t> Parser<'t> { self } /// Peek at the current token - pub fn peek(&self) -> Option<&Token> { - self.tokens.get(self.curr) + pub fn peek(&self) -> PResult<&Token> { + self.tokens.get(self.curr).ok_or(Error::end_of_file()) } /// Records the current position on the panic stack pub fn mark(&mut self) -> &mut Self { @@ -138,11 +181,6 @@ impl<'t> Parser<'t> { } out } - /// Parse the [start of an AST](Start) - pub fn parse(&mut self) -> PResult { - self.consume_comments(); - Ok(Start(self.expr()?)) - } } /// Helpers impl<'t> Parser<'t> { @@ -187,23 +225,19 @@ macro ptodo($self:expr $(, $t:expr)*) { $($t;)* Err(ptodo_err!($self)) } -fn check_eof(t: Option<&Token>) -> PResult<&Token> { - t.ok_or(Error::end_of_file()) -} /// # Terminals and Pseudo-Terminals impl<'t> Parser<'t> { pub fn identifier(&mut self) -> PResult { - let range = self + let token = *self .matches(Type::Identifier) - .map_err(|e| Error::not_identifier().maybe_token(e.start()))? - .range(); - Ok(Identifier(self.consume().text[range].into())) + .map_err(|e| Error::not_identifier().maybe_token(e.start()))?; + Ok(Identifier(self.consume().text[&token].into())) } pub fn literal(&mut self) -> PResult { use literal::Literal::*; use Keyword::{False, True}; - let tok = check_eof(self.peek())?; + let tok = self.peek()?; match tok.ty() { Type::Float => self.float().map(Float), Type::Integer => self.int::<10>().map(Int), @@ -216,28 +250,29 @@ impl<'t> Parser<'t> { pub fn float(&mut self) -> PResult { ptodo!(self) } - pub fn int(&mut self) -> PResult { - #[cfg(debug_assertions)] - eprintln!("/* TODO: parse integer literals from other bases */"); + pub fn int(&mut self) -> PResult { let token = *self.matches(Type::Integer)?; - self.consume().text[token.range()] - .parse() + u128::from_str_radix(&self.consume().text[&token], BASE) .map_err(|_| Error::not_int().token(token)) } pub fn string(&mut self) -> PResult { - let range = self.matches(Type::String)?.range(); - Ok(self.consume().text[range].into()) + let range = self + .matches(Type::String) + .map_err(|e| e.reason(NotString))? + .range(); + Ok(self.consume().text[range].chars().unescape().collect()) } pub fn char(&mut self) -> PResult { let token = *self.matches(Type::Character)?; self.consume().text[&token] .chars() + .unescape() .next() .ok_or(Error::not_char().token(token)) } pub fn bool(&mut self) -> PResult { use Keyword::{False, True}; - let token = check_eof(self.peek())?; + let token = self.peek()?; let out = match token.ty() { Type::Keyword(False) => false, Type::Keyword(True) => true, @@ -251,10 +286,10 @@ impl<'t> Parser<'t> { impl<'t> Parser<'t> { pub fn expr(&mut self) -> PResult { use expression::Expr; - self.ignore().map(Expr::Ignore) + Ok(Expr { ignore: self.ignore()? }) } pub fn if_not_expr(&mut self, matches: Type) -> PResult> { - if check_eof(self.peek())?.ty() == matches { + if self.peek()?.ty() == matches { Ok(None) } else { Some(self.expr()).transpose() @@ -265,14 +300,15 @@ impl<'t> Parser<'t> { .map(|e| expression::Block { expr: e.map(Box::new) }) } pub fn group(&mut self) -> PResult { - let t = check_eof(self.consume_type(Type::LParen)?.peek())?; + use expression::Group; + let t = self.consume_type(Type::LParen)?.peek()?; match t.ty() { Type::RParen => { self.consume(); - Ok(expression::Group { expr: None }) + Ok(Group::Empty) } _ => { - let out = self.expr().map(|expr| expression::Group {expr: Some(expr.into())}); + let out = self.expr().map(|expr| Group::Expr(expr.into())); self.consume_type(Type::RParen)?; out } @@ -335,7 +371,7 @@ impl<'t> Parser<'t> { } macro operator_impl($($(#[$m:meta])*$f:ident: $Ret:ty),*$(,)*) {$( $(#[$m])* pub fn $f(&mut self) -> Option<$Ret> { - let out: Option<$Ret> = self.peek()?.ty().into(); + let out: Option<$Ret> = self.peek().ok()?.ty().into(); if out.is_some() { self.consume(); } out } @@ -359,7 +395,7 @@ impl<'t> Parser<'t> { pub fn flow(&mut self) -> PResult { use control::Flow; use Keyword::{Break, Continue, For, If, Return, While}; - let token = check_eof(self.peek())?; + let token = self.peek()?; match token.ty() { Type::Keyword(While) => self.parse_while().map(Flow::While), Type::Keyword(For) => self.parse_for().map(Flow::For), diff --git a/libconlang/src/pretty_printer.rs b/libconlang/src/pretty_printer.rs index c603a95..be4510d 100644 --- a/libconlang/src/pretty_printer.rs +++ b/libconlang/src/pretty_printer.rs @@ -233,10 +233,7 @@ impl Visitor> for Printer { fn visit_group(&mut self, expr: &expression::Group) -> IOResult<()> { self.put('(')?.space()?; - match &expr.expr { - Some(expr) => expr.walk(self), - None => ().walk(self), - }?; + expr.walk(self)?; self.space()?.put(')').map(drop) } }