commit d1d8c45bdbe6a372b04d82b0b46ae5ec7b9190c9 Author: John Date: Sat Apr 6 05:22:21 2024 -0500 Implement a dead simple pseudo-EBNF to Pest translator, which works on Conlang's EBNF diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ea8c4bf --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/target diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..1c90227 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,16 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "grammatical" +version = "0.1.0" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "unicode-ident" +version = "1.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..90a72de --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,7 @@ +[package] +name = "grammatical" +version = "0.1.0" +edition = "2021" + +[dependencies] +unicode-ident = "1.0.12" diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..d31c42b --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,252 @@ +use std::{iter::Peekable, str::CharIndices}; + +use unicode_ident::*; + +/// Rule = ident '=' Either? ';' ; +#[derive(Debug, Default)] +pub struct Rule<'a> { + pub comment: Option<&'a str>, + pub name: &'a str, + pub body: RuleKind<'a>, +} + +#[derive(Debug, Default)] +pub enum RuleKind<'a> { + /// Either = Follow ('|' Follow)* ; + Either(Vec>), + /// Follow = (Any | Many | Maybe | Not)+ ; + Follow(Vec>), + /// Any = Not '*' ; + Any(Box>), + /// Many = Not '+' ; + Many(Box>), + /// Maybe = Not '?' ; + Maybe(Box>), + /// Not = '!'? Prime ; + Not(Box>), + /// Group = '(' Either ')' ; + Group(Box>), + /// ident = XID_START XID_CONTINUE* ; + Ident(&'a str), + /// str = '"' (!'"' ANY)* '"' ; + Chr(&'a str), + /// chr = "'" (!"'" ANY)* "'" ; + Str(&'a str), + /// Nothing + #[default] + Empty, +} + +impl std::fmt::Display for RuleKind<'_> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + RuleKind::Either(r) => r.iter().enumerate().try_for_each(|(i, r)| { + if i != 0 { + write!(f, " | ")?; + } + write!(f, "{r}") + }), + RuleKind::Follow(r) => r.iter().enumerate().try_for_each(|(i, r)| { + if i != 0 { + write!(f, " ~ ")?; + } + write!(f, "{r}") + }), + RuleKind::Any(r) => write!(f, "{r}*"), + RuleKind::Many(r) => write!(f, "{r}+"), + RuleKind::Maybe(r) => write!(f, "{r}?"), + RuleKind::Not(r) => write!(f, "!{r}"), + RuleKind::Group(r) => write!(f, "({r})"), + RuleKind::Ident(r) => write!(f, "{r}"), + RuleKind::Chr(r) => write!(f, "{r:?}"), + RuleKind::Str(r) => write!(f, "{r:?}"), + RuleKind::Empty => write!(f, "(!ANY)?"), + } + } +} +impl std::fmt::Display for Rule<'_> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let Self { + comment: _, + name, + body, + } = self; + // if let Some(comment) = comment { + // writeln!(f, "/* {comment} */")?; + // } + write!(f, "{name} = {{ {body} }}") + } +} + +pub struct Parser<'a> { + text: &'a str, + chars: Peekable>, + head: usize, + tail: usize, +} + +impl<'a> Parser<'a> { + pub fn new(text: &'a str) -> Self { + Self { + text, + chars: text.char_indices().peekable(), + head: 0, + tail: 0, + } + } + pub fn start(&mut self) -> &mut Self { + self.space(); + self.head = self.tail; + self + } + pub fn fragment(&self) -> &'a str { + let &Self { + text, head, tail, .. + } = self; + &text[head..tail] + } + pub fn peek(&mut self) -> Option { + self.chars.peek().map(|(_, c)| *c) + } + pub fn take(&mut self) -> Option<(usize, char)> { + let out = self.chars.next(); + self.tail = match out { + Some((i, _)) => i + 1, + None => self.text.len(), + }; + out + } + pub fn take_one(&mut self, f: fn(char) -> bool) -> Option<&mut Self> { + self.chars.peek().filter(|(_, c)| f(*c)).is_some().then(|| { + self.take(); + self + }) + } + pub fn take_many(&mut self, f: fn(char) -> bool) -> &mut Self { + while self.take_one(f).is_some() {} + self + } + pub fn space(&mut self) -> &mut Self { + self.take_many(char::is_whitespace) + } +} + +impl<'a> Parser<'a> { + pub fn rule(&mut self) -> Option> { + let out = Rule { + comment: self.comment(), + name: self.ident()?, + body: { + self.space() + .take_one(|c| '=' == c)? + .either() + .unwrap_or_default() + }, + }; + if self.space().take_one(|c| ';' == c).is_none() { + panic!("Rule should end in ';': {}..{}", self.head, self.tail) + } + Some(out) + } + pub fn either(&mut self) -> Option> { + let mut out = vec![self.follow()?]; + while self.space().take_one(|c| '|' == c).is_some() { + out.push(self.follow()?) + } + match out.len() { + 1 => out.pop(), + _ => Some(RuleKind::Either(out)), + } + } + pub fn follow(&mut self) -> Option> { + let mut out = vec![]; + while let Some(rule) = self.repeat() { + out.push(rule) + } + match out.len() { + 1 => out.pop(), + _ => Some(RuleKind::Follow(out)), + } + } + pub fn repeat(&mut self) -> Option> { + let out = self.not()?; + let out = match self.space().peek() { + Some('*') => RuleKind::Any(out.into()), + Some('+') => RuleKind::Many(out.into()), + Some('?') => RuleKind::Maybe(out.into()), + _ => return Some(out), + }; + self.take(); + Some(out) + } + pub fn not(&mut self) -> Option> { + match self.space().take_one(|c| '!' == c) { + Some(_) => Some(RuleKind::Not(self.prime()?.into())), + _ => self.prime(), + } + } + pub fn prime(&mut self) -> Option> { + Some(match self.space().peek()? { + '(' => return self.group(), + '"' => RuleKind::Str(self.str()?), + '\'' => RuleKind::Chr(self.chr()?), + _ => RuleKind::Ident(self.ident()?), + }) + } + pub fn group(&mut self) -> Option> { + self.take_one(|c| '(' == c)?; + let out = self.either()?; + if self.take_one(|c| ')' == c).is_none() { + panic!("Groups should have terminating ')': {}", self.tail) + } + Some(RuleKind::Group(out.into())) + } + pub fn ident(&mut self) -> Option<&'a str> { + self.start().take_one(is_xid_start)?; + self.take_many(is_xid_continue); + Some(self.fragment()) + } + pub fn chr(&mut self) -> Option<&'a str> { + self.space().take_one(|c| '\'' == c)?; + self.start().take_many(|c| '\'' != c); + let out = self.fragment(); + if self.take_one(|c| '\'' == c).is_none() { + panic!("chr should have terminating '\'': {}", self.tail) + } + Some(out) + } + pub fn str(&mut self) -> Option<&'a str> { + self.space().take_one(|c| '\"' == c)?; + self.start().take_many(|c| '\"' != c); + let out = self.fragment(); + if self.take_one(|c| '\"' == c).is_none() { + panic!("str should have terminating '\"': {}", self.tail) + } + Some(out) + } + + pub fn comment(&mut self) -> Option<&'a str> { + let start = self.tail; + while self.space().take_one(|c| '(' == c).is_some() { + self.take_one(|c| '*' == c)?; + while let Some(c) = self.peek() { + match c { + '*' => { + self.take_one(|c| '*' == c)?; + if self.take_one(|c| ')' == c).is_some() { + break; + } + } + '(' => { + self.comment(); + } + _ => { + self.take(); + } + } + } + } + let out = &self.text[start..self.tail]; + (out.len() > 1).then_some(out) + } +} diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..3609a92 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,22 @@ +use std::error::Error; + +use grammatical::*; + +fn main() -> Result<(), Box> { + for file in std::env::args().skip(1) { + let file = std::fs::read_to_string(file)?; + let mut p = Parser::new(&file); + while let Some(rule) = p.rule() { + println!("{rule}"); + } + } + + for line in std::io::stdin().lines() { + let line = line?; + let mut p = Parser::new(&line); + while let Some(rule) = p.rule() { + println!("{} = {{ {} }}", rule.name, rule.body); + } + } + Ok(()) +} diff --git a/test.grammatical b/test.grammatical new file mode 100644 index 0000000..206ac69 --- /dev/null +++ b/test.grammatical @@ -0,0 +1,18 @@ +(* Grammatical EBNF *) +Ruleset = (COMMENT? Rule)* EOI ; +Rule = ident '=' Either? ';' ; +Either = Follow ('|' Follow)* ; +Follow = (Any | Many | Maybe | Not)+ ; +Any = Not '*' ; +Many = Not '+' ; +Maybe = Not '?' ; +Not = '!'? Prime ; +Prime = Group | chr | str | ident ; +Group = '(' Either ')' ; + +ident = XID_START XID_CONTINUE* ; +str = '"' (!'"' ANY)* '"' ; +chr = "'" (!"'" ANY)* "'" ; + +WHITESPACE = WHITE_SPACE ; +COMMENT = "(*" (COMMENT | !"*)" ANY)* "*)" ;