lexer: Move module into file
This commit is contained in:
		
							
								
								
									
										540
									
								
								libconlang/src/lexer.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										540
									
								
								libconlang/src/lexer.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,540 @@ | ||||
| //! Converts a text file into tokens | ||||
| use crate::token::{Token, Type}; | ||||
| use lerox::Combinator; | ||||
|  | ||||
| pub struct IntoIter<'t> { | ||||
|     lexer: Lexer<'t>, | ||||
| } | ||||
| impl<'t> Iterator for IntoIter<'t> { | ||||
|     type Item = Token; | ||||
|     fn next(&mut self) -> Option<Self::Item> { | ||||
|         self.lexer.any() | ||||
|     } | ||||
| } | ||||
| impl<'t> IntoIterator for Lexer<'t> { | ||||
|     type Item = Token; | ||||
|     type IntoIter = IntoIter<'t>; | ||||
|     fn into_iter(self) -> Self::IntoIter { | ||||
|         IntoIter { lexer: self } | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[derive(Clone, Debug)] | ||||
| pub struct Lexer<'t> { | ||||
|     text: &'t str, | ||||
|     cursor: usize, | ||||
|     line: usize, | ||||
|     col: usize, | ||||
| } | ||||
| /// Implements the non-terminals of a language | ||||
| impl<'t> Lexer<'t> { | ||||
|     pub fn new(text: &'t str) -> Self { | ||||
|         Self { text, cursor: 0, line: 1, col: 1 } | ||||
|     } | ||||
|     /// Consumes the entire [`Lexer`], producing a [`Vec<Token>`] | ||||
|     /// and returning the original string | ||||
|     pub fn consume(self) -> (Vec<Token>, &'t str) { | ||||
|         let text = self.text; | ||||
|         (self.into_iter().collect(), text) | ||||
|     } | ||||
|     /// Counts some length | ||||
|     #[inline] | ||||
|     fn count_len(&mut self, len: usize) -> &mut Self { | ||||
|         self.cursor += len; | ||||
|         self.col += len; | ||||
|         self | ||||
|     } | ||||
|     /// Counts a line | ||||
|     #[inline] | ||||
|     fn count_line(&mut self, lines: usize) -> &mut Self { | ||||
|         self.line += lines; | ||||
|         self.col = 1; | ||||
|         self | ||||
|     } | ||||
|     /// Skips whitespace in the text | ||||
|     fn skip_whitespace(&mut self) { | ||||
|         self.count_len( | ||||
|             Rule::new(self.text()) | ||||
|                 .and_any(Rule::whitespace_not_newline) | ||||
|                 .end() | ||||
|                 .unwrap_or_default(), | ||||
|         ); | ||||
|         if Rule::new(self.text()).char('\n').end().is_some() { | ||||
|             // recurse until all newlines are skipped | ||||
|             self.count_len(1).count_line(1).skip_whitespace(); | ||||
|         } | ||||
|     } | ||||
|     /// Advances the cursor and produces a token from a provided [Rule] function | ||||
|     fn map_rule<F>(&mut self, rule: F, ty: Type) -> Option<Token> | ||||
|     where F: Fn(Rule) -> Rule { | ||||
|         self.skip_whitespace(); | ||||
|         let (line, col, start) = (self.line, self.col, self.cursor); | ||||
|         self.count_len(Rule::new(self.text()).and(rule).end()?); | ||||
|         Some(Token::new(ty, start, self.cursor, line, col)) | ||||
|     } | ||||
|     /// Gets a slice of text beginning at the cursor | ||||
|     fn text(&self) -> &str { | ||||
|         &self.text[self.cursor..] | ||||
|     } | ||||
|     // classifies a single arbitrary token | ||||
|     /// Returns the result of the rule with the highest precedence, if any matches | ||||
|     pub fn any(&mut self) -> Option<Token> { | ||||
|         None.or_else(|| self.comment()) | ||||
|             .or_else(|| self.identifier()) | ||||
|             .or_else(|| self.literal()) | ||||
|             .or_else(|| self.delimiter()) | ||||
|             .or_else(|| self.punctuation()) | ||||
|             .or_else(|| self.invalid()) | ||||
|     } | ||||
|     /// Attempts to produce a [Type::String], [Type::Float], or [Type::Integer] | ||||
|     pub fn literal(&mut self) -> Option<Token> { | ||||
|         None.or_else(|| self.string()) | ||||
|             .or_else(|| self.character()) | ||||
|             .or_else(|| self.float()) | ||||
|             .or_else(|| self.integer()) | ||||
|     } | ||||
|     /// Evaluates delimiter rules | ||||
|     pub fn delimiter(&mut self) -> Option<Token> { | ||||
|         None.or_else(|| self.l_brack()) | ||||
|             .or_else(|| self.r_brack()) | ||||
|             .or_else(|| self.l_curly()) | ||||
|             .or_else(|| self.r_curly()) | ||||
|             .or_else(|| self.l_paren()) | ||||
|             .or_else(|| self.r_paren()) | ||||
|     } | ||||
|     /// Evaluates punctuation rules | ||||
|     pub fn punctuation(&mut self) -> Option<Token> { | ||||
|         None.or_else(|| self.amp_amp()) | ||||
|             .or_else(|| self.bar_bar()) | ||||
|             .or_else(|| self.not_not()) | ||||
|             .or_else(|| self.cat_ear()) | ||||
|             .or_else(|| self.eq_eq()) | ||||
|             .or_else(|| self.gt_eq()) | ||||
|             .or_else(|| self.lt_eq()) | ||||
|             .or_else(|| self.not_eq()) | ||||
|             .or_else(|| self.lsh_eq()) | ||||
|             .or_else(|| self.rsh_eq()) | ||||
|             .or_else(|| self.star_eq()) | ||||
|             .or_else(|| self.div_eq()) | ||||
|             .or_else(|| self.rem_eq()) | ||||
|             .or_else(|| self.add_eq()) | ||||
|             .or_else(|| self.sub_eq()) | ||||
|             .or_else(|| self.and_eq()) | ||||
|             .or_else(|| self.or_eq()) | ||||
|             .or_else(|| self.xor_eq()) | ||||
|             .or_else(|| self.lsh()) | ||||
|             .or_else(|| self.rsh()) | ||||
|             .or_else(|| self.arrow()) | ||||
|             .or_else(|| self.fatarrow()) | ||||
|             .or_else(|| self.semi()) | ||||
|             .or_else(|| self.dot()) | ||||
|             .or_else(|| self.star()) | ||||
|             .or_else(|| self.div()) | ||||
|             .or_else(|| self.plus()) | ||||
|             .or_else(|| self.sub()) | ||||
|             .or_else(|| self.rem()) | ||||
|             .or_else(|| self.bang()) | ||||
|             .or_else(|| self.eq()) | ||||
|             .or_else(|| self.lt()) | ||||
|             .or_else(|| self.gt()) | ||||
|             .or_else(|| self.amp()) | ||||
|             .or_else(|| self.bar()) | ||||
|             .or_else(|| self.xor()) | ||||
|             .or_else(|| self.hash()) | ||||
|             .or_else(|| self.at()) | ||||
|             .or_else(|| self.colon()) | ||||
|             .or_else(|| self.backslash()) | ||||
|             .or_else(|| self.question()) | ||||
|             .or_else(|| self.comma()) | ||||
|             .or_else(|| self.tilde()) | ||||
|             .or_else(|| self.grave()) | ||||
|     } | ||||
|     pub fn unary_op(&mut self) -> Option<Token> { | ||||
|         self.bang().or_else(|| self.sub()) | ||||
|     } | ||||
|     // functions for lexing individual tokens | ||||
|     pub fn invalid(&mut self) -> Option<Token> { | ||||
|         self.map_rule(|r| r.invalid(), Type::Invalid) | ||||
|     } | ||||
|     // comments | ||||
|     pub fn comment(&mut self) -> Option<Token> { | ||||
|         self.map_rule(|r| r.comment(), Type::Comment) | ||||
|     } | ||||
|     // identifiers | ||||
|     pub fn identifier(&mut self) -> Option<Token> { | ||||
|         self.map_rule(|r| r.identifier(), Type::Identifier) | ||||
|             .map(|token| match self.text[token.range()].parse() { | ||||
|                 Ok(kw) => token.cast(Type::Keyword(kw)), | ||||
|                 Err(_) => token, | ||||
|             }) | ||||
|     } | ||||
|     // literals | ||||
|     pub fn integer(&mut self) -> Option<Token> { | ||||
|         self.map_rule(|r| r.integer(), Type::Integer) | ||||
|     } | ||||
|     pub fn float(&mut self) -> Option<Token> { | ||||
|         self.map_rule(|r| r.float(), Type::Float) | ||||
|     } | ||||
|     pub fn string(&mut self) -> Option<Token> { | ||||
|         // TODO: count lines and columns properly within string | ||||
|         self.map_rule(|r| r.string(), Type::String) | ||||
|             .map(|t| t.rebound(t.head + 1, t.tail - 1)) | ||||
|     } | ||||
|     pub fn character(&mut self) -> Option<Token> { | ||||
|         self.map_rule(|r| r.character(), Type::Character) | ||||
|             .map(|t| t.rebound(t.head + 1, t.tail - 1)) | ||||
|     } | ||||
|     // delimiters | ||||
|     pub fn l_brack(&mut self) -> Option<Token> { | ||||
|         self.map_rule(|r| r.char('['), Type::LBrack) | ||||
|     } | ||||
|     pub fn r_brack(&mut self) -> Option<Token> { | ||||
|         self.map_rule(|r| r.char(']'), Type::RBrack) | ||||
|     } | ||||
|     pub fn l_curly(&mut self) -> Option<Token> { | ||||
|         self.map_rule(|r| r.char('{'), Type::LCurly) | ||||
|     } | ||||
|     pub fn r_curly(&mut self) -> Option<Token> { | ||||
|         self.map_rule(|r| r.char('}'), Type::RCurly) | ||||
|     } | ||||
|     pub fn l_paren(&mut self) -> Option<Token> { | ||||
|         self.map_rule(|r| r.char('('), Type::LParen) | ||||
|     } | ||||
|     pub fn r_paren(&mut self) -> Option<Token> { | ||||
|         self.map_rule(|r| r.char(')'), Type::RParen) | ||||
|     } | ||||
|     // compound punctuation | ||||
|     pub fn lsh(&mut self) -> Option<Token> { | ||||
|         self.map_rule(|r| r.str("<<"), Type::Lsh) | ||||
|     } | ||||
|     pub fn rsh(&mut self) -> Option<Token> { | ||||
|         self.map_rule(|r| r.str(">>"), Type::Rsh) | ||||
|     } | ||||
|     pub fn amp_amp(&mut self) -> Option<Token> { | ||||
|         self.map_rule(|r| r.str("&&"), Type::AmpAmp) | ||||
|     } | ||||
|     pub fn bar_bar(&mut self) -> Option<Token> { | ||||
|         self.map_rule(|r| r.str("||"), Type::BarBar) | ||||
|     } | ||||
|     pub fn not_not(&mut self) -> Option<Token> { | ||||
|         self.map_rule(|r| r.str("!!"), Type::NotNot) | ||||
|     } | ||||
|     pub fn cat_ear(&mut self) -> Option<Token> { | ||||
|         self.map_rule(|r| r.str("^^"), Type::CatEar) | ||||
|     } | ||||
|     pub fn eq_eq(&mut self) -> Option<Token> { | ||||
|         self.map_rule(|r| r.str("=="), Type::EqEq) | ||||
|     } | ||||
|     pub fn gt_eq(&mut self) -> Option<Token> { | ||||
|         self.map_rule(|r| r.str(">="), Type::GtEq) | ||||
|     } | ||||
|     pub fn lt_eq(&mut self) -> Option<Token> { | ||||
|         self.map_rule(|r| r.str("<="), Type::LtEq) | ||||
|     } | ||||
|     pub fn not_eq(&mut self) -> Option<Token> { | ||||
|         self.map_rule(|r| r.str("!="), Type::NotEq) | ||||
|     } | ||||
|     pub fn star_eq(&mut self) -> Option<Token> { | ||||
|         self.map_rule(|r| r.str("*="), Type::StarEq) | ||||
|     } | ||||
|     pub fn div_eq(&mut self) -> Option<Token> { | ||||
|         self.map_rule(|r| r.str("/="), Type::DivEq) | ||||
|     } | ||||
|     pub fn rem_eq(&mut self) -> Option<Token> { | ||||
|         self.map_rule(|r| r.str("%="), Type::RemEq) | ||||
|     } | ||||
|     pub fn add_eq(&mut self) -> Option<Token> { | ||||
|         self.map_rule(|r| r.str("+="), Type::AddEq) | ||||
|     } | ||||
|     pub fn sub_eq(&mut self) -> Option<Token> { | ||||
|         self.map_rule(|r| r.str("-="), Type::SubEq) | ||||
|     } | ||||
|     pub fn and_eq(&mut self) -> Option<Token> { | ||||
|         self.map_rule(|r| r.str("&="), Type::AndEq) | ||||
|     } | ||||
|     pub fn or_eq(&mut self) -> Option<Token> { | ||||
|         self.map_rule(|r| r.str("|="), Type::OrEq) | ||||
|     } | ||||
|     pub fn xor_eq(&mut self) -> Option<Token> { | ||||
|         self.map_rule(|r| r.str("^="), Type::XorEq) | ||||
|     } | ||||
|     pub fn lsh_eq(&mut self) -> Option<Token> { | ||||
|         self.map_rule(|r| r.str("<<="), Type::LshEq) | ||||
|     } | ||||
|     pub fn rsh_eq(&mut self) -> Option<Token> { | ||||
|         self.map_rule(|r| r.str(">>="), Type::RshEq) | ||||
|     } | ||||
|     pub fn arrow(&mut self) -> Option<Token> { | ||||
|         self.map_rule(|r| r.str("->"), Type::Arrow) | ||||
|     } | ||||
|     pub fn fatarrow(&mut self) -> Option<Token> { | ||||
|         self.map_rule(|r| r.str("=>"), Type::FatArrow) | ||||
|     } | ||||
|     // simple punctuation | ||||
|     pub fn semi(&mut self) -> Option<Token> { | ||||
|         self.map_rule(|r| r.char(';'), Type::Semi) | ||||
|     } | ||||
|     pub fn dot(&mut self) -> Option<Token> { | ||||
|         self.map_rule(|r| r.char('.'), Type::Dot) | ||||
|     } | ||||
|     pub fn star(&mut self) -> Option<Token> { | ||||
|         self.map_rule(|r| r.char('*'), Type::Star) | ||||
|     } | ||||
|     pub fn div(&mut self) -> Option<Token> { | ||||
|         self.map_rule(|r| r.char('/'), Type::Div) | ||||
|     } | ||||
|     pub fn plus(&mut self) -> Option<Token> { | ||||
|         self.map_rule(|r| r.char('+'), Type::Plus) | ||||
|     } | ||||
|     pub fn sub(&mut self) -> Option<Token> { | ||||
|         self.map_rule(|r| r.char('-'), Type::Minus) | ||||
|     } | ||||
|     pub fn rem(&mut self) -> Option<Token> { | ||||
|         self.map_rule(|r| r.char('%'), Type::Rem) | ||||
|     } | ||||
|     pub fn bang(&mut self) -> Option<Token> { | ||||
|         self.map_rule(|r| r.char('!'), Type::Bang) | ||||
|     } | ||||
|     pub fn eq(&mut self) -> Option<Token> { | ||||
|         self.map_rule(|r| r.char('='), Type::Eq) | ||||
|     } | ||||
|     pub fn lt(&mut self) -> Option<Token> { | ||||
|         self.map_rule(|r| r.char('<'), Type::Lt) | ||||
|     } | ||||
|     pub fn gt(&mut self) -> Option<Token> { | ||||
|         self.map_rule(|r| r.char('>'), Type::Gt) | ||||
|     } | ||||
|     pub fn amp(&mut self) -> Option<Token> { | ||||
|         self.map_rule(|r| r.char('&'), Type::Amp) | ||||
|     } | ||||
|     pub fn bar(&mut self) -> Option<Token> { | ||||
|         self.map_rule(|r| r.char('|'), Type::Bar) | ||||
|     } | ||||
|     pub fn xor(&mut self) -> Option<Token> { | ||||
|         self.map_rule(|r| r.char('^'), Type::Xor) | ||||
|     } | ||||
|     pub fn hash(&mut self) -> Option<Token> { | ||||
|         self.map_rule(|r| r.char('#'), Type::Hash) | ||||
|     } | ||||
|     pub fn at(&mut self) -> Option<Token> { | ||||
|         self.map_rule(|r| r.char('@'), Type::At) | ||||
|     } | ||||
|     pub fn colon(&mut self) -> Option<Token> { | ||||
|         self.map_rule(|r| r.char(':'), Type::Colon) | ||||
|     } | ||||
|     pub fn question(&mut self) -> Option<Token> { | ||||
|         self.map_rule(|r| r.char('?'), Type::Question) | ||||
|     } | ||||
|     pub fn comma(&mut self) -> Option<Token> { | ||||
|         self.map_rule(|r| r.char(','), Type::Comma) | ||||
|     } | ||||
|     pub fn tilde(&mut self) -> Option<Token> { | ||||
|         self.map_rule(|r| r.char('~'), Type::Tilde) | ||||
|     } | ||||
|     pub fn grave(&mut self) -> Option<Token> { | ||||
|         self.map_rule(|r| r.char('`'), Type::Grave) | ||||
|     } | ||||
|     pub fn backslash(&mut self) -> Option<Token> { | ||||
|         self.map_rule(|r| r.char('\\'), Type::Backslash) | ||||
|     } | ||||
| } | ||||
|  | ||||
| // TODO: use real, functional parser-combinators here to produce tokens | ||||
| /// A lexer [Rule] matches patterns in text in a declarative manner | ||||
| #[derive(Clone, Debug, PartialEq, Eq)] | ||||
| pub struct Rule<'t> { | ||||
|     text: &'t str, | ||||
|     taken: usize, | ||||
|     is_alright: bool, | ||||
| } | ||||
| impl<'t> Rule<'t> { | ||||
|     pub fn new(text: &'t str) -> Self { | ||||
|         Self { text, taken: 0, is_alright: true } | ||||
|     } | ||||
|     pub fn end(self) -> Option<usize> { | ||||
|         self.is_alright.then_some(self.taken) | ||||
|     } | ||||
|     pub fn remaining(&self) -> &str { | ||||
|         self.text | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl<'t> Rule<'t> { | ||||
|     /// Matches any sequence of non-whitespace characters | ||||
|     pub fn invalid(self) -> Self { | ||||
|         self.and_many(Self::not_whitespace) | ||||
|     } | ||||
|     /// Matches a block, line, or shebang comment | ||||
|     pub fn comment(self) -> Self { | ||||
|         self.and_either(Self::line_comment, Self::block_comment) | ||||
|     } | ||||
|     /// Matches a line or shebang comment | ||||
|     fn line_comment(self) -> Self { | ||||
|         // line_comment := ("//" | "#!/") (!newline)* | ||||
|         self.str("//") | ||||
|             .or(|r| r.str("#!/")) | ||||
|             .and_any(|r| r.not_char('\n')) | ||||
|     } | ||||
|     /// Matches a block comment | ||||
|     fn block_comment(self) -> Self { | ||||
|         // block_comment := "/*" (block_comment | all_but("*/"))* "*/" | ||||
|         self.str("/*") | ||||
|             .and_any(|r| r.and_either(|f| f.block_comment(), |g| g.not_str("*/"))) | ||||
|             .str("*/") | ||||
|     } | ||||
|     /// Matches a Rust-style identifier | ||||
|     pub fn identifier(self) -> Self { | ||||
|         // identifier := ('_' | XID_START) ~ XID_CONTINUE* | ||||
|         self.char('_') | ||||
|             .or(Rule::xid_start) | ||||
|             .and_any(Rule::xid_continue) | ||||
|     } | ||||
|     /// Matches a Rust-style base-prefixed int literal | ||||
|     fn integer_kind(self, prefix: &str, digit: impl Fn(Self) -> Self) -> Self { | ||||
|         // int_kind<Prefix, Digit> := Prefix '_'* Digit (Digit | '_')* | ||||
|         self.str(prefix) | ||||
|             .and_any(|r| r.char('_')) | ||||
|             .and(&digit) | ||||
|             .and_any(|r| r.and(&digit).or(|r| r.char('_'))) | ||||
|     } | ||||
|     /// Matches a Rust-style integer literal | ||||
|     pub fn integer(self) -> Self { | ||||
|         // integer = (int_kind<0d, dec_digit> | int_kind<0x, hex_digit> | ||||
|         //           | int_kind<0o, oct_digit> | int_kind<0b, bin_digit> | dec_digit (dec_digit | '_')*) | ||||
|         self.and_one_of(&[ | ||||
|             &|rule| rule.integer_kind("0d", Rule::dec_digit), | ||||
|             &|rule| rule.integer_kind("0x", Rule::hex_digit), | ||||
|             &|rule| rule.integer_kind("0o", Rule::oct_digit), | ||||
|             &|rule| rule.integer_kind("0b", Rule::bin_digit), | ||||
|             &|rule| { | ||||
|                 rule.dec_digit() | ||||
|                     .and_any(|r| r.dec_digit().or(|r| r.char('_'))) | ||||
|             }, | ||||
|         ]) | ||||
|     } | ||||
|     /// Matches a float literal | ||||
|     // TODO: exponent form | ||||
|     pub fn float(self) -> Self { | ||||
|         self.and_any(Rule::dec_digit) | ||||
|             .char('.') | ||||
|             .and_many(Rule::dec_digit) | ||||
|     } | ||||
|     /// Matches one apostrophe-delimited char literal | ||||
|     pub fn character(self) -> Self { | ||||
|         self.char('\'').character_continue().char('\'') | ||||
|     } | ||||
|     pub fn character_continue(self) -> Self { | ||||
|         self.and(|rule| rule.string_escape().or(|rule| rule.not_char('\''))) | ||||
|     } | ||||
|     /// Matches one quote-delimited string literal | ||||
|     pub fn string(self) -> Self { | ||||
|         self.char('"').and_any(Rule::string_continue).char('"') | ||||
|     } | ||||
|     /// Matches one string escape sequence or non-`"` characcter | ||||
|     pub fn string_continue(self) -> Self { | ||||
|         self.and(Rule::string_escape).or(|rule| rule.not_char('"')) | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl<'t> Rule<'t> { | ||||
|     /// Matches a char lexicographically between start and end | ||||
|     pub fn char_between(self, start: char, end: char) -> Self { | ||||
|         self.char_fn(|c| start <= c && c <= end) | ||||
|     } | ||||
|     /// Matches a single char | ||||
|     pub fn char(self, c: char) -> Self { | ||||
|         self.has(|rule| rule.text.starts_with(c), 1) | ||||
|     } | ||||
|     /// Matches the entirety of a string slice | ||||
|     pub fn str(self, s: &str) -> Self { | ||||
|         self.has(|rule| rule.text.starts_with(s), s.len()) | ||||
|     } | ||||
|     /// Matches a char based on the output of a function | ||||
|     pub fn char_fn(self, f: impl Fn(char) -> bool) -> Self { | ||||
|         self.and(|rule| match rule.text.strip_prefix(&f) { | ||||
|             Some(text) => Self { text, taken: rule.taken + next_utf8(rule.text, 1), ..rule }, | ||||
|             None => Self { is_alright: false, ..rule }, | ||||
|         }) | ||||
|     } | ||||
|     /// Matches a single char except c | ||||
|     pub fn not_char(self, c: char) -> Self { | ||||
|         self.has(|rule| !rule.text.starts_with(c), 1) | ||||
|     } | ||||
|     /// Matches a single char unless the text starts with s | ||||
|     pub fn not_str(self, s: &str) -> Self { | ||||
|         self.has(|rule| !rule.text.starts_with(s), 1) | ||||
|     } | ||||
|     // commonly used character classes | ||||
|     /// Matches one of any character | ||||
|     pub fn any(self) -> Self { | ||||
|         self.has(|_| true, 1) | ||||
|     } | ||||
|     /// Matches one whitespace | ||||
|     pub fn whitespace(self) -> Self { | ||||
|         self.char_fn(|c| c.is_whitespace()) | ||||
|     } | ||||
|     /// Matches one whitespace, except `'\n'` | ||||
|     pub fn whitespace_not_newline(self) -> Self { | ||||
|         self.char_fn(|c| '\n' != c && c.is_whitespace()) | ||||
|     } | ||||
|     /// Matches anything but whitespace | ||||
|     pub fn not_whitespace(self) -> Self { | ||||
|         self.char_fn(|c| !c.is_whitespace()) | ||||
|     } | ||||
|     /// Matches one XID_START | ||||
|     pub fn xid_start(self) -> Self { | ||||
|         use unicode_xid::UnicodeXID; | ||||
|         self.char_fn(UnicodeXID::is_xid_start) | ||||
|     } | ||||
|     /// Matches one XID_CONTINUE | ||||
|     pub fn xid_continue(self) -> Self { | ||||
|         use unicode_xid::UnicodeXID; | ||||
|         self.char_fn(UnicodeXID::is_xid_continue) | ||||
|     } | ||||
|     /// Matches one hexadecimal digit | ||||
|     pub fn hex_digit(self) -> Self { | ||||
|         self.char_fn(|c| c.is_ascii_hexdigit()) | ||||
|     } | ||||
|     /// Matches one decimal digit | ||||
|     pub fn dec_digit(self) -> Self { | ||||
|         self.char_fn(|c| c.is_ascii_digit()) | ||||
|     } | ||||
|     /// Matches one octal digit | ||||
|     pub fn oct_digit(self) -> Self { | ||||
|         self.char_between('0', '7') | ||||
|     } | ||||
|     /// Matches one binary digit | ||||
|     pub fn bin_digit(self) -> Self { | ||||
|         self.char_between('0', '1') | ||||
|     } | ||||
|     /// Matches any string escape "\." | ||||
|     pub fn string_escape(self) -> Self { | ||||
|         self.char('\\').and(Rule::any) | ||||
|     } | ||||
|     /// Performs a consuming condition assertion on the input | ||||
|     fn has(self, condition: impl Fn(&Self) -> bool, len: usize) -> Self { | ||||
|         let len = next_utf8(self.text, len); | ||||
|         self.and(|rule| match condition(&rule) && !rule.text.is_empty() { | ||||
|             true => Self { text: &rule.text[len..], taken: rule.taken + len, ..rule }, | ||||
|             false => Self { is_alright: false, ..rule }, | ||||
|         }) | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl<'t> lerox::Combinator for Rule<'t> { | ||||
|     fn is_alright(&self) -> bool { | ||||
|         self.is_alright | ||||
|     } | ||||
|     fn into_alright(self) -> Self { | ||||
|         Self { is_alright: true, ..self } | ||||
|     } | ||||
| } | ||||
|  | ||||
| /// Returns the index of the next unicode character, rounded up | ||||
| fn next_utf8(text: &str, mut index: usize) -> usize { | ||||
|     index = index.min(text.len()); | ||||
|     while !text.is_char_boundary(index) { | ||||
|         index += 1 | ||||
|     } | ||||
|     index | ||||
| } | ||||
| @@ -5,548 +5,7 @@ pub mod token; | ||||
|  | ||||
| pub mod ast; | ||||
|  | ||||
| pub mod lexer { | ||||
|     //! Converts a text file into tokens | ||||
|     use crate::token::{Token, Type}; | ||||
|     use lerox::Combinator; | ||||
|  | ||||
|     pub struct IntoIter<'t> { | ||||
|         lexer: Lexer<'t>, | ||||
|     } | ||||
|     impl<'t> Iterator for IntoIter<'t> { | ||||
|         type Item = Token; | ||||
|         fn next(&mut self) -> Option<Self::Item> { | ||||
|             self.lexer.any() | ||||
|         } | ||||
|     } | ||||
|     impl<'t> IntoIterator for Lexer<'t> { | ||||
|         type Item = Token; | ||||
|         type IntoIter = IntoIter<'t>; | ||||
|         fn into_iter(self) -> Self::IntoIter { | ||||
|             IntoIter { lexer: self } | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     #[derive(Clone, Debug)] | ||||
|     pub struct Lexer<'t> { | ||||
|         text: &'t str, | ||||
|         cursor: usize, | ||||
|         line: usize, | ||||
|         col: usize, | ||||
|     } | ||||
|     /// Implements the non-terminals of a language | ||||
|     impl<'t> Lexer<'t> { | ||||
|         pub fn new(text: &'t str) -> Self { | ||||
|             Self { text, cursor: 0, line: 1, col: 1 } | ||||
|         } | ||||
|         /// Consumes the entire [`Lexer`], producing a [`Vec<Token>`] | ||||
|         /// and returning the original string | ||||
|         pub fn consume(self) -> (Vec<Token>, &'t str) { | ||||
|             let text = self.text; | ||||
|             (self.into_iter().collect(), text) | ||||
|         } | ||||
|         /// Counts some length | ||||
|         #[inline] | ||||
|         fn count_len(&mut self, len: usize) -> &mut Self { | ||||
|             self.cursor += len; | ||||
|             self.col += len; | ||||
|             self | ||||
|         } | ||||
|         /// Counts a line | ||||
|         #[inline] | ||||
|         fn count_line(&mut self, lines: usize) -> &mut Self { | ||||
|             self.line += lines; | ||||
|             self.col = 1; | ||||
|             self | ||||
|         } | ||||
|         /// Skips whitespace in the text | ||||
|         fn skip_whitespace(&mut self) { | ||||
|             self.count_len( | ||||
|                 Rule::new(self.text()) | ||||
|                     .and_any(Rule::whitespace_not_newline) | ||||
|                     .end() | ||||
|                     .unwrap_or_default(), | ||||
|             ); | ||||
|             if Rule::new(self.text()).char('\n').end().is_some() { | ||||
|                 // recurse until all newlines are skipped | ||||
|                 self.count_len(1).count_line(1).skip_whitespace(); | ||||
|             } | ||||
|         } | ||||
|         /// Advances the cursor and produces a token from a provided [Rule] function | ||||
|         fn map_rule<F>(&mut self, rule: F, ty: Type) -> Option<Token> | ||||
|         where F: Fn(Rule) -> Rule { | ||||
|             self.skip_whitespace(); | ||||
|             let (line, col, start) = (self.line, self.col, self.cursor); | ||||
|             self.count_len(Rule::new(self.text()).and(rule).end()?); | ||||
|             Some(Token::new(ty, start, self.cursor, line, col)) | ||||
|         } | ||||
|         /// Gets a slice of text beginning at the cursor | ||||
|         fn text(&self) -> &str { | ||||
|             &self.text[self.cursor..] | ||||
|         } | ||||
|         // classifies a single arbitrary token | ||||
|         /// Returns the result of the rule with the highest precedence, if any matches | ||||
|         pub fn any(&mut self) -> Option<Token> { | ||||
|             None.or_else(|| self.comment()) | ||||
|                 .or_else(|| self.identifier()) | ||||
|                 .or_else(|| self.literal()) | ||||
|                 .or_else(|| self.delimiter()) | ||||
|                 .or_else(|| self.punctuation()) | ||||
|                 .or_else(|| self.invalid()) | ||||
|         } | ||||
|         /// Attempts to produce a [Type::String], [Type::Float], or [Type::Integer] | ||||
|         pub fn literal(&mut self) -> Option<Token> { | ||||
|             None.or_else(|| self.string()) | ||||
|                 .or_else(|| self.character()) | ||||
|                 .or_else(|| self.float()) | ||||
|                 .or_else(|| self.integer()) | ||||
|         } | ||||
|         /// Evaluates delimiter rules | ||||
|         pub fn delimiter(&mut self) -> Option<Token> { | ||||
|             None.or_else(|| self.l_brack()) | ||||
|                 .or_else(|| self.r_brack()) | ||||
|                 .or_else(|| self.l_curly()) | ||||
|                 .or_else(|| self.r_curly()) | ||||
|                 .or_else(|| self.l_paren()) | ||||
|                 .or_else(|| self.r_paren()) | ||||
|         } | ||||
|         /// Evaluates punctuation rules | ||||
|         pub fn punctuation(&mut self) -> Option<Token> { | ||||
|             None.or_else(|| self.amp_amp()) | ||||
|                 .or_else(|| self.bar_bar()) | ||||
|                 .or_else(|| self.not_not()) | ||||
|                 .or_else(|| self.cat_ear()) | ||||
|                 .or_else(|| self.eq_eq()) | ||||
|                 .or_else(|| self.gt_eq()) | ||||
|                 .or_else(|| self.lt_eq()) | ||||
|                 .or_else(|| self.not_eq()) | ||||
|                 .or_else(|| self.lsh_eq()) | ||||
|                 .or_else(|| self.rsh_eq()) | ||||
|                 .or_else(|| self.star_eq()) | ||||
|                 .or_else(|| self.div_eq()) | ||||
|                 .or_else(|| self.rem_eq()) | ||||
|                 .or_else(|| self.add_eq()) | ||||
|                 .or_else(|| self.sub_eq()) | ||||
|                 .or_else(|| self.and_eq()) | ||||
|                 .or_else(|| self.or_eq()) | ||||
|                 .or_else(|| self.xor_eq()) | ||||
|                 .or_else(|| self.lsh()) | ||||
|                 .or_else(|| self.rsh()) | ||||
|                 .or_else(|| self.arrow()) | ||||
|                 .or_else(|| self.fatarrow()) | ||||
|                 .or_else(|| self.semi()) | ||||
|                 .or_else(|| self.dot()) | ||||
|                 .or_else(|| self.star()) | ||||
|                 .or_else(|| self.div()) | ||||
|                 .or_else(|| self.plus()) | ||||
|                 .or_else(|| self.sub()) | ||||
|                 .or_else(|| self.rem()) | ||||
|                 .or_else(|| self.bang()) | ||||
|                 .or_else(|| self.eq()) | ||||
|                 .or_else(|| self.lt()) | ||||
|                 .or_else(|| self.gt()) | ||||
|                 .or_else(|| self.amp()) | ||||
|                 .or_else(|| self.bar()) | ||||
|                 .or_else(|| self.xor()) | ||||
|                 .or_else(|| self.hash()) | ||||
|                 .or_else(|| self.at()) | ||||
|                 .or_else(|| self.colon()) | ||||
|                 .or_else(|| self.backslash()) | ||||
|                 .or_else(|| self.question()) | ||||
|                 .or_else(|| self.comma()) | ||||
|                 .or_else(|| self.tilde()) | ||||
|                 .or_else(|| self.grave()) | ||||
|         } | ||||
|         pub fn unary_op(&mut self) -> Option<Token> { | ||||
|             self.bang().or_else(|| self.sub()) | ||||
|         } | ||||
|         // functions for lexing individual tokens | ||||
|         pub fn invalid(&mut self) -> Option<Token> { | ||||
|             self.map_rule(|r| r.invalid(), Type::Invalid) | ||||
|         } | ||||
|         // comments | ||||
|         pub fn comment(&mut self) -> Option<Token> { | ||||
|             self.map_rule(|r| r.comment(), Type::Comment) | ||||
|         } | ||||
|         // identifiers | ||||
|         pub fn identifier(&mut self) -> Option<Token> { | ||||
|             self.map_rule(|r| r.identifier(), Type::Identifier) | ||||
|                 .map(|token| match self.text[token.range()].parse() { | ||||
|                     Ok(kw) => token.cast(Type::Keyword(kw)), | ||||
|                     Err(_) => token, | ||||
|                 }) | ||||
|         } | ||||
|         // literals | ||||
|         pub fn integer(&mut self) -> Option<Token> { | ||||
|             self.map_rule(|r| r.integer(), Type::Integer) | ||||
|         } | ||||
|         pub fn float(&mut self) -> Option<Token> { | ||||
|             self.map_rule(|r| r.float(), Type::Float) | ||||
|         } | ||||
|         pub fn string(&mut self) -> Option<Token> { | ||||
|             // TODO: count lines and columns properly within string | ||||
|             self.map_rule(|r| r.string(), Type::String) | ||||
|                 .map(|t| t.rebound(t.head + 1, t.tail - 1)) | ||||
|         } | ||||
|         pub fn character(&mut self) -> Option<Token> { | ||||
|             self.map_rule(|r| r.character(), Type::Character) | ||||
|                 .map(|t| t.rebound(t.head + 1, t.tail - 1)) | ||||
|         } | ||||
|         // delimiters | ||||
|         pub fn l_brack(&mut self) -> Option<Token> { | ||||
|             self.map_rule(|r| r.char('['), Type::LBrack) | ||||
|         } | ||||
|         pub fn r_brack(&mut self) -> Option<Token> { | ||||
|             self.map_rule(|r| r.char(']'), Type::RBrack) | ||||
|         } | ||||
|         pub fn l_curly(&mut self) -> Option<Token> { | ||||
|             self.map_rule(|r| r.char('{'), Type::LCurly) | ||||
|         } | ||||
|         pub fn r_curly(&mut self) -> Option<Token> { | ||||
|             self.map_rule(|r| r.char('}'), Type::RCurly) | ||||
|         } | ||||
|         pub fn l_paren(&mut self) -> Option<Token> { | ||||
|             self.map_rule(|r| r.char('('), Type::LParen) | ||||
|         } | ||||
|         pub fn r_paren(&mut self) -> Option<Token> { | ||||
|             self.map_rule(|r| r.char(')'), Type::RParen) | ||||
|         } | ||||
|         // compound punctuation | ||||
|         pub fn lsh(&mut self) -> Option<Token> { | ||||
|             self.map_rule(|r| r.str("<<"), Type::Lsh) | ||||
|         } | ||||
|         pub fn rsh(&mut self) -> Option<Token> { | ||||
|             self.map_rule(|r| r.str(">>"), Type::Rsh) | ||||
|         } | ||||
|         pub fn amp_amp(&mut self) -> Option<Token> { | ||||
|             self.map_rule(|r| r.str("&&"), Type::AmpAmp) | ||||
|         } | ||||
|         pub fn bar_bar(&mut self) -> Option<Token> { | ||||
|             self.map_rule(|r| r.str("||"), Type::BarBar) | ||||
|         } | ||||
|         pub fn not_not(&mut self) -> Option<Token> { | ||||
|             self.map_rule(|r| r.str("!!"), Type::NotNot) | ||||
|         } | ||||
|         pub fn cat_ear(&mut self) -> Option<Token> { | ||||
|             self.map_rule(|r| r.str("^^"), Type::CatEar) | ||||
|         } | ||||
|         pub fn eq_eq(&mut self) -> Option<Token> { | ||||
|             self.map_rule(|r| r.str("=="), Type::EqEq) | ||||
|         } | ||||
|         pub fn gt_eq(&mut self) -> Option<Token> { | ||||
|             self.map_rule(|r| r.str(">="), Type::GtEq) | ||||
|         } | ||||
|         pub fn lt_eq(&mut self) -> Option<Token> { | ||||
|             self.map_rule(|r| r.str("<="), Type::LtEq) | ||||
|         } | ||||
|         pub fn not_eq(&mut self) -> Option<Token> { | ||||
|             self.map_rule(|r| r.str("!="), Type::NotEq) | ||||
|         } | ||||
|         pub fn star_eq(&mut self) -> Option<Token> { | ||||
|             self.map_rule(|r| r.str("*="), Type::StarEq) | ||||
|         } | ||||
|         pub fn div_eq(&mut self) -> Option<Token> { | ||||
|             self.map_rule(|r| r.str("/="), Type::DivEq) | ||||
|         } | ||||
|         pub fn rem_eq(&mut self) -> Option<Token> { | ||||
|             self.map_rule(|r| r.str("%="), Type::RemEq) | ||||
|         } | ||||
|         pub fn add_eq(&mut self) -> Option<Token> { | ||||
|             self.map_rule(|r| r.str("+="), Type::AddEq) | ||||
|         } | ||||
|         pub fn sub_eq(&mut self) -> Option<Token> { | ||||
|             self.map_rule(|r| r.str("-="), Type::SubEq) | ||||
|         } | ||||
|         pub fn and_eq(&mut self) -> Option<Token> { | ||||
|             self.map_rule(|r| r.str("&="), Type::AndEq) | ||||
|         } | ||||
|         pub fn or_eq(&mut self) -> Option<Token> { | ||||
|             self.map_rule(|r| r.str("|="), Type::OrEq) | ||||
|         } | ||||
|         pub fn xor_eq(&mut self) -> Option<Token> { | ||||
|             self.map_rule(|r| r.str("^="), Type::XorEq) | ||||
|         } | ||||
|         pub fn lsh_eq(&mut self) -> Option<Token> { | ||||
|             self.map_rule(|r| r.str("<<="), Type::LshEq) | ||||
|         } | ||||
|         pub fn rsh_eq(&mut self) -> Option<Token> { | ||||
|             self.map_rule(|r| r.str(">>="), Type::RshEq) | ||||
|         } | ||||
|         pub fn arrow(&mut self) -> Option<Token> { | ||||
|             self.map_rule(|r| r.str("->"), Type::Arrow) | ||||
|         } | ||||
|         pub fn fatarrow(&mut self) -> Option<Token> { | ||||
|             self.map_rule(|r| r.str("=>"), Type::FatArrow) | ||||
|         } | ||||
|         // simple punctuation | ||||
|         pub fn semi(&mut self) -> Option<Token> { | ||||
|             self.map_rule(|r| r.char(';'), Type::Semi) | ||||
|         } | ||||
|         pub fn dot(&mut self) -> Option<Token> { | ||||
|             self.map_rule(|r| r.char('.'), Type::Dot) | ||||
|         } | ||||
|         pub fn star(&mut self) -> Option<Token> { | ||||
|             self.map_rule(|r| r.char('*'), Type::Star) | ||||
|         } | ||||
|         pub fn div(&mut self) -> Option<Token> { | ||||
|             self.map_rule(|r| r.char('/'), Type::Div) | ||||
|         } | ||||
|         pub fn plus(&mut self) -> Option<Token> { | ||||
|             self.map_rule(|r| r.char('+'), Type::Plus) | ||||
|         } | ||||
|         pub fn sub(&mut self) -> Option<Token> { | ||||
|             self.map_rule(|r| r.char('-'), Type::Minus) | ||||
|         } | ||||
|         pub fn rem(&mut self) -> Option<Token> { | ||||
|             self.map_rule(|r| r.char('%'), Type::Rem) | ||||
|         } | ||||
|         pub fn bang(&mut self) -> Option<Token> { | ||||
|             self.map_rule(|r| r.char('!'), Type::Bang) | ||||
|         } | ||||
|         pub fn eq(&mut self) -> Option<Token> { | ||||
|             self.map_rule(|r| r.char('='), Type::Eq) | ||||
|         } | ||||
|         pub fn lt(&mut self) -> Option<Token> { | ||||
|             self.map_rule(|r| r.char('<'), Type::Lt) | ||||
|         } | ||||
|         pub fn gt(&mut self) -> Option<Token> { | ||||
|             self.map_rule(|r| r.char('>'), Type::Gt) | ||||
|         } | ||||
|         pub fn amp(&mut self) -> Option<Token> { | ||||
|             self.map_rule(|r| r.char('&'), Type::Amp) | ||||
|         } | ||||
|         pub fn bar(&mut self) -> Option<Token> { | ||||
|             self.map_rule(|r| r.char('|'), Type::Bar) | ||||
|         } | ||||
|         pub fn xor(&mut self) -> Option<Token> { | ||||
|             self.map_rule(|r| r.char('^'), Type::Xor) | ||||
|         } | ||||
|         pub fn hash(&mut self) -> Option<Token> { | ||||
|             self.map_rule(|r| r.char('#'), Type::Hash) | ||||
|         } | ||||
|         pub fn at(&mut self) -> Option<Token> { | ||||
|             self.map_rule(|r| r.char('@'), Type::At) | ||||
|         } | ||||
|         pub fn colon(&mut self) -> Option<Token> { | ||||
|             self.map_rule(|r| r.char(':'), Type::Colon) | ||||
|         } | ||||
|         pub fn question(&mut self) -> Option<Token> { | ||||
|             self.map_rule(|r| r.char('?'), Type::Question) | ||||
|         } | ||||
|         pub fn comma(&mut self) -> Option<Token> { | ||||
|             self.map_rule(|r| r.char(','), Type::Comma) | ||||
|         } | ||||
|         pub fn tilde(&mut self) -> Option<Token> { | ||||
|             self.map_rule(|r| r.char('~'), Type::Tilde) | ||||
|         } | ||||
|         pub fn grave(&mut self) -> Option<Token> { | ||||
|             self.map_rule(|r| r.char('`'), Type::Grave) | ||||
|         } | ||||
|         pub fn backslash(&mut self) -> Option<Token> { | ||||
|             self.map_rule(|r| r.char('\\'), Type::Backslash) | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     // TODO: use real, functional parser-combinators here to produce tokens | ||||
|     /// A lexer [Rule] matches patterns in text in a declarative manner | ||||
|     #[derive(Clone, Debug, PartialEq, Eq)] | ||||
|     pub struct Rule<'t> { | ||||
|         text: &'t str, | ||||
|         taken: usize, | ||||
|         is_alright: bool, | ||||
|     } | ||||
|     impl<'t> Rule<'t> { | ||||
|         pub fn new(text: &'t str) -> Self { | ||||
|             Self { text, taken: 0, is_alright: true } | ||||
|         } | ||||
|         pub fn end(self) -> Option<usize> { | ||||
|             self.is_alright.then_some(self.taken) | ||||
|         } | ||||
|         pub fn remaining(&self) -> &str { | ||||
|             self.text | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     impl<'t> Rule<'t> { | ||||
|         /// Matches any sequence of non-whitespace characters | ||||
|         pub fn invalid(self) -> Self { | ||||
|             self.and_many(Self::not_whitespace) | ||||
|         } | ||||
|         /// Matches a block, line, or shebang comment | ||||
|         pub fn comment(self) -> Self { | ||||
|             self.and_either(Self::line_comment, Self::block_comment) | ||||
|         } | ||||
|         /// Matches a line or shebang comment | ||||
|         fn line_comment(self) -> Self { | ||||
|             // line_comment := ("//" | "#!/") (!newline)* | ||||
|             self.str("//") | ||||
|                 .or(|r| r.str("#!/")) | ||||
|                 .and_any(|r| r.not_char('\n')) | ||||
|         } | ||||
|         /// Matches a block comment | ||||
|         fn block_comment(self) -> Self { | ||||
|             // block_comment := "/*" (block_comment | all_but("*/"))* "*/" | ||||
|             self.str("/*") | ||||
|                 .and_any(|r| r.and_either(|f| f.block_comment(), |g| g.not_str("*/"))) | ||||
|                 .str("*/") | ||||
|         } | ||||
|         /// Matches a Rust-style identifier | ||||
|         pub fn identifier(self) -> Self { | ||||
|             // identifier := ('_' | XID_START) ~ XID_CONTINUE* | ||||
|             self.char('_') | ||||
|                 .or(Rule::xid_start) | ||||
|                 .and_any(Rule::xid_continue) | ||||
|         } | ||||
|         /// Matches a Rust-style base-prefixed int literal | ||||
|         fn integer_kind(self, prefix: &str, digit: impl Fn(Self) -> Self) -> Self { | ||||
|             // int_kind<Prefix, Digit> := Prefix '_'* Digit (Digit | '_')* | ||||
|             self.str(prefix) | ||||
|                 .and_any(|r| r.char('_')) | ||||
|                 .and(&digit) | ||||
|                 .and_any(|r| r.and(&digit).or(|r| r.char('_'))) | ||||
|         } | ||||
|         /// Matches a Rust-style integer literal | ||||
|         pub fn integer(self) -> Self { | ||||
|             // integer = (int_kind<0d, dec_digit> | int_kind<0x, hex_digit> | ||||
|             //           | int_kind<0o, oct_digit> | int_kind<0b, bin_digit> | dec_digit (dec_digit | '_')*) | ||||
|             self.and_one_of(&[ | ||||
|                 &|rule| rule.integer_kind("0d", Rule::dec_digit), | ||||
|                 &|rule| rule.integer_kind("0x", Rule::hex_digit), | ||||
|                 &|rule| rule.integer_kind("0o", Rule::oct_digit), | ||||
|                 &|rule| rule.integer_kind("0b", Rule::bin_digit), | ||||
|                 &|rule| { | ||||
|                     rule.dec_digit() | ||||
|                         .and_any(|r| r.dec_digit().or(|r| r.char('_'))) | ||||
|                 }, | ||||
|             ]) | ||||
|         } | ||||
|         /// Matches a float literal | ||||
|         // TODO: exponent form | ||||
|         pub fn float(self) -> Self { | ||||
|             self.and_any(Rule::dec_digit) | ||||
|                 .char('.') | ||||
|                 .and_many(Rule::dec_digit) | ||||
|         } | ||||
|         /// Matches one apostrophe-delimited char literal | ||||
|         pub fn character(self) -> Self { | ||||
|             self.char('\'').character_continue().char('\'') | ||||
|         } | ||||
|         pub fn character_continue(self) -> Self { | ||||
|             self.and(|rule| rule.string_escape().or(|rule| rule.not_char('\''))) | ||||
|         } | ||||
|         /// Matches one quote-delimited string literal | ||||
|         pub fn string(self) -> Self { | ||||
|             self.char('"').and_any(Rule::string_continue).char('"') | ||||
|         } | ||||
|         /// Matches one string escape sequence or non-`"` characcter | ||||
|         pub fn string_continue(self) -> Self { | ||||
|             self.and(Rule::string_escape).or(|rule| rule.not_char('"')) | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     impl<'t> Rule<'t> { | ||||
|         /// Matches a char lexicographically between start and end | ||||
|         pub fn char_between(self, start: char, end: char) -> Self { | ||||
|             self.char_fn(|c| start <= c && c <= end) | ||||
|         } | ||||
|         /// Matches a single char | ||||
|         pub fn char(self, c: char) -> Self { | ||||
|             self.has(|rule| rule.text.starts_with(c), 1) | ||||
|         } | ||||
|         /// Matches the entirety of a string slice | ||||
|         pub fn str(self, s: &str) -> Self { | ||||
|             self.has(|rule| rule.text.starts_with(s), s.len()) | ||||
|         } | ||||
|         /// Matches a char based on the output of a function | ||||
|         pub fn char_fn(self, f: impl Fn(char) -> bool) -> Self { | ||||
|             self.and(|rule| match rule.text.strip_prefix(&f) { | ||||
|                 Some(text) => Self { text, taken: rule.taken + next_utf8(rule.text, 1), ..rule }, | ||||
|                 None => Self { is_alright: false, ..rule }, | ||||
|             }) | ||||
|         } | ||||
|         /// Matches a single char except c | ||||
|         pub fn not_char(self, c: char) -> Self { | ||||
|             self.has(|rule| !rule.text.starts_with(c), 1) | ||||
|         } | ||||
|         /// Matches a single char unless the text starts with s | ||||
|         pub fn not_str(self, s: &str) -> Self { | ||||
|             self.has(|rule| !rule.text.starts_with(s), 1) | ||||
|         } | ||||
|         // commonly used character classes | ||||
|         /// Matches one of any character | ||||
|         pub fn any(self) -> Self { | ||||
|             self.has(|_| true, 1) | ||||
|         } | ||||
|         /// Matches one whitespace | ||||
|         pub fn whitespace(self) -> Self { | ||||
|             self.char_fn(|c| c.is_whitespace()) | ||||
|         } | ||||
|         /// Matches one whitespace, except `'\n'` | ||||
|         pub fn whitespace_not_newline(self) -> Self { | ||||
|             self.char_fn(|c| '\n' != c && c.is_whitespace()) | ||||
|         } | ||||
|         /// Matches anything but whitespace | ||||
|         pub fn not_whitespace(self) -> Self { | ||||
|             self.char_fn(|c| !c.is_whitespace()) | ||||
|         } | ||||
|         /// Matches one XID_START | ||||
|         pub fn xid_start(self) -> Self { | ||||
|             use unicode_xid::UnicodeXID; | ||||
|             self.char_fn(UnicodeXID::is_xid_start) | ||||
|         } | ||||
|         /// Matches one XID_CONTINUE | ||||
|         pub fn xid_continue(self) -> Self { | ||||
|             use unicode_xid::UnicodeXID; | ||||
|             self.char_fn(UnicodeXID::is_xid_continue) | ||||
|         } | ||||
|         /// Matches one hexadecimal digit | ||||
|         pub fn hex_digit(self) -> Self { | ||||
|             self.char_fn(|c| c.is_ascii_hexdigit()) | ||||
|         } | ||||
|         /// Matches one decimal digit | ||||
|         pub fn dec_digit(self) -> Self { | ||||
|             self.char_fn(|c| c.is_ascii_digit()) | ||||
|         } | ||||
|         /// Matches one octal digit | ||||
|         pub fn oct_digit(self) -> Self { | ||||
|             self.char_between('0', '7') | ||||
|         } | ||||
|         /// Matches one binary digit | ||||
|         pub fn bin_digit(self) -> Self { | ||||
|             self.char_between('0', '1') | ||||
|         } | ||||
|         /// Matches any string escape "\." | ||||
|         pub fn string_escape(self) -> Self { | ||||
|             self.char('\\').and(Rule::any) | ||||
|         } | ||||
|         /// Performs a consuming condition assertion on the input | ||||
|         fn has(self, condition: impl Fn(&Self) -> bool, len: usize) -> Self { | ||||
|             let len = next_utf8(self.text, len); | ||||
|             self.and(|rule| match condition(&rule) && !rule.text.is_empty() { | ||||
|                 true => Self { text: &rule.text[len..], taken: rule.taken + len, ..rule }, | ||||
|                 false => Self { is_alright: false, ..rule }, | ||||
|             }) | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     impl<'t> lerox::Combinator for Rule<'t> { | ||||
|         fn is_alright(&self) -> bool { | ||||
|             self.is_alright | ||||
|         } | ||||
|         fn into_alright(self) -> Self { | ||||
|             Self { is_alright: true, ..self } | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     /// Returns the index of the next unicode character, rounded up | ||||
|     fn next_utf8(text: &str, mut index: usize) -> usize { | ||||
|         index = index.min(text.len()); | ||||
|         while !text.is_char_boundary(index) { | ||||
|             index += 1 | ||||
|         } | ||||
|         index | ||||
|     } | ||||
| } | ||||
| pub mod lexer; | ||||
|  | ||||
| pub mod parser { | ||||
|     //! Parses [tokens](super::token) into an [AST](super::ast) | ||||
|   | ||||
		Reference in New Issue
	
	Block a user