conlang: Rename literals; split, compose, and document Rules
- Renamed literal Types to reflect their literal nature - This allows for consistent naming across future non-literal Types - Complicated lexer Rules have been split into composable sub-rules, and moved into the Rule struct. - This improves modularity, and allows sharing of sub-rules across rules. - Documented each lexer rule with (at least) a one-line blurb describing its function
This commit is contained in:
		| @@ -1,18 +1,16 @@ | ||||
| //! This example grabs input from stdin, lexes it, and prints which lexer rules matched | ||||
| #![allow(unused_imports)] | ||||
| use conlang::lexer::Lexer; | ||||
| use std::{io::stdin, error::Error}; | ||||
| use std::{error::Error, io::stdin}; | ||||
|  | ||||
| fn main() -> Result<(), Box<dyn Error>>{ | ||||
| fn main() -> Result<(), Box<dyn Error>> { | ||||
|     // get input from stdin | ||||
|     for line in stdin().lines() { | ||||
|         let line = line?; | ||||
|         // lex the line | ||||
|         for func in [Lexer::line_comment, Lexer::block_comment, Lexer::shebang_comment, Lexer::identifier, Lexer::integer, Lexer::float, Lexer::string] { | ||||
|             if let Some(token) = func(&mut Lexer::new(&line)) { | ||||
|                 println!("{:?}: {}", token, &line[token.range()]) | ||||
|             } | ||||
|         let mut lexer = Lexer::new(&line); | ||||
|         while let Some(token) = lexer.any() { | ||||
|             println!("{:?}: {}", token, &line[token.range()]) | ||||
|         } | ||||
|     } | ||||
|     Ok(()) | ||||
| } | ||||
| } | ||||
|   | ||||
| @@ -9,10 +9,12 @@ pub mod token { | ||||
|     pub enum Type { | ||||
|         Comment, | ||||
|         Identifier, | ||||
|         Integer, | ||||
|         Float, | ||||
|         String, | ||||
|         // Literals | ||||
|         LitInteger, | ||||
|         LitFloat, | ||||
|         LitString, | ||||
|     } | ||||
|  | ||||
|     #[derive(Clone, Copy, Debug, PartialEq, Eq)] | ||||
|     pub struct Token { | ||||
|         ty: Type, | ||||
| @@ -59,104 +61,60 @@ pub mod lexer { | ||||
|         pub fn new(text: &'t str) -> Self { | ||||
|             Self { text, cursor: 0 } | ||||
|         } | ||||
|         /// Skips whitespace in the text | ||||
|         fn skip_whitespace(&mut self) { | ||||
|             if let Some(len) = Rule::new(self.text()).and_any(Rule::whitespace).end() { | ||||
|                 self.cursor += len | ||||
|             } | ||||
|         } | ||||
|         /// Advances the cursor and produces a token | ||||
|         fn produce_token(&mut self, ty: Type, len: usize) -> Option<Token> { | ||||
|             let start = self.cursor; | ||||
|             self.cursor += len; | ||||
|             Some(Token::new(ty, start, self.cursor)) | ||||
|         } | ||||
|         /// Gets a slice of text beginning at the cursor | ||||
|         fn text(&self) -> &str { | ||||
|             &self.text[self.cursor..] | ||||
|         } | ||||
|         // classifies a single arbitrary token | ||||
|         pub fn any(&mut self) -> Option<Token> { | ||||
|             None.or_else(|| self.comment()) | ||||
|                 .or_else(|| self.identifier()) | ||||
|                 .or_else(|| self.literal()) | ||||
|         } | ||||
|         pub fn literal(&mut self) -> Option<Token> { | ||||
|             None.or_else(|| self.lit_string()) | ||||
|                 .or_else(|| self.lit_float()) | ||||
|                 .or_else(|| self.lit_integer()) | ||||
|         } | ||||
|         // functions for lexing individual tokens | ||||
|         pub fn line_comment(&mut self) -> Option<Token> { | ||||
|             // line_comment := "//" ~ (^newline)* | ||||
|         // comments | ||||
|         pub fn comment(&mut self) -> Option<Token> { | ||||
|             self.skip_whitespace(); | ||||
|             self.produce_token( | ||||
|                 Type::Comment, | ||||
|                 Rule::new(self.text()) | ||||
|                     .str("//") | ||||
|                     .and_any(|rule| rule.not_char('\n')) | ||||
|                     .end()?, | ||||
|             ) | ||||
|         } | ||||
|         pub fn block_comment(&mut self) -> Option<Token> { | ||||
|             // block_comment := "/*" ~ (block_comment | all_but("*/"))* ~ "*/" | ||||
|             self.skip_whitespace(); | ||||
|             self.produce_token( | ||||
|                 Type::Comment, | ||||
|                 Rule::new(self.text()) | ||||
|                     .str("/*") | ||||
|                     .and_any(|rule| rule.not_str("*/")) | ||||
|                     .str("*/") | ||||
|                     .end()?, | ||||
|             ) | ||||
|         } | ||||
|         pub fn shebang_comment(&mut self) -> Option<Token> { | ||||
|             // shebang_comment := "#!/" ~ (^newline)* | ||||
|             self.skip_whitespace(); | ||||
|             self.produce_token( | ||||
|                 Type::Comment, | ||||
|                 Rule::new(self.text()) | ||||
|                     .str("#!/") | ||||
|                     .and_any(|rule| rule.not_char('\n')) | ||||
|                     .end()?, | ||||
|             ) | ||||
|             self.produce_token(Type::Comment, Rule::new(self.text()).comment().end()?) | ||||
|         } | ||||
|         // identifiers | ||||
|         pub fn identifier(&mut self) -> Option<Token> { | ||||
|             self.skip_whitespace(); | ||||
|             self.produce_token( | ||||
|                 Type::Identifier, | ||||
|                 Rule::new(self.text()) | ||||
|                     .char('_') | ||||
|                     .or(Rule::xid_start) | ||||
|                     .and_any(Rule::xid_continue) | ||||
|                     .end()?, | ||||
|             ) | ||||
|             self.produce_token(Type::Identifier, Rule::new(self.text()).identifier().end()?) | ||||
|         } | ||||
|         pub fn integer(&mut self) -> Option<Token> { | ||||
|         // literals | ||||
|         pub fn lit_integer(&mut self) -> Option<Token> { | ||||
|             self.skip_whitespace(); | ||||
|             self.produce_token( | ||||
|                 Type::Integer, | ||||
|                 Rule::new(self.text()) | ||||
|                     .and_one_of(&[ | ||||
|                         &|rule| rule.str("0x").and_any(Rule::hex_digit), | ||||
|                         &|rule| rule.str("0d").and_any(Rule::dec_digit), | ||||
|                         &|rule| rule.str("0o").and_any(Rule::oct_digit), | ||||
|                         &|rule| rule.str("0b").and_any(Rule::bin_digit), | ||||
|                         &|rule| rule.and_many(Rule::dec_digit), | ||||
|                     ]) | ||||
|                     .end()?, | ||||
|             ) | ||||
|             self.produce_token(Type::LitInteger, Rule::new(self.text()).integer().end()?) | ||||
|         } | ||||
|         pub fn float(&mut self) -> Option<Token> { | ||||
|         pub fn lit_float(&mut self) -> Option<Token> { | ||||
|             self.skip_whitespace(); | ||||
|             self.produce_token( | ||||
|                 Type::Float, | ||||
|                 Rule::new(self.text()) | ||||
|                     .and_any(Rule::dec_digit) | ||||
|                     .char('.') | ||||
|                     .and_many(Rule::dec_digit) | ||||
|                     .end()?, | ||||
|             ) | ||||
|             self.produce_token(Type::LitFloat, Rule::new(self.text()).float().end()?) | ||||
|         } | ||||
|         pub fn string(&mut self) -> Option<Token> { | ||||
|         pub fn lit_string(&mut self) -> Option<Token> { | ||||
|             self.skip_whitespace(); | ||||
|             self.produce_token( | ||||
|                 Type::String, | ||||
|                 Rule::new(self.text()) | ||||
|                     .char('"') | ||||
|                     .and_any(|rule| rule.and(Rule::string_escape).or(|rule| rule.not_char('"'))) | ||||
|                     .char('"') | ||||
|                     .end()?, | ||||
|             ) | ||||
|             self.produce_token(Type::LitString, Rule::new(self.text()).string().end()?) | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     /// A lexer [Rule] matches patterns in text in a declarative manner | ||||
|     #[derive(Clone, Debug, PartialEq, Eq)] | ||||
|     pub struct Rule<'t> { | ||||
|         text: &'t str, | ||||
| @@ -176,53 +134,135 @@ pub mod lexer { | ||||
|     } | ||||
|  | ||||
|     impl<'t> Rule<'t> { | ||||
|         /// Matches a block, line, or shebang comment | ||||
|         pub fn comment(self) -> Self { | ||||
|             self.and_either(Self::line_comment, Self::block_comment) | ||||
|         } | ||||
|         /// Matches a line or shebang comment | ||||
|         fn line_comment(self) -> Self { | ||||
|             // line_comment := ("//" | "#!/") (!newline)* | ||||
|             self.str("//") | ||||
|                 .or(|r| r.str("#!/")) | ||||
|                 .and_any(|r| r.not_char('\n')) | ||||
|         } | ||||
|         /// Matches a block comment | ||||
|         fn block_comment(self) -> Self { | ||||
|             // block_comment := "/*" (block_comment | all_but("*/"))* "*/" | ||||
|             self.str("/*") | ||||
|                 .and_any(|r| r.and_either(|f| f.block_comment(), |g| g.not_str("*/"))) | ||||
|                 .str("*/") | ||||
|         } | ||||
|         /// Matches a Rust-style identifier | ||||
|         pub fn identifier(self) -> Self { | ||||
|             // identifier := ('_' | XID_START) ~ XID_CONTINUE* | ||||
|             self.char('_') | ||||
|                 .or(Rule::xid_start) | ||||
|                 .and_any(Rule::xid_continue) | ||||
|         } | ||||
|         /// Matches a Rust-style base-prefixed int literal | ||||
|         fn int_literal_kind(self, prefix: &str, digit: impl Fn(Self) -> Self) -> Self { | ||||
|             // int_kind<Prefix, Digit> := Prefix '_'* Digit (Digit | '_')* | ||||
|             self.str(prefix) | ||||
|                 .and_any(|r| r.char('_')) | ||||
|                 .and(&digit) | ||||
|                 .and_any(|r| r.and(&digit).or(|r| r.char('_'))) | ||||
|         } | ||||
|         /// Matches a Rust-style integer literal | ||||
|         pub fn integer(self) -> Self { | ||||
|             // integer = (int_kind<0d, dec_digit> | int_kind<0x, hex_digit> | ||||
|             //           | int_kind<0o, oct_digit> | int_kind<0b, bin_digit> | dec_digit (dec_digit | '_')*) | ||||
|             self.and_one_of(&[ | ||||
|                 &|rule| rule.int_literal_kind("0d", Rule::dec_digit), | ||||
|                 &|rule| rule.int_literal_kind("0x", Rule::hex_digit), | ||||
|                 &|rule| rule.int_literal_kind("0o", Rule::oct_digit), | ||||
|                 &|rule| rule.int_literal_kind("0b", Rule::bin_digit), | ||||
|                 &|rule| { | ||||
|                     rule.dec_digit() | ||||
|                         .and_any(|r| r.dec_digit().or(|r| r.char('_'))) | ||||
|                 }, | ||||
|             ]) | ||||
|         } | ||||
|         /// Matches a float literal | ||||
|         // TODO: exponent form | ||||
|         pub fn float(self) -> Self { | ||||
|             self.and_any(Rule::dec_digit) | ||||
|                 .char('.') | ||||
|                 .and_many(Rule::dec_digit) | ||||
|         } | ||||
|         /// Matches one quote-delimited string literal | ||||
|         pub fn string(self) -> Self { | ||||
|             self.char('"').and_any(Rule::string_continue).char('"') | ||||
|         } | ||||
|         /// Matches one string escape sequence or non-`"` characcter | ||||
|         pub fn string_continue(self) -> Self { | ||||
|             self.and(Rule::string_escape).or(|rule| rule.not_char('"')) | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     impl<'t> Rule<'t> { | ||||
|         /// Matches a char lexicographically between start and end | ||||
|         pub fn char_between(self, start: char, end: char) -> Self { | ||||
|             self.char_fn(|c| start <= c && c <= end) | ||||
|         } | ||||
|         /// Matches a single char | ||||
|         pub fn char(self, c: char) -> Self { | ||||
|             self.has(|rule| rule.text.starts_with(c), 1) | ||||
|         } | ||||
|         /// Matches the entirety of a string slice | ||||
|         pub fn str(self, s: &str) -> Self { | ||||
|             self.has(|rule| rule.text.starts_with(s), s.len()) | ||||
|         } | ||||
|         /// Matches a char based on the output of a function | ||||
|         pub fn char_fn(self, f: impl Fn(char) -> bool) -> Self { | ||||
|             self.and(|rule| match rule.text.strip_prefix(&f) { | ||||
|                 Some(text) => Self { text, taken: rule.taken + next_utf8(rule.text, 1), ..rule }, | ||||
|                 None => Self { is_alright: false, ..rule }, | ||||
|             }) | ||||
|         } | ||||
|         /// Matches a single char except c | ||||
|         pub fn not_char(self, c: char) -> Self { | ||||
|             self.has(|rule| !rule.text.starts_with(c), 1) | ||||
|         } | ||||
|         /// Matches a single char unless the text starts with s | ||||
|         pub fn not_str(self, s: &str) -> Self { | ||||
|             self.has(|rule| !rule.text.starts_with(s), 1) | ||||
|         } | ||||
|         // commonly used character classes | ||||
|         /// Matches one of any character | ||||
|         pub fn any(self) -> Self { | ||||
|             self.has(|_| true, 1) | ||||
|         } | ||||
|         /// Matches one whitespace | ||||
|         pub fn whitespace(self) -> Self { | ||||
|             self.char_fn(|c| c.is_whitespace()) | ||||
|         } | ||||
|         /// Matches one XID_START | ||||
|         pub fn xid_start(self) -> Self { | ||||
|             use unicode_xid::UnicodeXID; | ||||
|             self.char_fn(UnicodeXID::is_xid_start) | ||||
|         } | ||||
|         /// Matches one XID_CONTINUE | ||||
|         pub fn xid_continue(self) -> Self { | ||||
|             use unicode_xid::UnicodeXID; | ||||
|             self.char_fn(UnicodeXID::is_xid_continue) | ||||
|         } | ||||
|         /// Matches one hexadecimal digit | ||||
|         pub fn hex_digit(self) -> Self { | ||||
|             self.char_fn(|c| c.is_ascii_hexdigit()) | ||||
|         } | ||||
|         /// Matches one decimal digit | ||||
|         pub fn dec_digit(self) -> Self { | ||||
|             self.char_fn(|c| c.is_ascii_digit()) | ||||
|         } | ||||
|         /// Matches one octal digit | ||||
|         pub fn oct_digit(self) -> Self { | ||||
|             self.char_between('0', '7') | ||||
|         } | ||||
|         /// Matches one binary digit | ||||
|         pub fn bin_digit(self) -> Self { | ||||
|             self.char_between('0', '1') | ||||
|         } | ||||
|         /// Matches any string escape "\." | ||||
|         pub fn string_escape(self) -> Self { | ||||
|             self.char('\\').and(Rule::any) | ||||
|         } | ||||
| @@ -281,6 +321,8 @@ mod tests { | ||||
|         // TODO | ||||
|     } | ||||
|     mod lexer { | ||||
|         use std::ops::Range; | ||||
|  | ||||
|         use crate::{ | ||||
|             lexer::*, | ||||
|             token::{Token, Type}, | ||||
| @@ -288,11 +330,13 @@ mod tests { | ||||
|  | ||||
|         fn assert_whole_input_is_token<'t, F>(input: &'t str, f: F, ty: Type) | ||||
|         where F: FnOnce(&mut Lexer<'t>) -> Option<Token> { | ||||
|             assert_has_type_and_len(input, f, ty, input.len()) | ||||
|             assert_has_type_and_range(input, f, ty, 0..input.len()) | ||||
|         } | ||||
|         fn assert_has_type_and_len<'t, F>(input: &'t str, f: F, ty: Type, len: usize) | ||||
|         fn assert_has_type_and_range<'t, F>(input: &'t str, f: F, ty: Type, range: Range<usize>) | ||||
|         where F: FnOnce(&mut Lexer<'t>) -> Option<Token> { | ||||
|             assert_eq!(Some(Token::new(ty, 0, len)), f(&mut Lexer::new(input)),) | ||||
|             let tok = f(&mut Lexer::new(input)).unwrap(); | ||||
|             assert_eq!(ty, tok.ty()); | ||||
|             assert_eq!(range, tok.range()); | ||||
|         } | ||||
|  | ||||
|         mod comment { | ||||
| @@ -300,42 +344,47 @@ mod tests { | ||||
|  | ||||
|             #[test] | ||||
|             fn line_comment() { | ||||
|                 assert_whole_input_is_token("// comment!", Lexer::comment, Type::Comment); | ||||
|             } | ||||
|             #[test] | ||||
|             #[should_panic] | ||||
|             fn not_line_comment() { | ||||
|                 assert_whole_input_is_token("fn main() {}", Lexer::comment, Type::Comment); | ||||
|             } | ||||
|             #[test] | ||||
|             fn block_comment() { | ||||
|                 assert_whole_input_is_token("/* comment! */", Lexer::comment, Type::Comment); | ||||
|             } | ||||
|             #[test] | ||||
|             fn nested_block_comment() { | ||||
|                 assert_whole_input_is_token( | ||||
|                     "// this is a comment", | ||||
|                     Lexer::line_comment, | ||||
|                     "/* a /* nested */ comment */", | ||||
|                     Lexer::comment, | ||||
|                     Type::Comment, | ||||
|                 ); | ||||
|             } | ||||
|             #[test] | ||||
|             #[should_panic] | ||||
|             fn not_line_comment() { | ||||
|                 assert_whole_input_is_token("fn main() {}", Lexer::line_comment, Type::Comment); | ||||
|             } | ||||
|             #[test] | ||||
|             fn block_comment() { | ||||
|             fn unclosed_nested_comment() { | ||||
|                 assert_whole_input_is_token( | ||||
|                     "/* this is a comment */", | ||||
|                     Lexer::block_comment, | ||||
|                     "/* improperly /* nested */ comment", | ||||
|                     Lexer::comment, | ||||
|                     Type::Comment, | ||||
|                 ); | ||||
|             } | ||||
|             #[test] | ||||
|             #[should_panic] | ||||
|             fn not_block_comment() { | ||||
|                 assert_whole_input_is_token("fn main() {}", Lexer::block_comment, Type::Comment); | ||||
|                 assert_whole_input_is_token("fn main() {}", Lexer::comment, Type::Comment); | ||||
|             } | ||||
|             #[test] | ||||
|             fn shebang_comment() { | ||||
|                 assert_whole_input_is_token( | ||||
|                     "#!/ this is a comment", | ||||
|                     Lexer::shebang_comment, | ||||
|                     Type::Comment, | ||||
|                 ); | ||||
|                 assert_whole_input_is_token("#!/ comment!", Lexer::comment, Type::Comment); | ||||
|             } | ||||
|             #[test] | ||||
|             #[should_panic] | ||||
|             fn not_shebang_comment() { | ||||
|                 assert_whole_input_is_token("fn main() {}", Lexer::shebang_comment, Type::Comment); | ||||
|                 assert_whole_input_is_token("fn main() {}", Lexer::comment, Type::Comment); | ||||
|             } | ||||
|         } | ||||
|         mod identifier { | ||||
| @@ -366,65 +415,70 @@ mod tests { | ||||
|             use super::*; | ||||
|             #[test] | ||||
|             fn bare() { | ||||
|                 assert_whole_input_is_token("10010110", Lexer::integer, Type::Integer); | ||||
|                 assert_whole_input_is_token("12345670", Lexer::integer, Type::Integer); | ||||
|                 assert_whole_input_is_token("1234567890", Lexer::integer, Type::Integer); | ||||
|                 assert_whole_input_is_token("10010110", Lexer::lit_integer, Type::LitInteger); | ||||
|                 assert_whole_input_is_token("12345670", Lexer::lit_integer, Type::LitInteger); | ||||
|                 assert_whole_input_is_token("1234567890", Lexer::lit_integer, Type::LitInteger); | ||||
|             } | ||||
|             #[test] | ||||
|             fn base16() { | ||||
|                 assert_has_type_and_len("0x1234", Lexer::integer, Type::Integer, 6); | ||||
|                 assert_has_type_and_len("0x1234 \"hello\"", Lexer::integer, Type::Integer, 6); | ||||
|                 assert_has_type_and_range("0x1234", Lexer::lit_integer, Type::LitInteger, 0..6); | ||||
|                 assert_has_type_and_range( | ||||
|                     "0x1234 \"hello\"", | ||||
|                     Lexer::lit_integer, | ||||
|                     Type::LitInteger, | ||||
|                     0..6, | ||||
|                 ); | ||||
|             } | ||||
|             #[test] | ||||
|             fn base10() { | ||||
|                 assert_whole_input_is_token("0d1234", Lexer::integer, Type::Integer); | ||||
|                 assert_whole_input_is_token("0d1234", Lexer::lit_integer, Type::LitInteger); | ||||
|             } | ||||
|             #[test] | ||||
|             fn base8() { | ||||
|                 assert_whole_input_is_token("0o1234", Lexer::integer, Type::Integer); | ||||
|                 assert_whole_input_is_token("0o1234", Lexer::lit_integer, Type::LitInteger); | ||||
|             } | ||||
|             #[test] | ||||
|             fn base2() { | ||||
|                 assert_whole_input_is_token("0b1010", Lexer::integer, Type::Integer); | ||||
|                 assert_whole_input_is_token("0b1010", Lexer::lit_integer, Type::LitInteger); | ||||
|             } | ||||
|         } | ||||
|         mod float { | ||||
|             use super::*; | ||||
|             #[test] | ||||
|             fn number_dot_number_is_float() { | ||||
|                 assert_whole_input_is_token("1.0", Lexer::float, Type::Float); | ||||
|                 assert_whole_input_is_token("1.0", Lexer::lit_float, Type::LitFloat); | ||||
|             } | ||||
|             #[test] | ||||
|             fn nothing_dot_number_is_float() { | ||||
|                 assert_whole_input_is_token(".0", Lexer::float, Type::Float); | ||||
|                 assert_whole_input_is_token(".0", Lexer::lit_float, Type::LitFloat); | ||||
|             } | ||||
|             #[test] | ||||
|             #[should_panic] | ||||
|             fn number_dot_nothing_is_not_float() { | ||||
|                 assert_whole_input_is_token("1.", Lexer::float, Type::Float); | ||||
|                 assert_whole_input_is_token("1.", Lexer::lit_float, Type::LitFloat); | ||||
|             } | ||||
|             #[test] | ||||
|             #[should_panic] | ||||
|             fn nothing_dot_nothing_is_not_float() { | ||||
|                 assert_whole_input_is_token(".", Lexer::float, Type::Float); | ||||
|                 assert_whole_input_is_token(".", Lexer::lit_float, Type::LitFloat); | ||||
|             } | ||||
|         } | ||||
|         mod string { | ||||
|             use super::*; | ||||
|             #[test] | ||||
|             fn empty_string() { | ||||
|                 assert_whole_input_is_token("\"\"", Lexer::string, Type::String); | ||||
|                 assert_whole_input_is_token("\"\"", Lexer::lit_string, Type::LitString); | ||||
|             } | ||||
|             #[test] | ||||
|             fn unicode_string() { | ||||
|                 assert_whole_input_is_token("\"I 💙 🦈!\"", Lexer::string, Type::String); | ||||
|                 assert_whole_input_is_token("\"I 💙 🦈!\"", Lexer::lit_string, Type::LitString); | ||||
|             } | ||||
|             #[test] | ||||
|             fn escape_string() { | ||||
|                 assert_whole_input_is_token( | ||||
|                     r#"" \"This is a quote\" ""#, | ||||
|                     Lexer::string, | ||||
|                     Type::String, | ||||
|                     Lexer::lit_string, | ||||
|                     Type::LitString, | ||||
|                 ); | ||||
|             } | ||||
|         } | ||||
|   | ||||
		Reference in New Issue
	
	Block a user