From 097e2c4f11931e1d181d07b8379b2f8d48a8f72d Mon Sep 17 00:00:00 2001 From: John Date: Wed, 27 Sep 2023 18:13:01 -0500 Subject: [PATCH] conlang: Rename literals; split, compose, and document Rules - Renamed literal Types to reflect their literal nature - This allows for consistent naming across future non-literal Types - Complicated lexer Rules have been split into composable sub-rules, and moved into the Rule struct. - This improves modularity, and allows sharing of sub-rules across rules. - Documented each lexer rule with (at least) a one-line blurb describing its function --- .gitignore | 1 + dummy.cl | 7 +- libconlang/examples/identify_tokens.rs | 14 +- libconlang/src/lib.rs | 272 +++++++++++++++---------- 4 files changed, 176 insertions(+), 118 deletions(-) diff --git a/.gitignore b/.gitignore index 99e62e2..4dd298a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ **/Cargo.lock target +*.pest \ No newline at end of file diff --git a/dummy.cl b/dummy.cl index 3c5d37e..8bf31e1 100644 --- a/dummy.cl +++ b/dummy.cl @@ -1,4 +1,9 @@ #!/ this is a shebang comment! // This is an example Conlang file. -/* Conlang supports block comments! */ \ No newline at end of file +/* Conlang supports block comments! */ +ident // Identifier +.1 // literal float +0.1 // literal float +0x1234 // literal integer +"str" // literal string diff --git a/libconlang/examples/identify_tokens.rs b/libconlang/examples/identify_tokens.rs index c2ac4a8..5ff602b 100644 --- a/libconlang/examples/identify_tokens.rs +++ b/libconlang/examples/identify_tokens.rs @@ -1,18 +1,16 @@ //! This example grabs input from stdin, lexes it, and prints which lexer rules matched #![allow(unused_imports)] use conlang::lexer::Lexer; -use std::{io::stdin, error::Error}; +use std::{error::Error, io::stdin}; -fn main() -> Result<(), Box>{ +fn main() -> Result<(), Box> { // get input from stdin for line in stdin().lines() { let line = line?; - // lex the line - for func in [Lexer::line_comment, Lexer::block_comment, Lexer::shebang_comment, Lexer::identifier, Lexer::integer, Lexer::float, Lexer::string] { - if let Some(token) = func(&mut Lexer::new(&line)) { - println!("{:?}: {}", token, &line[token.range()]) - } + let mut lexer = Lexer::new(&line); + while let Some(token) = lexer.any() { + println!("{:?}: {}", token, &line[token.range()]) } } Ok(()) -} \ No newline at end of file +} diff --git a/libconlang/src/lib.rs b/libconlang/src/lib.rs index 91d2635..210171a 100644 --- a/libconlang/src/lib.rs +++ b/libconlang/src/lib.rs @@ -9,10 +9,12 @@ pub mod token { pub enum Type { Comment, Identifier, - Integer, - Float, - String, + // Literals + LitInteger, + LitFloat, + LitString, } + #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub struct Token { ty: Type, @@ -59,104 +61,60 @@ pub mod lexer { pub fn new(text: &'t str) -> Self { Self { text, cursor: 0 } } + /// Skips whitespace in the text fn skip_whitespace(&mut self) { if let Some(len) = Rule::new(self.text()).and_any(Rule::whitespace).end() { self.cursor += len } } + /// Advances the cursor and produces a token fn produce_token(&mut self, ty: Type, len: usize) -> Option { let start = self.cursor; self.cursor += len; Some(Token::new(ty, start, self.cursor)) } + /// Gets a slice of text beginning at the cursor fn text(&self) -> &str { &self.text[self.cursor..] } + // classifies a single arbitrary token + pub fn any(&mut self) -> Option { + None.or_else(|| self.comment()) + .or_else(|| self.identifier()) + .or_else(|| self.literal()) + } + pub fn literal(&mut self) -> Option { + None.or_else(|| self.lit_string()) + .or_else(|| self.lit_float()) + .or_else(|| self.lit_integer()) + } // functions for lexing individual tokens - pub fn line_comment(&mut self) -> Option { - // line_comment := "//" ~ (^newline)* + // comments + pub fn comment(&mut self) -> Option { self.skip_whitespace(); - self.produce_token( - Type::Comment, - Rule::new(self.text()) - .str("//") - .and_any(|rule| rule.not_char('\n')) - .end()?, - ) - } - pub fn block_comment(&mut self) -> Option { - // block_comment := "/*" ~ (block_comment | all_but("*/"))* ~ "*/" - self.skip_whitespace(); - self.produce_token( - Type::Comment, - Rule::new(self.text()) - .str("/*") - .and_any(|rule| rule.not_str("*/")) - .str("*/") - .end()?, - ) - } - pub fn shebang_comment(&mut self) -> Option { - // shebang_comment := "#!/" ~ (^newline)* - self.skip_whitespace(); - self.produce_token( - Type::Comment, - Rule::new(self.text()) - .str("#!/") - .and_any(|rule| rule.not_char('\n')) - .end()?, - ) + self.produce_token(Type::Comment, Rule::new(self.text()).comment().end()?) } + // identifiers pub fn identifier(&mut self) -> Option { self.skip_whitespace(); - self.produce_token( - Type::Identifier, - Rule::new(self.text()) - .char('_') - .or(Rule::xid_start) - .and_any(Rule::xid_continue) - .end()?, - ) + self.produce_token(Type::Identifier, Rule::new(self.text()).identifier().end()?) } - pub fn integer(&mut self) -> Option { + // literals + pub fn lit_integer(&mut self) -> Option { self.skip_whitespace(); - self.produce_token( - Type::Integer, - Rule::new(self.text()) - .and_one_of(&[ - &|rule| rule.str("0x").and_any(Rule::hex_digit), - &|rule| rule.str("0d").and_any(Rule::dec_digit), - &|rule| rule.str("0o").and_any(Rule::oct_digit), - &|rule| rule.str("0b").and_any(Rule::bin_digit), - &|rule| rule.and_many(Rule::dec_digit), - ]) - .end()?, - ) + self.produce_token(Type::LitInteger, Rule::new(self.text()).integer().end()?) } - pub fn float(&mut self) -> Option { + pub fn lit_float(&mut self) -> Option { self.skip_whitespace(); - self.produce_token( - Type::Float, - Rule::new(self.text()) - .and_any(Rule::dec_digit) - .char('.') - .and_many(Rule::dec_digit) - .end()?, - ) + self.produce_token(Type::LitFloat, Rule::new(self.text()).float().end()?) } - pub fn string(&mut self) -> Option { + pub fn lit_string(&mut self) -> Option { self.skip_whitespace(); - self.produce_token( - Type::String, - Rule::new(self.text()) - .char('"') - .and_any(|rule| rule.and(Rule::string_escape).or(|rule| rule.not_char('"'))) - .char('"') - .end()?, - ) + self.produce_token(Type::LitString, Rule::new(self.text()).string().end()?) } } + /// A lexer [Rule] matches patterns in text in a declarative manner #[derive(Clone, Debug, PartialEq, Eq)] pub struct Rule<'t> { text: &'t str, @@ -176,53 +134,135 @@ pub mod lexer { } impl<'t> Rule<'t> { + /// Matches a block, line, or shebang comment + pub fn comment(self) -> Self { + self.and_either(Self::line_comment, Self::block_comment) + } + /// Matches a line or shebang comment + fn line_comment(self) -> Self { + // line_comment := ("//" | "#!/") (!newline)* + self.str("//") + .or(|r| r.str("#!/")) + .and_any(|r| r.not_char('\n')) + } + /// Matches a block comment + fn block_comment(self) -> Self { + // block_comment := "/*" (block_comment | all_but("*/"))* "*/" + self.str("/*") + .and_any(|r| r.and_either(|f| f.block_comment(), |g| g.not_str("*/"))) + .str("*/") + } + /// Matches a Rust-style identifier + pub fn identifier(self) -> Self { + // identifier := ('_' | XID_START) ~ XID_CONTINUE* + self.char('_') + .or(Rule::xid_start) + .and_any(Rule::xid_continue) + } + /// Matches a Rust-style base-prefixed int literal + fn int_literal_kind(self, prefix: &str, digit: impl Fn(Self) -> Self) -> Self { + // int_kind := Prefix '_'* Digit (Digit | '_')* + self.str(prefix) + .and_any(|r| r.char('_')) + .and(&digit) + .and_any(|r| r.and(&digit).or(|r| r.char('_'))) + } + /// Matches a Rust-style integer literal + pub fn integer(self) -> Self { + // integer = (int_kind<0d, dec_digit> | int_kind<0x, hex_digit> + // | int_kind<0o, oct_digit> | int_kind<0b, bin_digit> | dec_digit (dec_digit | '_')*) + self.and_one_of(&[ + &|rule| rule.int_literal_kind("0d", Rule::dec_digit), + &|rule| rule.int_literal_kind("0x", Rule::hex_digit), + &|rule| rule.int_literal_kind("0o", Rule::oct_digit), + &|rule| rule.int_literal_kind("0b", Rule::bin_digit), + &|rule| { + rule.dec_digit() + .and_any(|r| r.dec_digit().or(|r| r.char('_'))) + }, + ]) + } + /// Matches a float literal + // TODO: exponent form + pub fn float(self) -> Self { + self.and_any(Rule::dec_digit) + .char('.') + .and_many(Rule::dec_digit) + } + /// Matches one quote-delimited string literal + pub fn string(self) -> Self { + self.char('"').and_any(Rule::string_continue).char('"') + } + /// Matches one string escape sequence or non-`"` characcter + pub fn string_continue(self) -> Self { + self.and(Rule::string_escape).or(|rule| rule.not_char('"')) + } + } + + impl<'t> Rule<'t> { + /// Matches a char lexicographically between start and end pub fn char_between(self, start: char, end: char) -> Self { self.char_fn(|c| start <= c && c <= end) } + /// Matches a single char pub fn char(self, c: char) -> Self { self.has(|rule| rule.text.starts_with(c), 1) } + /// Matches the entirety of a string slice pub fn str(self, s: &str) -> Self { self.has(|rule| rule.text.starts_with(s), s.len()) } + /// Matches a char based on the output of a function pub fn char_fn(self, f: impl Fn(char) -> bool) -> Self { self.and(|rule| match rule.text.strip_prefix(&f) { Some(text) => Self { text, taken: rule.taken + next_utf8(rule.text, 1), ..rule }, None => Self { is_alright: false, ..rule }, }) } + /// Matches a single char except c pub fn not_char(self, c: char) -> Self { self.has(|rule| !rule.text.starts_with(c), 1) } + /// Matches a single char unless the text starts with s pub fn not_str(self, s: &str) -> Self { self.has(|rule| !rule.text.starts_with(s), 1) } + // commonly used character classes + /// Matches one of any character pub fn any(self) -> Self { self.has(|_| true, 1) } + /// Matches one whitespace pub fn whitespace(self) -> Self { self.char_fn(|c| c.is_whitespace()) } + /// Matches one XID_START pub fn xid_start(self) -> Self { use unicode_xid::UnicodeXID; self.char_fn(UnicodeXID::is_xid_start) } + /// Matches one XID_CONTINUE pub fn xid_continue(self) -> Self { use unicode_xid::UnicodeXID; self.char_fn(UnicodeXID::is_xid_continue) } + /// Matches one hexadecimal digit pub fn hex_digit(self) -> Self { self.char_fn(|c| c.is_ascii_hexdigit()) } + /// Matches one decimal digit pub fn dec_digit(self) -> Self { self.char_fn(|c| c.is_ascii_digit()) } + /// Matches one octal digit pub fn oct_digit(self) -> Self { self.char_between('0', '7') } + /// Matches one binary digit pub fn bin_digit(self) -> Self { self.char_between('0', '1') } + /// Matches any string escape "\." pub fn string_escape(self) -> Self { self.char('\\').and(Rule::any) } @@ -281,6 +321,8 @@ mod tests { // TODO } mod lexer { + use std::ops::Range; + use crate::{ lexer::*, token::{Token, Type}, @@ -288,11 +330,13 @@ mod tests { fn assert_whole_input_is_token<'t, F>(input: &'t str, f: F, ty: Type) where F: FnOnce(&mut Lexer<'t>) -> Option { - assert_has_type_and_len(input, f, ty, input.len()) + assert_has_type_and_range(input, f, ty, 0..input.len()) } - fn assert_has_type_and_len<'t, F>(input: &'t str, f: F, ty: Type, len: usize) + fn assert_has_type_and_range<'t, F>(input: &'t str, f: F, ty: Type, range: Range) where F: FnOnce(&mut Lexer<'t>) -> Option { - assert_eq!(Some(Token::new(ty, 0, len)), f(&mut Lexer::new(input)),) + let tok = f(&mut Lexer::new(input)).unwrap(); + assert_eq!(ty, tok.ty()); + assert_eq!(range, tok.range()); } mod comment { @@ -300,42 +344,47 @@ mod tests { #[test] fn line_comment() { + assert_whole_input_is_token("// comment!", Lexer::comment, Type::Comment); + } + #[test] + #[should_panic] + fn not_line_comment() { + assert_whole_input_is_token("fn main() {}", Lexer::comment, Type::Comment); + } + #[test] + fn block_comment() { + assert_whole_input_is_token("/* comment! */", Lexer::comment, Type::Comment); + } + #[test] + fn nested_block_comment() { assert_whole_input_is_token( - "// this is a comment", - Lexer::line_comment, + "/* a /* nested */ comment */", + Lexer::comment, Type::Comment, ); } #[test] #[should_panic] - fn not_line_comment() { - assert_whole_input_is_token("fn main() {}", Lexer::line_comment, Type::Comment); - } - #[test] - fn block_comment() { + fn unclosed_nested_comment() { assert_whole_input_is_token( - "/* this is a comment */", - Lexer::block_comment, + "/* improperly /* nested */ comment", + Lexer::comment, Type::Comment, ); } #[test] #[should_panic] fn not_block_comment() { - assert_whole_input_is_token("fn main() {}", Lexer::block_comment, Type::Comment); + assert_whole_input_is_token("fn main() {}", Lexer::comment, Type::Comment); } #[test] fn shebang_comment() { - assert_whole_input_is_token( - "#!/ this is a comment", - Lexer::shebang_comment, - Type::Comment, - ); + assert_whole_input_is_token("#!/ comment!", Lexer::comment, Type::Comment); } #[test] #[should_panic] fn not_shebang_comment() { - assert_whole_input_is_token("fn main() {}", Lexer::shebang_comment, Type::Comment); + assert_whole_input_is_token("fn main() {}", Lexer::comment, Type::Comment); } } mod identifier { @@ -366,65 +415,70 @@ mod tests { use super::*; #[test] fn bare() { - assert_whole_input_is_token("10010110", Lexer::integer, Type::Integer); - assert_whole_input_is_token("12345670", Lexer::integer, Type::Integer); - assert_whole_input_is_token("1234567890", Lexer::integer, Type::Integer); + assert_whole_input_is_token("10010110", Lexer::lit_integer, Type::LitInteger); + assert_whole_input_is_token("12345670", Lexer::lit_integer, Type::LitInteger); + assert_whole_input_is_token("1234567890", Lexer::lit_integer, Type::LitInteger); } #[test] fn base16() { - assert_has_type_and_len("0x1234", Lexer::integer, Type::Integer, 6); - assert_has_type_and_len("0x1234 \"hello\"", Lexer::integer, Type::Integer, 6); + assert_has_type_and_range("0x1234", Lexer::lit_integer, Type::LitInteger, 0..6); + assert_has_type_and_range( + "0x1234 \"hello\"", + Lexer::lit_integer, + Type::LitInteger, + 0..6, + ); } #[test] fn base10() { - assert_whole_input_is_token("0d1234", Lexer::integer, Type::Integer); + assert_whole_input_is_token("0d1234", Lexer::lit_integer, Type::LitInteger); } #[test] fn base8() { - assert_whole_input_is_token("0o1234", Lexer::integer, Type::Integer); + assert_whole_input_is_token("0o1234", Lexer::lit_integer, Type::LitInteger); } #[test] fn base2() { - assert_whole_input_is_token("0b1010", Lexer::integer, Type::Integer); + assert_whole_input_is_token("0b1010", Lexer::lit_integer, Type::LitInteger); } } mod float { use super::*; #[test] fn number_dot_number_is_float() { - assert_whole_input_is_token("1.0", Lexer::float, Type::Float); + assert_whole_input_is_token("1.0", Lexer::lit_float, Type::LitFloat); } #[test] fn nothing_dot_number_is_float() { - assert_whole_input_is_token(".0", Lexer::float, Type::Float); + assert_whole_input_is_token(".0", Lexer::lit_float, Type::LitFloat); } #[test] #[should_panic] fn number_dot_nothing_is_not_float() { - assert_whole_input_is_token("1.", Lexer::float, Type::Float); + assert_whole_input_is_token("1.", Lexer::lit_float, Type::LitFloat); } #[test] #[should_panic] fn nothing_dot_nothing_is_not_float() { - assert_whole_input_is_token(".", Lexer::float, Type::Float); + assert_whole_input_is_token(".", Lexer::lit_float, Type::LitFloat); } } mod string { use super::*; #[test] fn empty_string() { - assert_whole_input_is_token("\"\"", Lexer::string, Type::String); + assert_whole_input_is_token("\"\"", Lexer::lit_string, Type::LitString); } #[test] fn unicode_string() { - assert_whole_input_is_token("\"I 💙 🦈!\"", Lexer::string, Type::String); + assert_whole_input_is_token("\"I 💙 🦈!\"", Lexer::lit_string, Type::LitString); } #[test] fn escape_string() { assert_whole_input_is_token( r#"" \"This is a quote\" ""#, - Lexer::string, - Type::String, + Lexer::lit_string, + Type::LitString, ); } }