lexer: Tokenize identifiers (resolves #2.)

2023-09-25 14:22:27 -05:00
parent 443cd11803
commit 8f07b29ff3
2 changed files with 46 additions and 1 deletions
--- a/libconlang/Cargo.toml
+++ b/libconlang/Cargo.toml
@@ -8,4 +8,5 @@ license.workspace = true
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

 [dependencies]
-lerox ={ path = "../lerox" }
+lerox ={ path = "../lerox" }
+unicode-xid = "0.2.4"
--- a/libconlang/src/lib.rs
+++ b/libconlang/src/lib.rs
@@ -8,6 +8,7 @@ pub mod token {
    #[derive(Clone, Copy, Debug, PartialEq, Eq)]
    pub enum Type {
        Comment,
+        Identifier,
    }
    #[derive(Clone, Copy, Debug, PartialEq, Eq)]
    pub struct Token {
@@ -101,6 +102,17 @@ pub mod lexer {
                    .end()?,
            )
        }
+        pub fn identifier(&mut self) -> Option<Token> {
+            self.skip_whitespace();
+            self.produce_token(
+                Type::Identifier,
+                Rule::new(self.text())
+                    .char('_')
+                    .or(Rule::xid_start)
+                    .and_any(Rule::xid_continue)
+                    .end()?,
+            )
+        }
    }

    #[derive(Clone, Debug, PartialEq, Eq)]
@@ -149,6 +161,14 @@ pub mod lexer {
        pub fn whitespace(self) -> Self {
            self.and_any(|rule| rule.char_fn(|c| c.is_whitespace()))
        }
+        pub fn xid_start(self) -> Self {
+            use unicode_xid::UnicodeXID;
+            self.char_fn(UnicodeXID::is_xid_start)
+        }
+        pub fn xid_continue(self) -> Self {
+            use unicode_xid::UnicodeXID;
+            self.char_fn(UnicodeXID::is_xid_continue)
+        }
        fn has(self, condition: impl Fn(&Self) -> bool, len: usize) -> Self {
            let len = next_utf8(self.text, len);
            self.and(|rule| match condition(&rule) && !rule.text.is_empty() {
@@ -261,6 +281,30 @@ mod tests {
            assert_whole_input_is_token("fn main() {}", Lexer::shebang_comment, Type::Comment);
            }
        }
+        mod identifier {
+            use super::*;
+
+            #[test]
+            fn identifier() {
+                assert_whole_input_is_token(
+                    "valid_identifier",
+                    Lexer::identifier,
+                    Type::Identifier,
+                );
+                assert_whole_input_is_token("_0", Lexer::identifier, Type::Identifier);
+                assert_whole_input_is_token("_", Lexer::identifier, Type::Identifier);
+            }
+            #[test]
+            fn unicode_identifier() {
+                assert_whole_input_is_token("ζ_ζζζ_ζζζ_ζζζ", Lexer::identifier, Type::Identifier);
+                assert_whole_input_is_token("_ζζζ_ζζζ_ζζζ_", Lexer::identifier, Type::Identifier);
+            }
+            #[test]
+            #[should_panic]
+            fn not_identifier() {
+                assert_whole_input_is_token("123456789", Lexer::identifier, Type::Identifier);
+            }
+        }
    }
    mod parser {
        // TODO