From 8f07b29ff38362434b24ded9266fc94a748655b4 Mon Sep 17 00:00:00 2001
From: John <j@soft.fish>
Date: Mon, 25 Sep 2023 14:22:27 -0500
Subject: [PATCH] lexer: Tokenize identifiers (resolves #2.)

---
 libconlang/Cargo.toml |  3 ++-
 libconlang/src/lib.rs | 44 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 46 insertions(+), 1 deletion(-)
diff --git a/libconlang/Cargo.toml b/libconlang/Cargo.toml
index 36dfd20..0ba9739 100644
--- a/libconlang/Cargo.toml
+++ b/libconlang/Cargo.toml
@@ -8,4 +8,5 @@ license.workspace = true
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
-lerox ={ path = "../lerox" }
\ No newline at end of file
+lerox ={ path = "../lerox" }
+unicode-xid = "0.2.4"
diff --git a/libconlang/src/lib.rs b/libconlang/src/lib.rs
index ab65a78..642a305 100644
--- a/libconlang/src/lib.rs
+++ b/libconlang/src/lib.rs
@@ -8,6 +8,7 @@ pub mod token {
     #[derive(Clone, Copy, Debug, PartialEq, Eq)]
     pub enum Type {
         Comment,
+        Identifier,
     }
     #[derive(Clone, Copy, Debug, PartialEq, Eq)]
     pub struct Token {
@@ -101,6 +102,17 @@ pub mod lexer {
                     .end()?,
             )
         }
+        pub fn identifier(&mut self) -> Option<Token> {
+            self.skip_whitespace();
+            self.produce_token(
+                Type::Identifier,
+                Rule::new(self.text())
+                    .char('_')
+                    .or(Rule::xid_start)
+                    .and_any(Rule::xid_continue)
+                    .end()?,
+            )
+        }
     }
 
     #[derive(Clone, Debug, PartialEq, Eq)]
@@ -149,6 +161,14 @@ pub mod lexer {
         pub fn whitespace(self) -> Self {
             self.and_any(|rule| rule.char_fn(|c| c.is_whitespace()))
         }
+        pub fn xid_start(self) -> Self {
+            use unicode_xid::UnicodeXID;
+            self.char_fn(UnicodeXID::is_xid_start)
+        }
+        pub fn xid_continue(self) -> Self {
+            use unicode_xid::UnicodeXID;
+            self.char_fn(UnicodeXID::is_xid_continue)
+        }
         fn has(self, condition: impl Fn(&Self) -> bool, len: usize) -> Self {
             let len = next_utf8(self.text, len);
             self.and(|rule| match condition(&rule) && !rule.text.is_empty() {
@@ -261,6 +281,30 @@ mod tests {
             assert_whole_input_is_token("fn main() {}", Lexer::shebang_comment, Type::Comment);
             }
         }
+        mod identifier {
+            use super::*;
+
+            #[test]
+            fn identifier() {
+                assert_whole_input_is_token(
+                    "valid_identifier",
+                    Lexer::identifier,
+                    Type::Identifier,
+                );
+                assert_whole_input_is_token("_0", Lexer::identifier, Type::Identifier);
+                assert_whole_input_is_token("_", Lexer::identifier, Type::Identifier);
+            }
+            #[test]
+            fn unicode_identifier() {
+                assert_whole_input_is_token("ζ_ζζζ_ζζζ_ζζζ", Lexer::identifier, Type::Identifier);
+                assert_whole_input_is_token("_ζζζ_ζζζ_ζζζ_", Lexer::identifier, Type::Identifier);
+            }
+            #[test]
+            #[should_panic]
+            fn not_identifier() {
+                assert_whole_input_is_token("123456789", Lexer::identifier, Type::Identifier);
+            }
+        }
     }
     mod parser {
         // TODO