From 8f07b29ff38362434b24ded9266fc94a748655b4 Mon Sep 17 00:00:00 2001 From: John Date: Mon, 25 Sep 2023 14:22:27 -0500 Subject: [PATCH] lexer: Tokenize identifiers (resolves #2.) --- libconlang/Cargo.toml | 3 ++- libconlang/src/lib.rs | 44 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 1 deletion(-) diff --git a/libconlang/Cargo.toml b/libconlang/Cargo.toml index 36dfd20..0ba9739 100644 --- a/libconlang/Cargo.toml +++ b/libconlang/Cargo.toml @@ -8,4 +8,5 @@ license.workspace = true # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -lerox ={ path = "../lerox" } \ No newline at end of file +lerox ={ path = "../lerox" } +unicode-xid = "0.2.4" diff --git a/libconlang/src/lib.rs b/libconlang/src/lib.rs index ab65a78..642a305 100644 --- a/libconlang/src/lib.rs +++ b/libconlang/src/lib.rs @@ -8,6 +8,7 @@ pub mod token { #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub enum Type { Comment, + Identifier, } #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub struct Token { @@ -101,6 +102,17 @@ pub mod lexer { .end()?, ) } + pub fn identifier(&mut self) -> Option { + self.skip_whitespace(); + self.produce_token( + Type::Identifier, + Rule::new(self.text()) + .char('_') + .or(Rule::xid_start) + .and_any(Rule::xid_continue) + .end()?, + ) + } } #[derive(Clone, Debug, PartialEq, Eq)] @@ -149,6 +161,14 @@ pub mod lexer { pub fn whitespace(self) -> Self { self.and_any(|rule| rule.char_fn(|c| c.is_whitespace())) } + pub fn xid_start(self) -> Self { + use unicode_xid::UnicodeXID; + self.char_fn(UnicodeXID::is_xid_start) + } + pub fn xid_continue(self) -> Self { + use unicode_xid::UnicodeXID; + self.char_fn(UnicodeXID::is_xid_continue) + } fn has(self, condition: impl Fn(&Self) -> bool, len: usize) -> Self { let len = next_utf8(self.text, len); self.and(|rule| match condition(&rule) && !rule.text.is_empty() { @@ -261,6 +281,30 @@ mod tests { assert_whole_input_is_token("fn main() {}", Lexer::shebang_comment, Type::Comment); } } + mod identifier { + use super::*; + + #[test] + fn identifier() { + assert_whole_input_is_token( + "valid_identifier", + Lexer::identifier, + Type::Identifier, + ); + assert_whole_input_is_token("_0", Lexer::identifier, Type::Identifier); + assert_whole_input_is_token("_", Lexer::identifier, Type::Identifier); + } + #[test] + fn unicode_identifier() { + assert_whole_input_is_token("ζ_ζζζ_ζζζ_ζζζ", Lexer::identifier, Type::Identifier); + assert_whole_input_is_token("_ζζζ_ζζζ_ζζζ_", Lexer::identifier, Type::Identifier); + } + #[test] + #[should_panic] + fn not_identifier() { + assert_whole_input_is_token("123456789", Lexer::identifier, Type::Identifier); + } + } } mod parser { // TODO