lexer: Tokenize identifiers (resolves #2.)

This commit is contained in:
John 2023-09-25 14:22:27 -05:00
parent 443cd11803
commit 8f07b29ff3
2 changed files with 46 additions and 1 deletions

View File

@ -9,3 +9,4 @@ license.workspace = true
[dependencies] [dependencies]
lerox ={ path = "../lerox" } lerox ={ path = "../lerox" }
unicode-xid = "0.2.4"

View File

@ -8,6 +8,7 @@ pub mod token {
#[derive(Clone, Copy, Debug, PartialEq, Eq)] #[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum Type { pub enum Type {
Comment, Comment,
Identifier,
} }
#[derive(Clone, Copy, Debug, PartialEq, Eq)] #[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub struct Token { pub struct Token {
@ -101,6 +102,17 @@ pub mod lexer {
.end()?, .end()?,
) )
} }
pub fn identifier(&mut self) -> Option<Token> {
self.skip_whitespace();
self.produce_token(
Type::Identifier,
Rule::new(self.text())
.char('_')
.or(Rule::xid_start)
.and_any(Rule::xid_continue)
.end()?,
)
}
} }
#[derive(Clone, Debug, PartialEq, Eq)] #[derive(Clone, Debug, PartialEq, Eq)]
@ -149,6 +161,14 @@ pub mod lexer {
pub fn whitespace(self) -> Self { pub fn whitespace(self) -> Self {
self.and_any(|rule| rule.char_fn(|c| c.is_whitespace())) self.and_any(|rule| rule.char_fn(|c| c.is_whitespace()))
} }
pub fn xid_start(self) -> Self {
use unicode_xid::UnicodeXID;
self.char_fn(UnicodeXID::is_xid_start)
}
pub fn xid_continue(self) -> Self {
use unicode_xid::UnicodeXID;
self.char_fn(UnicodeXID::is_xid_continue)
}
fn has(self, condition: impl Fn(&Self) -> bool, len: usize) -> Self { fn has(self, condition: impl Fn(&Self) -> bool, len: usize) -> Self {
let len = next_utf8(self.text, len); let len = next_utf8(self.text, len);
self.and(|rule| match condition(&rule) && !rule.text.is_empty() { self.and(|rule| match condition(&rule) && !rule.text.is_empty() {
@ -261,6 +281,30 @@ mod tests {
assert_whole_input_is_token("fn main() {}", Lexer::shebang_comment, Type::Comment); assert_whole_input_is_token("fn main() {}", Lexer::shebang_comment, Type::Comment);
} }
} }
mod identifier {
use super::*;
#[test]
fn identifier() {
assert_whole_input_is_token(
"valid_identifier",
Lexer::identifier,
Type::Identifier,
);
assert_whole_input_is_token("_0", Lexer::identifier, Type::Identifier);
assert_whole_input_is_token("_", Lexer::identifier, Type::Identifier);
}
#[test]
fn unicode_identifier() {
assert_whole_input_is_token("ζ_ζζζ_ζζζ_ζζζ", Lexer::identifier, Type::Identifier);
assert_whole_input_is_token("_ζζζ_ζζζ_ζζζ_", Lexer::identifier, Type::Identifier);
}
#[test]
#[should_panic]
fn not_identifier() {
assert_whole_input_is_token("123456789", Lexer::identifier, Type::Identifier);
}
}
} }
mod parser { mod parser {
// TODO // TODO