lexer: Tokenize identifiers (resolves #2.)
This commit is contained in:
parent
443cd11803
commit
8f07b29ff3
@ -8,4 +8,5 @@ license.workspace = true
|
|||||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
lerox ={ path = "../lerox" }
|
lerox ={ path = "../lerox" }
|
||||||
|
unicode-xid = "0.2.4"
|
||||||
|
@ -8,6 +8,7 @@ pub mod token {
|
|||||||
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
|
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
|
||||||
pub enum Type {
|
pub enum Type {
|
||||||
Comment,
|
Comment,
|
||||||
|
Identifier,
|
||||||
}
|
}
|
||||||
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
|
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
|
||||||
pub struct Token {
|
pub struct Token {
|
||||||
@ -101,6 +102,17 @@ pub mod lexer {
|
|||||||
.end()?,
|
.end()?,
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
pub fn identifier(&mut self) -> Option<Token> {
|
||||||
|
self.skip_whitespace();
|
||||||
|
self.produce_token(
|
||||||
|
Type::Identifier,
|
||||||
|
Rule::new(self.text())
|
||||||
|
.char('_')
|
||||||
|
.or(Rule::xid_start)
|
||||||
|
.and_any(Rule::xid_continue)
|
||||||
|
.end()?,
|
||||||
|
)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, Debug, PartialEq, Eq)]
|
#[derive(Clone, Debug, PartialEq, Eq)]
|
||||||
@ -149,6 +161,14 @@ pub mod lexer {
|
|||||||
pub fn whitespace(self) -> Self {
|
pub fn whitespace(self) -> Self {
|
||||||
self.and_any(|rule| rule.char_fn(|c| c.is_whitespace()))
|
self.and_any(|rule| rule.char_fn(|c| c.is_whitespace()))
|
||||||
}
|
}
|
||||||
|
pub fn xid_start(self) -> Self {
|
||||||
|
use unicode_xid::UnicodeXID;
|
||||||
|
self.char_fn(UnicodeXID::is_xid_start)
|
||||||
|
}
|
||||||
|
pub fn xid_continue(self) -> Self {
|
||||||
|
use unicode_xid::UnicodeXID;
|
||||||
|
self.char_fn(UnicodeXID::is_xid_continue)
|
||||||
|
}
|
||||||
fn has(self, condition: impl Fn(&Self) -> bool, len: usize) -> Self {
|
fn has(self, condition: impl Fn(&Self) -> bool, len: usize) -> Self {
|
||||||
let len = next_utf8(self.text, len);
|
let len = next_utf8(self.text, len);
|
||||||
self.and(|rule| match condition(&rule) && !rule.text.is_empty() {
|
self.and(|rule| match condition(&rule) && !rule.text.is_empty() {
|
||||||
@ -261,6 +281,30 @@ mod tests {
|
|||||||
assert_whole_input_is_token("fn main() {}", Lexer::shebang_comment, Type::Comment);
|
assert_whole_input_is_token("fn main() {}", Lexer::shebang_comment, Type::Comment);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
mod identifier {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn identifier() {
|
||||||
|
assert_whole_input_is_token(
|
||||||
|
"valid_identifier",
|
||||||
|
Lexer::identifier,
|
||||||
|
Type::Identifier,
|
||||||
|
);
|
||||||
|
assert_whole_input_is_token("_0", Lexer::identifier, Type::Identifier);
|
||||||
|
assert_whole_input_is_token("_", Lexer::identifier, Type::Identifier);
|
||||||
|
}
|
||||||
|
#[test]
|
||||||
|
fn unicode_identifier() {
|
||||||
|
assert_whole_input_is_token("ζ_ζζζ_ζζζ_ζζζ", Lexer::identifier, Type::Identifier);
|
||||||
|
assert_whole_input_is_token("_ζζζ_ζζζ_ζζζ_", Lexer::identifier, Type::Identifier);
|
||||||
|
}
|
||||||
|
#[test]
|
||||||
|
#[should_panic]
|
||||||
|
fn not_identifier() {
|
||||||
|
assert_whole_input_is_token("123456789", Lexer::identifier, Type::Identifier);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
mod parser {
|
mod parser {
|
||||||
// TODO
|
// TODO
|
||||||
|
Loading…
Reference in New Issue
Block a user