lexer: Refactor rule
and assoc. tests
- Lexer now ignores leading whitespace - Rule now has shorter, clearer function names - Tests for comment lexing are now consolidated into a module - Tests using the assert_has_type_and_len wrapper can now specify an expected length
This commit is contained in:
parent
71053f1992
commit
443cd11803
@ -60,35 +60,44 @@ pub mod lexer {
|
|||||||
self.cursor += len;
|
self.cursor += len;
|
||||||
Some(Token::new(ty, start, self.cursor))
|
Some(Token::new(ty, start, self.cursor))
|
||||||
}
|
}
|
||||||
|
fn text(&self) -> &str {
|
||||||
|
&self.text[self.cursor..]
|
||||||
|
}
|
||||||
|
fn skip_whitespace(&mut self) {
|
||||||
|
self.cursor += Rule::new(self.text).whitespace().end().unwrap_or_default()
|
||||||
|
}
|
||||||
// functions for lexing individual tokens
|
// functions for lexing individual tokens
|
||||||
pub fn line_comment(&mut self) -> Option<Token> {
|
pub fn line_comment(&mut self) -> Option<Token> {
|
||||||
// line_comment := "//" ~ (^newline)*
|
// line_comment := "//" ~ (^newline)*
|
||||||
|
self.skip_whitespace();
|
||||||
self.produce_token(
|
self.produce_token(
|
||||||
Type::Comment,
|
Type::Comment,
|
||||||
Rule::new(self.text)
|
Rule::new(self.text())
|
||||||
.take_str("//")
|
.str("//")
|
||||||
.and_any(|rule| rule.take_except_char('\n'))
|
.and_any(|rule| rule.not_char('\n'))
|
||||||
.end()?,
|
.end()?,
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
pub fn block_comment(&mut self) -> Option<Token> {
|
pub fn block_comment(&mut self) -> Option<Token> {
|
||||||
// block_comment := "/*" ~ (block_comment | all_but("*/"))* ~ "*/"
|
// block_comment := "/*" ~ (block_comment | all_but("*/"))* ~ "*/"
|
||||||
|
self.skip_whitespace();
|
||||||
self.produce_token(
|
self.produce_token(
|
||||||
Type::Comment,
|
Type::Comment,
|
||||||
Rule::new(self.text)
|
Rule::new(self.text())
|
||||||
.take_str("/*")
|
.str("/*")
|
||||||
.and_any(|rule| rule.take_except_str("*/"))
|
.and_any(|rule| rule.not_str("*/"))
|
||||||
.take_str("*/")
|
.str("*/")
|
||||||
.end()?,
|
.end()?,
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
pub fn shebang_comment(&mut self) -> Option<Token> {
|
pub fn shebang_comment(&mut self) -> Option<Token> {
|
||||||
// shebang_comment := "#!/" ~ (^newline)*
|
// shebang_comment := "#!/" ~ (^newline)*
|
||||||
|
self.skip_whitespace();
|
||||||
self.produce_token(
|
self.produce_token(
|
||||||
Type::Comment,
|
Type::Comment,
|
||||||
Rule::new(self.text)
|
Rule::new(self.text())
|
||||||
.take_str("#!/")
|
.str("#!/")
|
||||||
.and_any(|rule| rule.take_except_char('\n'))
|
.and_any(|rule| rule.not_char('\n'))
|
||||||
.end()?,
|
.end()?,
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
@ -107,28 +116,44 @@ pub mod lexer {
|
|||||||
pub fn end(self) -> Option<usize> {
|
pub fn end(self) -> Option<usize> {
|
||||||
self.is_alright.then_some(self.taken)
|
self.is_alright.then_some(self.taken)
|
||||||
}
|
}
|
||||||
|
pub fn remaining(&self) -> &str {
|
||||||
|
self.text
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'t> Rule<'t> {
|
impl<'t> Rule<'t> {
|
||||||
pub fn take_char(self, c: char) -> Self {
|
pub fn char_between(self, start: char, end: char) -> Self {
|
||||||
self.take(|this| this.text.starts_with(c), 1)
|
self.char_fn(|c| start <= c && c <= end)
|
||||||
}
|
}
|
||||||
pub fn take_except_char(self, c: char) -> Self {
|
pub fn char(self, c: char) -> Self {
|
||||||
self.take(|this| !this.text.starts_with(c), 1)
|
self.has(|rule| rule.text.starts_with(c), 1)
|
||||||
}
|
}
|
||||||
pub fn take_str(self, s: &str) -> Self {
|
pub fn str(self, s: &str) -> Self {
|
||||||
self.take(|this| this.text.starts_with(s), s.len())
|
self.has(|rule| rule.text.starts_with(s), s.len())
|
||||||
}
|
}
|
||||||
pub fn take_except_str(self, s: &str) -> Self {
|
pub fn char_fn(self, f: impl Fn(char) -> bool) -> Self {
|
||||||
self.take(|this| !this.text.starts_with(s), 1)
|
self.and(|rule| match rule.text.strip_prefix(&f) {
|
||||||
|
Some(text) => Self { text, taken: rule.taken + next_utf8(rule.text, 1), ..rule },
|
||||||
|
None => Self { is_alright: false, ..rule },
|
||||||
|
})
|
||||||
}
|
}
|
||||||
pub fn take_any(self) -> Self {
|
pub fn not_char(self, c: char) -> Self {
|
||||||
self.take(|_| true, 1)
|
self.has(|rule| !rule.text.starts_with(c), 1)
|
||||||
}
|
}
|
||||||
fn take(self, condition: impl Fn(&Self) -> bool, len: usize) -> Self {
|
pub fn not_str(self, s: &str) -> Self {
|
||||||
self.and(|this| match condition(&this) && !this.text.is_empty() {
|
self.has(|rule| !rule.text.starts_with(s), 1)
|
||||||
true => Self { text: &this.text[len..], taken: this.taken + len, ..this },
|
}
|
||||||
false => Self { is_alright: false, ..this },
|
pub fn any(self) -> Self {
|
||||||
|
self.has(|_| true, 1)
|
||||||
|
}
|
||||||
|
pub fn whitespace(self) -> Self {
|
||||||
|
self.and_any(|rule| rule.char_fn(|c| c.is_whitespace()))
|
||||||
|
}
|
||||||
|
fn has(self, condition: impl Fn(&Self) -> bool, len: usize) -> Self {
|
||||||
|
let len = next_utf8(self.text, len);
|
||||||
|
self.and(|rule| match condition(&rule) && !rule.text.is_empty() {
|
||||||
|
true => Self { text: &rule.text[len..], taken: rule.taken + len, ..rule },
|
||||||
|
false => Self { is_alright: false, ..rule },
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -141,6 +166,15 @@ pub mod lexer {
|
|||||||
Self { is_alright: true, ..self }
|
Self { is_alright: true, ..self }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns the index of the next unicode character, rounded up
|
||||||
|
fn next_utf8(text: &str, mut index: usize) -> usize {
|
||||||
|
index = index.min(text.len());
|
||||||
|
while !text.is_char_boundary(index) {
|
||||||
|
index += 1
|
||||||
|
}
|
||||||
|
index
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub mod parser {
|
pub mod parser {
|
||||||
@ -156,12 +190,12 @@ mod tests {
|
|||||||
mod token {
|
mod token {
|
||||||
use crate::token::*;
|
use crate::token::*;
|
||||||
#[test]
|
#[test]
|
||||||
fn token_type_is_stored() {
|
fn token_has_type() {
|
||||||
let t = Token::new(Type::Comment, 0, 10);
|
assert_eq!(Token::new(Type::Comment, 0, 10).ty(), Type::Comment);
|
||||||
assert_eq!(t.ty(), Type::Comment);
|
assert_eq!(Token::new(Type::Identifier, 0, 10).ty(), Type::Identifier);
|
||||||
}
|
}
|
||||||
#[test]
|
#[test]
|
||||||
fn token_range_is_stored() {
|
fn token_has_range() {
|
||||||
let t = Token::new(Type::Comment, 0, 10);
|
let t = Token::new(Type::Comment, 0, 10);
|
||||||
assert_eq!(t.range(), 0..10);
|
assert_eq!(t.range(), 0..10);
|
||||||
}
|
}
|
||||||
@ -175,16 +209,25 @@ mod tests {
|
|||||||
token::{Token, Type},
|
token::{Token, Type},
|
||||||
};
|
};
|
||||||
|
|
||||||
fn assert_whole_input_is_token<'t, F>(input: &'t str, operation: F, output_type: Type)
|
fn assert_whole_input_is_token<'t, F>(input: &'t str, f: F, ty: Type)
|
||||||
where F: FnOnce(&mut Lexer<'t>) -> Option<Token> {
|
where F: FnOnce(&mut Lexer<'t>) -> Option<Token> {
|
||||||
assert_eq!(
|
assert_has_type_and_len(input, f, ty, input.len())
|
||||||
operation(&mut Lexer::new(input)),
|
|
||||||
Some(Token::new(output_type, 0, input.len()))
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
fn assert_has_type_and_len<'t, F>(input: &'t str, f: F, ty: Type, len: usize)
|
||||||
|
where F: FnOnce(&mut Lexer<'t>) -> Option<Token> {
|
||||||
|
assert_eq!(Some(Token::new(ty, 0, len)), f(&mut Lexer::new(input)),)
|
||||||
|
}
|
||||||
|
|
||||||
|
mod comment {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn line_comment() {
|
fn line_comment() {
|
||||||
assert_whole_input_is_token("// this is a comment", Lexer::line_comment, Type::Comment);
|
assert_whole_input_is_token(
|
||||||
|
"// this is a comment",
|
||||||
|
Lexer::line_comment,
|
||||||
|
Type::Comment,
|
||||||
|
);
|
||||||
}
|
}
|
||||||
#[test]
|
#[test]
|
||||||
#[should_panic]
|
#[should_panic]
|
||||||
@ -218,6 +261,7 @@ mod tests {
|
|||||||
assert_whole_input_is_token("fn main() {}", Lexer::shebang_comment, Type::Comment);
|
assert_whole_input_is_token("fn main() {}", Lexer::shebang_comment, Type::Comment);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
mod parser {
|
mod parser {
|
||||||
// TODO
|
// TODO
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user