token & lexer: add line & col to tokens

This commit is contained in:
John 2023-10-13 13:05:14 -05:00
parent 545483dae6
commit c15490847a

View File

@ -83,22 +83,31 @@ pub mod token {
ty: Type, ty: Type,
head: usize, head: usize,
tail: usize, tail: usize,
line: usize,
col: usize,
} }
impl Token { impl Token {
pub fn new(ty: Type, head: usize, tail: usize) -> Self { pub fn new(ty: Type, head: usize, tail: usize, line: usize, col: usize) -> Self {
Self { ty, head, tail } Self { ty, head, tail, line, col }
}
pub fn line(&self) -> usize {
self.line
}
pub fn col(&self) -> usize {
self.col
} }
pub fn is_empty(&self) -> bool { pub fn is_empty(&self) -> bool {
self.tail == self.head self.tail == self.head
} }
/// Gets the length of the token, in bytes
pub fn len(&self) -> usize { pub fn len(&self) -> usize {
self.tail - self.head self.tail - self.head
} }
// Gets the [Type] of the token /// Gets the [Type] of the token
pub fn ty(&self) -> Type { pub fn ty(&self) -> Type {
self.ty self.ty
} }
// Gets the exclusive range of the token /// Gets the exclusive range of the token
pub fn range(&self) -> Range<usize> { pub fn range(&self) -> Range<usize> {
self.head..self.tail self.head..self.tail
} }
@ -118,25 +127,48 @@ pub mod lexer {
pub struct Lexer<'t> { pub struct Lexer<'t> {
text: &'t str, text: &'t str,
cursor: usize, cursor: usize,
line: usize,
col: usize,
} }
/// Implements the non-terminals of a language /// Implements the non-terminals of a language
impl<'t> Lexer<'t> { impl<'t> Lexer<'t> {
pub fn new(text: &'t str) -> Self { pub fn new(text: &'t str) -> Self {
Self { text, cursor: 0 } Self { text, cursor: 0, line: 1, col: 1 }
}
/// Counts some length
#[inline]
fn count_len(&mut self, len: usize) -> &mut Self {
self.cursor += len;
self.col += len;
self
}
/// Counts a line
#[inline]
fn count_line(&mut self, lines: usize) -> &mut Self {
self.line += lines;
self.col = 1;
self
} }
/// Skips whitespace in the text /// Skips whitespace in the text
fn skip_whitespace(&mut self) { fn skip_whitespace(&mut self) {
if let Some(len) = Rule::new(self.text()).and_any(Rule::whitespace).end() { self.count_len(
self.cursor += len Rule::new(self.text())
.and_any(Rule::whitespace_not_newline)
.end()
.unwrap_or_default(),
);
if Rule::new(self.text()).char('\n').end().is_some() {
// recurse until all newlines are skipped
self.count_len(1).count_line(1).skip_whitespace();
} }
} }
/// Advances the cursor and produces a token from a provided [Rule] function /// Advances the cursor and produces a token from a provided [Rule] function
fn map_rule<F>(&mut self, rule: F, ty: Type) -> Option<Token> fn map_rule<F>(&mut self, rule: F, ty: Type) -> Option<Token>
where F: Fn(Rule) -> Rule { where F: Fn(Rule) -> Rule {
self.skip_whitespace(); self.skip_whitespace();
let start = self.cursor; let (line, col, start) = (self.line, self.col, self.cursor);
self.cursor += Rule::new(self.text()).and(rule).end()?; self.count_len(Rule::new(self.text()).and(rule).end()?);
Some(Token::new(ty, start, self.cursor)) Some(Token::new(ty, start, self.cursor, line, col))
} }
/// Gets a slice of text beginning at the cursor /// Gets a slice of text beginning at the cursor
fn text(&self) -> &str { fn text(&self) -> &str {
@ -553,6 +585,10 @@ pub mod lexer {
pub fn whitespace(self) -> Self { pub fn whitespace(self) -> Self {
self.char_fn(|c| c.is_whitespace()) self.char_fn(|c| c.is_whitespace())
} }
/// Matches one whitespace, except `'\n'`
pub fn whitespace_not_newline(self) -> Self {
self.char_fn(|c| '\n' != c && c.is_whitespace())
}
/// Matches anything but whitespace /// Matches anything but whitespace
pub fn not_whitespace(self) -> Self { pub fn not_whitespace(self) -> Self {
self.char_fn(|c| !c.is_whitespace()) self.char_fn(|c| !c.is_whitespace())
@ -587,6 +623,7 @@ pub mod lexer {
pub fn string_escape(self) -> Self { pub fn string_escape(self) -> Self {
self.char('\\').and(Rule::any) self.char('\\').and(Rule::any)
} }
/// Performs a consuming condition assertion on the input
fn has(self, condition: impl Fn(&Self) -> bool, len: usize) -> Self { fn has(self, condition: impl Fn(&Self) -> bool, len: usize) -> Self {
let len = next_utf8(self.text, len); let len = next_utf8(self.text, len);
self.and(|rule| match condition(&rule) && !rule.text.is_empty() { self.and(|rule| match condition(&rule) && !rule.text.is_empty() {
@ -629,12 +666,15 @@ mod tests {
use crate::token::*; use crate::token::*;
#[test] #[test]
fn token_has_type() { fn token_has_type() {
assert_eq!(Token::new(Type::Comment, 0, 10).ty(), Type::Comment); assert_eq!(Token::new(Type::Comment, 0, 10, 1, 1).ty(), Type::Comment);
assert_eq!(Token::new(Type::Identifier, 0, 10).ty(), Type::Identifier); assert_eq!(
Token::new(Type::Identifier, 0, 10, 1, 1).ty(),
Type::Identifier
);
} }
#[test] #[test]
fn token_has_range() { fn token_has_range() {
let t = Token::new(Type::Comment, 0, 10); let t = Token::new(Type::Comment, 0, 10, 1, 1);
assert_eq!(t.range(), 0..10); assert_eq!(t.range(), 0..10);
} }
} }
@ -850,7 +890,7 @@ mod tests {
#[test] #[test]
fn escape_string() { fn escape_string() {
assert_whole_input_is_token( assert_whole_input_is_token(
r#"" \"This is a quote\" ""#, "\" \\\"This is a quote\\\" \"",
Lexer::lit_string, Lexer::lit_string,
Type::LitString, Type::LitString,
); );