From b5abd2bff10baece3a36dcb622ff4f5e21a0e51c Mon Sep 17 00:00:00 2001 From: John Date: Sun, 22 Oct 2023 18:28:20 -0500 Subject: [PATCH] Lexer rewrite: - Scan the input string *linearly*, without backtracking - Peek at most one character (unicode code-point) ahead - Store data (unescaped string literals and chars, identifiers, integers, floats) inside Token - This unfortunately makes tokens non-Copy - Refactor Parser to accommodate these changes - On the bright side, Parser no longer needs a reference to the text! - Write a new set of lexer tests - TODO: write a new set of token tests using tokendata Every day, we get closer to parsing `dummy.cl`! --- libconlang/examples/identify_tokens.rs | 24 +- libconlang/src/lexer.rs | 957 ++++++++++++------------- libconlang/src/parser.rs | 131 ++-- libconlang/src/tests.rs | 588 ++++----------- libconlang/src/token.rs | 79 +- 5 files changed, 716 insertions(+), 1063 deletions(-) diff --git a/libconlang/examples/identify_tokens.rs b/libconlang/examples/identify_tokens.rs index dc68456..3c9c9d0 100644 --- a/libconlang/examples/identify_tokens.rs +++ b/libconlang/examples/identify_tokens.rs @@ -13,7 +13,7 @@ fn main() -> Result<(), Box> { take_stdin()?; } else { for path in conf.paths.iter().map(PathBuf::as_path) { - lex_tokens(&std::fs::read_to_string(path)?, Some(path)); + lex_tokens(&std::fs::read_to_string(path)?, Some(path))?; } } Ok(()) @@ -32,29 +32,37 @@ impl Config { fn take_stdin() -> Result<(), Box> { if stdin().is_terminal() { for line in stdin().lines() { - lex_tokens(&line?, None) + lex_tokens(&line?, None)? } } else { - lex_tokens(&std::io::read_to_string(stdin())?, None) + lex_tokens(&std::io::read_to_string(stdin())?, None)? } Ok(()) } -fn lex_tokens(file: &str, path: Option<&Path>) { +fn lex_tokens(file: &str, path: Option<&Path>) -> Result<(), Box> { for token in Lexer::new(file) { + let token = match token { + Ok(t) => t, + Err(e) => { + println!("{e:?}"); + break; + }, + }; if let Some(path) = path { print!("{path:?}:") } - print_token(file, token); + print_token(token); } + Ok(()) } -fn print_token(line: &str, t: conlang::token::Token) { +fn print_token(t: conlang::token::Token) { println!( - "{:02}:{:02}: {:#19} │{}│", + "{:02}:{:02}: {:#19} │{:?}│", t.line(), t.col(), t.ty(), - &line[t.range()] + t.data(), ) } diff --git a/libconlang/src/lexer.rs b/libconlang/src/lexer.rs index a5f5002..59c226e 100644 --- a/libconlang/src/lexer.rs +++ b/libconlang/src/lexer.rs @@ -1,548 +1,477 @@ //! Converts a text file into tokens -use crate::token::{Token, Type}; -use lerox::Combinator; +use crate::token::{Keyword, Token, TokenData, Type}; +use std::{ + iter::Peekable, + str::{Chars, FromStr}, +}; +use unicode_xid::UnicodeXID; -pub struct IntoIter<'t> { - lexer: Lexer<'t>, -} -impl<'t> Iterator for IntoIter<'t> { - type Item = Token; - fn next(&mut self) -> Option { - self.lexer.any() +pub mod lexer_iter { + use super::{ + error::{LResult, Reason}, + Lexer, Token, + }; + + /// Fallible iterator over a [Lexer], returning optional [LResult]s + pub struct LexerIter<'t> { + lexer: Lexer<'t>, } -} -impl<'t> IntoIterator for Lexer<'t> { - type Item = Token; - type IntoIter = IntoIter<'t>; - fn into_iter(self) -> Self::IntoIter { - IntoIter { lexer: self } + impl<'t> Iterator for LexerIter<'t> { + type Item = LResult; + fn next(&mut self) -> Option { + match self.lexer.scan() { + Ok(v) => Some(Ok(v)), + Err(e) => { + if e.reason == Reason::EndOfFile { + None + } else { + Some(Err(e)) + } + } + } + } + } + impl<'t> IntoIterator for Lexer<'t> { + type Item = LResult; + type IntoIter = LexerIter<'t>; + fn into_iter(self) -> Self::IntoIter { + LexerIter { lexer: self } + } } } #[derive(Clone, Debug)] pub struct Lexer<'t> { - text: &'t str, - cursor: usize, - line: u32, - col: u32, + iter: Peekable>, + start: usize, + start_loc: (u32, u32), + current: usize, + current_loc: (u32, u32), } -/// Implements the non-terminals of a language + impl<'t> Lexer<'t> { pub fn new(text: &'t str) -> Self { - Self { text, cursor: 0, line: 1, col: 1 } - } - /// Consumes the entire [`Lexer`], producing a [`Vec`] - /// and returning the original string - pub fn consume(self) -> (Vec, &'t str) { - let text = self.text; - (self.into_iter().collect(), text) - } - /// Counts some length - #[inline] - fn count_len(&mut self, len: usize) -> &mut Self { - self.cursor += len; - self.col += len as u32; - self - } - /// Counts a line - #[inline] - fn count_line(&mut self, lines: u32) -> &mut Self { - self.line += lines; - self.col = 1; - self - } - /// Skips whitespace in the text - fn skip_whitespace(&mut self) { - self.count_len( - Rule::new(self.text()) - .and_any(Rule::whitespace_not_newline) - .end() - .unwrap_or_default(), - ); - if Rule::new(self.text()).char('\n').end().is_some() { - // recurse until all newlines are skipped - self.count_len(1).count_line(1).skip_whitespace(); + Self { + iter: text.chars().peekable(), + start: 0, + start_loc: (1, 1), + current: 0, + current_loc: (1, 1), } } - /// Advances the cursor and produces a token from a provided [Rule] function - fn map_rule(&mut self, rule: F, ty: Type) -> Option - where F: Fn(Rule) -> Rule { - self.skip_whitespace(); - let (line, col, start) = (self.line, self.col, self.cursor); - self.count_len(Rule::new(self.text()).and(rule).end()?); - Some(Token::new(ty, start, self.cursor, line, col)) - } - /// Gets a slice of text beginning at the cursor - fn text(&self) -> &str { - &self.text[self.cursor..] - } - // classifies a single arbitrary token - /// Returns the result of the rule with the highest precedence, if any matches - pub fn any(&mut self) -> Option { - None.or_else(|| self.comment()) - .or_else(|| self.identifier()) - .or_else(|| self.literal()) - .or_else(|| self.delimiter()) - .or_else(|| self.punctuation()) - .or_else(|| self.invalid()) - } - /// Attempts to produce a [Type::String], [Type::Float], or [Type::Integer] - pub fn literal(&mut self) -> Option { - None.or_else(|| self.string()) - .or_else(|| self.character()) - .or_else(|| self.float()) - .or_else(|| self.integer()) - } - /// Evaluates delimiter rules - pub fn delimiter(&mut self) -> Option { - None.or_else(|| self.l_brack()) - .or_else(|| self.r_brack()) - .or_else(|| self.l_curly()) - .or_else(|| self.r_curly()) - .or_else(|| self.l_paren()) - .or_else(|| self.r_paren()) - } - /// Evaluates punctuation rules - pub fn punctuation(&mut self) -> Option { - None.or_else(|| self.amp_amp()) // && - .or_else(|| self.amp_eq()) // &= - .or_else(|| self.amp()) // & - .or_else(|| self.at()) // @ - .or_else(|| self.backslash()) // \ - .or_else(|| self.bang_bang()) // !! - .or_else(|| self.bang_eq()) // != - .or_else(|| self.bang()) // ! - .or_else(|| self.bar_bar()) // || - .or_else(|| self.bar_eq()) // |= - .or_else(|| self.bar()) // | - .or_else(|| self.colon()) // : - .or_else(|| self.comma()) // , - .or_else(|| self.dot_dot_eq()) // ..= - .or_else(|| self.dot_dot()) // .. - .or_else(|| self.dot()) // . - .or_else(|| self.eq_eq()) // == - .or_else(|| self.fatarrow()) // => - .or_else(|| self.eq()) // = - .or_else(|| self.grave()) // ` - .or_else(|| self.gt_eq()) // >= - .or_else(|| self.gt_gt_eq()) // >>= - .or_else(|| self.gt_gt()) // >> - .or_else(|| self.gt()) // > - .or_else(|| self.hash()) // # - .or_else(|| self.lt_eq()) // <= - .or_else(|| self.lt_lt_eq()) // <<= - .or_else(|| self.lt_lt()) // << - .or_else(|| self.lt()) // < - .or_else(|| self.minus_eq()) // -= - .or_else(|| self.arrow()) // -> - .or_else(|| self.minus()) // - - .or_else(|| self.plus_eq()) // += - .or_else(|| self.plus()) // + - .or_else(|| self.question()) // ? - .or_else(|| self.rem_eq()) // %= - .or_else(|| self.rem()) // % - .or_else(|| self.semi()) // ; - .or_else(|| self.slash_eq()) // /= - .or_else(|| self.slash()) // / - .or_else(|| self.star_eq()) // *= - .or_else(|| self.star()) // * - .or_else(|| self.tilde()) // ~ - .or_else(|| self.xor_eq()) // ^= - .or_else(|| self.xor_xor()) // ^^ - .or_else(|| self.xor()) // ^ - } - pub fn unary_op(&mut self) -> Option { - self.bang().or_else(|| self.minus()) - } - // functions for lexing individual tokens - pub fn invalid(&mut self) -> Option { - self.map_rule(|r| r.invalid(), Type::Invalid) - } - // comments - pub fn comment(&mut self) -> Option { - self.map_rule(|r| r.comment(), Type::Comment) - } - // identifiers - pub fn identifier(&mut self) -> Option { - self.map_rule(|r| r.identifier(), Type::Identifier) - .map(|token| match self.text[token.range()].parse() { - Ok(kw) => token.cast(Type::Keyword(kw)), - Err(_) => token, - }) - } - // literals - pub fn integer(&mut self) -> Option { - self.map_rule(|r| r.integer(), Type::Integer) - } - pub fn float(&mut self) -> Option { - self.map_rule(|r| r.float(), Type::Float) - } - pub fn string(&mut self) -> Option { - // TODO: count lines and columns properly within string - self.map_rule(|r| r.string(), Type::String) - .map(|t| t.rebound(t.head + 1, t.tail - 1)) - } - pub fn character(&mut self) -> Option { - self.map_rule(|r| r.character(), Type::Character) - .map(|t| t.rebound(t.head + 1, t.tail - 1)) - } - // delimiters - pub fn l_brack(&mut self) -> Option { - self.map_rule(|r| r.char('['), Type::LBrack) - } - pub fn r_brack(&mut self) -> Option { - self.map_rule(|r| r.char(']'), Type::RBrack) - } - pub fn l_curly(&mut self) -> Option { - self.map_rule(|r| r.char('{'), Type::LCurly) - } - pub fn r_curly(&mut self) -> Option { - self.map_rule(|r| r.char('}'), Type::RCurly) - } - pub fn l_paren(&mut self) -> Option { - self.map_rule(|r| r.char('('), Type::LParen) - } - pub fn r_paren(&mut self) -> Option { - self.map_rule(|r| r.char(')'), Type::RParen) - } - // compound punctuation - pub fn lt_lt(&mut self) -> Option { - self.map_rule(|r| r.str("<<"), Type::LtLt) - } - pub fn gt_gt(&mut self) -> Option { - self.map_rule(|r| r.str(">>"), Type::GtGt) - } - pub fn amp_amp(&mut self) -> Option { - self.map_rule(|r| r.str("&&"), Type::AmpAmp) - } - pub fn bar_bar(&mut self) -> Option { - self.map_rule(|r| r.str("||"), Type::BarBar) - } - pub fn bang_bang(&mut self) -> Option { - self.map_rule(|r| r.str("!!"), Type::BangBang) - } - pub fn xor_xor(&mut self) -> Option { - self.map_rule(|r| r.str("^^"), Type::XorXor) - } - pub fn eq_eq(&mut self) -> Option { - self.map_rule(|r| r.str("=="), Type::EqEq) - } - pub fn gt_eq(&mut self) -> Option { - self.map_rule(|r| r.str(">="), Type::GtEq) - } - pub fn lt_eq(&mut self) -> Option { - self.map_rule(|r| r.str("<="), Type::LtEq) - } - pub fn bang_eq(&mut self) -> Option { - self.map_rule(|r| r.str("!="), Type::BangEq) - } - pub fn star_eq(&mut self) -> Option { - self.map_rule(|r| r.str("*="), Type::StarEq) - } - pub fn slash_eq(&mut self) -> Option { - self.map_rule(|r| r.str("/="), Type::SlashEq) - } - pub fn rem_eq(&mut self) -> Option { - self.map_rule(|r| r.str("%="), Type::RemEq) - } - pub fn plus_eq(&mut self) -> Option { - self.map_rule(|r| r.str("+="), Type::PlusEq) - } - pub fn minus_eq(&mut self) -> Option { - self.map_rule(|r| r.str("-="), Type::MinusEq) - } - pub fn amp_eq(&mut self) -> Option { - self.map_rule(|r| r.str("&="), Type::AmpEq) - } - pub fn bar_eq(&mut self) -> Option { - self.map_rule(|r| r.str("|="), Type::BarEq) - } - pub fn xor_eq(&mut self) -> Option { - self.map_rule(|r| r.str("^="), Type::XorEq) - } - pub fn lt_lt_eq(&mut self) -> Option { - self.map_rule(|r| r.str("<<="), Type::LtLtEq) - } - pub fn gt_gt_eq(&mut self) -> Option { - self.map_rule(|r| r.str(">>="), Type::GtGtEq) - } - pub fn dot_dot_eq(&mut self) -> Option { - self.map_rule(|r| r.str("..="), Type::DotDotEq) - } - pub fn dot_dot(&mut self) -> Option { - self.map_rule(|r| r.str(".."), Type::DotDot) - } - pub fn arrow(&mut self) -> Option { - self.map_rule(|r| r.str("->"), Type::Arrow) - } - pub fn fatarrow(&mut self) -> Option { - self.map_rule(|r| r.str("=>"), Type::FatArrow) - } - // simple punctuation - pub fn semi(&mut self) -> Option { - self.map_rule(|r| r.char(';'), Type::Semi) - } - pub fn dot(&mut self) -> Option { - self.map_rule(|r| r.char('.'), Type::Dot) - } - pub fn star(&mut self) -> Option { - self.map_rule(|r| r.char('*'), Type::Star) - } - pub fn slash(&mut self) -> Option { - self.map_rule(|r| r.char('/'), Type::Slash) - } - pub fn plus(&mut self) -> Option { - self.map_rule(|r| r.char('+'), Type::Plus) - } - pub fn minus(&mut self) -> Option { - self.map_rule(|r| r.char('-'), Type::Minus) - } - pub fn rem(&mut self) -> Option { - self.map_rule(|r| r.char('%'), Type::Rem) - } - pub fn bang(&mut self) -> Option { - self.map_rule(|r| r.char('!'), Type::Bang) - } - pub fn eq(&mut self) -> Option { - self.map_rule(|r| r.char('='), Type::Eq) - } - pub fn lt(&mut self) -> Option { - self.map_rule(|r| r.char('<'), Type::Lt) - } - pub fn gt(&mut self) -> Option { - self.map_rule(|r| r.char('>'), Type::Gt) - } - pub fn amp(&mut self) -> Option { - self.map_rule(|r| r.char('&'), Type::Amp) - } - pub fn bar(&mut self) -> Option { - self.map_rule(|r| r.char('|'), Type::Bar) - } - pub fn xor(&mut self) -> Option { - self.map_rule(|r| r.char('^'), Type::Xor) - } - pub fn hash(&mut self) -> Option { - self.map_rule(|r| r.char('#'), Type::Hash) - } - pub fn at(&mut self) -> Option { - self.map_rule(|r| r.char('@'), Type::At) - } - pub fn colon(&mut self) -> Option { - self.map_rule(|r| r.char(':'), Type::Colon) - } - pub fn question(&mut self) -> Option { - self.map_rule(|r| r.char('?'), Type::Question) - } - pub fn comma(&mut self) -> Option { - self.map_rule(|r| r.char(','), Type::Comma) - } - pub fn tilde(&mut self) -> Option { - self.map_rule(|r| r.char('~'), Type::Tilde) - } - pub fn grave(&mut self) -> Option { - self.map_rule(|r| r.char('`'), Type::Grave) - } - pub fn backslash(&mut self) -> Option { - self.map_rule(|r| r.char('\\'), Type::Backslash) + pub fn scan(&mut self) -> LResult { + match self.skip_whitespace().peek()? { + '{' => self.consume()?.produce(Type::LCurly, ()), + '}' => self.consume()?.produce(Type::RCurly, ()), + '[' => self.consume()?.produce(Type::LBrack, ()), + ']' => self.consume()?.produce(Type::RBrack, ()), + '(' => self.consume()?.produce(Type::LParen, ()), + ')' => self.consume()?.produce(Type::RParen, ()), + '&' => self.consume()?.amp(), + '@' => self.consume()?.produce(Type::At, ()), + '\\' => self.consume()?.produce(Type::Backslash, ()), + '!' => self.consume()?.bang(), + '|' => self.consume()?.bar(), + ':' => self.consume()?.produce(Type::Colon, ()), + ',' => self.consume()?.produce(Type::Comma, ()), + '.' => self.consume()?.dot(), + '=' => self.consume()?.equal(), + '`' => self.consume()?.produce(Type::Grave, ()), + '>' => self.consume()?.greater(), + '#' => self.consume()?.produce(Type::Hash, ()), + '<' => self.consume()?.less(), + '-' => self.consume()?.minus(), + '+' => self.consume()?.plus(), + '?' => self.consume()?.produce(Type::Question, ()), + '%' => self.consume()?.rem(), + ';' => self.consume()?.produce(Type::Semi, ()), + '/' => self.consume()?.slash(), + '*' => self.consume()?.star(), + '~' => self.consume()?.produce(Type::Tilde, ()), + '^' => self.consume()?.xor(), + '0' => self.consume()?.int_with_base(), + '1'..='9' => self.digits::<10>(), + '"' => self.consume()?.string(), + '\'' => self.consume()?.character(), + '_' => self.identifier(), + i if i.is_xid_start() => self.identifier(), + e => Err(Error::unexpected_char(e, self.line(), self.col())), + } + } + /// Gets the line of the next token + pub fn line(&self) -> u32 { + self.start_loc.0 + } + /// Gets the column of the next token + pub fn col(&self) -> u32 { + self.start_loc.1 + } + fn next(&mut self) -> LResult { + let out = self.peek(); + self.consume()?; + out + } + fn peek(&mut self) -> LResult { + self.iter + .peek() + .copied() + .ok_or(Error::end_of_file(self.line(), self.col())) + } + fn produce(&mut self, ty: Type, data: impl Into) -> LResult { + let loc = self.start_loc; + self.start_loc = self.current_loc; + self.start = self.current; + Ok(Token::new(ty, data, loc.0, loc.1)) + } + fn skip_whitespace(&mut self) -> &mut Self { + while let Ok(c) = self.peek() { + if !c.is_whitespace() { + break; + } + let _ = self.consume(); + } + self.start = self.current; + self.start_loc = self.current_loc; + self + } + fn consume(&mut self) -> LResult<&mut Self> { + self.current += 1; + match self.iter.next() { + Some('\n') => { + let (line, col) = &mut self.current_loc; + *line += 1; + *col = 1; + } + Some(_) => self.current_loc.1 += 1, + None => Err(Error::end_of_file(self.line(), self.col()))?, + } + Ok(self) } } - -// TODO: use real, functional parser-combinators here to produce tokens -/// A lexer [Rule] matches patterns in text in a declarative manner -#[derive(Clone, Debug, PartialEq, Eq)] -pub struct Rule<'t> { - text: &'t str, - taken: usize, - is_alright: bool, -} -impl<'t> Rule<'t> { - pub fn new(text: &'t str) -> Self { - Self { text, taken: 0, is_alright: true } +/// Digraphs and trigraphs +impl<'t> Lexer<'t> { + fn amp(&mut self) -> LResult { + match self.peek() { + Ok('&') => self.consume()?.produce(Type::AmpAmp, ()), + Ok('=') => self.consume()?.produce(Type::AmpEq, ()), + _ => self.produce(Type::Amp, ()), + } } - pub fn end(self) -> Option { - self.is_alright.then_some(self.taken) + fn bang(&mut self) -> LResult { + match self.peek() { + Ok('!') => self.consume()?.produce(Type::BangBang, ()), + Ok('=') => self.consume()?.produce(Type::BangEq, ()), + _ => self.produce(Type::Bang, ()), + } } - pub fn remaining(&self) -> &str { - self.text + fn bar(&mut self) -> LResult { + match self.peek() { + Ok('|') => self.consume()?.produce(Type::BarBar, ()), + Ok('=') => self.consume()?.produce(Type::BarEq, ()), + _ => self.produce(Type::Bar, ()), + } + } + fn dot(&mut self) -> LResult { + match self.peek() { + Ok('.') => { + if let Ok('=') = self.consume()?.peek() { + self.consume()?.produce(Type::DotDotEq, ()) + } else { + self.produce(Type::DotDot, ()) + } + } + _ => self.produce(Type::Dot, ()), + } + } + fn equal(&mut self) -> LResult { + match self.peek() { + Ok('=') => self.consume()?.produce(Type::EqEq, ()), + Ok('>') => self.consume()?.produce(Type::FatArrow, ()), + _ => self.produce(Type::Eq, ()), + } + } + fn greater(&mut self) -> LResult { + match self.peek() { + Ok('=') => self.consume()?.produce(Type::GtEq, ()), + Ok('>') => { + if let Ok('=') = self.consume()?.peek() { + self.consume()?.produce(Type::GtGtEq, ()) + } else { + self.produce(Type::GtGt, ()) + } + } + _ => self.produce(Type::Gt, ()), + } + } + fn less(&mut self) -> LResult { + match self.peek() { + Ok('=') => self.consume()?.produce(Type::LtEq, ()), + Ok('<') => { + if let Ok('=') = self.consume()?.peek() { + self.consume()?.produce(Type::LtLtEq, ()) + } else { + self.produce(Type::LtLt, ()) + } + } + _ => self.produce(Type::Lt, ()), + } + } + fn minus(&mut self) -> LResult { + match self.peek() { + Ok('=') => self.consume()?.produce(Type::MinusEq, ()), + Ok('>') => self.consume()?.produce(Type::Arrow, ()), + _ => self.produce(Type::Minus, ()), + } + } + fn plus(&mut self) -> LResult { + match self.peek() { + Ok('=') => self.consume()?.produce(Type::PlusEq, ()), + _ => self.produce(Type::Plus, ()), + } + } + fn rem(&mut self) -> LResult { + match self.peek() { + Ok('=') => self.consume()?.produce(Type::RemEq, ()), + _ => self.produce(Type::Rem, ()), + } + } + fn slash(&mut self) -> LResult { + match self.peek() { + Ok('=') => self.consume()?.produce(Type::SlashEq, ()), + Ok('/') => self.consume()?.line_comment(), + Ok('*') => self.consume()?.block_comment(), + _ => self.produce(Type::Slash, ()), + } + } + fn star(&mut self) -> LResult { + match self.peek() { + Ok('=') => self.consume()?.produce(Type::StarEq, ()), + _ => self.produce(Type::Star, ()), + } + } + fn xor(&mut self) -> LResult { + match self.peek() { + Ok('=') => self.consume()?.produce(Type::XorEq, ()), + Ok('^') => self.consume()?.produce(Type::XorXor, ()), + _ => self.produce(Type::Xor, ()), + } } } - -impl<'t> Rule<'t> { - /// Matches any sequence of non-whitespace characters - pub fn invalid(self) -> Self { - self.and_many(Self::not_whitespace) +/// Comments +impl<'t> Lexer<'t> { + fn line_comment(&mut self) -> LResult { + while Ok('\n') != self.peek() { + self.consume()?; + } + self.produce(Type::Comment, ()) } - /// Matches a block, line, or shebang comment - pub fn comment(self) -> Self { - self.and_either(Self::line_comment, Self::block_comment) - } - /// Matches a line or shebang comment - fn line_comment(self) -> Self { - // line_comment := ("//" | "#!/") (!newline)* - self.str("//") - .or(|r| r.str("#!/")) - .and_any(|r| r.not_char('\n')) - } - /// Matches a block comment - fn block_comment(self) -> Self { - // block_comment := "/*" (block_comment | all_but("*/"))* "*/" - self.str("/*") - .and_any(|r| r.and_either(|f| f.block_comment(), |g| g.not_str("*/"))) - .str("*/") - } - /// Matches a Rust-style identifier - pub fn identifier(self) -> Self { - // identifier := ('_' | XID_START) ~ XID_CONTINUE* - self.char('_') - .or(Rule::xid_start) - .and_any(Rule::xid_continue) - } - /// Matches a Rust-style base-prefixed int literal - fn integer_kind(self, prefix: &str, digit: impl Fn(Self) -> Self) -> Self { - // int_kind := Prefix '_'* Digit (Digit | '_')* - self.str(prefix) - .and_any(|r| r.char('_')) - .and(&digit) - .and_any(|r| r.and(&digit).or(|r| r.char('_'))) - } - /// Matches a Rust-style integer literal - pub fn integer(self) -> Self { - // integer = (int_kind<0d, dec_digit> | int_kind<0x, hex_digit> - // | int_kind<0o, oct_digit> | int_kind<0b, bin_digit> | dec_digit (dec_digit | '_')*) - self.and_one_of(&[ - &|rule| rule.integer_kind("0d", Rule::dec_digit), - &|rule| rule.integer_kind("0x", Rule::hex_digit), - &|rule| rule.integer_kind("0o", Rule::oct_digit), - &|rule| rule.integer_kind("0b", Rule::bin_digit), - &|rule| { - rule.dec_digit() - .and_any(|r| r.dec_digit().or(|r| r.char('_'))) - }, - ]) - } - /// Matches a float literal - // TODO: exponent form - pub fn float(self) -> Self { - self.and_any(Rule::dec_digit) - .char('.') - .and_many(Rule::dec_digit) - } - /// Matches one apostrophe-delimited char literal - pub fn character(self) -> Self { - self.char('\'').character_continue().char('\'') - } - pub fn character_continue(self) -> Self { - self.and(|rule| rule.string_escape().or(|rule| rule.not_char('\''))) - } - /// Matches one quote-delimited string literal - pub fn string(self) -> Self { - self.char('"').and_any(Rule::string_continue).char('"') - } - /// Matches one string escape sequence or non-`"` characcter - pub fn string_continue(self) -> Self { - self.and(Rule::string_escape).or(|rule| rule.not_char('"')) + fn block_comment(&mut self) -> LResult { + while let Ok(c) = self.next() { + if '*' == c && Ok('/') == self.next() { + break; + } + } + self.produce(Type::Comment, ()) } } - -impl<'t> Rule<'t> { - /// Matches a char lexicographically between start and end - pub fn char_between(self, start: char, end: char) -> Self { - self.char_fn(|c| start <= c && c <= end) +/// Identifiers +impl<'t> Lexer<'t> { + fn identifier(&mut self) -> LResult { + let mut out = String::from(self.xid_start()?); + while let Ok(c) = self.xid_continue() { + out.push(c) + } + if let Ok(keyword) = Keyword::from_str(&out) { + self.produce(Type::Keyword(keyword), ()) + } else { + self.produce(Type::Identifier, TokenData::Identifier(out.into())) + } } - /// Matches a single char - pub fn char(self, c: char) -> Self { - self.has(|rule| rule.text.starts_with(c), 1) + fn xid_start(&mut self) -> LResult { + match self.peek()? { + xid if xid == '_' || xid.is_xid_start() => { + self.consume()?; + Ok(xid) + } + bad => Err(Error::not_identifier(bad, self.line(), self.col())), + } } - /// Matches the entirety of a string slice - pub fn str(self, s: &str) -> Self { - self.has(|rule| rule.text.starts_with(s), s.len()) + fn xid_continue(&mut self) -> LResult { + match self.peek()? { + xid if xid.is_xid_continue() => { + self.consume()?; + Ok(xid) + } + bad => Err(Error::not_identifier(bad, self.line(), self.col())), + } } - /// Matches a char based on the output of a function - pub fn char_fn(self, f: impl Fn(char) -> bool) -> Self { - self.and(|rule| match rule.text.strip_prefix(&f) { - Some(text) => Self { text, taken: rule.taken + next_utf8(rule.text, 1), ..rule }, - None => Self { is_alright: false, ..rule }, +} +/// Integers +impl<'t> Lexer<'t> { + fn int_with_base(&mut self) -> LResult { + match self.peek() { + Ok('x') => self.consume()?.digits::<16>(), + Ok('d') => self.consume()?.digits::<10>(), + Ok('o') => self.consume()?.digits::<8>(), + Ok('b') => self.consume()?.digits::<2>(), + Ok('0'..='9') => self.digits::<10>(), + _ => self.produce(Type::Integer, 0), + } + } + fn digits(&mut self) -> LResult { + let mut value = self.digit::()? as u128; + while let Ok(true) = self.peek().as_ref().map(char::is_ascii_alphanumeric) { + value = value * B as u128 + self.digit::()? as u128; + } + self.produce(Type::Integer, value) + } + fn digit(&mut self) -> LResult { + let digit = self.peek()?; + self.consume()?; + digit + .to_digit(B) + .ok_or(Error::invalid_digit(digit, self.line(), self.col())) + } +} +/// Strings and characters +impl<'t> Lexer<'t> { + fn string(&mut self) -> LResult { + let mut value = String::new(); + while '"' + != self + .peek() + .map_err(|e| e.mask_reason(Reason::UnmatchedDelimiters('"')))? + { + value.push(self.unescape()?) + } + self.consume()?.produce(Type::String, value) + } + fn character(&mut self) -> LResult { + let out = self.unescape()?; + match self.peek()? { + '\'' => self.consume()?.produce(Type::Character, out), + _ => Err(Error::unmatched_delimiters('\'', self.line(), self.col())), + } + } + /// Unescape a single character + fn unescape(&mut self) -> LResult { + match self.next() { + Ok('\\') => (), + other => return other, + } + Ok(match self.next()? { + 'a' => '\x07', + 'b' => '\x08', + 'f' => '\x0c', + 'n' => '\n', + 'r' => '\r', + 't' => '\t', + 'x' => self.hex_escape()?, + 'u' => self.unicode_escape()?, + '0' => '\0', + chr => chr, }) } - /// Matches a single char except c - pub fn not_char(self, c: char) -> Self { - self.has(|rule| !rule.text.starts_with(c), 1) + /// unescape a single 2-digit hex escape + fn hex_escape(&mut self) -> LResult { + let out = (self.digit::<16>()? << 4) + self.digit::<16>()?; + char::from_u32(out).ok_or(Error::bad_unicode(out, self.line(), self.col())) } - /// Matches a single char unless the text starts with s - pub fn not_str(self, s: &str) -> Self { - self.has(|rule| !rule.text.starts_with(s), 1) - } - // commonly used character classes - /// Matches one of any character - pub fn any(self) -> Self { - self.has(|_| true, 1) - } - /// Matches one whitespace - pub fn whitespace(self) -> Self { - self.char_fn(|c| c.is_whitespace()) - } - /// Matches one whitespace, except `'\n'` - pub fn whitespace_not_newline(self) -> Self { - self.char_fn(|c| '\n' != c && c.is_whitespace()) - } - /// Matches anything but whitespace - pub fn not_whitespace(self) -> Self { - self.char_fn(|c| !c.is_whitespace()) - } - /// Matches one XID_START - pub fn xid_start(self) -> Self { - use unicode_xid::UnicodeXID; - self.char_fn(UnicodeXID::is_xid_start) - } - /// Matches one XID_CONTINUE - pub fn xid_continue(self) -> Self { - use unicode_xid::UnicodeXID; - self.char_fn(UnicodeXID::is_xid_continue) - } - /// Matches one hexadecimal digit - pub fn hex_digit(self) -> Self { - self.char_fn(|c| c.is_ascii_hexdigit()) - } - /// Matches one decimal digit - pub fn dec_digit(self) -> Self { - self.char_fn(|c| c.is_ascii_digit()) - } - /// Matches one octal digit - pub fn oct_digit(self) -> Self { - self.char_between('0', '7') - } - /// Matches one binary digit - pub fn bin_digit(self) -> Self { - self.char_between('0', '1') - } - /// Matches any string escape "\." - pub fn string_escape(self) -> Self { - self.char('\\').and(Rule::any) - } - /// Performs a consuming condition assertion on the input - fn has(self, condition: impl Fn(&Self) -> bool, len: usize) -> Self { - let len = next_utf8(self.text, len); - self.and(|rule| match condition(&rule) && !rule.text.is_empty() { - true => Self { text: &rule.text[len..], taken: rule.taken + len, ..rule }, - false => Self { is_alright: false, ..rule }, - }) + /// unescape a single \u{} unicode escape + fn unicode_escape(&mut self) -> LResult { + let mut out = 0; + let Ok('{') = self.peek() else { + return Err(Error::invalid_escape('u', self.line(), self.col())); + }; + self.consume()?; + while let Ok(c) = self.peek() { + match c { + '}' => { + self.consume()?; + return char::from_u32(out).ok_or(Error::bad_unicode( + out, + self.line(), + self.col(), + )); + } + _ => out = (out << 4) + self.digit::<16>()?, + } + } + Err(Error::invalid_escape('u', self.line(), self.col())) } } -impl<'t> lerox::Combinator for Rule<'t> { - fn is_alright(&self) -> bool { - self.is_alright - } - fn into_alright(self) -> Self { - Self { is_alright: true, ..self } - } -} +use error::{Error, LResult, Reason}; +pub mod error { + use std::fmt::Display; -/// Returns the index of the next unicode character, rounded up -fn next_utf8(text: &str, mut index: usize) -> usize { - index = index.min(text.len()); - while !text.is_char_boundary(index) { - index += 1 + pub type LResult = Result; + #[derive(Clone, Debug, PartialEq, Eq)] + pub struct Error { + pub reason: Reason, + pub line: u32, + pub col: u32, + } + /// The reason for the [Error] + #[derive(Clone, Copy, Debug, PartialEq, Eq)] + pub enum Reason { + UnmatchedDelimiters(char), + UnexpectedChar(char), + NotIdentifier(char), + UnknownEscape(char), + InvalidEscape(char), + InvalidDigit(char), + UnknownBase(char), + BadUnicode(u32), + EndOfFile, + } + error_impl! { + unmatched_delimiters(c: char) => Reason::UnmatchedDelimiters(c), + unexpected_char(c: char) => Reason::UnexpectedChar(c), + not_identifier(c: char) => Reason::NotIdentifier(c), + unknown_escape(e: char) => Reason::UnknownEscape(e), + invalid_escape(e: char) => Reason::InvalidEscape(e), + invalid_digit(digit: char) => Reason::InvalidDigit(digit), + unknown_base(base: char) => Reason::UnknownBase(base), + bad_unicode(value: u32) => Reason::BadUnicode(value), + end_of_file => Reason::EndOfFile, + } + impl Error { + /// Changes the [Reason] of this error + pub(super) fn mask_reason(self, reason: Reason) -> Self { + Self { reason, ..self } + } + /// Gets the (line, col) where the error happened + pub fn location(&self) -> (u32, u32) { + (self.line, self.col) + } + } + macro error_impl ($($fn:ident$(( $($p:ident: $t:ty),* ))? => $reason:expr),*$(,)?) { + #[allow(dead_code)] + impl Error { + $(pub(super) fn $fn ($($($p: $t),*,)? line: u32, col: u32) -> Self { + Self { reason: $reason, line, col } + })* + } + } + impl Display for Error { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}:{}: {}", self.line, self.col, self.reason) + } + } + impl Display for Reason { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Reason::UnmatchedDelimiters(c) => write! {f, "Unmatched `{c}` in input"}, + Reason::UnexpectedChar(c) => write!(f, "Character `{c}` not expected"), + Reason::NotIdentifier(c) => write!(f, "Character `{c}` not valid in identifiers"), + Reason::UnknownEscape(c) => write!(f, "`\\{c}` is not a known escape sequence"), + Reason::InvalidEscape(c) => write!(f, "Escape sequence `\\{c}`... is malformed"), + Reason::InvalidDigit(c) => write!(f, "`{c}` is not a valid digit"), + Reason::UnknownBase(c) => write!(f, "`0{c}`... is not a valid base"), + Reason::BadUnicode(c) => write!(f, "`{c}` is not a valid unicode code-point"), + Reason::EndOfFile => write!(f, "Reached end of input"), + } + } } - index } diff --git a/libconlang/src/parser.rs b/libconlang/src/parser.rs index 53d8f3e..e702543 100644 --- a/libconlang/src/parser.rs +++ b/libconlang/src/parser.rs @@ -1,12 +1,10 @@ //! Parses [tokens](super::token) into an [AST](super::ast) -use std::vec; use super::{ ast::preamble::*, lexer::Lexer, - token::{Keyword, Token, Type}, + token::{Keyword, Token, TokenData, Type}, }; -use constr::ConstrTools; use error::{Error, Reason::*, *}; pub mod error { @@ -16,6 +14,7 @@ pub mod error { #[derive(Clone, Debug, Default, PartialEq, Eq)] pub enum Reason { Expected(Type), + Unexpected(Type), NotIdentifier, NotOperator, NotLiteral, @@ -29,7 +28,6 @@ pub mod error { IntOverflow, NotBranch, IncompleteBranch, - AllElseFailed, EndOfFile, PanicStackUnderflow, #[default] @@ -41,6 +39,7 @@ pub mod error { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { Self::Expected(t) => write!(f, "Expected {t}"), + Self::Unexpected(t) => write!(f, "Unexpected {t} in bagging area"), Self::NotIdentifier => "Not an identifier".fmt(f), Self::NotOperator => "Not an operator".fmt(f), Self::NotLiteral => "Not a literal".fmt(f), @@ -54,7 +53,6 @@ pub mod error { Self::IntOverflow => "Integer too large".fmt(f), Self::IncompleteBranch => "Branch expression was incomplete".fmt(f), Self::NotBranch => "Expected branch expression".fmt(f), - Self::AllElseFailed => "Did not match any rule".fmt(f), Self::EndOfFile => "Got end of file".fmt(f), Self::PanicStackUnderflow => "Could not recover from panic".fmt(f), Self::Unspecified => { @@ -66,7 +64,7 @@ pub mod error { /// [Parser](super::Parser) [Result] pub type PResult = Result; - #[derive(Clone, Debug, Default, PartialEq, Eq)] + #[derive(Clone, Debug, Default, PartialEq)] pub struct Error { reason: Reason, start: Option, @@ -74,7 +72,7 @@ pub mod error { impl Display for Error { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - if let Some(token) = self.start { + if let Some(token) = &self.start { write!(f, "{}:{}: ", token.line(), token.col())?; } write!(f, "{}", self.reason) @@ -95,14 +93,15 @@ pub mod error { pub fn maybe_token(self, start: Option) -> Self { Self { start, ..self } } - pub fn start(&self) -> Option { - self.start + pub fn start(&self) -> Option<&Token> { + self.start.as_ref() } pub fn reason(self, reason: Reason) -> Self { Self { reason, ..self } } error_impl! { expected(e: Type): Expected, + unexpected(e: Type): Unexpected, not_identifier: NotIdentifier, not_operator: NotOperator, not_literal: NotLiteral, @@ -115,7 +114,6 @@ pub mod error { not_int: NotInt, int_overflow: IntOverflow, not_branch: NotBranch, - all_else_failed: AllElseFailed, end_of_file: EndOfFile, panic_underflow: PanicStackUnderflow, unspecified: Unspecified, @@ -125,27 +123,32 @@ pub mod error { /// The Parser performs recursive descent on the AST's grammar /// using a provided [Lexer]. -pub struct Parser<'t> { +pub struct Parser { tokens: Vec, panic_stack: Vec, - text: &'t str, curr: usize, } -impl<'t> From> for Parser<'t> { +impl<'t> From> for Parser { fn from(value: Lexer<'t>) -> Self { - let (tokens, text) = value.consume(); - Self::new(tokens, text) + let mut tokens = vec![]; + for result in value { + match result { + Ok(t) => tokens.push(t), + Err(e) => println!("{e}"), + } + } + Self::new(tokens) } } -impl<'t> Parser<'t> { +impl Parser { /// Create a new [Parser] from a list of [Tokens][1] /// and the [text](str) used to generate that list /// (as [Tokens][1] do not store their strings) /// /// [1]: Token - pub fn new(tokens: Vec, text: &'t str) -> Self { - Self { tokens, text, panic_stack: vec![], curr: 0 } + pub fn new(tokens: Vec) -> Self { + Self { tokens, panic_stack: vec![], curr: 0 } } /// Parse the [start of an AST](Start) pub fn parse(&mut self) -> PResult { @@ -170,7 +173,7 @@ impl<'t> Parser<'t> { pub fn peek(&self) -> PResult<&Token> { self.tokens .get(self.curr) - .ok_or(Error::end_of_file().maybe_token(self.tokens.last().copied())) + .ok_or(Error::end_of_file().maybe_token(self.tokens.last().cloned())) } /// Records the current position on the panic stack pub fn mark(&mut self) -> &mut Self { @@ -198,7 +201,7 @@ impl<'t> Parser<'t> { } } /// Helpers -impl<'t> Parser<'t> { +impl Parser { fn consume_type(&mut self, t: Type) -> PResult<&mut Self> { self.matches(t)?; Ok(self.consume()) @@ -207,17 +210,17 @@ impl<'t> Parser<'t> { if self.curr < self.tokens.len() { Ok(self) } else { - Err(Error::end_of_file().maybe_token(self.tokens.last().copied())) + Err(Error::end_of_file().maybe_token(self.tokens.last().cloned())) } } fn todo_error(&mut self, l: u32, c: u32, s: &str) -> Error { eprintln!("TODO: {s}:{l}:{c}"); - Error::unspecified().token(*self.peek().unwrap()) + Error::unspecified().token(self.peek().unwrap().clone()) } fn matches(&mut self, e: Type) -> PResult<&Token> { let t = self.check_eof()?.peek().expect("self should not be eof"); if t.ty() != e { - Err(Error::expected(e).token(*t))? + Err(Error::expected(e).token(t.clone()))? } Ok(t) } @@ -250,51 +253,54 @@ macro ptodo($self:expr $(, $t:expr)*) { } /// # Terminals and Pseudo-Terminals -impl<'t> Parser<'t> { +impl Parser { fn identifier(&mut self) -> PResult { - let token = *self - .matches(Type::Identifier) - .map_err(|e| Error::not_identifier().maybe_token(e.start()))?; - Ok(Identifier(self.consume().text[&token].into())) + let out = match self.matches(Type::Identifier)?.data() { + TokenData::Identifier(id) => Identifier(id.to_string()), + _ => Err(Error::not_identifier())?, + }; + self.consume(); + Ok(out) } fn literal(&mut self) -> PResult { use literal::Literal::*; use Keyword::{False, True}; - let tok = self.peek()?; - match tok.ty() { + let token = self.peek()?; + match token.ty() { Type::Float => self.float().map(Float), Type::Integer => self.int().map(Int), Type::String => self.string().map(String), Type::Character => self.char().map(Char), Type::Keyword(True | False) => self.bool().map(Bool), - _ => Err(Error::not_literal().token(*tok)), + _ => Err(Error::not_literal().token(token.clone())), } } fn float(&mut self) -> PResult { ptodo!(self) } fn int(&mut self) -> PResult { - let token = *self.matches(Type::Integer)?; - self.consume().text[&token] - .chars() - .parse_int::() - .next() - .ok_or(Error::not_int().token(token)) + let out = match self.matches(Type::Integer)?.data() { + TokenData::Integer(i) => *i, + _ => Err(Error::not_int())?, + }; + self.consume(); + Ok(out) } fn string(&mut self) -> PResult { - let range = self - .matches(Type::String) - .map_err(|e| e.reason(NotString))? - .range(); - Ok(self.consume().text[range].chars().unescape().collect()) + let out = match self.matches(Type::String)?.data() { + TokenData::String(s) => s.clone(), + _ => Err(Error::not_string())?, + }; + self.consume(); + Ok(out) } fn char(&mut self) -> PResult { - let token = *self.matches(Type::Character)?; - self.consume().text[&token] - .chars() - .unescape() - .next() - .ok_or(Error::not_char().token(token)) + let out = match self.matches(Type::Character)?.data() { + TokenData::Character(c) => *c, + _ => Err(Error::not_char())?, + }; + self.consume(); + Ok(out) } fn bool(&mut self) -> PResult { use Keyword::{False, True}; @@ -302,14 +308,14 @@ impl<'t> Parser<'t> { let out = match token.ty() { Type::Keyword(False) => false, Type::Keyword(True) => true, - _ => Err(Error::not_bool().token(*token))?, + _ => Err(Error::not_bool().token(token.clone()))?, }; self.consume(); Ok(out) } } /// Expressions -impl<'t> Parser<'t> { +impl Parser { fn expr(&mut self) -> PResult { use expression::Expr; Ok(Expr { ignore: self.ignore()? }) @@ -335,7 +341,7 @@ impl<'t> Parser<'t> { } fn primary(&mut self) -> PResult { use expression::Primary; - let token = *self.peek()?; + let token = self.peek()?; match token.ty() { Type::Identifier => self.identifier().map(Primary::Identifier), Type::String @@ -346,7 +352,7 @@ impl<'t> Parser<'t> { Type::LCurly => self.block().map(Primary::Block), Type::LParen => self.group().map(Primary::Group), Type::Keyword(_) => self.flow().map(Primary::Branch), - _ => Err(Error::all_else_failed().token(token))?, + e => Err(Error::unexpected(e).token(token.clone()))?, } } } @@ -377,7 +383,7 @@ macro binary ($($f:ident = $a:ident, $b:ident);*$(;)?) {$( } )*} /// # [Arithmetic and Logical Subexpressions](math) -impl<'t> Parser<'t> { +impl Parser { binary! { //name operands operators ignore = assign, ignore_op; @@ -400,18 +406,19 @@ impl<'t> Parser<'t> { } macro operator_impl ($($(#[$m:meta])* $f:ident : {$($type:pat => $op:ident),*$(,)?})*) { $($(#[$m])* fn $f(&mut self) -> PResult { + use operator::Binary; - let token = *self.peek()?; + let token = self.peek()?; let out = Ok(match token.ty() { $($type => Binary::$op,)* - _ => Err(Error::not_operator().token(token))?, + _ => Err(Error::not_operator().token(token.clone()))?, }); self.consume(); out })* } /// # [Operators](operator) -impl<'t> Parser<'t> { +impl Parser { operator_impl! { factor_op: { Type::Star => Mul, @@ -465,7 +472,7 @@ impl<'t> Parser<'t> { /// Parse a [unary operator](operator::Unary) fn unary_op(&mut self) -> PResult { use operator::Unary; - let token = *self.peek()?; + let token = self.peek()?; let out = Ok(match token.ty() { Type::AmpAmp => Unary::RefRef, Type::Amp => Unary::Ref, @@ -475,18 +482,18 @@ impl<'t> Parser<'t> { Type::At => Unary::At, Type::Hash => Unary::Hash, Type::Tilde => Unary::Tilde, - _ => Err(Error::not_operator().token(token))?, + _ => Err(Error::not_operator().token(token.clone()))?, }); self.consume(); out } } /// # [Control Flow](control) -impl<'t> Parser<'t> { +impl Parser { fn flow(&mut self) -> PResult { use control::Flow; use Keyword::{Break, Continue, For, If, Return, While}; - let token = *self.peek()?; + let token = self.peek()?; match token.ty() { Type::Keyword(While) => self.parse_while().map(Flow::While), Type::Keyword(For) => self.parse_for().map(Flow::For), @@ -494,9 +501,9 @@ impl<'t> Parser<'t> { Type::Keyword(Break) => self.parse_break().map(Flow::Break), Type::Keyword(Return) => self.parse_return().map(Flow::Return), Type::Keyword(Continue) => self.parse_continue().map(Flow::Continue), - _ => Err(Error::all_else_failed().token(token)), + e => Err(Error::unexpected(e).token(token.clone()))?, } - .map_err(|e| e.reason(IncompleteBranch).token(token)) + .map_err(|e| e.reason(IncompleteBranch)) } fn parse_if(&mut self) -> PResult { self.keyword(Keyword::If)?; diff --git a/libconlang/src/tests.rs b/libconlang/src/tests.rs index 3efcf3e..adf93c9 100644 --- a/libconlang/src/tests.rs +++ b/libconlang/src/tests.rs @@ -1,476 +1,180 @@ mod token { - use crate::token::*; - #[test] - fn token_has_type() { - assert_eq!(Token::new(Type::Comment, 0, 10, 1, 1).ty(), Type::Comment); - assert_eq!( - Token::new(Type::Identifier, 0, 10, 1, 1).ty(), - Type::Identifier - ); - } - #[test] - fn token_has_range() { - let t = Token::new(Type::Comment, 0, 10, 1, 1); - assert_eq!(t.range(), 0..10); - } + // TODO } mod ast { // TODO } mod lexer { - use std::ops::Range; - + #[allow(unused_imports)] use crate::{ - lexer::*, - token::{Token, Type}, + lexer::Lexer, + token::{Token, TokenData, Keyword, Type}, }; - fn assert_whole_input_is_token<'t, F>(input: &'t str, f: F, ty: Type) - where F: FnOnce(&mut Lexer<'t>) -> Option { - assert_has_type_and_range(input, f, ty, 0..input.len()) - } - fn assert_has_type_and_range<'t, F>(input: &'t str, f: F, ty: Type, range: Range) - where F: FnOnce(&mut Lexer<'t>) -> Option { - let tok = - f(&mut Lexer::new(input)).unwrap_or_else(|| panic!("Should be {ty:?}, {range:?}")); - assert_eq!(ty, tok.ty()); - assert_eq!(range, tok.range()); - } - - mod comment { - use super::*; - + macro test_lexer_output_type ($($f:ident {$($test:expr => $expect:expr),*$(,)?})*) {$( #[test] - fn line_comment() { - assert_whole_input_is_token("// comment!", Lexer::comment, Type::Comment); - } - #[test] - #[should_panic] - fn not_line_comment() { - assert_whole_input_is_token("fn main() {}", Lexer::comment, Type::Comment); - } - #[test] - fn block_comment() { - assert_whole_input_is_token("/* comment! */", Lexer::comment, Type::Comment); - } - #[test] - fn nested_block_comment() { - assert_whole_input_is_token( - "/* a /* nested */ comment */", - Lexer::comment, - Type::Comment, + fn $f() {$( + assert_eq!( + Lexer::new($test) + .into_iter() + .map(|t| t.unwrap().ty()) + .collect::>(), + dbg!($expect) ); - } + )*} + )*} + + macro test_lexer_data_type ($($f:ident {$($test:expr => $expect:expr),*$(,)?})*) {$( #[test] - #[should_panic] - fn unclosed_nested_comment() { - assert_whole_input_is_token( - "/* improperly /* nested */ comment", - Lexer::comment, - Type::Comment, + fn $f() {$( + assert_eq!( + Lexer::new($test) + .into_iter() + .map(|t| t.unwrap().into_data()) + .collect::>(), + dbg!($expect) ); - } - #[test] - #[should_panic] - fn not_block_comment() { - assert_whole_input_is_token("fn main() {}", Lexer::comment, Type::Comment); - } - #[test] - fn shebang_comment() { - assert_whole_input_is_token("#!/ comment!", Lexer::comment, Type::Comment); - } - #[test] - #[should_panic] - fn not_shebang_comment() { - assert_whole_input_is_token("fn main() {}", Lexer::comment, Type::Comment); - } - } - mod identifier { - use super::*; + )*} + )*} - #[test] - fn identifier() { - assert_whole_input_is_token("valid_identifier", Lexer::identifier, Type::Identifier); - assert_whole_input_is_token("_0", Lexer::identifier, Type::Identifier); - assert_whole_input_is_token("_", Lexer::identifier, Type::Identifier); - } - #[test] - fn unicode_identifier() { - assert_whole_input_is_token("ζ_ζζζ_ζζζ_ζζζ", Lexer::identifier, Type::Identifier); - assert_whole_input_is_token("_ζζζ_ζζζ_ζζζ_", Lexer::identifier, Type::Identifier); - } - #[test] - #[should_panic] - fn not_identifier() { - assert_whole_input_is_token("123456789", Lexer::identifier, Type::Identifier); - } + /// Convert an `[ expr, ... ]` into a `[ *, ... ]` + macro td ($($id:expr),*) { + [$($id.into()),*] } - mod literal { - use super::*; - #[test] - fn literal_class() { - assert_whole_input_is_token("1_00000", Lexer::literal, Type::Integer); - assert_whole_input_is_token("1.00000", Lexer::literal, Type::Float); - assert_has_type_and_range("\"1.0\"", Lexer::literal, Type::String, 1..4); - assert_has_type_and_range("'\"'", Lexer::literal, Type::Character, 1..2); - } - mod integer { - use super::*; - #[test] - fn bare() { - assert_whole_input_is_token("10010110", Lexer::integer, Type::Integer); - assert_whole_input_is_token("12345670", Lexer::integer, Type::Integer); - assert_whole_input_is_token("1234567890", Lexer::integer, Type::Integer); - } - #[test] - fn base16() { - assert_has_type_and_range("0x1234", Lexer::integer, Type::Integer, 0..6); - assert_has_type_and_range("0x1234 \"hello\"", Lexer::integer, Type::Integer, 0..6); - } - #[test] - fn base10() { - assert_whole_input_is_token("0d1234", Lexer::integer, Type::Integer); - } - #[test] - fn base8() { - assert_whole_input_is_token("0o1234", Lexer::integer, Type::Integer); - } - #[test] - fn base2() { - assert_whole_input_is_token("0b1010", Lexer::integer, Type::Integer); - } - } - mod float { - use super::*; - #[test] - fn number_dot_number_is_float() { - assert_whole_input_is_token("1.0", Lexer::float, Type::Float); - } - #[test] - fn nothing_dot_number_is_float() { - assert_whole_input_is_token(".0", Lexer::float, Type::Float); - } - #[test] - #[should_panic] - fn number_dot_nothing_is_not_float() { - assert_whole_input_is_token("1.", Lexer::float, Type::Float); - } - #[test] - #[should_panic] - fn nothing_dot_nothing_is_not_float() { - assert_whole_input_is_token(".", Lexer::float, Type::Float); - } - } - mod string { - use super::*; - #[test] - fn empty_string() { - assert_has_type_and_range("\"\"", Lexer::string, Type::String, 1..1); - } - #[test] - fn unicode_string() { - assert_has_type_and_range("\"I 💙 🦈!\"", Lexer::string, Type::String, 1..13); - } - #[test] - fn escape_string() { - assert_has_type_and_range( - "\" \\\"This is a quote\\\" \"", - Lexer::string, - Type::String, - 1..22, - ); - } - } - mod char { - use super::*; - #[test] - fn plain_char() { - assert_has_type_and_range("'A'", Lexer::character, Type::Character, 1..2); - assert_has_type_and_range("'a'", Lexer::character, Type::Character, 1..2); - assert_has_type_and_range("'#'", Lexer::character, Type::Character, 1..2); - } - #[test] - fn unicode_char() { - assert_has_type_and_range("'ε'", Lexer::character, Type::Character, 1..3); - } - #[test] - fn escaped_char() { - assert_has_type_and_range("'\\n'", Lexer::character, Type::Character, 1..3); - } - #[test] - #[should_panic] - fn no_char() { - assert_has_type_and_range("''", Lexer::character, Type::Character, 1..1); - } - } - } - mod delimiter { - use super::*; - #[test] - fn delimiter_class() { - assert_whole_input_is_token("[", Lexer::delimiter, Type::LBrack); - assert_whole_input_is_token("]", Lexer::delimiter, Type::RBrack); - assert_whole_input_is_token("{", Lexer::delimiter, Type::LCurly); - assert_whole_input_is_token("}", Lexer::delimiter, Type::RCurly); - assert_whole_input_is_token("(", Lexer::delimiter, Type::LParen); - assert_whole_input_is_token(")", Lexer::delimiter, Type::RParen); - } - #[test] - fn l_brack() { - assert_whole_input_is_token("[", Lexer::l_brack, Type::LBrack); - } - #[test] - fn r_brack() { - assert_whole_input_is_token("]", Lexer::r_brack, Type::RBrack); - } - #[test] - fn l_curly() { - assert_whole_input_is_token("{", Lexer::l_curly, Type::LCurly); - } - #[test] - fn r_curly() { - assert_whole_input_is_token("}", Lexer::r_curly, Type::RCurly); - } - #[test] - fn l_paren() { - assert_whole_input_is_token("(", Lexer::l_paren, Type::LParen); + mod ident { + use super::*; + macro ident ($($id:literal),*) { + [$(TokenData::Identifier($id.into())),*] } - #[test] - fn r_paren() { - assert_whole_input_is_token(")", Lexer::r_paren, Type::RParen); + test_lexer_data_type! { + underscore { "_ _" => ident!["_", "_"] } + unicode { "_ε ε_" => ident!["_ε", "ε_"] } + many_underscore { "____________________________________" => + ident!["____________________________________"] } } } - mod punctuation { + mod keyword { use super::*; - mod compound { - use super::*; - #[test] - fn dot_dot() { - assert_whole_input_is_token("..", Lexer::dot_dot, Type::DotDot) + macro kw($($k:ident),*) { + [ $(Type::Keyword(Keyword::$k),)* ] + } + test_lexer_output_type! { + kw_break { "break break" => kw![Break, Break] } + kw_continue { "continue continue" => kw![Continue, Continue] } + kw_else { "else else" => kw![Else, Else] } + kw_false { "false false" => kw![False, False] } + kw_for { "for for" => kw![For, For] } + kw_fn { "fn fn" => kw![Fn, Fn] } + kw_if { "if if" => kw![If, If] } + kw_in { "in in" => kw![In, In] } + kw_let { "let let" => kw![Let, Let] } + kw_return { "return return" => kw![Return, Return] } + kw_true { "true true" => kw![True, True] } + kw_while { "while while" => kw![While, While] } + keywords { "break continue else false for fn if in let return true while" => + kw![Break, Continue, Else, False, For, Fn, If, In, Let, Return, True, While] } + } + } + mod integer { + use super::*; + test_lexer_data_type! { + hex { + "0x0 0x1 0x15 0x2100 0x8000" => + td![0x0, 0x1, 0x15, 0x2100, 0x8000] } - #[test] - fn dot_dot_eq() { - assert_whole_input_is_token("..=", Lexer::dot_dot_eq, Type::DotDotEq) + dec { + "0d0 0d1 0d21 0d8448 0d32768" => + td![0, 0x1, 0x15, 0x2100, 0x8000] } - #[test] - fn lt_lt() { - assert_whole_input_is_token("<<", Lexer::lt_lt, Type::LtLt) + oct { + "0o0 0o1 0o25 0o20400 0o100000" => + td![0x0, 0x1, 0x15, 0x2100, 0x8000] } - #[test] - fn gt_gt() { - assert_whole_input_is_token(">>", Lexer::gt_gt, Type::GtGt) + bin { + "0b0 0b1 0b10101 0b10000100000000 0b1000000000000000" => + td![0x0, 0x1, 0x15, 0x2100, 0x8000] } - #[test] - fn amp_amp() { - assert_whole_input_is_token("&&", Lexer::amp_amp, Type::AmpAmp) - } - #[test] - fn bar_bar() { - assert_whole_input_is_token("||", Lexer::bar_bar, Type::BarBar) - } - #[test] - fn bang_bang() { - assert_whole_input_is_token("!!", Lexer::bang_bang, Type::BangBang) - } - #[test] - fn xor_xor() { - assert_whole_input_is_token("^^", Lexer::xor_xor, Type::XorXor) - } - #[test] - fn eq_eq() { - assert_whole_input_is_token("==", Lexer::eq_eq, Type::EqEq) - } - #[test] - fn gt_eq() { - assert_whole_input_is_token(">=", Lexer::gt_eq, Type::GtEq) - } - #[test] - fn lt_eq() { - assert_whole_input_is_token("<=", Lexer::lt_eq, Type::LtEq) - } - #[test] - fn bang_eq() { - assert_whole_input_is_token("!=", Lexer::bang_eq, Type::BangEq) - } - #[test] - fn star_eq() { - assert_whole_input_is_token("*=", Lexer::star_eq, Type::StarEq) - } - #[test] - fn slash_eq() { - assert_whole_input_is_token("/=", Lexer::slash_eq, Type::SlashEq) - } - #[test] - fn plus_eq() { - assert_whole_input_is_token("+=", Lexer::plus_eq, Type::PlusEq) - } - #[test] - fn minus_eq() { - assert_whole_input_is_token("-=", Lexer::minus_eq, Type::MinusEq) - } - #[test] - fn amp_eq() { - assert_whole_input_is_token("&=", Lexer::amp_eq, Type::AmpEq) - } - #[test] - fn bar_eq() { - assert_whole_input_is_token("|=", Lexer::bar_eq, Type::BarEq) - } - #[test] - fn xor_eq() { - assert_whole_input_is_token("^=", Lexer::xor_eq, Type::XorEq) - } - #[test] - fn lt_lt_eq() { - assert_whole_input_is_token("<<=", Lexer::lt_lt_eq, Type::LtLtEq) - } - #[test] - fn gt_gt_eq() { - assert_whole_input_is_token(">>=", Lexer::gt_gt_eq, Type::GtGtEq) + baseless { + "0 1 21 8448 32768" => + td![0x0, 0x1, 0x15, 0x2100, 0x8000] } } - - mod simple { - use super::*; - #[test] - fn punctuation_class() { - // go from least to most specific - assert_whole_input_is_token(";", Lexer::punctuation, Type::Semi); - assert_whole_input_is_token(".", Lexer::punctuation, Type::Dot); - assert_whole_input_is_token("*", Lexer::punctuation, Type::Star); - assert_whole_input_is_token("/", Lexer::punctuation, Type::Slash); - assert_whole_input_is_token("+", Lexer::punctuation, Type::Plus); - assert_whole_input_is_token("-", Lexer::punctuation, Type::Minus); - assert_whole_input_is_token("%", Lexer::punctuation, Type::Rem); - assert_whole_input_is_token("!", Lexer::punctuation, Type::Bang); - assert_whole_input_is_token("=", Lexer::punctuation, Type::Eq); - assert_whole_input_is_token("<", Lexer::punctuation, Type::Lt); - assert_whole_input_is_token(">", Lexer::punctuation, Type::Gt); - assert_whole_input_is_token("&", Lexer::punctuation, Type::Amp); - assert_whole_input_is_token("|", Lexer::punctuation, Type::Bar); - assert_whole_input_is_token("^", Lexer::punctuation, Type::Xor); - assert_whole_input_is_token("#", Lexer::punctuation, Type::Hash); - assert_whole_input_is_token("@", Lexer::punctuation, Type::At); - assert_whole_input_is_token(":", Lexer::punctuation, Type::Colon); - assert_whole_input_is_token("?", Lexer::punctuation, Type::Question); - assert_whole_input_is_token(",", Lexer::punctuation, Type::Comma); - assert_whole_input_is_token("~", Lexer::punctuation, Type::Tilde); - assert_whole_input_is_token("`", Lexer::punctuation, Type::Grave); - assert_whole_input_is_token("\\", Lexer::punctuation, Type::Backslash); - assert_whole_input_is_token("<<", Lexer::punctuation, Type::LtLt); - assert_whole_input_is_token(">>", Lexer::punctuation, Type::GtGt); - assert_whole_input_is_token("&&", Lexer::punctuation, Type::AmpAmp); - assert_whole_input_is_token("||", Lexer::punctuation, Type::BarBar); - assert_whole_input_is_token("!!", Lexer::punctuation, Type::BangBang); - assert_whole_input_is_token("^^", Lexer::punctuation, Type::XorXor); - assert_whole_input_is_token("==", Lexer::punctuation, Type::EqEq); - assert_whole_input_is_token(">=", Lexer::punctuation, Type::GtEq); - assert_whole_input_is_token("<=", Lexer::punctuation, Type::LtEq); - assert_whole_input_is_token("!=", Lexer::punctuation, Type::BangEq); - assert_whole_input_is_token("*=", Lexer::punctuation, Type::StarEq); - assert_whole_input_is_token("/=", Lexer::punctuation, Type::SlashEq); - assert_whole_input_is_token("+=", Lexer::punctuation, Type::PlusEq); - assert_whole_input_is_token("-=", Lexer::punctuation, Type::MinusEq); - assert_whole_input_is_token("&=", Lexer::punctuation, Type::AmpEq); - assert_whole_input_is_token("|=", Lexer::punctuation, Type::BarEq); - assert_whole_input_is_token("^=", Lexer::punctuation, Type::XorEq); - assert_whole_input_is_token("..", Lexer::punctuation, Type::DotDot); - assert_whole_input_is_token("..=", Lexer::punctuation, Type::DotDotEq); - assert_whole_input_is_token("<<=", Lexer::punctuation, Type::LtLtEq); - assert_whole_input_is_token(">>=", Lexer::punctuation, Type::GtGtEq); + } + mod string { + use super::*; + test_lexer_data_type! { + empty_string { + "\"\"" => + td![String::from("")] } - // individual functions below - #[test] - fn semi() { - assert_whole_input_is_token(";", Lexer::semi, Type::Semi) + unicode_string { + "\"I 💙 🦈!\"" => + td![String::from("I 💙 🦈!")] } - #[test] - fn dot() { - assert_whole_input_is_token(".", Lexer::dot, Type::Dot) - } - #[test] - fn star() { - assert_whole_input_is_token("*", Lexer::star, Type::Star) - } - #[test] - fn slash() { - assert_whole_input_is_token("/", Lexer::slash, Type::Slash) - } - #[test] - fn plus() { - assert_whole_input_is_token("+", Lexer::plus, Type::Plus) - } - #[test] - fn minus() { - assert_whole_input_is_token("-", Lexer::minus, Type::Minus) - } - #[test] - fn rem() { - assert_whole_input_is_token("%", Lexer::rem, Type::Rem) - } - #[test] - fn bang() { - assert_whole_input_is_token("!", Lexer::bang, Type::Bang) - } - #[test] - fn eq() { - assert_whole_input_is_token("=", Lexer::eq, Type::Eq) - } - #[test] - fn lt() { - assert_whole_input_is_token("<", Lexer::lt, Type::Lt) - } - #[test] - fn gt() { - assert_whole_input_is_token(">", Lexer::gt, Type::Gt) - } - #[test] - fn amp() { - assert_whole_input_is_token("&", Lexer::amp, Type::Amp) - } - #[test] - fn bar() { - assert_whole_input_is_token("|", Lexer::bar, Type::Bar) - } - #[test] - fn xor() { - assert_whole_input_is_token("^", Lexer::xor, Type::Xor) - } - #[test] - fn hash() { - assert_whole_input_is_token("#", Lexer::hash, Type::Hash) - } - #[test] - fn at() { - assert_whole_input_is_token("@", Lexer::at, Type::At) - } - #[test] - fn colon() { - assert_whole_input_is_token(":", Lexer::colon, Type::Colon) - } - #[test] - fn backslash() { - assert_whole_input_is_token("\\", Lexer::backslash, Type::Backslash) - } - #[test] - fn question() { - assert_whole_input_is_token("?", Lexer::question, Type::Question) - } - #[test] - fn comma() { - assert_whole_input_is_token(",", Lexer::comma, Type::Comma) - } - #[test] - fn tilde() { - assert_whole_input_is_token("~", Lexer::tilde, Type::Tilde) - } - #[test] - fn grave() { - assert_whole_input_is_token("`", Lexer::grave, Type::Grave) + escape_string { + " \"This is a shark: \\u{1f988}\" " => + td![String::from("This is a shark: 🦈")] } } } + mod punct { + use super::*; + test_lexer_output_type! { + l_curly { "{ {" => [ Type::LCurly, Type::LCurly ] } + r_curly { "} }" => [ Type::RCurly, Type::RCurly ] } + l_brack { "[ [" => [ Type::LBrack, Type::LBrack ] } + r_brack { "] ]" => [ Type::RBrack, Type::RBrack ] } + l_paren { "( (" => [ Type::LParen, Type::LParen ] } + r_paren { ") )" => [ Type::RParen, Type::RParen ] } + amp { "& &" => [ Type::Amp, Type::Amp ] } + amp_amp { "&& &&" => [ Type::AmpAmp, Type::AmpAmp ] } + amp_eq { "&= &=" => [ Type::AmpEq, Type::AmpEq ] } + arrow { "-> ->" => [ Type::Arrow, Type::Arrow] } + at { "@ @" => [ Type::At, Type::At] } + backslash { "\\ \\" => [ Type::Backslash, Type::Backslash] } + bang { "! !" => [ Type::Bang, Type::Bang] } + bangbang { "!! !!" => [ Type::BangBang, Type::BangBang] } + bangeq { "!= !=" => [ Type::BangEq, Type::BangEq] } + bar { "| |" => [ Type::Bar, Type::Bar] } + barbar { "|| ||" => [ Type::BarBar, Type::BarBar] } + bareq { "|= |=" => [ Type::BarEq, Type::BarEq] } + colon { ": :" => [ Type::Colon, Type::Colon] } + comma { ", ," => [ Type::Comma, Type::Comma] } + dot { ". ." => [ Type::Dot, Type::Dot] } + dotdot { ".. .." => [ Type::DotDot, Type::DotDot] } + dotdoteq { "..= ..=" => [ Type::DotDotEq, Type::DotDotEq] } + eq { "= =" => [ Type::Eq, Type::Eq] } + eqeq { "== ==" => [ Type::EqEq, Type::EqEq] } + fatarrow { "=> =>" => [ Type::FatArrow, Type::FatArrow] } + grave { "` `" => [ Type::Grave, Type::Grave] } + gt { "> >" => [ Type::Gt, Type::Gt] } + gteq { ">= >=" => [ Type::GtEq, Type::GtEq] } + gtgt { ">> >>" => [ Type::GtGt, Type::GtGt] } + gtgteq { ">>= >>=" => [ Type::GtGtEq, Type::GtGtEq] } + hash { "# #" => [ Type::Hash, Type::Hash] } + lt { "< <" => [ Type::Lt, Type::Lt] } + lteq { "<= <=" => [ Type::LtEq, Type::LtEq] } + ltlt { "<< <<" => [ Type::LtLt, Type::LtLt] } + ltlteq { "<<= <<=" => [ Type::LtLtEq, Type::LtLtEq] } + minus { "- -" => [ Type::Minus, Type::Minus] } + minuseq { "-= -=" => [ Type::MinusEq, Type::MinusEq] } + plus { "+ +" => [ Type::Plus, Type::Plus] } + pluseq { "+= +=" => [ Type::PlusEq, Type::PlusEq] } + question { "? ?" => [ Type::Question, Type::Question] } + rem { "% %" => [ Type::Rem, Type::Rem] } + remeq { "%= %=" => [ Type::RemEq, Type::RemEq] } + semi { "; ;" => [ Type::Semi, Type::Semi] } + slash { "/ /" => [ Type::Slash, Type::Slash] } + slasheq { "/= /=" => [ Type::SlashEq, Type::SlashEq] } + star { "* *" => [ Type::Star, Type::Star] } + stareq { "*= *=" => [ Type::StarEq, Type::StarEq] } + tilde { "~ ~" => [ Type::Tilde, Type::Tilde] } + xor { "^ ^" => [ Type::Xor, Type::Xor] } + xoreq { "^= ^=" => [ Type::XorEq, Type::XorEq] } + xorxor { "^^ ^^" => [ Type::XorXor, Type::XorXor] } + } + } } mod parser { // TODO diff --git a/libconlang/src/token.rs b/libconlang/src/token.rs index cb6db1c..2296066 100644 --- a/libconlang/src/token.rs +++ b/libconlang/src/token.rs @@ -1,5 +1,4 @@ //! Stores a component of a file as a type and span -use std::ops::Range; mod token_type; #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] @@ -88,54 +87,60 @@ pub enum Keyword { While, } -#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +#[derive(Clone, Debug, PartialEq)] +pub enum TokenData { + Identifier(Box), + String(String), + Character(char), + Integer(u128), + Float(f64), + None, +} +from! { + value: &str => Self::Identifier(value.into()), + value: String => Self::String(value), + value: u128 => Self::Integer(value), + value: f64 => Self::Float(value), + value: char => Self::Character(value), + _v: () => Self::None, +} +macro from($($value:ident: $src:ty => $dst:expr),*$(,)?) { + $(impl From<$src> for TokenData { + fn from($value: $src) -> Self { $dst } + })* +} + +#[derive(Clone, Debug, PartialEq)] pub struct Token { ty: Type, - pub head: usize, - pub tail: usize, + data: TokenData, line: u32, col: u32, } impl Token { - pub fn new(ty: Type, head: usize, tail: usize, line: u32, col: u32) -> Self { - Self { ty, head, tail, line, col } + /// Creates a new [Token] out of a [Type], [TokenData], line, and column. + pub fn new(ty: Type, data: impl Into, line: u32, col: u32) -> Self { + Self { ty, data: data.into(), line, col } } - /// Cast this [Token] to a new [Type] + /// Casts this token to a new [Type] pub fn cast(self, ty: Type) -> Self { Self { ty, ..self } } - /// Hack to work around the current [lexer's design limitations](crate::lexer) - pub fn rebound(self, head: usize, tail: usize) -> Self { - Self { head, tail, ..self } - } - /// Gets the line from this token - pub fn line(&self) -> u32 { - self.line - } - /// Gets the column from this token - pub fn col(&self) -> u32 { - self.col - } - pub fn is_empty(&self) -> bool { - self.tail == self.head - } - /// Gets the length of the token, in bytes - pub fn len(&self) -> usize { - self.tail - self.head - } - /// Gets the [Type] of the token + /// Gets the [Type] of this token pub fn ty(&self) -> Type { self.ty } - /// Gets the exclusive range of the token - pub fn range(&self) -> Range { - self.head..self.tail - } -} - -impl std::ops::Index<&Token> for str { - type Output = str; - fn index(&self, index: &Token) -> &Self::Output { - &self[index.range()] + /// Gets the [TokenData] of this token + pub fn data(&self) -> &TokenData { + &self.data + } + pub fn into_data(self) -> TokenData { + self.data + } + pub fn line(&self) -> u32 { + self.line + } + pub fn col(&self) -> u32 { + self.col } }