cl-lexer: Bring over new lexer
- Different flow, similar action - Consolidated all the single-purpose di- and trigraph functions - Gave the lexer explicit access to its entire string, which is can slice.
This commit is contained in:
parent
f0c871711c
commit
1fe796dda7
@ -5,7 +5,7 @@ use cl_structures::span::Loc;
|
|||||||
use cl_token::{TokenKind as Kind, *};
|
use cl_token::{TokenKind as Kind, *};
|
||||||
use std::{
|
use std::{
|
||||||
iter::Peekable,
|
iter::Peekable,
|
||||||
str::{Chars, FromStr},
|
str::{CharIndices, FromStr},
|
||||||
};
|
};
|
||||||
use unicode_ident::*;
|
use unicode_ident::*;
|
||||||
|
|
||||||
@ -76,409 +76,370 @@ pub mod lexer_iter {
|
|||||||
/// ```
|
/// ```
|
||||||
#[derive(Clone, Debug)]
|
#[derive(Clone, Debug)]
|
||||||
pub struct Lexer<'t> {
|
pub struct Lexer<'t> {
|
||||||
iter: Peekable<Chars<'t>>,
|
/// The source text
|
||||||
start: usize,
|
text: &'t str,
|
||||||
start_loc: (u32, u32),
|
/// A peekable iterator over the source text
|
||||||
current: usize,
|
iter: Peekable<CharIndices<'t>>,
|
||||||
current_loc: (u32, u32),
|
/// The end of the current token
|
||||||
|
head: usize,
|
||||||
|
/// The (line, col) end of the current token
|
||||||
|
head_loc: (u32, u32),
|
||||||
|
/// The start of the current token
|
||||||
|
tail: usize,
|
||||||
|
/// The (line, col) start of the current token
|
||||||
|
tail_loc: (u32, u32),
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'t> Lexer<'t> {
|
impl<'t> Lexer<'t> {
|
||||||
/// Creates a new [Lexer] over a [str]
|
/// Creates a new [Lexer] over a [str]
|
||||||
pub fn new(text: &'t str) -> Self {
|
pub fn new(text: &'t str) -> Self {
|
||||||
Self {
|
Self {
|
||||||
iter: text.chars().peekable(),
|
text,
|
||||||
start: 0,
|
iter: text.char_indices().peekable(),
|
||||||
start_loc: (1, 1),
|
head: 0,
|
||||||
current: 0,
|
head_loc: (1, 1),
|
||||||
current_loc: (1, 1),
|
tail: 0,
|
||||||
}
|
tail_loc: (1, 1),
|
||||||
}
|
|
||||||
/// Scans through the text, searching for the next [Token]
|
|
||||||
pub fn scan(&mut self) -> LResult<Token> {
|
|
||||||
match self.skip_whitespace().peek()? {
|
|
||||||
'{' => self.consume()?.produce_op(Kind::LCurly),
|
|
||||||
'}' => self.consume()?.produce_op(Kind::RCurly),
|
|
||||||
'[' => self.consume()?.produce_op(Kind::LBrack),
|
|
||||||
']' => self.consume()?.produce_op(Kind::RBrack),
|
|
||||||
'(' => self.consume()?.produce_op(Kind::LParen),
|
|
||||||
')' => self.consume()?.produce_op(Kind::RParen),
|
|
||||||
'&' => self.consume()?.amp(),
|
|
||||||
'@' => self.consume()?.produce_op(Kind::At),
|
|
||||||
'\\' => self.consume()?.produce_op(Kind::Backslash),
|
|
||||||
'!' => self.consume()?.bang(),
|
|
||||||
'|' => self.consume()?.bar(),
|
|
||||||
':' => self.consume()?.colon(),
|
|
||||||
',' => self.consume()?.produce_op(Kind::Comma),
|
|
||||||
'.' => self.consume()?.dot(),
|
|
||||||
'=' => self.consume()?.equal(),
|
|
||||||
'`' => self.consume()?.produce_op(Kind::Grave),
|
|
||||||
'>' => self.consume()?.greater(),
|
|
||||||
'#' => self.consume()?.hash(),
|
|
||||||
'<' => self.consume()?.less(),
|
|
||||||
'-' => self.consume()?.minus(),
|
|
||||||
'+' => self.consume()?.plus(),
|
|
||||||
'?' => self.consume()?.produce_op(Kind::Question),
|
|
||||||
'%' => self.consume()?.rem(),
|
|
||||||
';' => self.consume()?.produce_op(Kind::Semi),
|
|
||||||
'/' => self.consume()?.slash(),
|
|
||||||
'*' => self.consume()?.star(),
|
|
||||||
'~' => self.consume()?.produce_op(Kind::Tilde),
|
|
||||||
'^' => self.consume()?.xor(),
|
|
||||||
'0' => self.consume()?.int_with_base(),
|
|
||||||
'1'..='9' => self.digits::<10>(),
|
|
||||||
'"' => self.consume()?.string(),
|
|
||||||
'\'' => self.consume()?.character(),
|
|
||||||
'_' => self.identifier(),
|
|
||||||
i if is_xid_start(i) => self.identifier(),
|
|
||||||
e => {
|
|
||||||
let err = Err(Error::unexpected_char(e, self.line(), self.col()));
|
|
||||||
let _ = self.consume();
|
|
||||||
err
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the current line
|
/// Returns the current line
|
||||||
pub fn line(&self) -> u32 {
|
pub fn line(&self) -> u32 {
|
||||||
self.start_loc.0
|
self.tail_loc.0
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the current column
|
/// Returns the current column
|
||||||
pub fn col(&self) -> u32 {
|
pub fn col(&self) -> u32 {
|
||||||
self.start_loc.1
|
self.tail_loc.1
|
||||||
}
|
}
|
||||||
fn next(&mut self) -> LResult<char> {
|
|
||||||
let out = self.peek();
|
/// Returns the current token's lexeme
|
||||||
self.consume()?;
|
fn lexeme(&mut self) -> &'t str {
|
||||||
out
|
&self.text[self.tail..self.head]
|
||||||
}
|
}
|
||||||
fn peek(&mut self) -> LResult<char> {
|
|
||||||
self.iter
|
/// Peeks the next character without advancing the lexer
|
||||||
.peek()
|
fn peek(&mut self) -> Option<char> {
|
||||||
.copied()
|
self.iter.peek().map(|(_, c)| *c)
|
||||||
.ok_or(Error::end_of_file(self.line(), self.col()))
|
|
||||||
}
|
}
|
||||||
fn produce(&mut self, kind: Kind, data: impl Into<TokenData>) -> LResult<Token> {
|
|
||||||
let loc = self.start_loc;
|
/// Advances the 'tail' (current position)
|
||||||
self.start_loc = self.current_loc;
|
fn advance_tail(&mut self) {
|
||||||
self.start = self.current;
|
let (idx, c) = self.iter.peek().copied().unwrap_or((self.text.len(), '\0'));
|
||||||
Ok(Token::new(kind, data, loc.0, loc.1))
|
let (line, col) = &mut self.head_loc;
|
||||||
}
|
let diff = idx - self.head;
|
||||||
fn produce_op(&mut self, kind: Kind) -> LResult<Token> {
|
|
||||||
self.produce(kind, ())
|
self.head = idx;
|
||||||
}
|
match c {
|
||||||
fn skip_whitespace(&mut self) -> &mut Self {
|
'\n' => {
|
||||||
while let Ok(c) = self.peek() {
|
|
||||||
if !c.is_whitespace() {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
let _ = self.consume();
|
|
||||||
}
|
|
||||||
self.start = self.current;
|
|
||||||
self.start_loc = self.current_loc;
|
|
||||||
self
|
|
||||||
}
|
|
||||||
fn consume(&mut self) -> LResult<&mut Self> {
|
|
||||||
self.current += 1;
|
|
||||||
match self.iter.next() {
|
|
||||||
Some('\n') => {
|
|
||||||
let (line, col) = &mut self.current_loc;
|
|
||||||
*line += 1;
|
*line += 1;
|
||||||
*col = 1;
|
*col = 1;
|
||||||
}
|
}
|
||||||
Some(_) => self.current_loc.1 += 1,
|
_ => *col += diff as u32,
|
||||||
None => Err(Error::end_of_file(self.line(), self.col()))?,
|
|
||||||
}
|
|
||||||
Ok(self)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
/// Digraphs and trigraphs
|
|
||||||
impl Lexer<'_> {
|
|
||||||
fn amp(&mut self) -> LResult<Token> {
|
|
||||||
match self.peek() {
|
|
||||||
Ok('&') => self.consume()?.produce_op(Kind::AmpAmp),
|
|
||||||
Ok('=') => self.consume()?.produce_op(Kind::AmpEq),
|
|
||||||
_ => self.produce_op(Kind::Amp),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
fn bang(&mut self) -> LResult<Token> {
|
|
||||||
match self.peek() {
|
|
||||||
Ok('!') => self.consume()?.produce_op(Kind::BangBang),
|
|
||||||
Ok('=') => self.consume()?.produce_op(Kind::BangEq),
|
|
||||||
_ => self.produce_op(Kind::Bang),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
fn bar(&mut self) -> LResult<Token> {
|
|
||||||
match self.peek() {
|
|
||||||
Ok('|') => self.consume()?.produce_op(Kind::BarBar),
|
|
||||||
Ok('=') => self.consume()?.produce_op(Kind::BarEq),
|
|
||||||
_ => self.produce_op(Kind::Bar),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
fn colon(&mut self) -> LResult<Token> {
|
|
||||||
match self.peek() {
|
|
||||||
Ok(':') => self.consume()?.produce_op(Kind::ColonColon),
|
|
||||||
_ => self.produce_op(Kind::Colon),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
fn dot(&mut self) -> LResult<Token> {
|
|
||||||
match self.peek() {
|
|
||||||
Ok('.') => {
|
|
||||||
if let Ok('=') = self.consume()?.peek() {
|
|
||||||
self.consume()?.produce_op(Kind::DotDotEq)
|
|
||||||
} else {
|
|
||||||
self.produce_op(Kind::DotDot)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
_ => self.produce_op(Kind::Dot),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
fn equal(&mut self) -> LResult<Token> {
|
|
||||||
match self.peek() {
|
|
||||||
Ok('=') => self.consume()?.produce_op(Kind::EqEq),
|
|
||||||
Ok('>') => self.consume()?.produce_op(Kind::FatArrow),
|
|
||||||
_ => self.produce_op(Kind::Eq),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
fn greater(&mut self) -> LResult<Token> {
|
|
||||||
match self.peek() {
|
|
||||||
Ok('=') => self.consume()?.produce_op(Kind::GtEq),
|
|
||||||
Ok('>') => {
|
|
||||||
if let Ok('=') = self.consume()?.peek() {
|
|
||||||
self.consume()?.produce_op(Kind::GtGtEq)
|
|
||||||
} else {
|
|
||||||
self.produce_op(Kind::GtGt)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
_ => self.produce_op(Kind::Gt),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
fn hash(&mut self) -> LResult<Token> {
|
|
||||||
match self.peek() {
|
|
||||||
Ok('!') => self.consume()?.hashbang(),
|
|
||||||
_ => self.produce_op(Kind::Hash),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
fn hashbang(&mut self) -> LResult<Token> {
|
|
||||||
match self.peek() {
|
|
||||||
Ok('/' | '\'') => self.line_comment(),
|
|
||||||
_ => self.produce_op(Kind::HashBang),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
fn less(&mut self) -> LResult<Token> {
|
|
||||||
match self.peek() {
|
|
||||||
Ok('=') => self.consume()?.produce_op(Kind::LtEq),
|
|
||||||
Ok('<') => {
|
|
||||||
if let Ok('=') = self.consume()?.peek() {
|
|
||||||
self.consume()?.produce_op(Kind::LtLtEq)
|
|
||||||
} else {
|
|
||||||
self.produce_op(Kind::LtLt)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
_ => self.produce_op(Kind::Lt),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
fn minus(&mut self) -> LResult<Token> {
|
|
||||||
match self.peek() {
|
|
||||||
Ok('=') => self.consume()?.produce_op(Kind::MinusEq),
|
|
||||||
Ok('>') => self.consume()?.produce_op(Kind::Arrow),
|
|
||||||
_ => self.produce_op(Kind::Minus),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
fn plus(&mut self) -> LResult<Token> {
|
|
||||||
match self.peek() {
|
|
||||||
Ok('=') => self.consume()?.produce_op(Kind::PlusEq),
|
|
||||||
_ => self.produce_op(Kind::Plus),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
fn rem(&mut self) -> LResult<Token> {
|
|
||||||
match self.peek() {
|
|
||||||
Ok('=') => self.consume()?.produce_op(Kind::RemEq),
|
|
||||||
_ => self.produce_op(Kind::Rem),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
fn slash(&mut self) -> LResult<Token> {
|
|
||||||
match self.peek() {
|
|
||||||
Ok('=') => self.consume()?.produce_op(Kind::SlashEq),
|
|
||||||
Ok('/') => self.consume()?.line_comment(),
|
|
||||||
Ok('*') => self.consume()?.block_comment(),
|
|
||||||
_ => self.produce_op(Kind::Slash),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
fn star(&mut self) -> LResult<Token> {
|
|
||||||
match self.peek() {
|
|
||||||
Ok('=') => self.consume()?.produce_op(Kind::StarEq),
|
|
||||||
_ => self.produce_op(Kind::Star),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
fn xor(&mut self) -> LResult<Token> {
|
|
||||||
match self.peek() {
|
|
||||||
Ok('=') => self.consume()?.produce_op(Kind::XorEq),
|
|
||||||
Ok('^') => self.consume()?.produce_op(Kind::XorXor),
|
|
||||||
_ => self.produce_op(Kind::Xor),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Takes the last-peeked character, or the next character if none peeked.
|
||||||
|
pub fn take(&mut self) -> Option<char> {
|
||||||
|
let (_, c) = self.iter.next()?;
|
||||||
|
self.advance_tail();
|
||||||
|
Some(c)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Takes the next char if it matches the `expected` char
|
||||||
|
pub fn next_if(&mut self, expected: char) -> Option<char> {
|
||||||
|
let (_, c) = self.iter.next_if(|&(_, c)| c == expected)?;
|
||||||
|
self.advance_tail();
|
||||||
|
Some(c)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Consumes the last-peeked character, advancing the tail
|
||||||
|
pub fn consume(&mut self) -> &mut Self {
|
||||||
|
self.iter.next();
|
||||||
|
self.advance_tail();
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Produces an [Error] at the start of the current token
|
||||||
|
fn error(&self, reason: Reason) -> Error {
|
||||||
|
Error { reason, line: self.line(), col: self.col() }
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Produces a token with the current [lexeme](Lexer::lexeme) as its data
|
||||||
|
fn produce(&mut self, kind: Kind) -> LResult<Token> {
|
||||||
|
let lexeme = self.lexeme().to_owned();
|
||||||
|
self.produce_with(kind, lexeme)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Produces a token with the provided `data`
|
||||||
|
fn produce_with(&mut self, kind: Kind, data: impl Into<TokenData>) -> LResult<Token> {
|
||||||
|
let loc = self.tail_loc;
|
||||||
|
self.tail_loc = self.head_loc;
|
||||||
|
self.tail = self.head;
|
||||||
|
Ok(Token::new(kind, data, loc.0, loc.1))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Produces a token with no `data`
|
||||||
|
fn produce_op(&mut self, kind: Kind) -> LResult<Token> {
|
||||||
|
self.produce_with(kind, ())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Consumes 0 or more whitespace
|
||||||
|
fn skip_whitespace(&mut self) -> &mut Self {
|
||||||
|
while self.peek().is_some_and(char::is_whitespace) {
|
||||||
|
let _ = self.consume();
|
||||||
|
}
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Starts a new token
|
||||||
|
fn start_token(&mut self) -> &mut Self {
|
||||||
|
self.tail_loc = self.head_loc;
|
||||||
|
self.tail = self.head;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Scans through the text, searching for the next [Token]
|
||||||
|
pub fn scan(&mut self) -> LResult<Token> {
|
||||||
|
use TokenKind::*;
|
||||||
|
// !"#%&'()*+,-./:;<=>?@[\\]^`{|}~
|
||||||
|
let tok = match self
|
||||||
|
.skip_whitespace()
|
||||||
|
.start_token()
|
||||||
|
.peek()
|
||||||
|
.ok_or_else(|| self.error(Reason::EndOfFile))?
|
||||||
|
{
|
||||||
|
'!' => Bang,
|
||||||
|
'"' => return self.string(),
|
||||||
|
'#' => Hash,
|
||||||
|
'%' => Rem,
|
||||||
|
'&' => Amp,
|
||||||
|
'\'' => return self.character(),
|
||||||
|
'(' => LParen,
|
||||||
|
')' => RParen,
|
||||||
|
'*' => Star,
|
||||||
|
'+' => Plus,
|
||||||
|
',' => Comma,
|
||||||
|
'-' => Minus,
|
||||||
|
'.' => Dot,
|
||||||
|
'/' => Slash,
|
||||||
|
'0' => TokenKind::Literal,
|
||||||
|
'1'..='9' => return self.digits::<10>(),
|
||||||
|
':' => Colon,
|
||||||
|
';' => Semi,
|
||||||
|
'<' => Lt,
|
||||||
|
'=' => Eq,
|
||||||
|
'>' => Gt,
|
||||||
|
'?' => Question,
|
||||||
|
'@' => At,
|
||||||
|
'[' => LBrack,
|
||||||
|
'\\' => Backslash,
|
||||||
|
']' => RBrack,
|
||||||
|
'^' => Xor,
|
||||||
|
'`' => Grave,
|
||||||
|
'{' => LCurly,
|
||||||
|
'|' => Bar,
|
||||||
|
'}' => RCurly,
|
||||||
|
'~' => Tilde,
|
||||||
|
'_' => return self.identifier(),
|
||||||
|
c if is_xid_start(c) => return self.identifier(),
|
||||||
|
e => {
|
||||||
|
let err = Err(self.error(Reason::UnexpectedChar(e)));
|
||||||
|
let _ = self.consume();
|
||||||
|
err?
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Handle digraphs
|
||||||
|
let tok = match (tok, self.consume().peek()) {
|
||||||
|
(Literal, Some('b')) => return self.consume().digits::<2>(),
|
||||||
|
(Literal, Some('d')) => return self.consume().digits::<10>(),
|
||||||
|
(Literal, Some('o')) => return self.consume().digits::<8>(),
|
||||||
|
(Literal, Some('x')) => return self.consume().digits::<16>(),
|
||||||
|
(Literal, Some('~')) => return self.consume().digits::<36>(),
|
||||||
|
(Literal, _) => return self.digits::<10>(),
|
||||||
|
(Amp, Some('&')) => AmpAmp,
|
||||||
|
(Amp, Some('=')) => AmpEq,
|
||||||
|
(Bang, Some('!')) => BangBang,
|
||||||
|
(Bang, Some('=')) => BangEq,
|
||||||
|
(Bar, Some('|')) => BarBar,
|
||||||
|
(Bar, Some('=')) => BarEq,
|
||||||
|
(Colon, Some(':')) => ColonColon,
|
||||||
|
(Dot, Some('.')) => DotDot,
|
||||||
|
(Eq, Some('=')) => EqEq,
|
||||||
|
(Eq, Some('>')) => FatArrow,
|
||||||
|
(Gt, Some('=')) => GtEq,
|
||||||
|
(Gt, Some('>')) => GtGt,
|
||||||
|
(Hash, Some('!')) => HashBang,
|
||||||
|
(Lt, Some('=')) => LtEq,
|
||||||
|
(Lt, Some('<')) => LtLt,
|
||||||
|
(Minus, Some('=')) => MinusEq,
|
||||||
|
(Minus, Some('>')) => Arrow,
|
||||||
|
(Plus, Some('=')) => PlusEq,
|
||||||
|
(Rem, Some('=')) => RemEq,
|
||||||
|
(Slash, Some('*')) => return self.block_comment()?.produce(Kind::Comment),
|
||||||
|
(Slash, Some('/')) => return self.line_comment(),
|
||||||
|
(Slash, Some('=')) => SlashEq,
|
||||||
|
(Star, Some('=')) => StarEq,
|
||||||
|
(Xor, Some('=')) => XorEq,
|
||||||
|
(Xor, Some('^')) => XorXor,
|
||||||
|
_ => return self.produce_op(tok),
|
||||||
|
};
|
||||||
|
|
||||||
|
// Handle trigraphs
|
||||||
|
let tok = match (tok, self.consume().peek()) {
|
||||||
|
(HashBang, Some('/')) => return self.line_comment(),
|
||||||
|
(DotDot, Some('=')) => DotDotEq,
|
||||||
|
(GtGt, Some('=')) => GtGtEq,
|
||||||
|
(LtLt, Some('=')) => LtLtEq,
|
||||||
|
_ => return self.produce_op(tok),
|
||||||
|
};
|
||||||
|
|
||||||
|
self.consume().produce_op(tok)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Comments
|
/// Comments
|
||||||
impl Lexer<'_> {
|
impl Lexer<'_> {
|
||||||
|
/// Consumes until the next newline '\n', producing a [Comment](Kind::Comment)
|
||||||
fn line_comment(&mut self) -> LResult<Token> {
|
fn line_comment(&mut self) -> LResult<Token> {
|
||||||
let mut comment = String::new();
|
while self.consume().peek().is_some_and(|c| c != '\n') {}
|
||||||
while Ok('\n') != self.peek() {
|
self.produce(Kind::Comment)
|
||||||
comment.push(self.next()?);
|
|
||||||
}
|
}
|
||||||
self.produce(Kind::Comment, comment)
|
|
||||||
|
/// Consumes nested block-comments. Does not produce by itself.
|
||||||
|
fn block_comment(&mut self) -> LResult<&mut Self> {
|
||||||
|
self.consume();
|
||||||
|
while let Some(c) = self.take() {
|
||||||
|
match (c, self.peek()) {
|
||||||
|
('/', Some('*')) => self.block_comment()?,
|
||||||
|
('*', Some('/')) => return Ok(self.consume()),
|
||||||
|
_ => continue,
|
||||||
|
};
|
||||||
}
|
}
|
||||||
fn block_comment(&mut self) -> LResult<Token> {
|
Err(self.error(Reason::UnmatchedDelimiters('/')))
|
||||||
let mut comment = String::new();
|
|
||||||
while let Ok(c) = self.next() {
|
|
||||||
if '*' == c && Ok('/') == self.peek() {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
comment.push(c);
|
|
||||||
}
|
|
||||||
self.consume()?.produce(Kind::Comment, comment)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Identifiers
|
/// Identifiers
|
||||||
impl Lexer<'_> {
|
impl Lexer<'_> {
|
||||||
|
/// Produces an [Identifier](Kind::Identifier) or keyword
|
||||||
fn identifier(&mut self) -> LResult<Token> {
|
fn identifier(&mut self) -> LResult<Token> {
|
||||||
let mut out = String::from(self.xid_start()?);
|
while self.consume().peek().is_some_and(is_xid_continue) {}
|
||||||
while let Ok(c) = self.xid_continue() {
|
if let Ok(keyword) = Kind::from_str(self.lexeme()) {
|
||||||
out.push(c)
|
self.produce_with(keyword, ())
|
||||||
}
|
|
||||||
if let Ok(keyword) = Kind::from_str(&out) {
|
|
||||||
self.produce(keyword, ())
|
|
||||||
} else {
|
} else {
|
||||||
self.produce(Kind::Identifier, TokenData::String(out))
|
self.produce(Kind::Identifier)
|
||||||
}
|
|
||||||
}
|
|
||||||
fn xid_start(&mut self) -> LResult<char> {
|
|
||||||
match self.peek()? {
|
|
||||||
xid if xid == '_' || is_xid_start(xid) => {
|
|
||||||
self.consume()?;
|
|
||||||
Ok(xid)
|
|
||||||
}
|
|
||||||
bad => Err(Error::not_identifier(bad, self.line(), self.col())),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
fn xid_continue(&mut self) -> LResult<char> {
|
|
||||||
match self.peek()? {
|
|
||||||
xid if is_xid_continue(xid) => {
|
|
||||||
self.consume()?;
|
|
||||||
Ok(xid)
|
|
||||||
}
|
|
||||||
bad => Err(Error::not_identifier(bad, self.line(), self.col())),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Integers
|
/// Integers
|
||||||
impl Lexer<'_> {
|
impl Lexer<'_> {
|
||||||
fn int_with_base(&mut self) -> LResult<Token> {
|
/// Produces a [Literal](Kind::Literal) with an integer or float value.
|
||||||
match self.peek() {
|
|
||||||
Ok('~') => self.consume()?.digits::<36>(),
|
|
||||||
Ok('x') => self.consume()?.digits::<16>(),
|
|
||||||
Ok('d') => self.consume()?.digits::<10>(),
|
|
||||||
Ok('o') => self.consume()?.digits::<8>(),
|
|
||||||
Ok('b') => self.consume()?.digits::<2>(),
|
|
||||||
Ok('0'..='9' | '.') => self.digits::<10>(),
|
|
||||||
_ => self.produce(Kind::Literal, 0),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
fn digits<const B: u32>(&mut self) -> LResult<Token> {
|
fn digits<const B: u32>(&mut self) -> LResult<Token> {
|
||||||
let mut value = 0;
|
let mut value = 0;
|
||||||
while let Ok(true) = self.peek().as_ref().map(char::is_ascii_alphanumeric) {
|
while let Some(true) = self.peek().as_ref().map(char::is_ascii_alphanumeric) {
|
||||||
value = value * B as u128 + self.digit::<B>()? as u128;
|
value = value * B as u128 + self.digit::<B>()? as u128;
|
||||||
}
|
}
|
||||||
// TODO: find a better way to handle floats in the tokenizer
|
// TODO: find a better way to handle floats in the tokenizer
|
||||||
match self.peek() {
|
match self.peek() {
|
||||||
Ok('.') => {
|
Some('.') => {
|
||||||
// FIXME: hack: 0.. is not [0.0, '.']
|
// FIXME: hack: 0.. is not [0.0, '.']
|
||||||
if let Ok('.') = self.clone().consume()?.next() {
|
if let Some('.') = self.clone().consume().take() {
|
||||||
return self.produce(Kind::Literal, value);
|
return self.produce_with(Kind::Literal, value);
|
||||||
}
|
}
|
||||||
let mut float = format!("{value}.");
|
let mut float = format!("{value}.");
|
||||||
self.consume()?;
|
self.consume();
|
||||||
while let Ok(true) = self.peek().as_ref().map(char::is_ascii_digit) {
|
while let Some(true) = self.peek().as_ref().map(char::is_ascii_digit) {
|
||||||
float.push(self.iter.next().unwrap_or_default());
|
float.push(self.iter.next().map(|(_, c)| c).unwrap_or_default());
|
||||||
}
|
}
|
||||||
let float = f64::from_str(&float).expect("must be parsable as float");
|
let float = f64::from_str(&float).expect("must be parsable as float");
|
||||||
self.produce(Kind::Literal, float)
|
self.produce_with(Kind::Literal, float)
|
||||||
}
|
}
|
||||||
_ => self.produce(Kind::Literal, value),
|
_ => self.produce_with(Kind::Literal, value),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Consumes a single digit of base [B](Lexer::digit)
|
||||||
fn digit<const B: u32>(&mut self) -> LResult<u32> {
|
fn digit<const B: u32>(&mut self) -> LResult<u32> {
|
||||||
let digit = self.peek()?;
|
let digit = self.take().ok_or_else(|| self.error(Reason::EndOfFile))?;
|
||||||
self.consume()?;
|
|
||||||
digit
|
digit
|
||||||
.to_digit(B)
|
.to_digit(B)
|
||||||
.ok_or(Error::invalid_digit(digit, self.line(), self.col()))
|
.ok_or_else(|| self.error(Reason::InvalidDigit(digit)))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Strings and characters
|
/// Strings and characters
|
||||||
impl Lexer<'_> {
|
impl Lexer<'_> {
|
||||||
fn string(&mut self) -> LResult<Token> {
|
/// Produces a [Literal](Kind::Literal) with a pre-escaped [String]
|
||||||
let mut value = String::new();
|
pub fn string(&mut self) -> Result<Token, Error> {
|
||||||
while '"'
|
let mut lexeme = String::new();
|
||||||
!= self
|
self.consume();
|
||||||
.peek()
|
loop {
|
||||||
.map_err(|e| e.mask_reason(Reason::UnmatchedDelimiters('"')))?
|
lexeme.push(match self.take() {
|
||||||
{
|
None => Err(self.error(Reason::UnmatchedDelimiters('"')))?,
|
||||||
value.push(self.unescape()?)
|
Some('\\') => self.unescape()?,
|
||||||
|
Some('"') => break,
|
||||||
|
Some(c) => c,
|
||||||
|
})
|
||||||
}
|
}
|
||||||
self.consume()?.produce(Kind::Literal, value)
|
lexeme.shrink_to_fit();
|
||||||
|
self.produce_with(Kind::Literal, lexeme)
|
||||||
}
|
}
|
||||||
fn character(&mut self) -> LResult<Token> {
|
|
||||||
let out = self.unescape()?;
|
/// Produces a [Literal](Kind::Literal) with a pre-escaped [char]
|
||||||
match self.peek()? {
|
fn character(&mut self) -> Result<Token, Error> {
|
||||||
'\'' => self.consume()?.produce(Kind::Literal, out),
|
let c = match self.consume().take() {
|
||||||
_ => Err(Error::unmatched_delimiters('\'', self.line(), self.col())),
|
Some('\\') => self.unescape()?,
|
||||||
|
Some(c) => c,
|
||||||
|
None => '\0',
|
||||||
|
};
|
||||||
|
if self.take().is_some_and(|c| c == '\'') {
|
||||||
|
self.produce_with(Kind::Literal, c)
|
||||||
|
} else {
|
||||||
|
Err(self.error(Reason::UnmatchedDelimiters('\'')))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
/// Unescape a single character
|
|
||||||
|
/// Unescapes a single character
|
||||||
|
#[rustfmt::skip]
|
||||||
fn unescape(&mut self) -> LResult<char> {
|
fn unescape(&mut self) -> LResult<char> {
|
||||||
match self.next() {
|
Ok(match self.take().ok_or_else(|| self.error(Reason::EndOfFile))? {
|
||||||
Ok('\\') => (),
|
' ' => '\u{a0}',
|
||||||
other => return other,
|
'0' => '\0',
|
||||||
}
|
|
||||||
Ok(match self.next()? {
|
|
||||||
'a' => '\x07',
|
'a' => '\x07',
|
||||||
'b' => '\x08',
|
'b' => '\x08',
|
||||||
|
'e' => '\x1b',
|
||||||
'f' => '\x0c',
|
'f' => '\x0c',
|
||||||
'n' => '\n',
|
'n' => '\n',
|
||||||
'r' => '\r',
|
'r' => '\r',
|
||||||
't' => '\t',
|
't' => '\t',
|
||||||
'x' => self.hex_escape()?,
|
|
||||||
'u' => self.unicode_escape()?,
|
'u' => self.unicode_escape()?,
|
||||||
'0' => '\0',
|
'x' => self.hex_escape()?,
|
||||||
chr => chr,
|
chr => chr,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
/// unescape a single 2-digit hex escape
|
/// Unescapes a single 2-digit hex escape
|
||||||
fn hex_escape(&mut self) -> LResult<char> {
|
fn hex_escape(&mut self) -> LResult<char> {
|
||||||
let out = (self.digit::<16>()? << 4) + self.digit::<16>()?;
|
let out = (self.digit::<16>()? << 4) + self.digit::<16>()?;
|
||||||
char::from_u32(out).ok_or(Error::bad_unicode(out, self.line(), self.col()))
|
char::from_u32(out).ok_or_else(|| self.error(Reason::BadUnicode(out)))
|
||||||
}
|
}
|
||||||
/// unescape a single \u{} unicode escape
|
|
||||||
fn unicode_escape(&mut self) -> LResult<char> {
|
/// Unescapes a single \u{} unicode escape
|
||||||
|
pub fn unicode_escape(&mut self) -> Result<char, Error> {
|
||||||
|
self.next_if('{')
|
||||||
|
.ok_or_else(|| self.error(Reason::InvalidEscape('u')))?;
|
||||||
let mut out = 0;
|
let mut out = 0;
|
||||||
let Ok('{') = self.peek() else {
|
while let Some(c) = self.take() {
|
||||||
return Err(Error::invalid_escape('u', self.line(), self.col()));
|
if c == '}' {
|
||||||
};
|
return char::from_u32(out).ok_or_else(|| self.error(Reason::BadUnicode(out)));
|
||||||
self.consume()?;
|
|
||||||
while let Ok(c) = self.peek() {
|
|
||||||
match c {
|
|
||||||
'}' => {
|
|
||||||
self.consume()?;
|
|
||||||
return char::from_u32(out).ok_or(Error::bad_unicode(
|
|
||||||
out,
|
|
||||||
self.line(),
|
|
||||||
self.col(),
|
|
||||||
));
|
|
||||||
}
|
}
|
||||||
_ => out = (out << 4) + self.digit::<16>()?,
|
out = out * 16
|
||||||
|
+ c.to_digit(16)
|
||||||
|
.ok_or_else(|| self.error(Reason::InvalidDigit(c)))?;
|
||||||
}
|
}
|
||||||
}
|
Err(self.error(Reason::UnmatchedDelimiters('}')))
|
||||||
Err(Error::invalid_escape('u', self.line(), self.col()))
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -508,8 +469,6 @@ pub mod error {
|
|||||||
UnmatchedDelimiters(char),
|
UnmatchedDelimiters(char),
|
||||||
/// Found a character that doesn't belong to any [TokenKind](cl_token::TokenKind)
|
/// Found a character that doesn't belong to any [TokenKind](cl_token::TokenKind)
|
||||||
UnexpectedChar(char),
|
UnexpectedChar(char),
|
||||||
/// Found a character that's not valid in identifiers while looking for an identifier
|
|
||||||
NotIdentifier(char),
|
|
||||||
/// Found a character that's not valid in an escape sequence while looking for an escape
|
/// Found a character that's not valid in an escape sequence while looking for an escape
|
||||||
/// sequence
|
/// sequence
|
||||||
UnknownEscape(char),
|
UnknownEscape(char),
|
||||||
@ -517,30 +476,12 @@ pub mod error {
|
|||||||
InvalidEscape(char),
|
InvalidEscape(char),
|
||||||
/// Character is not a valid digit in the requested base
|
/// Character is not a valid digit in the requested base
|
||||||
InvalidDigit(char),
|
InvalidDigit(char),
|
||||||
/// Base conversion requested, but the base character was not in the set of known
|
|
||||||
/// characters
|
|
||||||
UnknownBase(char),
|
|
||||||
/// Unicode escape does not map to a valid unicode code-point
|
/// Unicode escape does not map to a valid unicode code-point
|
||||||
BadUnicode(u32),
|
BadUnicode(u32),
|
||||||
/// Reached end of input
|
/// Reached end of input
|
||||||
EndOfFile,
|
EndOfFile,
|
||||||
}
|
}
|
||||||
error_impl! {
|
|
||||||
unmatched_delimiters(c: char) => Reason::UnmatchedDelimiters(c),
|
|
||||||
unexpected_char(c: char) => Reason::UnexpectedChar(c),
|
|
||||||
not_identifier(c: char) => Reason::NotIdentifier(c),
|
|
||||||
unknown_escape(e: char) => Reason::UnknownEscape(e),
|
|
||||||
invalid_escape(e: char) => Reason::InvalidEscape(e),
|
|
||||||
invalid_digit(digit: char) => Reason::InvalidDigit(digit),
|
|
||||||
unknown_base(base: char) => Reason::UnknownBase(base),
|
|
||||||
bad_unicode(value: u32) => Reason::BadUnicode(value),
|
|
||||||
end_of_file => Reason::EndOfFile,
|
|
||||||
}
|
|
||||||
impl Error {
|
impl Error {
|
||||||
/// Changes the [Reason] of this error
|
|
||||||
pub(super) fn mask_reason(self, reason: Reason) -> Self {
|
|
||||||
Self { reason, ..self }
|
|
||||||
}
|
|
||||||
/// Returns the [Reason] for this error
|
/// Returns the [Reason] for this error
|
||||||
pub fn reason(&self) -> &Reason {
|
pub fn reason(&self) -> &Reason {
|
||||||
&self.reason
|
&self.reason
|
||||||
@ -550,14 +491,6 @@ pub mod error {
|
|||||||
(self.line, self.col)
|
(self.line, self.col)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
macro error_impl ($($fn:ident$(( $($p:ident: $t:ty),* ))? => $reason:expr),*$(,)?) {
|
|
||||||
#[allow(dead_code)]
|
|
||||||
impl Error {
|
|
||||||
$(pub(super) fn $fn ($($($p: $t),*,)? line: u32, col: u32) -> Self {
|
|
||||||
Self { reason: $reason, line, col }
|
|
||||||
})*
|
|
||||||
}
|
|
||||||
}
|
|
||||||
impl std::error::Error for Error {}
|
impl std::error::Error for Error {}
|
||||||
impl Display for Error {
|
impl Display for Error {
|
||||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
@ -567,14 +500,12 @@ pub mod error {
|
|||||||
impl Display for Reason {
|
impl Display for Reason {
|
||||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
match self {
|
match self {
|
||||||
Reason::UnmatchedDelimiters(c) => write! {f, "Unmatched `{c}` in input"},
|
Reason::UnmatchedDelimiters(c) => write! {f, "Unmatched `{c:?}` in input"},
|
||||||
Reason::UnexpectedChar(c) => write!(f, "Character `{c}` not expected"),
|
Reason::UnexpectedChar(c) => write!(f, "Character `{c:?}` not expected"),
|
||||||
Reason::NotIdentifier(c) => write!(f, "Character `{c}` not valid in identifiers"),
|
|
||||||
Reason::UnknownEscape(c) => write!(f, "`\\{c}` is not a known escape sequence"),
|
Reason::UnknownEscape(c) => write!(f, "`\\{c}` is not a known escape sequence"),
|
||||||
Reason::InvalidEscape(c) => write!(f, "Escape sequence `\\{c}`... is malformed"),
|
Reason::InvalidEscape(c) => write!(f, "Escape sequence `\\{c}`... is malformed"),
|
||||||
Reason::InvalidDigit(c) => write!(f, "`{c}` is not a valid digit"),
|
Reason::InvalidDigit(c) => write!(f, "`{c:?}` is not a valid digit"),
|
||||||
Reason::UnknownBase(c) => write!(f, "`0{c}`... is not a valid base"),
|
Reason::BadUnicode(c) => write!(f, "`\\u{{{c:x}}}` is not valid unicode"),
|
||||||
Reason::BadUnicode(c) => write!(f, "`{c}` is not a valid unicode code-point"),
|
|
||||||
Reason::EndOfFile => write!(f, "Reached end of input"),
|
Reason::EndOfFile => write!(f, "Reached end of input"),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user