//! A lobster use std::ops::Range; #[allow(dead_code)] use std::{iter::Peekable, str::CharIndices}; use unicode_ident::{is_xid_continue, is_xid_start}; use crate::{span::Span, token::*}; #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub struct LexError { pub pos: Span, pub res: LexFailure, } impl std::error::Error for LexError {} impl std::fmt::Display for LexError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { let Self { pos, res } = self; write!(f, "{pos}: {res}") } } #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub enum LexFailure { /// Reached end of file EOF, UnexpectedEOF, Unexpected(char), UnterminatedBlockComment, UnterminatedCharacter, UnterminatedString, UnterminatedUnicodeEscape, InvalidUnicodeEscape(u32), InvalidDigitForBase(char, u32), IntegerOverflow, } use LexFailure::*; pub use LexFailure::{EOF, UnexpectedEOF}; impl std::fmt::Display for LexFailure { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { Self::EOF => "EOF".fmt(f), Self::UnexpectedEOF => "Unexpected EOF".fmt(f), Self::Unexpected(c) => write!(f, "Character '{c:?}'"), Self::UnterminatedBlockComment => "Unterminated Block Comment".fmt(f), Self::UnterminatedCharacter => "Unterminated Character".fmt(f), Self::UnterminatedString => "Unterminated String".fmt(f), Self::UnterminatedUnicodeEscape => "Unterminated Unicode Escape".fmt(f), Self::InvalidUnicodeEscape(hex) => { write!(f, "'\\u{{{hex:x}}}' is not a valid UTF-8 codepoint") } Self::InvalidDigitForBase(digit, base) => { write!(f, "Invalid digit {digit} for base {base}") } Self::IntegerOverflow => "Integer literal does not fit in 128 bits".fmt(f), } } } #[derive(Clone, Debug)] pub struct Lexer<'t> { /// The source text text: &'t str, /// A peekable iterator over the source text iter: Peekable>, /// The start of the current token head: u32, /// The end of the current token tail: u32, } impl<'t> Lexer<'t> { /// Constructs a new Lexer from some text pub fn new(text: &'t str) -> Self { let iter = text.char_indices().peekable(); Self { text, iter, head: 0, tail: 0 } } /// Peeks the next character without advancing the lexer pub fn peek(&mut self) -> Option { self.iter.peek().map(|&(_, c)| c) } /// Advances the tail to the current character index fn advance_tail(&mut self) { match self.iter.peek() { Some(&(idx, _)) => self.tail = idx as u32, None => self.tail = self.text.len() as _, } } /// Takes the last character fn take(&mut self) -> Option { let (_, c) = self.iter.next()?; self.advance_tail(); Some(c) } fn next_if(&mut self, expected: char) -> Option { let (_, c) = self.iter.next_if(|&(_, c)| c == expected)?; self.advance_tail(); Some(c) } /// Consumes the last-peeked character, advancing the tail fn consume(&mut self) -> &mut Self { self.iter.next(); self.advance_tail(); self } /// Produces a [`LexError`] at the start of the current token const fn error(&self, res: LexFailure) -> LexError { LexError { pos: Span(self.head, self.tail), res } } /// Gets the Lexer's current &[str] lexeme and [Span] fn as_str(&self) -> (&'t str, Span) { let span = Span(self.head, self.tail); (&self.text[Range::from(span)], span) } /// Produces a Token fn produce(&mut self, kind: TKind) -> Token { self.advance_tail(); let (lexeme, span) = self.as_str(); self.head = self.tail; Token { lexeme: Lexeme::String(lexeme.to_owned()), kind, span } } fn produce_with_lexeme(&mut self, kind: TKind, lexeme: Lexeme) -> Token { self.advance_tail(); let span = Span(self.head, self.tail); self.head = self.tail; Token { lexeme, kind, span } } /// Consumes 0 or more whitespace fn skip_whitespace(&mut self) -> &mut Self { while self.peek().is_some_and(char::is_whitespace) { let _ = self.consume(); } self } const fn start_token(&mut self) -> &mut Self { self.head = self.tail; self } /// Scans forward until it finds the next Token in the input pub fn scan(&mut self) -> Result { use TKind::*; // !"#%&'()*+,-./:;<=>?@[\\]^`{|}~ let tok = match self .skip_whitespace() .start_token() .peek() .ok_or_else(|| self.error(EOF))? { '!' => Bang, '"' => return self.string(), '#' => Hash, '%' => Rem, '&' => Amp, '\'' => return self.character(), '(' => LParen, ')' => RParen, '*' => Star, '+' => Plus, ',' => return self.consume().trailing(Comma), '-' => Minus, '.' => Dot, '/' => Slash, '0' => Integer, '1'..='9' => return self.digits::<10>(), ':' => Colon, ';' => Semi, '<' => Lt, '=' => Eq, '>' => Gt, '?' => Question, '@' => At, '[' => LBrack, '\\' => Backslash, ']' => RBrack, '^' => Xor, '`' => Grave, '{' => LCurly, '|' => Bar, '}' => RCurly, '~' => Tilde, '_' => return self.identifier(), c if is_xid_start(c) => return self.identifier(), c => Err(self.error(Unexpected(c)))?, }; // Handle digraphs let tok = match (tok, self.consume().peek()) { (Integer, Some('b')) => return self.consume().digits::<2>(), (Integer, Some('d')) => return self.consume().digits::<10>(), (Integer, Some('o')) => return self.consume().digits::<8>(), (Integer, Some('x')) => return self.consume().digits::<16>(), (Integer, Some('~')) => return self.consume().digits::<36>(), (Integer, _) => return self.digits::<10>(), (Amp, Some('&')) => AmpAmp, (Amp, Some('=')) => AmpEq, (Bang, Some('!')) => BangBang, (Bang, Some('=')) => BangEq, (Bar, Some('|')) => BarBar, (Bar, Some('=')) => BarEq, (Colon, Some(':')) => ColonColon, (Dot, Some('.')) => DotDot, (Eq, Some('=')) => EqEq, (Eq, Some('>')) => FatArrow, (Gt, Some('=')) => GtEq, (Gt, Some('>')) => GtGt, (Hash, Some('!')) => HashBang, (Lt, Some('=')) => LtEq, (Lt, Some('<')) => LtLt, (Minus, Some('=')) => MinusEq, (Minus, Some('>')) => Arrow, (Plus, Some('=')) => PlusEq, (Rem, Some('=')) => RemEq, (Slash, Some('*')) => return Ok(self.block_comment()?.produce(Comment)), (Slash, Some('=')) => SlashEq, (Slash, Some('/')) => return self.line_comment(), (Star, Some('=')) => StarEq, (Xor, Some('=')) => XorEq, (Xor, Some('^')) => XorXor, _ => return Ok(self.produce(tok)), }; // Handle trigraphs let tok = match (tok, self.consume().peek()) { (HashBang, Some('/')) => return self.line_comment(), (DotDot, Some('=')) => DotDotEq, (GtGt, Some('=')) => GtGtEq, (LtLt, Some('=')) => LtLtEq, _ => return Ok(self.produce(tok)), }; Ok(self.consume().produce(tok)) } /// Elides the trailing [Token] `kind` when it comes before a list terminator. pub fn trailing(&mut self, kind: TKind) -> Result { Ok(match self.skip_whitespace().peek() { // Some(')') => self.consume().produce(TKind::RParen), // maybe. Some(']') => self.consume().produce(TKind::RBrack), Some('}') => self.consume().produce(TKind::RCurly), _ => self.produce(kind), }) } /// Consumes characters until the lexer reaches a newline `'\n'` pub fn line_comment(&mut self) -> Result { let kind = match self.consume().peek() { Some('!' | '/') => TKind::Doc, _ => TKind::Comment, }; while self.consume().peek().is_some_and(|c| c != '\n') {} Ok(self.produce(kind)) } /// Consumes characters until the lexer reaches the end of a *nested* block comment. /// This allows you to arbitrarily comment out code, even if that code has a block comment. pub fn block_comment(&mut self) -> Result<&mut Self, LexError> { self.consume(); while let Some(c) = self.take() { match (c, self.peek()) { ('/', Some('*')) => self.block_comment()?, ('*', Some('/')) => return Ok(self.consume()), _ => continue, }; } Err(self.error(UnterminatedBlockComment)) } /// Consumes characters until it reaches a character not in [`is_xid_continue`]. /// /// Always consumes the first character. /// /// Maps the result to either a [`TKind::Identifier`] or a [`TKind`] keyword. pub fn identifier(&mut self) -> Result { while self.consume().peek().is_some_and(is_xid_continue) {} let (lexeme, _span) = self.as_str(); let token = self.produce(TKind::Identifier); Ok(Token { kind: match lexeme { "as" => TKind::As, "break" => TKind::Break, "const" => TKind::Const, "continue" => TKind::Continue, "do" => TKind::Do, "else" => TKind::Else, "enum" => TKind::Enum, "false" => TKind::False, "fn" => TKind::Fn, "for" => TKind::For, "if" => TKind::If, "impl" => TKind::Impl, "in" => TKind::In, "let" => TKind::Let, "loop" => TKind::Loop, "macro" => TKind::Macro, "match" => TKind::Match, "mod" => TKind::Mod, "mut" => TKind::Mut, "pub" => TKind::Pub, "return" => TKind::Return, "static" => TKind::Static, "struct" => TKind::Struct, "then" => TKind::Do, "true" => TKind::True, "type" => TKind::Type, "use" => TKind::Use, "while" => TKind::While, _ => token.kind, }, ..token }) } /// Eagerly parses a character literal starting at the current lexer position. pub fn character(&mut self) -> Result { let c = match self.consume().take() { Some('\\') => self.escape()?, Some(c) => c, None => '\0', }; if self.take().is_some_and(|c| c == '\'') { Ok(self.produce_with_lexeme(TKind::Character, Lexeme::Char(c))) } else { Err(self.error(UnterminatedCharacter)) } } // Eagerly parses a string literal starting at the current lexer position. pub fn string(&mut self) -> Result { let mut lexeme = String::new(); self.consume(); loop { lexeme.push(match self.take() { None => Err(self.error(UnterminatedString))?, Some('\\') => self.escape()?, Some('"') => break, Some(c) => c, }); } lexeme.shrink_to_fit(); Ok(self.produce_with_lexeme(TKind::String, Lexeme::String(lexeme))) } /// Parses a single escape sequence into its resulting char value. pub fn escape(&mut self) -> Result { Ok( match self.take().ok_or_else(|| self.error(UnexpectedEOF))? { ' ' => '\u{a0}', // Non-breaking space '0' => '\0', // C0 Null Character 'a' => '\x07', // C0 Acknowledge 'b' => '\x08', // C0 Bell 'e' => '\x1b', // C0 Escape 'f' => '\x0c', // Form Feed 'n' => '\n', // New Line 'r' => '\r', // Carriage Return 't' => '\t', // Tab 'u' => self.unicode_escape()?, 'x' => self.hex_escape()?, c => c, }, ) } /// Parses two hex-digits and constructs a [char] out of them. pub fn hex_escape(&mut self) -> Result { let out = (self.digit::<16>()? << 4) + self.digit::<16>()?; char::from_u32(out).ok_or_else(|| self.error(InvalidUnicodeEscape(out))) } /// Parses a sequence of `{}`-bracketed hex-digits and constructs a [char] out of them. pub fn unicode_escape(&mut self) -> Result { self.next_if('{') .ok_or_else(|| self.error(UnterminatedUnicodeEscape))?; let mut out = 0; while let Some(c) = self.take() { if c == '}' { return char::from_u32(out).ok_or_else(|| self.error(InvalidUnicodeEscape(out))); } out = out * 16 + c.to_digit(16) .ok_or_else(|| self.error(InvalidDigitForBase(c, 16)))?; } Err(self.error(UnterminatedUnicodeEscape)) } /// Parses a sequence of digits (and underscores) in base `BASE`, where 2 <= `BASE` <= 36. /// /// If the sequence of digits exceeds the bounds of a [u128], the resulting number will wrap /// around 2^128. pub fn digits(&mut self) -> Result { let mut int: u128 = 0; while let Some(c) = self.peek() { int = match c.to_digit(BASE).ok_or(c) { Err('_') => int, Ok(c) => int .checked_mul(BASE as _) .and_then(|int| int.checked_add(c as _)) .ok_or_else(|| self.error(IntegerOverflow))?, _ => break, }; self.consume(); } Ok(self.produce_with_lexeme(TKind::Integer, Lexeme::Integer(int, BASE))) } /// Parses a single digit in base `BASE` as a u32, where 2 <= `BASE` <= 36. pub fn digit(&mut self) -> Result { let digit = self.take().ok_or_else(|| self.error(UnexpectedEOF))?; if let Some(digit) = digit.to_digit(BASE) { Ok(digit) } else { Err(self.error(InvalidDigitForBase(digit, BASE))) } } }