Doughlang/src/lexer.rs

//! A lobster
use std::ops::Range;
#[allow(dead_code)]
use std::{iter::Peekable, str::CharIndices};
use unicode_ident::{is_xid_continue, is_xid_start};

use crate::{span::Span, token::*};

#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub struct LexError {
    pub pos: Span,
    pub res: LexFailure,
}

impl std::error::Error for LexError {}
impl std::fmt::Display for LexError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        let Self { pos, res } = self;
        write!(f, "{pos}: {res}")
    }
}

#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum LexFailure {
    /// Reached end of file
    EOF,
    UnexpectedEOF,
    Unexpected(char),
    UnterminatedBlockComment,
    UnterminatedCharacter,
    UnterminatedString,
    UnterminatedUnicodeEscape,
    InvalidUnicodeEscape(u32),
    InvalidDigitForBase(char, u32),
    IntegerOverflow,
}
use LexFailure::*;
pub use LexFailure::{EOF, UnexpectedEOF};

impl std::fmt::Display for LexFailure {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            Self::EOF => "EOF".fmt(f),
            Self::UnexpectedEOF => "Unexpected EOF".fmt(f),
            Self::Unexpected(c) => write!(f, "Character '{c:?}'"),
            Self::UnterminatedBlockComment => "Unterminated Block Comment".fmt(f),
            Self::UnterminatedCharacter => "Unterminated Character".fmt(f),
            Self::UnterminatedString => "Unterminated String".fmt(f),
            Self::UnterminatedUnicodeEscape => "Unterminated Unicode Escape".fmt(f),
            Self::InvalidUnicodeEscape(hex) => {
                write!(f, "'\\u{{{hex:x}}}' is not a valid UTF-8 codepoint")
            }
            Self::InvalidDigitForBase(digit, base) => {
                write!(f, "Invalid digit {digit} for base {base}")
            }
            Self::IntegerOverflow => "Integer literal does not fit in 128 bits".fmt(f),
        }
    }
}

#[derive(Clone, Debug)]
pub struct Lexer<'t> {
    /// The source text
    text: &'t str,
    /// A peekable iterator over the source text
    iter: Peekable<CharIndices<'t>>,
    /// The start of the current token
    head: u32,
    /// The end of the current token
    tail: u32,
}

impl<'t> Lexer<'t> {
    /// Constructs a new Lexer from some text
    pub fn new(text: &'t str) -> Self {
        let iter = text.char_indices().peekable();
        Self { text, iter, head: 0, tail: 0 }
    }

    /// Peeks the next character without advancing the lexer
    pub fn peek(&mut self) -> Option<char> {
        self.iter.peek().map(|&(_, c)| c)
    }

    /// Advances the tail to the current character index
    fn advance_tail(&mut self) {
        match self.iter.peek() {
            Some(&(idx, _)) => self.tail = idx as u32,
            None => self.tail = self.text.len() as _,
        }
    }

    /// Takes the last character
    fn take(&mut self) -> Option<char> {
        let (_, c) = self.iter.next()?;
        self.advance_tail();
        Some(c)
    }

    fn next_if(&mut self, expected: char) -> Option<char> {
        let (_, c) = self.iter.next_if(|&(_, c)| c == expected)?;
        self.advance_tail();
        Some(c)
    }

    /// Consumes the last-peeked character, advancing the tail
    fn consume(&mut self) -> &mut Self {
        self.iter.next();
        self.advance_tail();
        self
    }

    /// Produces a [`LexError`] at the start of the current token
    const fn error(&self, res: LexFailure) -> LexError {
        LexError { pos: Span(self.head, self.tail), res }
    }

    /// Gets the Lexer's current &[str] lexeme and [Span]
    fn as_str(&self) -> (&'t str, Span) {
        let span = Span(self.head, self.tail);
        (&self.text[Range::from(span)], span)
    }

    /// Produces a Token
    fn produce(&mut self, kind: TKind) -> Token {
        self.advance_tail();
        let (lexeme, span) = self.as_str();
        self.head = self.tail;
        Token { lexeme: Lexeme::String(lexeme.to_owned()), kind, span }
    }

    fn produce_with_lexeme(&mut self, kind: TKind, lexeme: Lexeme) -> Token {
        self.advance_tail();
        let span = Span(self.head, self.tail);
        self.head = self.tail;
        Token { lexeme, kind, span }
    }

    /// Consumes 0 or more whitespace
    fn skip_whitespace(&mut self) -> &mut Self {
        while self.peek().is_some_and(char::is_whitespace) {
            let _ = self.consume();
        }
        self
    }

    const fn start_token(&mut self) -> &mut Self {
        self.head = self.tail;
        self
    }

    /// Scans forward until it finds the next Token in the input
    pub fn scan(&mut self) -> Result<Token, LexError> {
        use TKind::*;
        // !"#%&'()*+,-./:;<=>?@[\\]^`{|}~
        let tok = match self
            .skip_whitespace()
            .start_token()
            .peek()
            .ok_or_else(|| self.error(EOF))?
        {
            '!' => Bang,
            '"' => return self.string(),
            '#' => Hash,
            '%' => Rem,
            '&' => Amp,
            '\'' => return self.character(),
            '(' => LParen,
            ')' => RParen,
            '*' => Star,
            '+' => Plus,
            ',' => return self.consume().trailing(Comma),
            '-' => Minus,
            '.' => Dot,
            '/' => Slash,
            '0' => Integer,
            '1'..='9' => return self.digits::<10>(),
            ':' => Colon,
            ';' => Semi,
            '<' => Lt,
            '=' => Eq,
            '>' => Gt,
            '?' => Question,
            '@' => At,
            '[' => LBrack,
            '\\' => Backslash,
            ']' => RBrack,
            '^' => Xor,
            '`' => Grave,
            '{' => LCurly,
            '|' => Bar,
            '}' => RCurly,
            '~' => Tilde,
            '_' => return self.identifier(),
            c if is_xid_start(c) => return self.identifier(),
            c => Err(self.error(Unexpected(c)))?,
        };

        // Handle digraphs
        let tok = match (tok, self.consume().peek()) {
            (Integer, Some('b')) => return self.consume().digits::<2>(),
            (Integer, Some('d')) => return self.consume().digits::<10>(),
            (Integer, Some('o')) => return self.consume().digits::<8>(),
            (Integer, Some('x')) => return self.consume().digits::<16>(),
            (Integer, Some('~')) => return self.consume().digits::<36>(),
            (Integer, _) => return self.digits::<10>(),
            (Amp, Some('&')) => AmpAmp,
            (Amp, Some('=')) => AmpEq,
            (Bang, Some('!')) => BangBang,
            (Bang, Some('=')) => BangEq,
            (Bar, Some('|')) => BarBar,
            (Bar, Some('=')) => BarEq,
            (Colon, Some(':')) => ColonColon,
            (Dot, Some('.')) => DotDot,
            (Eq, Some('=')) => EqEq,
            (Eq, Some('>')) => FatArrow,
            (Gt, Some('=')) => GtEq,
            (Gt, Some('>')) => GtGt,
            (Hash, Some('!')) => HashBang,
            (Lt, Some('=')) => LtEq,
            (Lt, Some('<')) => LtLt,
            (Minus, Some('=')) => MinusEq,
            (Minus, Some('>')) => Arrow,
            (Plus, Some('=')) => PlusEq,
            (Rem, Some('=')) => RemEq,
            (Slash, Some('*')) => return Ok(self.block_comment()?.produce(Comment)),
            (Slash, Some('=')) => SlashEq,
            (Slash, Some('/')) => return self.line_comment(),
            (Star, Some('=')) => StarEq,
            (Xor, Some('=')) => XorEq,
            (Xor, Some('^')) => XorXor,
            _ => return Ok(self.produce(tok)),
        };

        // Handle trigraphs
        let tok = match (tok, self.consume().peek()) {
            (HashBang, Some('/')) => return self.line_comment(),
            (DotDot, Some('=')) => DotDotEq,
            (GtGt, Some('=')) => GtGtEq,
            (LtLt, Some('=')) => LtLtEq,
            _ => return Ok(self.produce(tok)),
        };

        Ok(self.consume().produce(tok))
    }

    /// Elides the trailing [Token] `kind` when it comes before a list terminator.
    pub fn trailing(&mut self, kind: TKind) -> Result<Token, LexError> {
        Ok(match self.skip_whitespace().peek() {
            // Some(')') => self.consume().produce(TKind::RParen), // maybe.
            Some(']') => self.consume().produce(TKind::RBrack),
            Some('}') => self.consume().produce(TKind::RCurly),
            _ => self.produce(kind),
        })
    }

    /// Consumes characters until the lexer reaches a newline `'\n'`
    pub fn line_comment(&mut self) -> Result<Token, LexError> {
        let kind = match self.consume().peek() {
            Some('!' | '/') => TKind::Doc,
            _ => TKind::Comment,
        };
        while self.consume().peek().is_some_and(|c| c != '\n') {}
        Ok(self.produce(kind))
    }

    /// Consumes characters until the lexer reaches the end of a *nested* block comment.
    /// This allows you to arbitrarily comment out code, even if that code has a block comment.
    pub fn block_comment(&mut self) -> Result<&mut Self, LexError> {
        self.consume();
        while let Some(c) = self.take() {
            match (c, self.peek()) {
                ('/', Some('*')) => self.block_comment()?,
                ('*', Some('/')) => return Ok(self.consume()),
                _ => continue,
            };
        }
        Err(self.error(UnterminatedBlockComment))
    }

    /// Consumes characters until it reaches a character not in [`is_xid_continue`].
    ///
    /// Always consumes the first character.
    ///
    /// Maps the result to either a [`TKind::Identifier`] or a [`TKind`] keyword.
    pub fn identifier(&mut self) -> Result<Token, LexError> {
        while self.consume().peek().is_some_and(is_xid_continue) {}
        let (lexeme, _span) = self.as_str();
        let token = self.produce(TKind::Identifier);
        Ok(Token {
            kind: match lexeme {
                "as" => TKind::As,
                "break" => TKind::Break,
                "const" => TKind::Const,
                "continue" => TKind::Continue,
                "do" => TKind::Do,
                "else" => TKind::Else,
                "enum" => TKind::Enum,
                "false" => TKind::False,
                "fn" => TKind::Fn,
                "for" => TKind::For,
                "if" => TKind::If,
                "impl" => TKind::Impl,
                "in" => TKind::In,
                "let" => TKind::Let,
                "loop" => TKind::Loop,
                "macro" => TKind::Macro,
                "match" => TKind::Match,
                "mod" => TKind::Mod,
                "mut" => TKind::Mut,
                "pub" => TKind::Pub,
                "return" => TKind::Return,
                "static" => TKind::Static,
                "struct" => TKind::Struct,
                "then" => TKind::Do,
                "true" => TKind::True,
                "type" => TKind::Type,
                "use" => TKind::Use,
                "while" => TKind::While,
                _ => token.kind,
            },
            ..token
        })
    }

    /// Eagerly parses a character literal starting at the current lexer position.
    pub fn character(&mut self) -> Result<Token, LexError> {
        let c = match self.consume().take() {
            Some('\\') => self.escape()?,
            Some(c) => c,
            None => '\0',
        };
        if self.take().is_some_and(|c| c == '\'') {
            Ok(self.produce_with_lexeme(TKind::Character, Lexeme::Char(c)))
        } else {
            Err(self.error(UnterminatedCharacter))
        }
    }

    // Eagerly parses a string literal starting at the current lexer position.
    pub fn string(&mut self) -> Result<Token, LexError> {
        let mut lexeme = String::new();
        self.consume();
        loop {
            lexeme.push(match self.take() {
                None => Err(self.error(UnterminatedString))?,
                Some('\\') => self.escape()?,
                Some('"') => break,
                Some(c) => c,
            });
        }
        lexeme.shrink_to_fit();
        Ok(self.produce_with_lexeme(TKind::String, Lexeme::String(lexeme)))
    }

    /// Parses a single escape sequence into its resulting char value.
    pub fn escape(&mut self) -> Result<char, LexError> {
        Ok(
            match self.take().ok_or_else(|| self.error(UnexpectedEOF))? {
                ' ' => '\u{a0}', // Non-breaking space
                '0' => '\0',     // C0 Null Character
                'a' => '\x07',   // C0 Acknowledge
                'b' => '\x08',   // C0 Bell
                'e' => '\x1b',   // C0 Escape
                'f' => '\x0c',   // Form Feed
                'n' => '\n',     // New Line
                'r' => '\r',     // Carriage Return
                't' => '\t',     // Tab
                'u' => self.unicode_escape()?,
                'x' => self.hex_escape()?,
                c => c,
            },
        )
    }

    /// Parses two hex-digits and constructs a [char] out of them.
    pub fn hex_escape(&mut self) -> Result<char, LexError> {
        let out = (self.digit::<16>()? << 4) + self.digit::<16>()?;
        char::from_u32(out).ok_or_else(|| self.error(InvalidUnicodeEscape(out)))
    }

    /// Parses a sequence of `{}`-bracketed hex-digits and constructs a [char] out of them.
    pub fn unicode_escape(&mut self) -> Result<char, LexError> {
        self.next_if('{')
            .ok_or_else(|| self.error(UnterminatedUnicodeEscape))?;
        let mut out = 0;
        while let Some(c) = self.take() {
            if c == '}' {
                return char::from_u32(out).ok_or_else(|| self.error(InvalidUnicodeEscape(out)));
            }
            out = out * 16
                + c.to_digit(16)
                    .ok_or_else(|| self.error(InvalidDigitForBase(c, 16)))?;
        }
        Err(self.error(UnterminatedUnicodeEscape))
    }

    /// Parses a sequence of digits (and underscores) in base `BASE`, where 2 <= `BASE` <= 36.
    ///
    /// If the sequence of digits exceeds the bounds of a [u128], the resulting number will wrap
    /// around 2^128.
    pub fn digits<const BASE: u32>(&mut self) -> Result<Token, LexError> {
        let mut int: u128 = 0;
        while let Some(c) = self.peek() {
            int = match c.to_digit(BASE).ok_or(c) {
                Err('_') => int,
                Ok(c) => int
                    .checked_mul(BASE as _)
                    .and_then(|int| int.checked_add(c as _))
                    .ok_or_else(|| self.error(IntegerOverflow))?,
                _ => break,
            };
            self.consume();
        }

        Ok(self.produce_with_lexeme(TKind::Integer, Lexeme::Integer(int, BASE)))
    }

    /// Parses a single digit in base `BASE` as a u32, where 2 <= `BASE` <= 36.
    pub fn digit<const BASE: u32>(&mut self) -> Result<u32, LexError> {
        let digit = self.take().ok_or_else(|| self.error(UnexpectedEOF))?;
        if let Some(digit) = digit.to_digit(BASE) {
            Ok(digit)
        } else {
            Err(self.error(InvalidDigitForBase(digit, BASE)))
        }
    }
}