doughlang: Preserve errors through entire pipeline
lexer: - Un-stringify errors - Reserve more words - Doc the comments parser: - MASSIVE changes to peek, peek_if, next_if, consume_if=>expect. - Keep track of when EOF is allowable - TKind is stupidly cheap with >100 niches, so we can fit like 4 of them in a single ParseError lmao - TODO: make sure EOF/UnexpectedEOF propagation is correct. It seems... Kinda Not correct. - Add meta-expressions
This commit is contained in:
		
							
								
								
									
										119
									
								
								src/lexer.rs
									
									
									
									
									
								
							
							
						
						
									
										119
									
								
								src/lexer.rs
									
									
									
									
									
								
							@@ -8,9 +8,10 @@ use crate::{span::Span, token::*};
 | 
			
		||||
 | 
			
		||||
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
 | 
			
		||||
pub struct LexError {
 | 
			
		||||
    pub pos: u32,
 | 
			
		||||
    pub res: &'static str,
 | 
			
		||||
    pub pos: Span,
 | 
			
		||||
    pub res: LexFailure,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
impl std::error::Error for LexError {}
 | 
			
		||||
impl std::fmt::Display for LexError {
 | 
			
		||||
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
 | 
			
		||||
@@ -19,6 +20,44 @@ impl std::fmt::Display for LexError {
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
 | 
			
		||||
pub enum LexFailure {
 | 
			
		||||
    /// Reached end of file
 | 
			
		||||
    EOF,
 | 
			
		||||
    UnexpectedEOF,
 | 
			
		||||
    Unexpected(char),
 | 
			
		||||
    UnterminatedBlockComment,
 | 
			
		||||
    UnterminatedCharacter,
 | 
			
		||||
    UnterminatedString,
 | 
			
		||||
    UnterminatedUnicodeEscape,
 | 
			
		||||
    InvalidUnicodeEscape(u32),
 | 
			
		||||
    InvalidDigitForBase(char, u32),
 | 
			
		||||
    IntegerOverflow,
 | 
			
		||||
}
 | 
			
		||||
use LexFailure::*;
 | 
			
		||||
pub use LexFailure::{EOF, UnexpectedEOF};
 | 
			
		||||
 | 
			
		||||
impl std::fmt::Display for LexFailure {
 | 
			
		||||
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
 | 
			
		||||
        match self {
 | 
			
		||||
            Self::EOF => "EOF".fmt(f),
 | 
			
		||||
            Self::UnexpectedEOF => "Unexpected EOF".fmt(f),
 | 
			
		||||
            Self::Unexpected(c) => write!(f, "Character '{c:?}'"),
 | 
			
		||||
            Self::UnterminatedBlockComment => "Unterminated Block Comment".fmt(f),
 | 
			
		||||
            Self::UnterminatedCharacter => "Unterminated Character".fmt(f),
 | 
			
		||||
            Self::UnterminatedString => "Unterminated String".fmt(f),
 | 
			
		||||
            Self::UnterminatedUnicodeEscape => "Unterminated Unicode Escape".fmt(f),
 | 
			
		||||
            Self::InvalidUnicodeEscape(hex) => {
 | 
			
		||||
                write!(f, "'\\u{{{hex:x}}}' is not a valid UTF-8 codepoint")
 | 
			
		||||
            }
 | 
			
		||||
            Self::InvalidDigitForBase(digit, base) => {
 | 
			
		||||
                write!(f, "Invalid digit {digit} for base {base}")
 | 
			
		||||
            }
 | 
			
		||||
            Self::IntegerOverflow => "Integer literal does not fit in 128 bits".fmt(f),
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#[derive(Clone, Debug)]
 | 
			
		||||
pub struct Lexer<'t> {
 | 
			
		||||
    /// The source text
 | 
			
		||||
@@ -72,8 +111,8 @@ impl<'t> Lexer<'t> {
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    /// Produces a LexError at the start of the current token
 | 
			
		||||
    fn error(&self, res: &'static str) -> LexError {
 | 
			
		||||
        LexError { pos: self.head, res }
 | 
			
		||||
    fn error(&self, res: LexFailure) -> LexError {
 | 
			
		||||
        LexError { pos: Span(self.head, self.tail), res }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    /// Gets the Lexer's current &[str] lexeme and [Span]
 | 
			
		||||
@@ -118,7 +157,7 @@ impl<'t> Lexer<'t> {
 | 
			
		||||
            .skip_whitespace()
 | 
			
		||||
            .start_token()
 | 
			
		||||
            .peek()
 | 
			
		||||
            .ok_or_else(|| self.error("EOF"))?
 | 
			
		||||
            .ok_or_else(|| self.error(EOF))?
 | 
			
		||||
        {
 | 
			
		||||
            '!' => Bang,
 | 
			
		||||
            '"' => return self.string(),
 | 
			
		||||
@@ -154,7 +193,7 @@ impl<'t> Lexer<'t> {
 | 
			
		||||
            '~' => Tilde,
 | 
			
		||||
            '_' => return self.identifier(),
 | 
			
		||||
            c if is_xid_start(c) => return self.identifier(),
 | 
			
		||||
            _ => Err(self.error("Invalid"))?,
 | 
			
		||||
            c => Err(self.error(Unexpected(c)))?,
 | 
			
		||||
        };
 | 
			
		||||
 | 
			
		||||
        // Handle digraphs
 | 
			
		||||
@@ -217,8 +256,12 @@ impl<'t> Lexer<'t> {
 | 
			
		||||
 | 
			
		||||
    /// Consumes characters until the lexer reaches a newline `'\n'`
 | 
			
		||||
    pub fn line_comment(&mut self) -> Result<Token, LexError> {
 | 
			
		||||
        let kind = match self.consume().peek() {
 | 
			
		||||
            Some('!' | '/') => TKind::Doc,
 | 
			
		||||
            _ => TKind::Comment,
 | 
			
		||||
        };
 | 
			
		||||
        while self.consume().peek().is_some_and(|c| c != '\n') {}
 | 
			
		||||
        Ok(self.produce(TKind::Comment))
 | 
			
		||||
        Ok(self.produce(kind))
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    /// Consumes characters until the lexer reaches the end of a *nested* block comment.
 | 
			
		||||
@@ -232,7 +275,7 @@ impl<'t> Lexer<'t> {
 | 
			
		||||
                _ => continue,
 | 
			
		||||
            };
 | 
			
		||||
        }
 | 
			
		||||
        Err(self.error("Unterminated block comment"))
 | 
			
		||||
        Err(self.error(UnterminatedBlockComment))
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    /// Consumes characters until it reaches a character not in [is_xid_continue].
 | 
			
		||||
@@ -257,6 +300,7 @@ impl<'t> Lexer<'t> {
 | 
			
		||||
                "fn" => TKind::Fn,
 | 
			
		||||
                "for" => TKind::For,
 | 
			
		||||
                "if" => TKind::If,
 | 
			
		||||
                "impl" => TKind::Impl,
 | 
			
		||||
                "in" => TKind::In,
 | 
			
		||||
                "let" => TKind::Let,
 | 
			
		||||
                "loop" => TKind::Loop,
 | 
			
		||||
@@ -266,6 +310,7 @@ impl<'t> Lexer<'t> {
 | 
			
		||||
                "or" => TKind::Or,
 | 
			
		||||
                "pub" => TKind::Public,
 | 
			
		||||
                "return" => TKind::Return,
 | 
			
		||||
                "static" => TKind::Const, // TODO: Static
 | 
			
		||||
                "struct" => TKind::Struct,
 | 
			
		||||
                "then" => TKind::Do,
 | 
			
		||||
                "true" => TKind::True,
 | 
			
		||||
@@ -286,7 +331,7 @@ impl<'t> Lexer<'t> {
 | 
			
		||||
        if self.take().is_some_and(|c| c == '\'') {
 | 
			
		||||
            Ok(self.produce_with_lexeme(TKind::Character, Lexeme::Char(c)))
 | 
			
		||||
        } else {
 | 
			
		||||
            Err(self.error("Unterminated character"))
 | 
			
		||||
            Err(self.error(UnterminatedCharacter))
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
@@ -296,7 +341,7 @@ impl<'t> Lexer<'t> {
 | 
			
		||||
        self.consume();
 | 
			
		||||
        loop {
 | 
			
		||||
            lexeme.push(match self.take() {
 | 
			
		||||
                None => Err(self.error("Unterminated string"))?,
 | 
			
		||||
                None => Err(self.error(UnterminatedString))?,
 | 
			
		||||
                Some('\\') => self.escape()?,
 | 
			
		||||
                Some('"') => break,
 | 
			
		||||
                Some(c) => c,
 | 
			
		||||
@@ -308,40 +353,44 @@ impl<'t> Lexer<'t> {
 | 
			
		||||
 | 
			
		||||
    /// Parses a single escape sequence into its resulting char value.
 | 
			
		||||
    pub fn escape(&mut self) -> Result<char, LexError> {
 | 
			
		||||
        Ok(match self.take().ok_or_else(|| self.error("EOF"))? {
 | 
			
		||||
            ' ' => '\u{a0}', // Non-breaking space
 | 
			
		||||
            '0' => '\0',     // C0 Null Character
 | 
			
		||||
            'a' => '\x07',   // C0 Acknowledge
 | 
			
		||||
            'b' => '\x08',   // C0 Bell
 | 
			
		||||
            'e' => '\x1b',   // C0 Escape
 | 
			
		||||
            'f' => '\x0c',   // Form Feed
 | 
			
		||||
            'n' => '\n',     // New Line
 | 
			
		||||
            'r' => '\r',     // Carriage Return
 | 
			
		||||
            't' => '\t',     // Tab
 | 
			
		||||
            'u' => self.unicode_escape()?,
 | 
			
		||||
            'x' => self.hex_escape()?,
 | 
			
		||||
            c => c,
 | 
			
		||||
        })
 | 
			
		||||
        Ok(
 | 
			
		||||
            match self.take().ok_or_else(|| self.error(UnexpectedEOF))? {
 | 
			
		||||
                ' ' => '\u{a0}', // Non-breaking space
 | 
			
		||||
                '0' => '\0',     // C0 Null Character
 | 
			
		||||
                'a' => '\x07',   // C0 Acknowledge
 | 
			
		||||
                'b' => '\x08',   // C0 Bell
 | 
			
		||||
                'e' => '\x1b',   // C0 Escape
 | 
			
		||||
                'f' => '\x0c',   // Form Feed
 | 
			
		||||
                'n' => '\n',     // New Line
 | 
			
		||||
                'r' => '\r',     // Carriage Return
 | 
			
		||||
                't' => '\t',     // Tab
 | 
			
		||||
                'u' => self.unicode_escape()?,
 | 
			
		||||
                'x' => self.hex_escape()?,
 | 
			
		||||
                c => c,
 | 
			
		||||
            },
 | 
			
		||||
        )
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    /// Parses two hex-digits and constructs a [char] out of them.
 | 
			
		||||
    pub fn hex_escape(&mut self) -> Result<char, LexError> {
 | 
			
		||||
        let out = (self.digit::<16>()? << 4) + self.digit::<16>()?;
 | 
			
		||||
        char::from_u32(out).ok_or(self.error("Invalid digit"))
 | 
			
		||||
        char::from_u32(out).ok_or(self.error(InvalidUnicodeEscape(out)))
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    /// Parses a sequence of `{}`-bracketed hex-digits and constructs a [char] out of them.
 | 
			
		||||
    pub fn unicode_escape(&mut self) -> Result<char, LexError> {
 | 
			
		||||
        self.next_if('{')
 | 
			
		||||
            .ok_or_else(|| self.error("No unicode escape opener"))?;
 | 
			
		||||
            .ok_or_else(|| self.error(UnterminatedUnicodeEscape))?;
 | 
			
		||||
        let mut out = 0;
 | 
			
		||||
        while let Some(c) = self.take() {
 | 
			
		||||
            if c == '}' {
 | 
			
		||||
                return char::from_u32(out).ok_or_else(|| self.error("Bad unicode value"));
 | 
			
		||||
                return char::from_u32(out).ok_or_else(|| self.error(InvalidUnicodeEscape(out)));
 | 
			
		||||
            }
 | 
			
		||||
            out = out * 16 + c.to_digit(16).ok_or_else(|| self.error("Invalid digit"))?;
 | 
			
		||||
            out = out * 16
 | 
			
		||||
                + c.to_digit(16)
 | 
			
		||||
                    .ok_or_else(|| self.error(InvalidDigitForBase(c, 16)))?;
 | 
			
		||||
        }
 | 
			
		||||
        Err(self.error("Unterminated unicode escape"))
 | 
			
		||||
        Err(self.error(UnterminatedUnicodeEscape))
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    /// Parses a sequence of digits (and underscores) in base `BASE`, where 2 <= `BASE` <= 36.
 | 
			
		||||
@@ -353,7 +402,10 @@ impl<'t> Lexer<'t> {
 | 
			
		||||
        while let Some(c) = self.peek() {
 | 
			
		||||
            int = match c.to_digit(BASE).ok_or(c) {
 | 
			
		||||
                Err('_') => int,
 | 
			
		||||
                Ok(c) => int.wrapping_mul(BASE as _).wrapping_add(c as _),
 | 
			
		||||
                Ok(c) => int
 | 
			
		||||
                    .checked_mul(BASE as _)
 | 
			
		||||
                    .and_then(|int| int.checked_add(c as _))
 | 
			
		||||
                    .ok_or_else(|| self.error(IntegerOverflow))?,
 | 
			
		||||
                _ => break,
 | 
			
		||||
            };
 | 
			
		||||
            self.consume();
 | 
			
		||||
@@ -362,12 +414,13 @@ impl<'t> Lexer<'t> {
 | 
			
		||||
        Ok(self.produce_with_lexeme(TKind::Integer, Lexeme::Integer(int, BASE)))
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    /// Parses a single digit in base `BASE` as a u32, where 2 <= `BASE` <= 36
 | 
			
		||||
    /// Parses a single digit in base `BASE` as a u32, where 2 <= `BASE` <= 36.
 | 
			
		||||
    pub fn digit<const BASE: u32>(&mut self) -> Result<u32, LexError> {
 | 
			
		||||
        if let Some(digit) = self.take().and_then(|c| c.to_digit(BASE)) {
 | 
			
		||||
        let digit = self.take().ok_or_else(|| self.error(UnexpectedEOF))?;
 | 
			
		||||
        if let Some(digit) = digit.to_digit(BASE) {
 | 
			
		||||
            Ok(digit)
 | 
			
		||||
        } else {
 | 
			
		||||
            Err(self.error("Invalid digit"))
 | 
			
		||||
            Err(self.error(InvalidDigitForBase(digit, BASE)))
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user