doughlang: Preserve errors through entire pipeline

lexer:
- Un-stringify errors
- Reserve more words
- Doc the comments

parser:
- MASSIVE changes to peek, peek_if, next_if, consume_if=>expect.
- Keep track of when EOF is allowable
- TKind is stupidly cheap with >100 niches, so we can fit like 4 of them in a single ParseError lmao
- TODO: make sure EOF/UnexpectedEOF propagation is correct. It seems... Kinda Not correct.
- Add meta-expressions
This commit is contained in:
2025-10-17 06:25:11 -04:00
parent c8f1f082c4
commit 6368e68941
6 changed files with 543 additions and 351 deletions

View File

@@ -8,9 +8,10 @@ use crate::{span::Span, token::*};
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub struct LexError {
pub pos: u32,
pub res: &'static str,
pub pos: Span,
pub res: LexFailure,
}
impl std::error::Error for LexError {}
impl std::fmt::Display for LexError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -19,6 +20,44 @@ impl std::fmt::Display for LexError {
}
}
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum LexFailure {
/// Reached end of file
EOF,
UnexpectedEOF,
Unexpected(char),
UnterminatedBlockComment,
UnterminatedCharacter,
UnterminatedString,
UnterminatedUnicodeEscape,
InvalidUnicodeEscape(u32),
InvalidDigitForBase(char, u32),
IntegerOverflow,
}
use LexFailure::*;
pub use LexFailure::{EOF, UnexpectedEOF};
impl std::fmt::Display for LexFailure {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::EOF => "EOF".fmt(f),
Self::UnexpectedEOF => "Unexpected EOF".fmt(f),
Self::Unexpected(c) => write!(f, "Character '{c:?}'"),
Self::UnterminatedBlockComment => "Unterminated Block Comment".fmt(f),
Self::UnterminatedCharacter => "Unterminated Character".fmt(f),
Self::UnterminatedString => "Unterminated String".fmt(f),
Self::UnterminatedUnicodeEscape => "Unterminated Unicode Escape".fmt(f),
Self::InvalidUnicodeEscape(hex) => {
write!(f, "'\\u{{{hex:x}}}' is not a valid UTF-8 codepoint")
}
Self::InvalidDigitForBase(digit, base) => {
write!(f, "Invalid digit {digit} for base {base}")
}
Self::IntegerOverflow => "Integer literal does not fit in 128 bits".fmt(f),
}
}
}
#[derive(Clone, Debug)]
pub struct Lexer<'t> {
/// The source text
@@ -72,8 +111,8 @@ impl<'t> Lexer<'t> {
}
/// Produces a LexError at the start of the current token
fn error(&self, res: &'static str) -> LexError {
LexError { pos: self.head, res }
fn error(&self, res: LexFailure) -> LexError {
LexError { pos: Span(self.head, self.tail), res }
}
/// Gets the Lexer's current &[str] lexeme and [Span]
@@ -118,7 +157,7 @@ impl<'t> Lexer<'t> {
.skip_whitespace()
.start_token()
.peek()
.ok_or_else(|| self.error("EOF"))?
.ok_or_else(|| self.error(EOF))?
{
'!' => Bang,
'"' => return self.string(),
@@ -154,7 +193,7 @@ impl<'t> Lexer<'t> {
'~' => Tilde,
'_' => return self.identifier(),
c if is_xid_start(c) => return self.identifier(),
_ => Err(self.error("Invalid"))?,
c => Err(self.error(Unexpected(c)))?,
};
// Handle digraphs
@@ -217,8 +256,12 @@ impl<'t> Lexer<'t> {
/// Consumes characters until the lexer reaches a newline `'\n'`
pub fn line_comment(&mut self) -> Result<Token, LexError> {
let kind = match self.consume().peek() {
Some('!' | '/') => TKind::Doc,
_ => TKind::Comment,
};
while self.consume().peek().is_some_and(|c| c != '\n') {}
Ok(self.produce(TKind::Comment))
Ok(self.produce(kind))
}
/// Consumes characters until the lexer reaches the end of a *nested* block comment.
@@ -232,7 +275,7 @@ impl<'t> Lexer<'t> {
_ => continue,
};
}
Err(self.error("Unterminated block comment"))
Err(self.error(UnterminatedBlockComment))
}
/// Consumes characters until it reaches a character not in [is_xid_continue].
@@ -257,6 +300,7 @@ impl<'t> Lexer<'t> {
"fn" => TKind::Fn,
"for" => TKind::For,
"if" => TKind::If,
"impl" => TKind::Impl,
"in" => TKind::In,
"let" => TKind::Let,
"loop" => TKind::Loop,
@@ -266,6 +310,7 @@ impl<'t> Lexer<'t> {
"or" => TKind::Or,
"pub" => TKind::Public,
"return" => TKind::Return,
"static" => TKind::Const, // TODO: Static
"struct" => TKind::Struct,
"then" => TKind::Do,
"true" => TKind::True,
@@ -286,7 +331,7 @@ impl<'t> Lexer<'t> {
if self.take().is_some_and(|c| c == '\'') {
Ok(self.produce_with_lexeme(TKind::Character, Lexeme::Char(c)))
} else {
Err(self.error("Unterminated character"))
Err(self.error(UnterminatedCharacter))
}
}
@@ -296,7 +341,7 @@ impl<'t> Lexer<'t> {
self.consume();
loop {
lexeme.push(match self.take() {
None => Err(self.error("Unterminated string"))?,
None => Err(self.error(UnterminatedString))?,
Some('\\') => self.escape()?,
Some('"') => break,
Some(c) => c,
@@ -308,40 +353,44 @@ impl<'t> Lexer<'t> {
/// Parses a single escape sequence into its resulting char value.
pub fn escape(&mut self) -> Result<char, LexError> {
Ok(match self.take().ok_or_else(|| self.error("EOF"))? {
' ' => '\u{a0}', // Non-breaking space
'0' => '\0', // C0 Null Character
'a' => '\x07', // C0 Acknowledge
'b' => '\x08', // C0 Bell
'e' => '\x1b', // C0 Escape
'f' => '\x0c', // Form Feed
'n' => '\n', // New Line
'r' => '\r', // Carriage Return
't' => '\t', // Tab
'u' => self.unicode_escape()?,
'x' => self.hex_escape()?,
c => c,
})
Ok(
match self.take().ok_or_else(|| self.error(UnexpectedEOF))? {
' ' => '\u{a0}', // Non-breaking space
'0' => '\0', // C0 Null Character
'a' => '\x07', // C0 Acknowledge
'b' => '\x08', // C0 Bell
'e' => '\x1b', // C0 Escape
'f' => '\x0c', // Form Feed
'n' => '\n', // New Line
'r' => '\r', // Carriage Return
't' => '\t', // Tab
'u' => self.unicode_escape()?,
'x' => self.hex_escape()?,
c => c,
},
)
}
/// Parses two hex-digits and constructs a [char] out of them.
pub fn hex_escape(&mut self) -> Result<char, LexError> {
let out = (self.digit::<16>()? << 4) + self.digit::<16>()?;
char::from_u32(out).ok_or(self.error("Invalid digit"))
char::from_u32(out).ok_or(self.error(InvalidUnicodeEscape(out)))
}
/// Parses a sequence of `{}`-bracketed hex-digits and constructs a [char] out of them.
pub fn unicode_escape(&mut self) -> Result<char, LexError> {
self.next_if('{')
.ok_or_else(|| self.error("No unicode escape opener"))?;
.ok_or_else(|| self.error(UnterminatedUnicodeEscape))?;
let mut out = 0;
while let Some(c) = self.take() {
if c == '}' {
return char::from_u32(out).ok_or_else(|| self.error("Bad unicode value"));
return char::from_u32(out).ok_or_else(|| self.error(InvalidUnicodeEscape(out)));
}
out = out * 16 + c.to_digit(16).ok_or_else(|| self.error("Invalid digit"))?;
out = out * 16
+ c.to_digit(16)
.ok_or_else(|| self.error(InvalidDigitForBase(c, 16)))?;
}
Err(self.error("Unterminated unicode escape"))
Err(self.error(UnterminatedUnicodeEscape))
}
/// Parses a sequence of digits (and underscores) in base `BASE`, where 2 <= `BASE` <= 36.
@@ -353,7 +402,10 @@ impl<'t> Lexer<'t> {
while let Some(c) = self.peek() {
int = match c.to_digit(BASE).ok_or(c) {
Err('_') => int,
Ok(c) => int.wrapping_mul(BASE as _).wrapping_add(c as _),
Ok(c) => int
.checked_mul(BASE as _)
.and_then(|int| int.checked_add(c as _))
.ok_or_else(|| self.error(IntegerOverflow))?,
_ => break,
};
self.consume();
@@ -362,12 +414,13 @@ impl<'t> Lexer<'t> {
Ok(self.produce_with_lexeme(TKind::Integer, Lexeme::Integer(int, BASE)))
}
/// Parses a single digit in base `BASE` as a u32, where 2 <= `BASE` <= 36
/// Parses a single digit in base `BASE` as a u32, where 2 <= `BASE` <= 36.
pub fn digit<const BASE: u32>(&mut self) -> Result<u32, LexError> {
if let Some(digit) = self.take().and_then(|c| c.to_digit(BASE)) {
let digit = self.take().ok_or_else(|| self.error(UnexpectedEOF))?;
if let Some(digit) = digit.to_digit(BASE) {
Ok(digit)
} else {
Err(self.error("Invalid digit"))
Err(self.error(InvalidDigitForBase(digit, BASE)))
}
}
}