lexer: Document stuff

This commit is contained in:
2025-10-16 23:08:16 -04:00
parent 70042f4ed0
commit c8f1f082c4

View File

@@ -43,6 +43,7 @@ impl<'t> Lexer<'t> {
self.iter.peek().map(|&(_, c)| c) self.iter.peek().map(|&(_, c)| c)
} }
/// Advances the tail to the current character index
fn advance_tail(&mut self) { fn advance_tail(&mut self) {
match self.iter.peek() { match self.iter.peek() {
Some(&(idx, _)) => self.tail = idx as u32, Some(&(idx, _)) => self.tail = idx as u32,
@@ -51,44 +52,45 @@ impl<'t> Lexer<'t> {
} }
/// Takes the last character /// Takes the last character
pub fn take(&mut self) -> Option<char> { fn take(&mut self) -> Option<char> {
let (_, c) = self.iter.next()?; let (_, c) = self.iter.next()?;
self.advance_tail(); self.advance_tail();
Some(c) Some(c)
} }
pub fn next_if(&mut self, expected: char) -> Option<char> { fn next_if(&mut self, expected: char) -> Option<char> {
let (_, c) = self.iter.next_if(|&(_, c)| c == expected)?; let (_, c) = self.iter.next_if(|&(_, c)| c == expected)?;
self.advance_tail(); self.advance_tail();
Some(c) Some(c)
} }
/// Consumes the last-peeked character, advancing the tail /// Consumes the last-peeked character, advancing the tail
pub fn consume(&mut self) -> &mut Self { fn consume(&mut self) -> &mut Self {
self.iter.next(); self.iter.next();
self.advance_tail(); self.advance_tail();
self self
} }
/// Produces a LexError at the start of the current token /// Produces a LexError at the start of the current token
pub fn error(&self, res: &'static str) -> LexError { fn error(&self, res: &'static str) -> LexError {
LexError { pos: self.head, res } LexError { pos: self.head, res }
} }
pub fn as_str(&self) -> (&'t str, Span) { /// Gets the Lexer's current &[str] lexeme and [Span]
fn as_str(&self) -> (&'t str, Span) {
let span = Span(self.head, self.tail); let span = Span(self.head, self.tail);
(&self.text[Range::from(span)], span) (&self.text[Range::from(span)], span)
} }
/// Produces a Token /// Produces a Token
pub fn produce(&mut self, kind: TKind) -> Token { fn produce(&mut self, kind: TKind) -> Token {
self.advance_tail(); self.advance_tail();
let (lexeme, span) = self.as_str(); let (lexeme, span) = self.as_str();
self.head = self.tail; self.head = self.tail;
Token { lexeme: Lexeme::String(lexeme.to_owned()), kind, span } Token { lexeme: Lexeme::String(lexeme.to_owned()), kind, span }
} }
pub fn produce_with_lexeme(&mut self, kind: TKind, lexeme: Lexeme) -> Token { fn produce_with_lexeme(&mut self, kind: TKind, lexeme: Lexeme) -> Token {
self.advance_tail(); self.advance_tail();
let span = Span(self.head, self.tail); let span = Span(self.head, self.tail);
self.head = self.tail; self.head = self.tail;
@@ -96,14 +98,14 @@ impl<'t> Lexer<'t> {
} }
/// Consumes 0 or more whitespace /// Consumes 0 or more whitespace
pub fn skip_whitespace(&mut self) -> &mut Self { fn skip_whitespace(&mut self) -> &mut Self {
while self.peek().is_some_and(char::is_whitespace) { while self.peek().is_some_and(char::is_whitespace) {
let _ = self.consume(); let _ = self.consume();
} }
self self
} }
pub fn start_token(&mut self) -> &mut Self { fn start_token(&mut self) -> &mut Self {
self.head = self.tail; self.head = self.tail;
self self
} }
@@ -203,6 +205,7 @@ impl<'t> Lexer<'t> {
Ok(self.consume().produce(tok)) Ok(self.consume().produce(tok))
} }
/// Elides the trailing [Token] `kind` when it comes before a list terminator.
pub fn trailing(&mut self, kind: TKind) -> Result<Token, LexError> { pub fn trailing(&mut self, kind: TKind) -> Result<Token, LexError> {
Ok(match self.skip_whitespace().peek() { Ok(match self.skip_whitespace().peek() {
// Some(')') => self.consume().produce(TKind::RParen), // maybe. // Some(')') => self.consume().produce(TKind::RParen), // maybe.
@@ -212,11 +215,14 @@ impl<'t> Lexer<'t> {
}) })
} }
/// Consumes characters until the lexer reaches a newline `'\n'`
pub fn line_comment(&mut self) -> Result<Token, LexError> { pub fn line_comment(&mut self) -> Result<Token, LexError> {
while self.consume().peek().is_some_and(|c| c != '\n') {} while self.consume().peek().is_some_and(|c| c != '\n') {}
Ok(self.produce(TKind::Comment)) Ok(self.produce(TKind::Comment))
} }
/// Consumes characters until the lexer reaches the end of a *nested* block comment.
/// This allows you to arbitrarily comment out code, even if that code has a block comment.
pub fn block_comment(&mut self) -> Result<&mut Self, LexError> { pub fn block_comment(&mut self) -> Result<&mut Self, LexError> {
self.consume(); self.consume();
while let Some(c) = self.take() { while let Some(c) = self.take() {
@@ -229,6 +235,11 @@ impl<'t> Lexer<'t> {
Err(self.error("Unterminated block comment")) Err(self.error("Unterminated block comment"))
} }
/// Consumes characters until it reaches a character not in [is_xid_continue].
///
/// Always consumes the first character.
///
/// Maps the result to either a [TKind::Identifier] or a [TKind] keyword.
pub fn identifier(&mut self) -> Result<Token, LexError> { pub fn identifier(&mut self) -> Result<Token, LexError> {
while self.consume().peek().is_some_and(is_xid_continue) {} while self.consume().peek().is_some_and(is_xid_continue) {}
let (lexeme, _span) = self.as_str(); let (lexeme, _span) = self.as_str();
@@ -265,6 +276,7 @@ impl<'t> Lexer<'t> {
}) })
} }
/// Eagerly parses a character literal starting at the current lexer position.
pub fn character(&mut self) -> Result<Token, LexError> { pub fn character(&mut self) -> Result<Token, LexError> {
let c = match self.consume().take() { let c = match self.consume().take() {
Some('\\') => self.escape()?, Some('\\') => self.escape()?,
@@ -278,6 +290,7 @@ impl<'t> Lexer<'t> {
} }
} }
// Eagerly parses a string literal starting at the current lexer position.
pub fn string(&mut self) -> Result<Token, LexError> { pub fn string(&mut self) -> Result<Token, LexError> {
let mut lexeme = String::new(); let mut lexeme = String::new();
self.consume(); self.consume();
@@ -293,28 +306,31 @@ impl<'t> Lexer<'t> {
Ok(self.produce_with_lexeme(TKind::String, Lexeme::String(lexeme))) Ok(self.produce_with_lexeme(TKind::String, Lexeme::String(lexeme)))
} }
/// Parses a single escape sequence into its resulting char value.
pub fn escape(&mut self) -> Result<char, LexError> { pub fn escape(&mut self) -> Result<char, LexError> {
Ok(match self.take().ok_or_else(|| self.error("EOF"))? { Ok(match self.take().ok_or_else(|| self.error("EOF"))? {
' ' => '\u{a0}', ' ' => '\u{a0}', // Non-breaking space
'0' => '\0', '0' => '\0', // C0 Null Character
'a' => '\x07', 'a' => '\x07', // C0 Acknowledge
'b' => '\x08', 'b' => '\x08', // C0 Bell
'e' => '\x1b', 'e' => '\x1b', // C0 Escape
'f' => '\x0c', 'f' => '\x0c', // Form Feed
'n' => '\n', 'n' => '\n', // New Line
'r' => '\r', 'r' => '\r', // Carriage Return
't' => '\t', 't' => '\t', // Tab
'u' => self.unicode_escape()?, 'u' => self.unicode_escape()?,
'x' => self.hex_escape()?, 'x' => self.hex_escape()?,
c => c, c => c,
}) })
} }
/// Parses two hex-digits and constructs a [char] out of them.
pub fn hex_escape(&mut self) -> Result<char, LexError> { pub fn hex_escape(&mut self) -> Result<char, LexError> {
let out = (self.digit::<16>()? << 4) + self.digit::<16>()?; let out = (self.digit::<16>()? << 4) + self.digit::<16>()?;
char::from_u32(out).ok_or(self.error("Invalid digit")) char::from_u32(out).ok_or(self.error("Invalid digit"))
} }
/// Parses a sequence of `{}`-bracketed hex-digits and constructs a [char] out of them.
pub fn unicode_escape(&mut self) -> Result<char, LexError> { pub fn unicode_escape(&mut self) -> Result<char, LexError> {
self.next_if('{') self.next_if('{')
.ok_or_else(|| self.error("No unicode escape opener"))?; .ok_or_else(|| self.error("No unicode escape opener"))?;
@@ -328,6 +344,10 @@ impl<'t> Lexer<'t> {
Err(self.error("Unterminated unicode escape")) Err(self.error("Unterminated unicode escape"))
} }
/// Parses a sequence of digits (and underscores) in base `BASE`, where 2 <= `BASE` <= 36.
///
/// If the sequence of digits exceeds the bounds of a [u128], the resulting number will wrap
/// around 2^128.
pub fn digits<const BASE: u32>(&mut self) -> Result<Token, LexError> { pub fn digits<const BASE: u32>(&mut self) -> Result<Token, LexError> {
let mut int: u128 = 0; let mut int: u128 = 0;
while let Some(c) = self.peek() { while let Some(c) = self.peek() {
@@ -342,6 +362,7 @@ impl<'t> Lexer<'t> {
Ok(self.produce_with_lexeme(TKind::Integer, Lexeme::Integer(int, BASE))) Ok(self.produce_with_lexeme(TKind::Integer, Lexeme::Integer(int, BASE)))
} }
/// Parses a single digit in base `BASE` as a u32, where 2 <= `BASE` <= 36
pub fn digit<const BASE: u32>(&mut self) -> Result<u32, LexError> { pub fn digit<const BASE: u32>(&mut self) -> Result<u32, LexError> {
if let Some(digit) = self.take().and_then(|c| c.to_digit(BASE)) { if let Some(digit) = self.take().and_then(|c| c.to_digit(BASE)) {
Ok(digit) Ok(digit)