From c8f1f082c49fc680b3fe55872a3a131e84828a55 Mon Sep 17 00:00:00 2001 From: John Date: Thu, 16 Oct 2025 23:08:16 -0400 Subject: [PATCH] lexer: Document stuff --- src/lexer.rs | 57 +++++++++++++++++++++++++++++++++++----------------- 1 file changed, 39 insertions(+), 18 deletions(-) diff --git a/src/lexer.rs b/src/lexer.rs index ee99f92..8c424c5 100644 --- a/src/lexer.rs +++ b/src/lexer.rs @@ -43,6 +43,7 @@ impl<'t> Lexer<'t> { self.iter.peek().map(|&(_, c)| c) } + /// Advances the tail to the current character index fn advance_tail(&mut self) { match self.iter.peek() { Some(&(idx, _)) => self.tail = idx as u32, @@ -51,44 +52,45 @@ impl<'t> Lexer<'t> { } /// Takes the last character - pub fn take(&mut self) -> Option { + fn take(&mut self) -> Option { let (_, c) = self.iter.next()?; self.advance_tail(); Some(c) } - pub fn next_if(&mut self, expected: char) -> Option { + fn next_if(&mut self, expected: char) -> Option { let (_, c) = self.iter.next_if(|&(_, c)| c == expected)?; self.advance_tail(); Some(c) } /// Consumes the last-peeked character, advancing the tail - pub fn consume(&mut self) -> &mut Self { + fn consume(&mut self) -> &mut Self { self.iter.next(); self.advance_tail(); self } /// Produces a LexError at the start of the current token - pub fn error(&self, res: &'static str) -> LexError { + fn error(&self, res: &'static str) -> LexError { LexError { pos: self.head, res } } - pub fn as_str(&self) -> (&'t str, Span) { + /// Gets the Lexer's current &[str] lexeme and [Span] + fn as_str(&self) -> (&'t str, Span) { let span = Span(self.head, self.tail); (&self.text[Range::from(span)], span) } /// Produces a Token - pub fn produce(&mut self, kind: TKind) -> Token { + fn produce(&mut self, kind: TKind) -> Token { self.advance_tail(); let (lexeme, span) = self.as_str(); self.head = self.tail; Token { lexeme: Lexeme::String(lexeme.to_owned()), kind, span } } - pub fn produce_with_lexeme(&mut self, kind: TKind, lexeme: Lexeme) -> Token { + fn produce_with_lexeme(&mut self, kind: TKind, lexeme: Lexeme) -> Token { self.advance_tail(); let span = Span(self.head, self.tail); self.head = self.tail; @@ -96,14 +98,14 @@ impl<'t> Lexer<'t> { } /// Consumes 0 or more whitespace - pub fn skip_whitespace(&mut self) -> &mut Self { + fn skip_whitespace(&mut self) -> &mut Self { while self.peek().is_some_and(char::is_whitespace) { let _ = self.consume(); } self } - pub fn start_token(&mut self) -> &mut Self { + fn start_token(&mut self) -> &mut Self { self.head = self.tail; self } @@ -203,6 +205,7 @@ impl<'t> Lexer<'t> { Ok(self.consume().produce(tok)) } + /// Elides the trailing [Token] `kind` when it comes before a list terminator. pub fn trailing(&mut self, kind: TKind) -> Result { Ok(match self.skip_whitespace().peek() { // Some(')') => self.consume().produce(TKind::RParen), // maybe. @@ -212,11 +215,14 @@ impl<'t> Lexer<'t> { }) } + /// Consumes characters until the lexer reaches a newline `'\n'` pub fn line_comment(&mut self) -> Result { while self.consume().peek().is_some_and(|c| c != '\n') {} Ok(self.produce(TKind::Comment)) } + /// Consumes characters until the lexer reaches the end of a *nested* block comment. + /// This allows you to arbitrarily comment out code, even if that code has a block comment. pub fn block_comment(&mut self) -> Result<&mut Self, LexError> { self.consume(); while let Some(c) = self.take() { @@ -229,6 +235,11 @@ impl<'t> Lexer<'t> { Err(self.error("Unterminated block comment")) } + /// Consumes characters until it reaches a character not in [is_xid_continue]. + /// + /// Always consumes the first character. + /// + /// Maps the result to either a [TKind::Identifier] or a [TKind] keyword. pub fn identifier(&mut self) -> Result { while self.consume().peek().is_some_and(is_xid_continue) {} let (lexeme, _span) = self.as_str(); @@ -265,6 +276,7 @@ impl<'t> Lexer<'t> { }) } + /// Eagerly parses a character literal starting at the current lexer position. pub fn character(&mut self) -> Result { let c = match self.consume().take() { Some('\\') => self.escape()?, @@ -278,6 +290,7 @@ impl<'t> Lexer<'t> { } } + // Eagerly parses a string literal starting at the current lexer position. pub fn string(&mut self) -> Result { let mut lexeme = String::new(); self.consume(); @@ -293,28 +306,31 @@ impl<'t> Lexer<'t> { Ok(self.produce_with_lexeme(TKind::String, Lexeme::String(lexeme))) } + /// Parses a single escape sequence into its resulting char value. pub fn escape(&mut self) -> Result { Ok(match self.take().ok_or_else(|| self.error("EOF"))? { - ' ' => '\u{a0}', - '0' => '\0', - 'a' => '\x07', - 'b' => '\x08', - 'e' => '\x1b', - 'f' => '\x0c', - 'n' => '\n', - 'r' => '\r', - 't' => '\t', + ' ' => '\u{a0}', // Non-breaking space + '0' => '\0', // C0 Null Character + 'a' => '\x07', // C0 Acknowledge + 'b' => '\x08', // C0 Bell + 'e' => '\x1b', // C0 Escape + 'f' => '\x0c', // Form Feed + 'n' => '\n', // New Line + 'r' => '\r', // Carriage Return + 't' => '\t', // Tab 'u' => self.unicode_escape()?, 'x' => self.hex_escape()?, c => c, }) } + /// Parses two hex-digits and constructs a [char] out of them. pub fn hex_escape(&mut self) -> Result { let out = (self.digit::<16>()? << 4) + self.digit::<16>()?; char::from_u32(out).ok_or(self.error("Invalid digit")) } + /// Parses a sequence of `{}`-bracketed hex-digits and constructs a [char] out of them. pub fn unicode_escape(&mut self) -> Result { self.next_if('{') .ok_or_else(|| self.error("No unicode escape opener"))?; @@ -328,6 +344,10 @@ impl<'t> Lexer<'t> { Err(self.error("Unterminated unicode escape")) } + /// Parses a sequence of digits (and underscores) in base `BASE`, where 2 <= `BASE` <= 36. + /// + /// If the sequence of digits exceeds the bounds of a [u128], the resulting number will wrap + /// around 2^128. pub fn digits(&mut self) -> Result { let mut int: u128 = 0; while let Some(c) = self.peek() { @@ -342,6 +362,7 @@ impl<'t> Lexer<'t> { Ok(self.produce_with_lexeme(TKind::Integer, Lexeme::Integer(int, BASE))) } + /// Parses a single digit in base `BASE` as a u32, where 2 <= `BASE` <= 36 pub fn digit(&mut self) -> Result { if let Some(digit) = self.take().and_then(|c| c.to_digit(BASE)) { Ok(digit)