Conlang/compiler/cl-lexer/src/lib.rs

557 lines
19 KiB
Rust

//! Converts a text file into tokens
#![warn(clippy::all)]
#![feature(decl_macro)]
use cl_structures::span::Loc;
use cl_token::{TokenKind as Kind, *};
use std::{
iter::Peekable,
str::{Chars, FromStr},
};
use unicode_ident::*;
#[cfg(test)]
mod tests;
pub mod lexer_iter {
//! Iterator over a [`Lexer`], returning [`LResult<Token>`]s
use super::{
error::{LResult, Reason},
Lexer, Token,
};
/// Iterator over a [`Lexer`], returning [`LResult<Token>`]s
pub struct LexerIter<'t> {
lexer: Lexer<'t>,
}
impl<'t> Iterator for LexerIter<'t> {
type Item = LResult<Token>;
fn next(&mut self) -> Option<Self::Item> {
match self.lexer.scan() {
Ok(v) => Some(Ok(v)),
Err(e) => {
if e.reason == Reason::EndOfFile {
None
} else {
Some(Err(e))
}
}
}
}
}
impl<'t> IntoIterator for Lexer<'t> {
type Item = LResult<Token>;
type IntoIter = LexerIter<'t>;
fn into_iter(self) -> Self::IntoIter {
LexerIter { lexer: self }
}
}
}
/// The Lexer iterates over the characters in a body of text, searching for [Tokens](Token).
///
/// # Examples
/// ```rust
/// # use cl_lexer::Lexer;
/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
/// // Read in your code from somewhere
/// let some_code = "
/// fn main () {
/// // TODO: code goes here!
/// }
/// ";
/// // Create a lexer over your code
/// let mut lexer = Lexer::new(some_code);
/// // Scan for a single token
/// let first_token = lexer.scan()?;
/// println!("{first_token:?}");
/// // Loop over all the rest of the tokens
/// for token in lexer {
/// # let token: Result<_,()> = Ok(token?);
/// match token {
/// Ok(token) => println!("{token:?}"),
/// Err(e) => eprintln!("{e:?}"),
/// }
/// }
/// # Ok(()) }
/// ```
#[derive(Clone, Debug)]
pub struct Lexer<'t> {
iter: Peekable<Chars<'t>>,
start: usize,
start_loc: (u32, u32),
current: usize,
current_loc: (u32, u32),
}
impl<'t> Lexer<'t> {
/// Creates a new [Lexer] over a [str]
pub fn new(text: &'t str) -> Self {
Self {
iter: text.chars().peekable(),
start: 0,
start_loc: (1, 1),
current: 0,
current_loc: (1, 1),
}
}
/// Scans through the text, searching for the next [Token]
pub fn scan(&mut self) -> LResult<Token> {
match self.skip_whitespace().peek()? {
'{' => self.consume()?.produce_op(Punct::LCurly),
'}' => self.consume()?.produce_op(Punct::RCurly),
'[' => self.consume()?.produce_op(Punct::LBrack),
']' => self.consume()?.produce_op(Punct::RBrack),
'(' => self.consume()?.produce_op(Punct::LParen),
')' => self.consume()?.produce_op(Punct::RParen),
'&' => self.consume()?.amp(),
'@' => self.consume()?.produce_op(Punct::At),
'\\' => self.consume()?.produce_op(Punct::Backslash),
'!' => self.consume()?.bang(),
'|' => self.consume()?.bar(),
':' => self.consume()?.colon(),
',' => self.consume()?.produce_op(Punct::Comma),
'.' => self.consume()?.dot(),
'=' => self.consume()?.equal(),
'`' => self.consume()?.produce_op(Punct::Grave),
'>' => self.consume()?.greater(),
'#' => self.consume()?.hash(),
'<' => self.consume()?.less(),
'-' => self.consume()?.minus(),
'+' => self.consume()?.plus(),
'?' => self.consume()?.produce_op(Punct::Question),
'%' => self.consume()?.rem(),
';' => self.consume()?.produce_op(Punct::Semi),
'/' => self.consume()?.slash(),
'*' => self.consume()?.star(),
'~' => self.consume()?.produce_op(Punct::Tilde),
'^' => self.consume()?.xor(),
'0' => self.consume()?.int_with_base(),
'1'..='9' => self.digits::<10>(),
'"' => self.consume()?.string(),
'\'' => self.consume()?.character(),
'_' => self.identifier(),
i if is_xid_start(i) => self.identifier(),
e => {
let err = Err(Error::unexpected_char(e, self.line(), self.col()));
let _ = self.consume();
err
}
}
}
/// Returns the current line
pub fn line(&self) -> u32 {
self.start_loc.0
}
/// Returns the current column
pub fn col(&self) -> u32 {
self.start_loc.1
}
fn next(&mut self) -> LResult<char> {
let out = self.peek();
self.consume()?;
out
}
fn peek(&mut self) -> LResult<char> {
self.iter
.peek()
.copied()
.ok_or(Error::end_of_file(self.line(), self.col()))
}
fn produce(&mut self, kind: TokenKind, data: impl Into<TokenData>) -> LResult<Token> {
let loc = self.start_loc;
self.start_loc = self.current_loc;
self.start = self.current;
Ok(Token::new(kind, data, loc.0, loc.1))
}
fn produce_op(&mut self, kind: Punct) -> LResult<Token> {
self.produce(TokenKind::Punct(kind), ())
}
fn skip_whitespace(&mut self) -> &mut Self {
while let Ok(c) = self.peek() {
if !c.is_whitespace() {
break;
}
let _ = self.consume();
}
self.start = self.current;
self.start_loc = self.current_loc;
self
}
fn consume(&mut self) -> LResult<&mut Self> {
self.current += 1;
match self.iter.next() {
Some('\n') => {
let (line, col) = &mut self.current_loc;
*line += 1;
*col = 1;
}
Some(_) => self.current_loc.1 += 1,
None => Err(Error::end_of_file(self.line(), self.col()))?,
}
Ok(self)
}
}
/// Digraphs and trigraphs
impl<'t> Lexer<'t> {
fn amp(&mut self) -> LResult<Token> {
match self.peek() {
Ok('&') => self.consume()?.produce_op(Punct::AmpAmp),
Ok('=') => self.consume()?.produce_op(Punct::AmpEq),
_ => self.produce_op(Punct::Amp),
}
}
fn bang(&mut self) -> LResult<Token> {
match self.peek() {
Ok('!') => self.consume()?.produce_op(Punct::BangBang),
Ok('=') => self.consume()?.produce_op(Punct::BangEq),
_ => self.produce_op(Punct::Bang),
}
}
fn bar(&mut self) -> LResult<Token> {
match self.peek() {
Ok('|') => self.consume()?.produce_op(Punct::BarBar),
Ok('=') => self.consume()?.produce_op(Punct::BarEq),
_ => self.produce_op(Punct::Bar),
}
}
fn colon(&mut self) -> LResult<Token> {
match self.peek() {
Ok(':') => self.consume()?.produce_op(Punct::ColonColon),
_ => self.produce_op(Punct::Colon),
}
}
fn dot(&mut self) -> LResult<Token> {
match self.peek() {
Ok('.') => {
if let Ok('=') = self.consume()?.peek() {
self.consume()?.produce_op(Punct::DotDotEq)
} else {
self.produce_op(Punct::DotDot)
}
}
_ => self.produce_op(Punct::Dot),
}
}
fn equal(&mut self) -> LResult<Token> {
match self.peek() {
Ok('=') => self.consume()?.produce_op(Punct::EqEq),
Ok('>') => self.consume()?.produce_op(Punct::FatArrow),
_ => self.produce_op(Punct::Eq),
}
}
fn greater(&mut self) -> LResult<Token> {
match self.peek() {
Ok('=') => self.consume()?.produce_op(Punct::GtEq),
Ok('>') => {
if let Ok('=') = self.consume()?.peek() {
self.consume()?.produce_op(Punct::GtGtEq)
} else {
self.produce_op(Punct::GtGt)
}
}
_ => self.produce_op(Punct::Gt),
}
}
fn hash(&mut self) -> LResult<Token> {
match self.peek() {
Ok('!') => self.consume()?.produce_op(Punct::HashBang),
_ => self.produce_op(Punct::Hash),
}
}
fn less(&mut self) -> LResult<Token> {
match self.peek() {
Ok('=') => self.consume()?.produce_op(Punct::LtEq),
Ok('<') => {
if let Ok('=') = self.consume()?.peek() {
self.consume()?.produce_op(Punct::LtLtEq)
} else {
self.produce_op(Punct::LtLt)
}
}
_ => self.produce_op(Punct::Lt),
}
}
fn minus(&mut self) -> LResult<Token> {
match self.peek() {
Ok('=') => self.consume()?.produce_op(Punct::MinusEq),
Ok('>') => self.consume()?.produce_op(Punct::Arrow),
_ => self.produce_op(Punct::Minus),
}
}
fn plus(&mut self) -> LResult<Token> {
match self.peek() {
Ok('=') => self.consume()?.produce_op(Punct::PlusEq),
_ => self.produce_op(Punct::Plus),
}
}
fn rem(&mut self) -> LResult<Token> {
match self.peek() {
Ok('=') => self.consume()?.produce_op(Punct::RemEq),
_ => self.produce_op(Punct::Rem),
}
}
fn slash(&mut self) -> LResult<Token> {
match self.peek() {
Ok('=') => self.consume()?.produce_op(Punct::SlashEq),
Ok('/') => self.consume()?.line_comment(),
Ok('*') => self.consume()?.block_comment(),
_ => self.produce_op(Punct::Slash),
}
}
fn star(&mut self) -> LResult<Token> {
match self.peek() {
Ok('=') => self.consume()?.produce_op(Punct::StarEq),
_ => self.produce_op(Punct::Star),
}
}
fn xor(&mut self) -> LResult<Token> {
match self.peek() {
Ok('=') => self.consume()?.produce_op(Punct::XorEq),
Ok('^') => self.consume()?.produce_op(Punct::XorXor),
_ => self.produce_op(Punct::Xor),
}
}
}
/// Comments
impl<'t> Lexer<'t> {
fn line_comment(&mut self) -> LResult<Token> {
while Ok('\n') != self.peek() {
self.consume()?;
}
self.produce(Kind::Comment, ())
}
fn block_comment(&mut self) -> LResult<Token> {
while let Ok(c) = self.next() {
if '*' == c && Ok('/') == self.next() {
break;
}
}
self.produce(Kind::Comment, ())
}
}
/// Identifiers
impl<'t> Lexer<'t> {
fn identifier(&mut self) -> LResult<Token> {
let mut out = String::from(self.xid_start()?);
while let Ok(c) = self.xid_continue() {
out.push(c)
}
if let Ok(keyword) = Kind::from_str(&out) {
self.produce(keyword, ())
} else {
self.produce(Kind::Identifier, TokenData::String(out))
}
}
fn xid_start(&mut self) -> LResult<char> {
match self.peek()? {
xid if xid == '_' || is_xid_start(xid) => {
self.consume()?;
Ok(xid)
}
bad => Err(Error::not_identifier(bad, self.line(), self.col())),
}
}
fn xid_continue(&mut self) -> LResult<char> {
match self.peek()? {
xid if is_xid_continue(xid) => {
self.consume()?;
Ok(xid)
}
bad => Err(Error::not_identifier(bad, self.line(), self.col())),
}
}
}
/// Integers
impl<'t> Lexer<'t> {
fn int_with_base(&mut self) -> LResult<Token> {
match self.peek() {
Ok('x') => self.consume()?.digits::<16>(),
Ok('d') => self.consume()?.digits::<10>(),
Ok('o') => self.consume()?.digits::<8>(),
Ok('b') => self.consume()?.digits::<2>(),
Ok('0'..='9') => self.digits::<10>(),
_ => self.produce(Kind::Literal, 0),
}
}
fn digits<const B: u32>(&mut self) -> LResult<Token> {
let mut value = self.digit::<B>()? as u128;
while let Ok(true) = self.peek().as_ref().map(char::is_ascii_alphanumeric) {
value = value * B as u128 + self.digit::<B>()? as u128;
}
self.produce(Kind::Literal, value)
}
fn digit<const B: u32>(&mut self) -> LResult<u32> {
let digit = self.peek()?;
self.consume()?;
digit
.to_digit(B)
.ok_or(Error::invalid_digit(digit, self.line(), self.col()))
}
}
/// Strings and characters
impl<'t> Lexer<'t> {
fn string(&mut self) -> LResult<Token> {
let mut value = String::new();
while '"'
!= self
.peek()
.map_err(|e| e.mask_reason(Reason::UnmatchedDelimiters('"')))?
{
value.push(self.unescape()?)
}
self.consume()?.produce(Kind::Literal, value)
}
fn character(&mut self) -> LResult<Token> {
let out = self.unescape()?;
match self.peek()? {
'\'' => self.consume()?.produce(Kind::Literal, out),
_ => Err(Error::unmatched_delimiters('\'', self.line(), self.col())),
}
}
/// Unescape a single character
fn unescape(&mut self) -> LResult<char> {
match self.next() {
Ok('\\') => (),
other => return other,
}
Ok(match self.next()? {
'a' => '\x07',
'b' => '\x08',
'f' => '\x0c',
'n' => '\n',
'r' => '\r',
't' => '\t',
'x' => self.hex_escape()?,
'u' => self.unicode_escape()?,
'0' => '\0',
chr => chr,
})
}
/// unescape a single 2-digit hex escape
fn hex_escape(&mut self) -> LResult<char> {
let out = (self.digit::<16>()? << 4) + self.digit::<16>()?;
char::from_u32(out).ok_or(Error::bad_unicode(out, self.line(), self.col()))
}
/// unescape a single \u{} unicode escape
fn unicode_escape(&mut self) -> LResult<char> {
let mut out = 0;
let Ok('{') = self.peek() else {
return Err(Error::invalid_escape('u', self.line(), self.col()));
};
self.consume()?;
while let Ok(c) = self.peek() {
match c {
'}' => {
self.consume()?;
return char::from_u32(out).ok_or(Error::bad_unicode(
out,
self.line(),
self.col(),
));
}
_ => out = (out << 4) + self.digit::<16>()?,
}
}
Err(Error::invalid_escape('u', self.line(), self.col()))
}
}
impl<'t> From<&Lexer<'t>> for Loc {
fn from(value: &Lexer<'t>) -> Self {
Loc(value.line(), value.col())
}
}
use error::{Error, LResult, Reason};
pub mod error {
//! [Error] type for the [Lexer](super::Lexer)
use std::fmt::Display;
/// Result type with [Err] = [Error]
pub type LResult<T> = Result<T, Error>;
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct Error {
pub reason: Reason,
pub line: u32,
pub col: u32,
}
/// The reason for the [Error]
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum Reason {
/// Found an opening delimiter of type [char], but not the expected closing delimiter
UnmatchedDelimiters(char),
/// Found a character that doesn't belong to any [TokenKind](cl_token::TokenKind)
UnexpectedChar(char),
/// Found a character that's not valid in identifiers while looking for an identifier
NotIdentifier(char),
/// Found a character that's not valid in an escape sequence while looking for an escape
/// sequence
UnknownEscape(char),
/// Escape sequence contains invalid hexadecimal digit or unmatched braces
InvalidEscape(char),
/// Character is not a valid digit in the requested base
InvalidDigit(char),
/// Base conversion requested, but the base character was not in the set of known
/// characters
UnknownBase(char),
/// Unicode escape does not map to a valid unicode code-point
BadUnicode(u32),
/// Reached end of input
EndOfFile,
}
error_impl! {
unmatched_delimiters(c: char) => Reason::UnmatchedDelimiters(c),
unexpected_char(c: char) => Reason::UnexpectedChar(c),
not_identifier(c: char) => Reason::NotIdentifier(c),
unknown_escape(e: char) => Reason::UnknownEscape(e),
invalid_escape(e: char) => Reason::InvalidEscape(e),
invalid_digit(digit: char) => Reason::InvalidDigit(digit),
unknown_base(base: char) => Reason::UnknownBase(base),
bad_unicode(value: u32) => Reason::BadUnicode(value),
end_of_file => Reason::EndOfFile,
}
impl Error {
/// Changes the [Reason] of this error
pub(super) fn mask_reason(self, reason: Reason) -> Self {
Self { reason, ..self }
}
/// Returns the [Reason] for this error
pub fn reason(&self) -> &Reason {
&self.reason
}
/// Returns the (line, col) where the error happened
pub fn location(&self) -> (u32, u32) {
(self.line, self.col)
}
}
macro error_impl ($($fn:ident$(( $($p:ident: $t:ty),* ))? => $reason:expr),*$(,)?) {
#[allow(dead_code)]
impl Error {
$(pub(super) fn $fn ($($($p: $t),*,)? line: u32, col: u32) -> Self {
Self { reason: $reason, line, col }
})*
}
}
impl std::error::Error for Error {}
impl Display for Error {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}:{}: {}", self.line, self.col, self.reason)
}
}
impl Display for Reason {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Reason::UnmatchedDelimiters(c) => write! {f, "Unmatched `{c}` in input"},
Reason::UnexpectedChar(c) => write!(f, "Character `{c}` not expected"),
Reason::NotIdentifier(c) => write!(f, "Character `{c}` not valid in identifiers"),
Reason::UnknownEscape(c) => write!(f, "`\\{c}` is not a known escape sequence"),
Reason::InvalidEscape(c) => write!(f, "Escape sequence `\\{c}`... is malformed"),
Reason::InvalidDigit(c) => write!(f, "`{c}` is not a valid digit"),
Reason::UnknownBase(c) => write!(f, "`0{c}`... is not a valid base"),
Reason::BadUnicode(c) => write!(f, "`{c}` is not a valid unicode code-point"),
Reason::EndOfFile => write!(f, "Reached end of input"),
}
}
}
}