557 lines
19 KiB
Rust
557 lines
19 KiB
Rust
//! Converts a text file into tokens
|
|
#![warn(clippy::all)]
|
|
#![feature(decl_macro)]
|
|
use cl_structures::span::Loc;
|
|
use cl_token::{TokenKind as Kind, *};
|
|
use std::{
|
|
iter::Peekable,
|
|
str::{Chars, FromStr},
|
|
};
|
|
use unicode_ident::*;
|
|
|
|
#[cfg(test)]
|
|
mod tests;
|
|
|
|
pub mod lexer_iter {
|
|
//! Iterator over a [`Lexer`], returning [`LResult<Token>`]s
|
|
use super::{
|
|
error::{LResult, Reason},
|
|
Lexer, Token,
|
|
};
|
|
|
|
/// Iterator over a [`Lexer`], returning [`LResult<Token>`]s
|
|
pub struct LexerIter<'t> {
|
|
lexer: Lexer<'t>,
|
|
}
|
|
impl<'t> Iterator for LexerIter<'t> {
|
|
type Item = LResult<Token>;
|
|
fn next(&mut self) -> Option<Self::Item> {
|
|
match self.lexer.scan() {
|
|
Ok(v) => Some(Ok(v)),
|
|
Err(e) => {
|
|
if e.reason == Reason::EndOfFile {
|
|
None
|
|
} else {
|
|
Some(Err(e))
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
impl<'t> IntoIterator for Lexer<'t> {
|
|
type Item = LResult<Token>;
|
|
type IntoIter = LexerIter<'t>;
|
|
fn into_iter(self) -> Self::IntoIter {
|
|
LexerIter { lexer: self }
|
|
}
|
|
}
|
|
}
|
|
|
|
/// The Lexer iterates over the characters in a body of text, searching for [Tokens](Token).
|
|
///
|
|
/// # Examples
|
|
/// ```rust
|
|
/// # use cl_lexer::Lexer;
|
|
/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|
/// // Read in your code from somewhere
|
|
/// let some_code = "
|
|
/// fn main () {
|
|
/// // TODO: code goes here!
|
|
/// }
|
|
/// ";
|
|
/// // Create a lexer over your code
|
|
/// let mut lexer = Lexer::new(some_code);
|
|
/// // Scan for a single token
|
|
/// let first_token = lexer.scan()?;
|
|
/// println!("{first_token:?}");
|
|
/// // Loop over all the rest of the tokens
|
|
/// for token in lexer {
|
|
/// # let token: Result<_,()> = Ok(token?);
|
|
/// match token {
|
|
/// Ok(token) => println!("{token:?}"),
|
|
/// Err(e) => eprintln!("{e:?}"),
|
|
/// }
|
|
/// }
|
|
/// # Ok(()) }
|
|
/// ```
|
|
#[derive(Clone, Debug)]
|
|
pub struct Lexer<'t> {
|
|
iter: Peekable<Chars<'t>>,
|
|
start: usize,
|
|
start_loc: (u32, u32),
|
|
current: usize,
|
|
current_loc: (u32, u32),
|
|
}
|
|
|
|
impl<'t> Lexer<'t> {
|
|
/// Creates a new [Lexer] over a [str]
|
|
pub fn new(text: &'t str) -> Self {
|
|
Self {
|
|
iter: text.chars().peekable(),
|
|
start: 0,
|
|
start_loc: (1, 1),
|
|
current: 0,
|
|
current_loc: (1, 1),
|
|
}
|
|
}
|
|
/// Scans through the text, searching for the next [Token]
|
|
pub fn scan(&mut self) -> LResult<Token> {
|
|
match self.skip_whitespace().peek()? {
|
|
'{' => self.consume()?.produce_op(Punct::LCurly),
|
|
'}' => self.consume()?.produce_op(Punct::RCurly),
|
|
'[' => self.consume()?.produce_op(Punct::LBrack),
|
|
']' => self.consume()?.produce_op(Punct::RBrack),
|
|
'(' => self.consume()?.produce_op(Punct::LParen),
|
|
')' => self.consume()?.produce_op(Punct::RParen),
|
|
'&' => self.consume()?.amp(),
|
|
'@' => self.consume()?.produce_op(Punct::At),
|
|
'\\' => self.consume()?.produce_op(Punct::Backslash),
|
|
'!' => self.consume()?.bang(),
|
|
'|' => self.consume()?.bar(),
|
|
':' => self.consume()?.colon(),
|
|
',' => self.consume()?.produce_op(Punct::Comma),
|
|
'.' => self.consume()?.dot(),
|
|
'=' => self.consume()?.equal(),
|
|
'`' => self.consume()?.produce_op(Punct::Grave),
|
|
'>' => self.consume()?.greater(),
|
|
'#' => self.consume()?.hash(),
|
|
'<' => self.consume()?.less(),
|
|
'-' => self.consume()?.minus(),
|
|
'+' => self.consume()?.plus(),
|
|
'?' => self.consume()?.produce_op(Punct::Question),
|
|
'%' => self.consume()?.rem(),
|
|
';' => self.consume()?.produce_op(Punct::Semi),
|
|
'/' => self.consume()?.slash(),
|
|
'*' => self.consume()?.star(),
|
|
'~' => self.consume()?.produce_op(Punct::Tilde),
|
|
'^' => self.consume()?.xor(),
|
|
'0' => self.consume()?.int_with_base(),
|
|
'1'..='9' => self.digits::<10>(),
|
|
'"' => self.consume()?.string(),
|
|
'\'' => self.consume()?.character(),
|
|
'_' => self.identifier(),
|
|
i if is_xid_start(i) => self.identifier(),
|
|
e => {
|
|
let err = Err(Error::unexpected_char(e, self.line(), self.col()));
|
|
let _ = self.consume();
|
|
err
|
|
}
|
|
}
|
|
}
|
|
/// Returns the current line
|
|
pub fn line(&self) -> u32 {
|
|
self.start_loc.0
|
|
}
|
|
/// Returns the current column
|
|
pub fn col(&self) -> u32 {
|
|
self.start_loc.1
|
|
}
|
|
fn next(&mut self) -> LResult<char> {
|
|
let out = self.peek();
|
|
self.consume()?;
|
|
out
|
|
}
|
|
fn peek(&mut self) -> LResult<char> {
|
|
self.iter
|
|
.peek()
|
|
.copied()
|
|
.ok_or(Error::end_of_file(self.line(), self.col()))
|
|
}
|
|
fn produce(&mut self, kind: TokenKind, data: impl Into<TokenData>) -> LResult<Token> {
|
|
let loc = self.start_loc;
|
|
self.start_loc = self.current_loc;
|
|
self.start = self.current;
|
|
Ok(Token::new(kind, data, loc.0, loc.1))
|
|
}
|
|
fn produce_op(&mut self, kind: Punct) -> LResult<Token> {
|
|
self.produce(TokenKind::Punct(kind), ())
|
|
}
|
|
fn skip_whitespace(&mut self) -> &mut Self {
|
|
while let Ok(c) = self.peek() {
|
|
if !c.is_whitespace() {
|
|
break;
|
|
}
|
|
let _ = self.consume();
|
|
}
|
|
self.start = self.current;
|
|
self.start_loc = self.current_loc;
|
|
self
|
|
}
|
|
fn consume(&mut self) -> LResult<&mut Self> {
|
|
self.current += 1;
|
|
match self.iter.next() {
|
|
Some('\n') => {
|
|
let (line, col) = &mut self.current_loc;
|
|
*line += 1;
|
|
*col = 1;
|
|
}
|
|
Some(_) => self.current_loc.1 += 1,
|
|
None => Err(Error::end_of_file(self.line(), self.col()))?,
|
|
}
|
|
Ok(self)
|
|
}
|
|
}
|
|
/// Digraphs and trigraphs
|
|
impl<'t> Lexer<'t> {
|
|
fn amp(&mut self) -> LResult<Token> {
|
|
match self.peek() {
|
|
Ok('&') => self.consume()?.produce_op(Punct::AmpAmp),
|
|
Ok('=') => self.consume()?.produce_op(Punct::AmpEq),
|
|
_ => self.produce_op(Punct::Amp),
|
|
}
|
|
}
|
|
fn bang(&mut self) -> LResult<Token> {
|
|
match self.peek() {
|
|
Ok('!') => self.consume()?.produce_op(Punct::BangBang),
|
|
Ok('=') => self.consume()?.produce_op(Punct::BangEq),
|
|
_ => self.produce_op(Punct::Bang),
|
|
}
|
|
}
|
|
fn bar(&mut self) -> LResult<Token> {
|
|
match self.peek() {
|
|
Ok('|') => self.consume()?.produce_op(Punct::BarBar),
|
|
Ok('=') => self.consume()?.produce_op(Punct::BarEq),
|
|
_ => self.produce_op(Punct::Bar),
|
|
}
|
|
}
|
|
fn colon(&mut self) -> LResult<Token> {
|
|
match self.peek() {
|
|
Ok(':') => self.consume()?.produce_op(Punct::ColonColon),
|
|
_ => self.produce_op(Punct::Colon),
|
|
}
|
|
}
|
|
fn dot(&mut self) -> LResult<Token> {
|
|
match self.peek() {
|
|
Ok('.') => {
|
|
if let Ok('=') = self.consume()?.peek() {
|
|
self.consume()?.produce_op(Punct::DotDotEq)
|
|
} else {
|
|
self.produce_op(Punct::DotDot)
|
|
}
|
|
}
|
|
_ => self.produce_op(Punct::Dot),
|
|
}
|
|
}
|
|
fn equal(&mut self) -> LResult<Token> {
|
|
match self.peek() {
|
|
Ok('=') => self.consume()?.produce_op(Punct::EqEq),
|
|
Ok('>') => self.consume()?.produce_op(Punct::FatArrow),
|
|
_ => self.produce_op(Punct::Eq),
|
|
}
|
|
}
|
|
fn greater(&mut self) -> LResult<Token> {
|
|
match self.peek() {
|
|
Ok('=') => self.consume()?.produce_op(Punct::GtEq),
|
|
Ok('>') => {
|
|
if let Ok('=') = self.consume()?.peek() {
|
|
self.consume()?.produce_op(Punct::GtGtEq)
|
|
} else {
|
|
self.produce_op(Punct::GtGt)
|
|
}
|
|
}
|
|
_ => self.produce_op(Punct::Gt),
|
|
}
|
|
}
|
|
fn hash(&mut self) -> LResult<Token> {
|
|
match self.peek() {
|
|
Ok('!') => self.consume()?.produce_op(Punct::HashBang),
|
|
_ => self.produce_op(Punct::Hash),
|
|
}
|
|
}
|
|
fn less(&mut self) -> LResult<Token> {
|
|
match self.peek() {
|
|
Ok('=') => self.consume()?.produce_op(Punct::LtEq),
|
|
Ok('<') => {
|
|
if let Ok('=') = self.consume()?.peek() {
|
|
self.consume()?.produce_op(Punct::LtLtEq)
|
|
} else {
|
|
self.produce_op(Punct::LtLt)
|
|
}
|
|
}
|
|
_ => self.produce_op(Punct::Lt),
|
|
}
|
|
}
|
|
fn minus(&mut self) -> LResult<Token> {
|
|
match self.peek() {
|
|
Ok('=') => self.consume()?.produce_op(Punct::MinusEq),
|
|
Ok('>') => self.consume()?.produce_op(Punct::Arrow),
|
|
_ => self.produce_op(Punct::Minus),
|
|
}
|
|
}
|
|
fn plus(&mut self) -> LResult<Token> {
|
|
match self.peek() {
|
|
Ok('=') => self.consume()?.produce_op(Punct::PlusEq),
|
|
_ => self.produce_op(Punct::Plus),
|
|
}
|
|
}
|
|
fn rem(&mut self) -> LResult<Token> {
|
|
match self.peek() {
|
|
Ok('=') => self.consume()?.produce_op(Punct::RemEq),
|
|
_ => self.produce_op(Punct::Rem),
|
|
}
|
|
}
|
|
fn slash(&mut self) -> LResult<Token> {
|
|
match self.peek() {
|
|
Ok('=') => self.consume()?.produce_op(Punct::SlashEq),
|
|
Ok('/') => self.consume()?.line_comment(),
|
|
Ok('*') => self.consume()?.block_comment(),
|
|
_ => self.produce_op(Punct::Slash),
|
|
}
|
|
}
|
|
fn star(&mut self) -> LResult<Token> {
|
|
match self.peek() {
|
|
Ok('=') => self.consume()?.produce_op(Punct::StarEq),
|
|
_ => self.produce_op(Punct::Star),
|
|
}
|
|
}
|
|
fn xor(&mut self) -> LResult<Token> {
|
|
match self.peek() {
|
|
Ok('=') => self.consume()?.produce_op(Punct::XorEq),
|
|
Ok('^') => self.consume()?.produce_op(Punct::XorXor),
|
|
_ => self.produce_op(Punct::Xor),
|
|
}
|
|
}
|
|
}
|
|
/// Comments
|
|
impl<'t> Lexer<'t> {
|
|
fn line_comment(&mut self) -> LResult<Token> {
|
|
while Ok('\n') != self.peek() {
|
|
self.consume()?;
|
|
}
|
|
self.produce(Kind::Comment, ())
|
|
}
|
|
fn block_comment(&mut self) -> LResult<Token> {
|
|
while let Ok(c) = self.next() {
|
|
if '*' == c && Ok('/') == self.next() {
|
|
break;
|
|
}
|
|
}
|
|
self.produce(Kind::Comment, ())
|
|
}
|
|
}
|
|
/// Identifiers
|
|
impl<'t> Lexer<'t> {
|
|
fn identifier(&mut self) -> LResult<Token> {
|
|
let mut out = String::from(self.xid_start()?);
|
|
while let Ok(c) = self.xid_continue() {
|
|
out.push(c)
|
|
}
|
|
if let Ok(keyword) = Kind::from_str(&out) {
|
|
self.produce(keyword, ())
|
|
} else {
|
|
self.produce(Kind::Identifier, TokenData::String(out))
|
|
}
|
|
}
|
|
fn xid_start(&mut self) -> LResult<char> {
|
|
match self.peek()? {
|
|
xid if xid == '_' || is_xid_start(xid) => {
|
|
self.consume()?;
|
|
Ok(xid)
|
|
}
|
|
bad => Err(Error::not_identifier(bad, self.line(), self.col())),
|
|
}
|
|
}
|
|
fn xid_continue(&mut self) -> LResult<char> {
|
|
match self.peek()? {
|
|
xid if is_xid_continue(xid) => {
|
|
self.consume()?;
|
|
Ok(xid)
|
|
}
|
|
bad => Err(Error::not_identifier(bad, self.line(), self.col())),
|
|
}
|
|
}
|
|
}
|
|
/// Integers
|
|
impl<'t> Lexer<'t> {
|
|
fn int_with_base(&mut self) -> LResult<Token> {
|
|
match self.peek() {
|
|
Ok('x') => self.consume()?.digits::<16>(),
|
|
Ok('d') => self.consume()?.digits::<10>(),
|
|
Ok('o') => self.consume()?.digits::<8>(),
|
|
Ok('b') => self.consume()?.digits::<2>(),
|
|
Ok('0'..='9') => self.digits::<10>(),
|
|
_ => self.produce(Kind::Literal, 0),
|
|
}
|
|
}
|
|
fn digits<const B: u32>(&mut self) -> LResult<Token> {
|
|
let mut value = self.digit::<B>()? as u128;
|
|
while let Ok(true) = self.peek().as_ref().map(char::is_ascii_alphanumeric) {
|
|
value = value * B as u128 + self.digit::<B>()? as u128;
|
|
}
|
|
self.produce(Kind::Literal, value)
|
|
}
|
|
fn digit<const B: u32>(&mut self) -> LResult<u32> {
|
|
let digit = self.peek()?;
|
|
self.consume()?;
|
|
digit
|
|
.to_digit(B)
|
|
.ok_or(Error::invalid_digit(digit, self.line(), self.col()))
|
|
}
|
|
}
|
|
/// Strings and characters
|
|
impl<'t> Lexer<'t> {
|
|
fn string(&mut self) -> LResult<Token> {
|
|
let mut value = String::new();
|
|
while '"'
|
|
!= self
|
|
.peek()
|
|
.map_err(|e| e.mask_reason(Reason::UnmatchedDelimiters('"')))?
|
|
{
|
|
value.push(self.unescape()?)
|
|
}
|
|
self.consume()?.produce(Kind::Literal, value)
|
|
}
|
|
fn character(&mut self) -> LResult<Token> {
|
|
let out = self.unescape()?;
|
|
match self.peek()? {
|
|
'\'' => self.consume()?.produce(Kind::Literal, out),
|
|
_ => Err(Error::unmatched_delimiters('\'', self.line(), self.col())),
|
|
}
|
|
}
|
|
/// Unescape a single character
|
|
fn unescape(&mut self) -> LResult<char> {
|
|
match self.next() {
|
|
Ok('\\') => (),
|
|
other => return other,
|
|
}
|
|
Ok(match self.next()? {
|
|
'a' => '\x07',
|
|
'b' => '\x08',
|
|
'f' => '\x0c',
|
|
'n' => '\n',
|
|
'r' => '\r',
|
|
't' => '\t',
|
|
'x' => self.hex_escape()?,
|
|
'u' => self.unicode_escape()?,
|
|
'0' => '\0',
|
|
chr => chr,
|
|
})
|
|
}
|
|
/// unescape a single 2-digit hex escape
|
|
fn hex_escape(&mut self) -> LResult<char> {
|
|
let out = (self.digit::<16>()? << 4) + self.digit::<16>()?;
|
|
char::from_u32(out).ok_or(Error::bad_unicode(out, self.line(), self.col()))
|
|
}
|
|
/// unescape a single \u{} unicode escape
|
|
fn unicode_escape(&mut self) -> LResult<char> {
|
|
let mut out = 0;
|
|
let Ok('{') = self.peek() else {
|
|
return Err(Error::invalid_escape('u', self.line(), self.col()));
|
|
};
|
|
self.consume()?;
|
|
while let Ok(c) = self.peek() {
|
|
match c {
|
|
'}' => {
|
|
self.consume()?;
|
|
return char::from_u32(out).ok_or(Error::bad_unicode(
|
|
out,
|
|
self.line(),
|
|
self.col(),
|
|
));
|
|
}
|
|
_ => out = (out << 4) + self.digit::<16>()?,
|
|
}
|
|
}
|
|
Err(Error::invalid_escape('u', self.line(), self.col()))
|
|
}
|
|
}
|
|
|
|
impl<'t> From<&Lexer<'t>> for Loc {
|
|
fn from(value: &Lexer<'t>) -> Self {
|
|
Loc(value.line(), value.col())
|
|
}
|
|
}
|
|
|
|
use error::{Error, LResult, Reason};
|
|
pub mod error {
|
|
//! [Error] type for the [Lexer](super::Lexer)
|
|
use std::fmt::Display;
|
|
|
|
/// Result type with [Err] = [Error]
|
|
pub type LResult<T> = Result<T, Error>;
|
|
#[derive(Clone, Debug, PartialEq, Eq)]
|
|
pub struct Error {
|
|
pub reason: Reason,
|
|
pub line: u32,
|
|
pub col: u32,
|
|
}
|
|
/// The reason for the [Error]
|
|
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
|
|
pub enum Reason {
|
|
/// Found an opening delimiter of type [char], but not the expected closing delimiter
|
|
UnmatchedDelimiters(char),
|
|
/// Found a character that doesn't belong to any [TokenKind](cl_token::TokenKind)
|
|
UnexpectedChar(char),
|
|
/// Found a character that's not valid in identifiers while looking for an identifier
|
|
NotIdentifier(char),
|
|
/// Found a character that's not valid in an escape sequence while looking for an escape
|
|
/// sequence
|
|
UnknownEscape(char),
|
|
/// Escape sequence contains invalid hexadecimal digit or unmatched braces
|
|
InvalidEscape(char),
|
|
/// Character is not a valid digit in the requested base
|
|
InvalidDigit(char),
|
|
/// Base conversion requested, but the base character was not in the set of known
|
|
/// characters
|
|
UnknownBase(char),
|
|
/// Unicode escape does not map to a valid unicode code-point
|
|
BadUnicode(u32),
|
|
/// Reached end of input
|
|
EndOfFile,
|
|
}
|
|
error_impl! {
|
|
unmatched_delimiters(c: char) => Reason::UnmatchedDelimiters(c),
|
|
unexpected_char(c: char) => Reason::UnexpectedChar(c),
|
|
not_identifier(c: char) => Reason::NotIdentifier(c),
|
|
unknown_escape(e: char) => Reason::UnknownEscape(e),
|
|
invalid_escape(e: char) => Reason::InvalidEscape(e),
|
|
invalid_digit(digit: char) => Reason::InvalidDigit(digit),
|
|
unknown_base(base: char) => Reason::UnknownBase(base),
|
|
bad_unicode(value: u32) => Reason::BadUnicode(value),
|
|
end_of_file => Reason::EndOfFile,
|
|
}
|
|
impl Error {
|
|
/// Changes the [Reason] of this error
|
|
pub(super) fn mask_reason(self, reason: Reason) -> Self {
|
|
Self { reason, ..self }
|
|
}
|
|
/// Returns the [Reason] for this error
|
|
pub fn reason(&self) -> &Reason {
|
|
&self.reason
|
|
}
|
|
/// Returns the (line, col) where the error happened
|
|
pub fn location(&self) -> (u32, u32) {
|
|
(self.line, self.col)
|
|
}
|
|
}
|
|
macro error_impl ($($fn:ident$(( $($p:ident: $t:ty),* ))? => $reason:expr),*$(,)?) {
|
|
#[allow(dead_code)]
|
|
impl Error {
|
|
$(pub(super) fn $fn ($($($p: $t),*,)? line: u32, col: u32) -> Self {
|
|
Self { reason: $reason, line, col }
|
|
})*
|
|
}
|
|
}
|
|
impl std::error::Error for Error {}
|
|
impl Display for Error {
|
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
write!(f, "{}:{}: {}", self.line, self.col, self.reason)
|
|
}
|
|
}
|
|
impl Display for Reason {
|
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
match self {
|
|
Reason::UnmatchedDelimiters(c) => write! {f, "Unmatched `{c}` in input"},
|
|
Reason::UnexpectedChar(c) => write!(f, "Character `{c}` not expected"),
|
|
Reason::NotIdentifier(c) => write!(f, "Character `{c}` not valid in identifiers"),
|
|
Reason::UnknownEscape(c) => write!(f, "`\\{c}` is not a known escape sequence"),
|
|
Reason::InvalidEscape(c) => write!(f, "Escape sequence `\\{c}`... is malformed"),
|
|
Reason::InvalidDigit(c) => write!(f, "`{c}` is not a valid digit"),
|
|
Reason::UnknownBase(c) => write!(f, "`0{c}`... is not a valid base"),
|
|
Reason::BadUnicode(c) => write!(f, "`{c}` is not a valid unicode code-point"),
|
|
Reason::EndOfFile => write!(f, "Reached end of input"),
|
|
}
|
|
}
|
|
}
|
|
}
|