msp430-repl/src/lexer.rs

// © 2023-2024 John Breaux
//! The [Lexer] turns a [sequence of characters](str) into a stream of
//! [lexically-tagged tokens](token)

pub mod token;

use self::token::{Special, TokenKind, *};
use crate::util::Span;
use std::{
    iter::Peekable,
    str::{CharIndices, FromStr},
};
use unicode_ident::*;

const DEFAULT_BASE: u32 = 10;

/// Turns a [sequence of characters](str) into a stream of [lexically identified tokens](token).
///
/// # Examples
/// ```rust
/// # use libmsp430::lexer::{Lexer, token::*};
/// let text = "mov r14, r15";
/// let mut lexer = Lexer::new(text);
/// assert_eq!(lexer.scan().unwrap().kind, TokenKind::TwoArg(TwoArg::Mov));
/// assert_eq!(lexer.scan().unwrap().kind, TokenKind::Reg(Reg::R14));
/// assert_eq!(lexer.scan().unwrap().kind, TokenKind::Comma);
/// assert_eq!(lexer.scan().unwrap().kind, TokenKind::Reg(Reg::R15));
/// assert_eq!(lexer.scan().unwrap().kind, TokenKind::Eof);
/// ```
#[derive(Clone, Debug)]
pub struct Lexer<'t> {
    /// Keeps track of the byte offset into the string
    iter: Peekable<CharIndices<'t>>,
    text: &'t str,
    start: usize,
    index: usize,
}

impl<'t> Lexer<'t> {
    /// Creates a new [Lexer] over some [text](str)
    pub fn new(text: &'t str) -> Self {
        Self { iter: text.char_indices().peekable(), text, start: 0, index: 0 }
    }
    /// Gets the current byte-position
    pub fn location(&self) -> usize {
        self.start
    }
    /// Internal: Emits a token with the provided [TokenKind], providing its extents.
    fn emit(&mut self, kind: TokenKind) -> Option<Token<'t>> {
        let out =
            Some(Token::new(self.next_lexeme(), kind, Span { start: self.start, end: self.index }));
        self.start = self.index;
        out
    }
    fn next_lexeme(&self) -> &'t str {
        &self.text[self.start..self.index]
    }
    fn repeat(&mut self, f: impl Fn(char) -> bool) -> &mut Self {
        while let Some(&c) = self.peek() {
            if !f(c) {
                break;
            }
            self.next();
        }
        self
    }
    fn space(&mut self) -> Option<&mut Self> {
        while self.peek()?.is_whitespace() && *self.peek()? != '\n' {
            self.next();
        }
        self.start = self.index;
        Some(self)
    }
    /// Consumes a [char] without checking, for ergonomic chaining
    fn then(&mut self) -> &mut Self {
        self.next();
        self
    }
    fn peek(&mut self) -> Option<&char> {
        self.iter.peek().map(|(_, c)| c)
    }
    fn next(&mut self) -> Option<char> {
        let (index, c) = self.iter.next()?;
        self.index = index + c.len_utf8();
        Some(c)
    }

    /// Scans for the next [Token] in the stream
    pub fn scan(&mut self) -> Option<Token<'t>> {
        if self.space().is_none() {
            return self.emit(TokenKind::Eof);
        }
        let Some(c) = self.peek() else {
            return self.emit(TokenKind::Eof);
        };
        match c {
            '\n' => self.then().emit(TokenKind::Newline),
            '!' => self.then().emit(TokenKind::Bang),
            '#' => self.then().emit(TokenKind::Hash),
            '$' => self.then().emit(TokenKind::Dollar),
            '%' => self.then().emit(TokenKind::Percent),
            '&' => self.then().emit(TokenKind::Amp),
            '\'' => self.then().char(),
            '"' => self.then().string(),
            '(' => self.then().emit(TokenKind::OpenParen),
            ')' => self.then().emit(TokenKind::CloseParen),
            '*' => self.then().emit(TokenKind::Star),
            '+' => self.then().emit(TokenKind::Plus),
            ',' => self.then().emit(TokenKind::Comma),
            '-' => self.then().emit(TokenKind::Minus),
            '.' => self.then().directive_or_bw(),
            '/' => self.then().comment_or_slash(),
            '0' => self.then().number_with_base(),
            ':' => self.then().emit(TokenKind::Colon),
            ';' => self.repeat(|c| c != '\n').emit(TokenKind::Comment),
            '<' => self.then().less(),
            '>' => self.then().greater(),
            '@' => self.then().emit(TokenKind::At),
            '[' => self.then().emit(TokenKind::OpenBrace),
            ']' => self.then().emit(TokenKind::CloseBrace),
            '^' => self.then().emit(TokenKind::Caret),
            '_' => self.then().identifier(),
            '{' => self.then().emit(TokenKind::OpenCurly),
            '|' => self.then().emit(TokenKind::Bar),
            '}' => self.then().emit(TokenKind::CloseCurly),
            c if c.is_numeric() => self.number::<DEFAULT_BASE>(),
            &c if is_xid_start(c) => self.identifier(),
            c => todo!("Unrecognized character: {c}"),
        }
    }
    fn number_with_base(&mut self) -> Option<Token<'t>> {
        match self.peek() {
            Some('x') => self.then().number::<16>(),
            Some('d') => self.then().number::<10>(),
            Some('o') => self.then().number::<8>(),
            Some('b') => self.then().number::<2>(),
            Some(c) if c.is_ascii_digit() => self.number::<DEFAULT_BASE>(),
            _ => self.emit(TokenKind::Number(0, 10)),
        }
    }
    fn number<const B: u32>(&mut self) -> Option<Token<'t>> {
        let mut num = self.digit::<B>()?;
        while let Some(digit) = self.digit::<B>() {
            num = num * B + digit;
        }
        if num > u16::MAX as u32 {
            None
        } else {
            self.emit(TokenKind::Number(num as u16, B as u8))
        }
    }
    fn digit<const B: u32>(&mut self) -> Option<u32> {
        let digit = self.peek()?.to_digit(B)?;
        self.then();
        Some(digit)
    }

    fn comment_or_slash(&mut self) -> Option<Token<'t>> {
        match self.peek() {
            Some('/') => self.repeat(|c| c != '\n').emit(TokenKind::Comment),
            _ => self.emit(TokenKind::Slash),
        }
    }
    fn less(&mut self) -> Option<Token<'t>> {
        match self.peek() {
            Some('<') => self.then().emit(TokenKind::Lsh),
            _ => todo!("less"),
        }
    }
    fn greater(&mut self) -> Option<Token<'t>> {
        match self.peek() {
            Some('>') => self.then().emit(TokenKind::Rsh),
            _ => todo!("greater"),
        }
    }
    fn identifier(&mut self) -> Option<Token<'t>> {
        while let Some(c) = self.then().peek() {
            if !is_xid_continue(*c) {
                break;
            }
        }
        let lexeme = self.next_lexeme();
        if let Ok(op) = Reg::from_str(lexeme) {
            self.emit(TokenKind::Reg(op))
        } else if let Ok(op) = NoEm::from_str(lexeme) {
            self.emit(TokenKind::NoEm(op))
        } else if let Ok(op) = OneEm::from_str(lexeme) {
            self.emit(TokenKind::OneEm(op))
        } else if let Ok(op) = Special::from_str(lexeme) {
            self.emit(TokenKind::Special(op))
        } else if let Ok(op) = OneArg::from_str(lexeme) {
            self.emit(TokenKind::OneArg(op))
        } else if let Ok(op) = TwoArg::from_str(lexeme) {
            self.emit(TokenKind::TwoArg(op))
        } else if let Ok(op) = Jump::from_str(lexeme) {
            self.emit(TokenKind::Jump(op))
        } else {
            self.emit(TokenKind::Identifier)
        }
    }
    fn directive_or_bw(&mut self) -> Option<Token<'t>> {
        while let Some(c) = self.then().peek() {
            if !is_xid_continue(*c) {
                break;
            }
        }
        match self.next_lexeme() {
            ".b" => self.emit(TokenKind::Byte),
            ".w" => self.emit(TokenKind::Word),
            _ => self.emit(TokenKind::Directive),
        }
    }

    /// Todo: Character unescaping in Lexer::string
    fn string(&mut self) -> Option<Token<'t>> {
        while '"' != self.next()? {}
        self.emit(TokenKind::String)
    }
    fn char(&mut self) -> Option<Token<'t>> {
        let out = self.unescape()?;
        self.next().filter(|c| *c == '\'').and_then(|_| self.emit(TokenKind::Char(out)))
    }
    /// Unescape a single character
    fn unescape(&mut self) -> Option<char> {
        match self.next() {
            Some('\\') => (),
            other => return other,
        }
        Some(match self.next()? {
            'a' => '\x07',
            'b' => '\x08',
            'f' => '\x0c',
            'n' => '\n',
            'r' => '\r',
            't' => '\t',
            'x' => self.hex_escape()?,
            'u' => self.unicode_escape()?,
            '0' => '\0',
            chr => chr,
        })
    }
    /// unescape a single 2-digit hex escape
    fn hex_escape(&mut self) -> Option<char> {
        let out = (self.digit::<16>()? << 4) + self.digit::<16>()?;
        char::from_u32(out) //.ok_or(Error::bad_unicode(out, self.line(), self.col()))
    }
    /// unescape a single \u{} unicode escape
    fn unicode_escape(&mut self) -> Option<char> {
        let mut out = 0;
        let Some('{') = self.peek() else {
            return None; //Err(Error::invalid_escape('u', self.line(), self.col()));
        };
        self.then();
        while let Some(c) = self.peek() {
            match c {
                '}' => {
                    self.then();
                    return char::from_u32(out); //.ok_or(Error::bad_unicode(out, self.line(), self.col()));
                }
                _ => out = (out << 4) + self.digit::<16>()?,
            }
        }
        None //Err(Error::invalid_escape('u', self.line(), self.col()))
    }
}

#[cfg(test)]
mod tests;