msp430-repl/src/lexer.rs

269 lines
9.3 KiB
Rust

// © 2023-2024 John Breaux
//! The [Lexer] turns a [sequence of characters](str) into a stream of
//! [lexically-tagged tokens](token)
pub mod token;
use self::token::{Special, TokenKind, *};
use crate::util::Span;
use std::{
iter::Peekable,
str::{CharIndices, FromStr},
};
use unicode_ident::*;
const DEFAULT_BASE: u32 = 10;
/// Turns a [sequence of characters](str) into a stream of [lexically identified tokens](token).
///
/// # Examples
/// ```rust
/// # use libmsp430::lexer::{Lexer, token::*};
/// let text = "mov r14, r15";
/// let mut lexer = Lexer::new(text);
/// assert_eq!(lexer.scan().unwrap().kind, TokenKind::TwoArg(TwoArg::Mov));
/// assert_eq!(lexer.scan().unwrap().kind, TokenKind::Reg(Reg::R14));
/// assert_eq!(lexer.scan().unwrap().kind, TokenKind::Comma);
/// assert_eq!(lexer.scan().unwrap().kind, TokenKind::Reg(Reg::R15));
/// assert_eq!(lexer.scan().unwrap().kind, TokenKind::Eof);
/// ```
#[derive(Clone, Debug)]
pub struct Lexer<'t> {
/// Keeps track of the byte offset into the string
iter: Peekable<CharIndices<'t>>,
text: &'t str,
start: usize,
index: usize,
}
impl<'t> Lexer<'t> {
/// Creates a new [Lexer] over some [text](str)
pub fn new(text: &'t str) -> Self {
Self { iter: text.char_indices().peekable(), text, start: 0, index: 0 }
}
/// Gets the current byte-position
pub fn location(&self) -> usize {
self.start
}
/// Internal: Emits a token with the provided [TokenKind], providing its extents.
fn emit(&mut self, kind: TokenKind) -> Option<Token<'t>> {
let out =
Some(Token::new(self.next_lexeme(), kind, Span { start: self.start, end: self.index }));
self.start = self.index;
out
}
fn next_lexeme(&self) -> &'t str {
&self.text[self.start..self.index]
}
fn repeat(&mut self, f: impl Fn(char) -> bool) -> &mut Self {
while let Some(&c) = self.peek() {
if !f(c) {
break;
}
self.next();
}
self
}
fn space(&mut self) -> Option<&mut Self> {
while self.peek()?.is_whitespace() && *self.peek()? != '\n' {
self.next();
}
self.start = self.index;
Some(self)
}
/// Consumes a [char] without checking, for ergonomic chaining
fn then(&mut self) -> &mut Self {
self.next();
self
}
fn peek(&mut self) -> Option<&char> {
self.iter.peek().map(|(_, c)| c)
}
fn next(&mut self) -> Option<char> {
let (index, c) = self.iter.next()?;
self.index = index + c.len_utf8();
Some(c)
}
/// Scans for the next [Token] in the stream
pub fn scan(&mut self) -> Option<Token<'t>> {
if self.space().is_none() {
return self.emit(TokenKind::Eof);
}
let Some(c) = self.peek() else {
return self.emit(TokenKind::Eof);
};
match c {
'\n' => self.then().emit(TokenKind::Newline),
'!' => self.then().emit(TokenKind::Bang),
'#' => self.then().emit(TokenKind::Hash),
'$' => self.then().emit(TokenKind::Dollar),
'%' => self.then().emit(TokenKind::Percent),
'&' => self.then().emit(TokenKind::Amp),
'\'' => self.then().char(),
'"' => self.then().string(),
'(' => self.then().emit(TokenKind::OpenParen),
')' => self.then().emit(TokenKind::CloseParen),
'*' => self.then().emit(TokenKind::Star),
'+' => self.then().emit(TokenKind::Plus),
',' => self.then().emit(TokenKind::Comma),
'-' => self.then().emit(TokenKind::Minus),
'.' => self.then().directive_or_bw(),
'/' => self.then().comment_or_slash(),
'0' => self.then().number_with_base(),
':' => self.then().emit(TokenKind::Colon),
';' => self.repeat(|c| c != '\n').emit(TokenKind::Comment),
'<' => self.then().less(),
'>' => self.then().greater(),
'@' => self.then().emit(TokenKind::At),
'[' => self.then().emit(TokenKind::OpenBrace),
']' => self.then().emit(TokenKind::CloseBrace),
'^' => self.then().emit(TokenKind::Caret),
'_' => self.then().identifier(),
'{' => self.then().emit(TokenKind::OpenCurly),
'|' => self.then().emit(TokenKind::Bar),
'}' => self.then().emit(TokenKind::CloseCurly),
c if c.is_numeric() => self.number::<DEFAULT_BASE>(),
&c if is_xid_start(c) => self.identifier(),
c => todo!("Unrecognized character: {c}"),
}
}
fn number_with_base(&mut self) -> Option<Token<'t>> {
match self.peek() {
Some('x') => self.then().number::<16>(),
Some('d') => self.then().number::<10>(),
Some('o') => self.then().number::<8>(),
Some('b') => self.then().number::<2>(),
Some(c) if c.is_ascii_digit() => self.number::<DEFAULT_BASE>(),
_ => self.emit(TokenKind::Number(0, 10)),
}
}
fn number<const B: u32>(&mut self) -> Option<Token<'t>> {
let mut num = self.digit::<B>()?;
while let Some(digit) = self.digit::<B>() {
num = num * B + digit;
}
if num > u16::MAX as u32 {
None
} else {
self.emit(TokenKind::Number(num as u16, B as u8))
}
}
fn digit<const B: u32>(&mut self) -> Option<u32> {
let digit = self.peek()?.to_digit(B)?;
self.then();
Some(digit)
}
fn comment_or_slash(&mut self) -> Option<Token<'t>> {
match self.peek() {
Some('/') => self.repeat(|c| c != '\n').emit(TokenKind::Comment),
_ => self.emit(TokenKind::Slash),
}
}
fn less(&mut self) -> Option<Token<'t>> {
match self.peek() {
Some('<') => self.then().emit(TokenKind::Lsh),
_ => todo!("less"),
}
}
fn greater(&mut self) -> Option<Token<'t>> {
match self.peek() {
Some('>') => self.then().emit(TokenKind::Rsh),
_ => todo!("greater"),
}
}
fn identifier(&mut self) -> Option<Token<'t>> {
while let Some(c) = self.then().peek() {
if !is_xid_continue(*c) {
break;
}
}
let lexeme = self.next_lexeme();
if let Ok(op) = Reg::from_str(lexeme) {
self.emit(TokenKind::Reg(op))
} else if let Ok(op) = NoEm::from_str(lexeme) {
self.emit(TokenKind::NoEm(op))
} else if let Ok(op) = OneEm::from_str(lexeme) {
self.emit(TokenKind::OneEm(op))
} else if let Ok(op) = Special::from_str(lexeme) {
self.emit(TokenKind::Special(op))
} else if let Ok(op) = OneArg::from_str(lexeme) {
self.emit(TokenKind::OneArg(op))
} else if let Ok(op) = TwoArg::from_str(lexeme) {
self.emit(TokenKind::TwoArg(op))
} else if let Ok(op) = Jump::from_str(lexeme) {
self.emit(TokenKind::Jump(op))
} else {
self.emit(TokenKind::Identifier)
}
}
fn directive_or_bw(&mut self) -> Option<Token<'t>> {
while let Some(c) = self.then().peek() {
if !is_xid_continue(*c) {
break;
}
}
match self.next_lexeme() {
".b" => self.emit(TokenKind::Byte),
".w" => self.emit(TokenKind::Word),
_ => self.emit(TokenKind::Directive),
}
}
/// Todo: Character unescaping in Lexer::string
fn string(&mut self) -> Option<Token<'t>> {
while '"' != self.next()? {}
self.emit(TokenKind::String)
}
fn char(&mut self) -> Option<Token<'t>> {
let out = self.unescape()?;
self.next().filter(|c| *c == '\'').and_then(|_| self.emit(TokenKind::Char(out)))
}
/// Unescape a single character
fn unescape(&mut self) -> Option<char> {
match self.next() {
Some('\\') => (),
other => return other,
}
Some(match self.next()? {
'a' => '\x07',
'b' => '\x08',
'f' => '\x0c',
'n' => '\n',
'r' => '\r',
't' => '\t',
'x' => self.hex_escape()?,
'u' => self.unicode_escape()?,
'0' => '\0',
chr => chr,
})
}
/// unescape a single 2-digit hex escape
fn hex_escape(&mut self) -> Option<char> {
let out = (self.digit::<16>()? << 4) + self.digit::<16>()?;
char::from_u32(out) //.ok_or(Error::bad_unicode(out, self.line(), self.col()))
}
/// unescape a single \u{} unicode escape
fn unicode_escape(&mut self) -> Option<char> {
let mut out = 0;
let Some('{') = self.peek() else {
return None; //Err(Error::invalid_escape('u', self.line(), self.col()));
};
self.then();
while let Some(c) = self.peek() {
match c {
'}' => {
self.then();
return char::from_u32(out); //.ok_or(Error::bad_unicode(out, self.line(), self.col()));
}
_ => out = (out << 4) + self.digit::<16>()?,
}
}
None //Err(Error::invalid_escape('u', self.line(), self.col()))
}
}
#[cfg(test)]
mod tests;