269 lines
9.3 KiB
Rust
269 lines
9.3 KiB
Rust
// © 2023-2024 John Breaux
|
|
//! The [Lexer] turns a [sequence of characters](str) into a stream of
|
|
//! [lexically-tagged tokens](token)
|
|
|
|
pub mod token;
|
|
|
|
use self::token::{Special, TokenKind, *};
|
|
use crate::util::Span;
|
|
use std::{
|
|
iter::Peekable,
|
|
str::{CharIndices, FromStr},
|
|
};
|
|
use unicode_ident::*;
|
|
|
|
const DEFAULT_BASE: u32 = 10;
|
|
|
|
/// Turns a [sequence of characters](str) into a stream of [lexically identified tokens](token).
|
|
///
|
|
/// # Examples
|
|
/// ```rust
|
|
/// # use libmsp430::lexer::{Lexer, token::*};
|
|
/// let text = "mov r14, r15";
|
|
/// let mut lexer = Lexer::new(text);
|
|
/// assert_eq!(lexer.scan().unwrap().kind, TokenKind::TwoArg(TwoArg::Mov));
|
|
/// assert_eq!(lexer.scan().unwrap().kind, TokenKind::Reg(Reg::R14));
|
|
/// assert_eq!(lexer.scan().unwrap().kind, TokenKind::Comma);
|
|
/// assert_eq!(lexer.scan().unwrap().kind, TokenKind::Reg(Reg::R15));
|
|
/// assert_eq!(lexer.scan().unwrap().kind, TokenKind::Eof);
|
|
/// ```
|
|
#[derive(Clone, Debug)]
|
|
pub struct Lexer<'t> {
|
|
/// Keeps track of the byte offset into the string
|
|
iter: Peekable<CharIndices<'t>>,
|
|
text: &'t str,
|
|
start: usize,
|
|
index: usize,
|
|
}
|
|
|
|
impl<'t> Lexer<'t> {
|
|
/// Creates a new [Lexer] over some [text](str)
|
|
pub fn new(text: &'t str) -> Self {
|
|
Self { iter: text.char_indices().peekable(), text, start: 0, index: 0 }
|
|
}
|
|
/// Gets the current byte-position
|
|
pub fn location(&self) -> usize {
|
|
self.start
|
|
}
|
|
/// Internal: Emits a token with the provided [TokenKind], providing its extents.
|
|
fn emit(&mut self, kind: TokenKind) -> Option<Token<'t>> {
|
|
let out =
|
|
Some(Token::new(self.next_lexeme(), kind, Span { start: self.start, end: self.index }));
|
|
self.start = self.index;
|
|
out
|
|
}
|
|
fn next_lexeme(&self) -> &'t str {
|
|
&self.text[self.start..self.index]
|
|
}
|
|
fn repeat(&mut self, f: impl Fn(char) -> bool) -> &mut Self {
|
|
while let Some(&c) = self.peek() {
|
|
if !f(c) {
|
|
break;
|
|
}
|
|
self.next();
|
|
}
|
|
self
|
|
}
|
|
fn space(&mut self) -> Option<&mut Self> {
|
|
while self.peek()?.is_whitespace() && *self.peek()? != '\n' {
|
|
self.next();
|
|
}
|
|
self.start = self.index;
|
|
Some(self)
|
|
}
|
|
/// Consumes a [char] without checking, for ergonomic chaining
|
|
fn then(&mut self) -> &mut Self {
|
|
self.next();
|
|
self
|
|
}
|
|
fn peek(&mut self) -> Option<&char> {
|
|
self.iter.peek().map(|(_, c)| c)
|
|
}
|
|
fn next(&mut self) -> Option<char> {
|
|
let (index, c) = self.iter.next()?;
|
|
self.index = index + c.len_utf8();
|
|
Some(c)
|
|
}
|
|
|
|
/// Scans for the next [Token] in the stream
|
|
pub fn scan(&mut self) -> Option<Token<'t>> {
|
|
if self.space().is_none() {
|
|
return self.emit(TokenKind::Eof);
|
|
}
|
|
let Some(c) = self.peek() else {
|
|
return self.emit(TokenKind::Eof);
|
|
};
|
|
match c {
|
|
'\n' => self.then().emit(TokenKind::Newline),
|
|
'!' => self.then().emit(TokenKind::Bang),
|
|
'#' => self.then().emit(TokenKind::Hash),
|
|
'$' => self.then().emit(TokenKind::Dollar),
|
|
'%' => self.then().emit(TokenKind::Percent),
|
|
'&' => self.then().emit(TokenKind::Amp),
|
|
'\'' => self.then().char(),
|
|
'"' => self.then().string(),
|
|
'(' => self.then().emit(TokenKind::OpenParen),
|
|
')' => self.then().emit(TokenKind::CloseParen),
|
|
'*' => self.then().emit(TokenKind::Star),
|
|
'+' => self.then().emit(TokenKind::Plus),
|
|
',' => self.then().emit(TokenKind::Comma),
|
|
'-' => self.then().emit(TokenKind::Minus),
|
|
'.' => self.then().directive_or_bw(),
|
|
'/' => self.then().comment_or_slash(),
|
|
'0' => self.then().number_with_base(),
|
|
':' => self.then().emit(TokenKind::Colon),
|
|
';' => self.repeat(|c| c != '\n').emit(TokenKind::Comment),
|
|
'<' => self.then().less(),
|
|
'>' => self.then().greater(),
|
|
'@' => self.then().emit(TokenKind::At),
|
|
'[' => self.then().emit(TokenKind::OpenBrace),
|
|
']' => self.then().emit(TokenKind::CloseBrace),
|
|
'^' => self.then().emit(TokenKind::Caret),
|
|
'_' => self.then().identifier(),
|
|
'{' => self.then().emit(TokenKind::OpenCurly),
|
|
'|' => self.then().emit(TokenKind::Bar),
|
|
'}' => self.then().emit(TokenKind::CloseCurly),
|
|
c if c.is_numeric() => self.number::<DEFAULT_BASE>(),
|
|
&c if is_xid_start(c) => self.identifier(),
|
|
c => todo!("Unrecognized character: {c}"),
|
|
}
|
|
}
|
|
fn number_with_base(&mut self) -> Option<Token<'t>> {
|
|
match self.peek() {
|
|
Some('x') => self.then().number::<16>(),
|
|
Some('d') => self.then().number::<10>(),
|
|
Some('o') => self.then().number::<8>(),
|
|
Some('b') => self.then().number::<2>(),
|
|
Some(c) if c.is_ascii_digit() => self.number::<DEFAULT_BASE>(),
|
|
_ => self.emit(TokenKind::Number(0, 10)),
|
|
}
|
|
}
|
|
fn number<const B: u32>(&mut self) -> Option<Token<'t>> {
|
|
let mut num = self.digit::<B>()?;
|
|
while let Some(digit) = self.digit::<B>() {
|
|
num = num * B + digit;
|
|
}
|
|
if num > u16::MAX as u32 {
|
|
None
|
|
} else {
|
|
self.emit(TokenKind::Number(num as u16, B as u8))
|
|
}
|
|
}
|
|
fn digit<const B: u32>(&mut self) -> Option<u32> {
|
|
let digit = self.peek()?.to_digit(B)?;
|
|
self.then();
|
|
Some(digit)
|
|
}
|
|
|
|
fn comment_or_slash(&mut self) -> Option<Token<'t>> {
|
|
match self.peek() {
|
|
Some('/') => self.repeat(|c| c != '\n').emit(TokenKind::Comment),
|
|
_ => self.emit(TokenKind::Slash),
|
|
}
|
|
}
|
|
fn less(&mut self) -> Option<Token<'t>> {
|
|
match self.peek() {
|
|
Some('<') => self.then().emit(TokenKind::Lsh),
|
|
_ => todo!("less"),
|
|
}
|
|
}
|
|
fn greater(&mut self) -> Option<Token<'t>> {
|
|
match self.peek() {
|
|
Some('>') => self.then().emit(TokenKind::Rsh),
|
|
_ => todo!("greater"),
|
|
}
|
|
}
|
|
fn identifier(&mut self) -> Option<Token<'t>> {
|
|
while let Some(c) = self.then().peek() {
|
|
if !is_xid_continue(*c) {
|
|
break;
|
|
}
|
|
}
|
|
let lexeme = self.next_lexeme();
|
|
if let Ok(op) = Reg::from_str(lexeme) {
|
|
self.emit(TokenKind::Reg(op))
|
|
} else if let Ok(op) = NoEm::from_str(lexeme) {
|
|
self.emit(TokenKind::NoEm(op))
|
|
} else if let Ok(op) = OneEm::from_str(lexeme) {
|
|
self.emit(TokenKind::OneEm(op))
|
|
} else if let Ok(op) = Special::from_str(lexeme) {
|
|
self.emit(TokenKind::Special(op))
|
|
} else if let Ok(op) = OneArg::from_str(lexeme) {
|
|
self.emit(TokenKind::OneArg(op))
|
|
} else if let Ok(op) = TwoArg::from_str(lexeme) {
|
|
self.emit(TokenKind::TwoArg(op))
|
|
} else if let Ok(op) = Jump::from_str(lexeme) {
|
|
self.emit(TokenKind::Jump(op))
|
|
} else {
|
|
self.emit(TokenKind::Identifier)
|
|
}
|
|
}
|
|
fn directive_or_bw(&mut self) -> Option<Token<'t>> {
|
|
while let Some(c) = self.then().peek() {
|
|
if !is_xid_continue(*c) {
|
|
break;
|
|
}
|
|
}
|
|
match self.next_lexeme() {
|
|
".b" => self.emit(TokenKind::Byte),
|
|
".w" => self.emit(TokenKind::Word),
|
|
_ => self.emit(TokenKind::Directive),
|
|
}
|
|
}
|
|
|
|
/// Todo: Character unescaping in Lexer::string
|
|
fn string(&mut self) -> Option<Token<'t>> {
|
|
while '"' != self.next()? {}
|
|
self.emit(TokenKind::String)
|
|
}
|
|
fn char(&mut self) -> Option<Token<'t>> {
|
|
let out = self.unescape()?;
|
|
self.next().filter(|c| *c == '\'').and_then(|_| self.emit(TokenKind::Char(out)))
|
|
}
|
|
/// Unescape a single character
|
|
fn unescape(&mut self) -> Option<char> {
|
|
match self.next() {
|
|
Some('\\') => (),
|
|
other => return other,
|
|
}
|
|
Some(match self.next()? {
|
|
'a' => '\x07',
|
|
'b' => '\x08',
|
|
'f' => '\x0c',
|
|
'n' => '\n',
|
|
'r' => '\r',
|
|
't' => '\t',
|
|
'x' => self.hex_escape()?,
|
|
'u' => self.unicode_escape()?,
|
|
'0' => '\0',
|
|
chr => chr,
|
|
})
|
|
}
|
|
/// unescape a single 2-digit hex escape
|
|
fn hex_escape(&mut self) -> Option<char> {
|
|
let out = (self.digit::<16>()? << 4) + self.digit::<16>()?;
|
|
char::from_u32(out) //.ok_or(Error::bad_unicode(out, self.line(), self.col()))
|
|
}
|
|
/// unescape a single \u{} unicode escape
|
|
fn unicode_escape(&mut self) -> Option<char> {
|
|
let mut out = 0;
|
|
let Some('{') = self.peek() else {
|
|
return None; //Err(Error::invalid_escape('u', self.line(), self.col()));
|
|
};
|
|
self.then();
|
|
while let Some(c) = self.peek() {
|
|
match c {
|
|
'}' => {
|
|
self.then();
|
|
return char::from_u32(out); //.ok_or(Error::bad_unicode(out, self.line(), self.col()));
|
|
}
|
|
_ => out = (out << 4) + self.digit::<16>()?,
|
|
}
|
|
}
|
|
None //Err(Error::invalid_escape('u', self.line(), self.col()))
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests;
|