// © 2023-2024 John Breaux //! The [Lexer] turns a [sequence of characters](str) into a stream of //! [lexically-tagged tokens](token) pub mod token; use self::token::{Special, TokenKind, *}; use crate::util::Span; use std::{ iter::Peekable, str::{CharIndices, FromStr}, }; use unicode_ident::*; const DEFAULT_BASE: u32 = 10; /// Turns a [sequence of characters](str) into a stream of [lexically identified tokens](token). /// /// # Examples /// ```rust /// # use libmsp430::lexer::{Lexer, token::*}; /// let text = "mov r14, r15"; /// let mut lexer = Lexer::new(text); /// assert_eq!(lexer.scan().unwrap().kind, TokenKind::TwoArg(TwoArg::Mov)); /// assert_eq!(lexer.scan().unwrap().kind, TokenKind::Reg(Reg::R14)); /// assert_eq!(lexer.scan().unwrap().kind, TokenKind::Comma); /// assert_eq!(lexer.scan().unwrap().kind, TokenKind::Reg(Reg::R15)); /// assert_eq!(lexer.scan().unwrap().kind, TokenKind::Eof); /// ``` #[derive(Clone, Debug)] pub struct Lexer<'t> { /// Keeps track of the byte offset into the string iter: Peekable>, text: &'t str, start: usize, index: usize, } impl<'t> Lexer<'t> { /// Creates a new [Lexer] over some [text](str) pub fn new(text: &'t str) -> Self { Self { iter: text.char_indices().peekable(), text, start: 0, index: 0 } } /// Gets the current byte-position pub fn location(&self) -> usize { self.start } /// Internal: Emits a token with the provided [TokenKind], providing its extents. fn emit(&mut self, kind: TokenKind) -> Option> { let out = Some(Token::new(self.next_lexeme(), kind, Span { start: self.start, end: self.index })); self.start = self.index; out } fn next_lexeme(&self) -> &'t str { &self.text[self.start..self.index] } fn repeat(&mut self, f: impl Fn(char) -> bool) -> &mut Self { while let Some(&c) = self.peek() { if !f(c) { break; } self.next(); } self } fn space(&mut self) -> Option<&mut Self> { while self.peek()?.is_whitespace() && *self.peek()? != '\n' { self.next(); } self.start = self.index; Some(self) } /// Consumes a [char] without checking, for ergonomic chaining fn then(&mut self) -> &mut Self { self.next(); self } fn peek(&mut self) -> Option<&char> { self.iter.peek().map(|(_, c)| c) } fn next(&mut self) -> Option { let (index, c) = self.iter.next()?; self.index = index + c.len_utf8(); Some(c) } /// Scans for the next [Token] in the stream pub fn scan(&mut self) -> Option> { if self.space().is_none() { return self.emit(TokenKind::Eof); } let Some(c) = self.peek() else { return self.emit(TokenKind::Eof); }; match c { '\n' => self.then().emit(TokenKind::Newline), '!' => self.then().emit(TokenKind::Bang), '#' => self.then().emit(TokenKind::Hash), '$' => self.then().emit(TokenKind::Dollar), '%' => self.then().emit(TokenKind::Percent), '&' => self.then().emit(TokenKind::Amp), '\'' => self.then().char(), '"' => self.then().string(), '(' => self.then().emit(TokenKind::OpenParen), ')' => self.then().emit(TokenKind::CloseParen), '*' => self.then().emit(TokenKind::Star), '+' => self.then().emit(TokenKind::Plus), ',' => self.then().emit(TokenKind::Comma), '-' => self.then().emit(TokenKind::Minus), '.' => self.then().directive_or_bw(), '/' => self.then().comment_or_slash(), '0' => self.then().number_with_base(), ':' => self.then().emit(TokenKind::Colon), ';' => self.repeat(|c| c != '\n').emit(TokenKind::Comment), '<' => self.then().less(), '>' => self.then().greater(), '@' => self.then().emit(TokenKind::At), '[' => self.then().emit(TokenKind::OpenBrace), ']' => self.then().emit(TokenKind::CloseBrace), '^' => self.then().emit(TokenKind::Caret), '_' => self.then().identifier(), '{' => self.then().emit(TokenKind::OpenCurly), '|' => self.then().emit(TokenKind::Bar), '}' => self.then().emit(TokenKind::CloseCurly), c if c.is_numeric() => self.number::(), &c if is_xid_start(c) => self.identifier(), c => todo!("Unrecognized character: {c}"), } } fn number_with_base(&mut self) -> Option> { match self.peek() { Some('x') => self.then().number::<16>(), Some('d') => self.then().number::<10>(), Some('o') => self.then().number::<8>(), Some('b') => self.then().number::<2>(), Some(c) if c.is_ascii_digit() => self.number::(), _ => self.emit(TokenKind::Number(0, 10)), } } fn number(&mut self) -> Option> { let mut num = self.digit::()?; while let Some(digit) = self.digit::() { num = num * B + digit; } if num > u16::MAX as u32 { None } else { self.emit(TokenKind::Number(num as u16, B as u8)) } } fn digit(&mut self) -> Option { let digit = self.peek()?.to_digit(B)?; self.then(); Some(digit) } fn comment_or_slash(&mut self) -> Option> { match self.peek() { Some('/') => self.repeat(|c| c != '\n').emit(TokenKind::Comment), _ => self.emit(TokenKind::Slash), } } fn less(&mut self) -> Option> { match self.peek() { Some('<') => self.then().emit(TokenKind::Lsh), _ => todo!("less"), } } fn greater(&mut self) -> Option> { match self.peek() { Some('>') => self.then().emit(TokenKind::Rsh), _ => todo!("greater"), } } fn identifier(&mut self) -> Option> { while let Some(c) = self.then().peek() { if !is_xid_continue(*c) { break; } } let lexeme = self.next_lexeme(); if let Ok(op) = Reg::from_str(lexeme) { self.emit(TokenKind::Reg(op)) } else if let Ok(op) = NoEm::from_str(lexeme) { self.emit(TokenKind::NoEm(op)) } else if let Ok(op) = OneEm::from_str(lexeme) { self.emit(TokenKind::OneEm(op)) } else if let Ok(op) = Special::from_str(lexeme) { self.emit(TokenKind::Special(op)) } else if let Ok(op) = OneArg::from_str(lexeme) { self.emit(TokenKind::OneArg(op)) } else if let Ok(op) = TwoArg::from_str(lexeme) { self.emit(TokenKind::TwoArg(op)) } else if let Ok(op) = Jump::from_str(lexeme) { self.emit(TokenKind::Jump(op)) } else { self.emit(TokenKind::Identifier) } } fn directive_or_bw(&mut self) -> Option> { while let Some(c) = self.then().peek() { if !is_xid_continue(*c) { break; } } match self.next_lexeme() { ".b" => self.emit(TokenKind::Byte), ".w" => self.emit(TokenKind::Word), _ => self.emit(TokenKind::Directive), } } /// Todo: Character unescaping in Lexer::string fn string(&mut self) -> Option> { while '"' != self.next()? {} self.emit(TokenKind::String) } fn char(&mut self) -> Option> { let out = self.unescape()?; self.next().filter(|c| *c == '\'').and_then(|_| self.emit(TokenKind::Char(out))) } /// Unescape a single character fn unescape(&mut self) -> Option { match self.next() { Some('\\') => (), other => return other, } Some(match self.next()? { 'a' => '\x07', 'b' => '\x08', 'f' => '\x0c', 'n' => '\n', 'r' => '\r', 't' => '\t', 'x' => self.hex_escape()?, 'u' => self.unicode_escape()?, '0' => '\0', chr => chr, }) } /// unescape a single 2-digit hex escape fn hex_escape(&mut self) -> Option { let out = (self.digit::<16>()? << 4) + self.digit::<16>()?; char::from_u32(out) //.ok_or(Error::bad_unicode(out, self.line(), self.col())) } /// unescape a single \u{} unicode escape fn unicode_escape(&mut self) -> Option { let mut out = 0; let Some('{') = self.peek() else { return None; //Err(Error::invalid_escape('u', self.line(), self.col())); }; self.then(); while let Some(c) = self.peek() { match c { '}' => { self.then(); return char::from_u32(out); //.ok_or(Error::bad_unicode(out, self.line(), self.col())); } _ => out = (out << 4) + self.digit::<16>()?, } } None //Err(Error::invalid_escape('u', self.line(), self.col())) } } #[cfg(test)] mod tests;