msp430-repl/src/lexer/token.rs

316 lines
10 KiB
Rust

// © 2023 John Breaux
//! A [Token] is a [semantically tagged](Type) sequence of characters
use crate::Error;
use regex::Regex;
use std::{
fmt::{Debug, Display},
sync::OnceLock,
};
/// Implements regex matching functions on [`Token`] for each [`Type`],
/// and implements [`From<&str>`] for [`Token`]
macro_rules! regex_impl {
(<$t:lifetime> $type:ty {$(
$(#[$meta:meta])*
pub fn $func:ident (text: &str) -> Option<Self> {
regex!($out:path = $re:literal)
}
)*}) => {
impl<$t> $type {
/// Lexes a token only for the expected `variant`
///
/// Warning: This bypasses precedence rules. Only use for specific patterns.
pub fn expect(text: &$t str, expected: Type) -> Result<Self, Error> {
match expected {$(
$out => Self::$func(text),
)*}.ok_or(Error::UnexpectedToken {
expected,
got: Self::from(text).into(),
})
}
$(
$(#[$meta])*
/// Tries to read [`
#[doc = stringify!($out)]
/// `] from `text`
pub fn $func(text: &$t str) -> Option<Self> {
static RE: OnceLock<Regex> = OnceLock::new();
let lexeme = RE.get_or_init(|| Regex::new($re).unwrap())
.find(text)?.into();
Some(Self { variant: $out, lexeme })
})*
}
impl<$t> From<&$t str> for $type {
fn from (value: &$t str) -> Self {
$(
if let Some(token) = Self::$func(value) {
token
} else
)*
{todo!("Unexpected input: {value:#?} (Tokenization failure)")}
}
}
};
}
/// A [Token] is a [semantically tagged](Type) sequence of characters
#[derive(Clone, Copy, Default, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct Token<'text> {
/// The type of this token
variant: Type,
/// The sub[str]ing corresponding to this token
lexeme: &'text str,
}
impl<'text> Token<'text> {
/// Returns the [Type] of this [Token]
pub fn variant(&self) -> Type { self.variant }
/// Returns the lexeme (originating string slice) of this token
pub fn lexeme(&self) -> &'text str { self.lexeme }
/// Parses this [Token] into another type
pub fn parse<F>(&self) -> Result<F, <F as std::str::FromStr>::Err>
where F: std::str::FromStr {
self.lexeme.parse()
}
/// Returns whether the Lexeme is the expected [Type]
pub fn is_variant(&self, expected: Type) -> bool { self.variant == expected }
/// Returns the length of [Self::lexeme] in bytes.
pub fn len(&self) -> usize { self.lexeme.len() }
/// Returns `true` if [Self::lexeme] has a length of zero bytes.
pub fn is_empty(&self) -> bool { self.lexeme.is_empty() }
}
impl<'text> Debug for Token<'text> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_list().entry(&self.variant).entry(&self.lexeme).finish()
}
}
impl<'text> Display for Token<'text> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self.variant {
Type::Endl | Type::EndOfFile => Display::fmt(&self.variant, f),
v => write!(f, "{v} \"{}\"", self.lexeme),
}
}
}
/// A [token Type](Type) is a semantic tag for a sequence of characters
#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub enum Type {
/// contiguous whitespace, excluding newline
Space,
/// newline and contiguous whitespace
Endl,
/// A line-comment
Comment,
/// Jump label *definition*
Label,
/// Instructions
Insn,
/// Operand width is byte
ByteWidth,
/// Operand width is word
WordWidth,
/// Register mnemonic (i.e. `pc`, `r14`)
Register,
/// Marker for base-10
RadixMarkerDec,
/// Marker for base-16
RadixMarkerHex,
/// Marker for base-8
RadixMarkerOct,
/// Marker for base-2
RadixMarkerBin,
/// 1-4 hexadigit numbers only
Number,
/// Negative number marker
Minus,
/// post-increment mode marker
Plus,
/// Open-Indexed-Mode marker
LParen,
/// Close-Indexed-Mode marker
RParen,
/// Indirect mode marker
Indirect,
/// absolute address marker
Absolute,
/// immediate value marker
Immediate,
/// Valid identifier. Identifiers must start with a Latin alphabetic character or underline
Identifier,
/// Assembler directive
Directive,
/// Separator (comma)
Separator,
/// End of File marker
#[default]
EndOfFile,
/// Invalid token
Invalid,
}
regex_impl! {<'text> Token<'text> {
pub fn expect_space(text: &str) -> Option<Self> {
regex!(Type::Space = r"^[\s--\n]+")
}
pub fn expect_endl(text: &str) -> Option<Self> {
regex!(Type::Endl = r"^\n[\s--\n]*")
}
pub fn expect_comment(text: &str) -> Option<Self> {
regex!(Type::Comment = r"^(;|//|<.*>|\{.*\}).*")
}
pub fn expect_label(text: &str) -> Option<Self> {
regex!(Type::Label = r"^:")
}
pub fn expect_insn(text: &str) -> Option<Self> {
regex!(Type::Insn = r"(?i)^(adc|addc?|and|bi[cs]|bitb?|br|call|clr[cnz]?|cmp|dad[cd]|decd?|[de]int|incd?|inv|j([cz]|eq|ge|hs|lo?|mp|n[cez]?)|mov|[np]op|push|reti?|r[lr][ac]|sbc|set[cnz]|subc?|swpb|sxt|tst|xor)(?-u:\b)")
}
pub fn expect_byte_width(text: &str) -> Option<Self> {
regex!(Type::ByteWidth = r"(?i)^\.b")
}
pub fn expect_word_width(text: &str) -> Option<Self> {
regex!(Type::WordWidth = r"(?i)^\.w")
}
pub fn expect_register(text: &str) -> Option<Self> {
// old regex regex!(Type::Register = r"(?i)^(r(1[0-5]|[0-9])|pc|s[pr]|cg)")
regex!(Type::Register = r"(?i)^(r\d+|pc|s[pr]|cg)(?-u:\b)")
}
pub fn expect_radix_marker_dec(text: &str) -> Option<Self> {
regex!(Type::RadixMarkerDec = r"(?i)^0d")
}
pub fn expect_radix_marker_hex(text: &str) -> Option<Self> {
regex!(Type::RadixMarkerHex = r"(?i)^(0x|\$)")
}
pub fn expect_radix_marker_oct(text: &str) -> Option<Self> {
regex!(Type::RadixMarkerOct = r"(?i)^0o")
}
pub fn expect_radix_marker_bin(text: &str) -> Option<Self> {
regex!(Type::RadixMarkerBin = r"(?i)^0b")
}
pub fn expect_number(text: &str) -> Option<Self> {
regex!(Type::Number = r"^+?[[:xdigit:]]+(?-u:\b)")
}
pub fn expect_minus(text: &str) -> Option<Self> {
regex!(Type::Minus = r"^-")
}
pub fn expect_plus(text: &str) -> Option<Self> {
regex!(Type::Plus = r"^\+")
}
pub fn expect_l_paren(text: &str) -> Option<Self> {
regex!(Type::LParen = r"^\(")
}
pub fn expect_r_paren(text: &str) -> Option<Self> {
regex!(Type::RParen = r"^\)")
}
pub fn expect_indrect(text: &str) -> Option<Self> {
regex!(Type::Indirect = r"^@")
}
pub fn expect_absolute(text: &str) -> Option<Self> {
regex!(Type::Absolute = r"^&")
}
pub fn expect_immediate(text: &str) -> Option<Self> {
regex!(Type::Immediate = r"^#")
}
pub fn expect_directive(text: &str) -> Option<Self> {
regex!(Type::Directive = r"^\.\S+")
}
pub fn expect_identifier(text: &str) -> Option<Self> {
regex!(Type::Identifier = r"^[A-Za-z_]\w*")
}
pub fn expect_separator(text: &str) -> Option<Self> {
regex!(Type::Separator = r"^,")
}
pub fn expect_end_of_file(text: &str) -> Option<Self> {
regex!(Type::EndOfFile = r"^$")
}
pub fn expect_anything(text: &str) -> Option<Self> {
regex!(Type::Invalid = r"^.*")
}
}}
impl Display for Type {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::Space => Display::fmt("space", f),
Self::Endl => Display::fmt("newline", f),
Self::Comment => Display::fmt("comment", f),
Self::Label => Display::fmt("label definition", f),
Self::Insn => Display::fmt("opcode", f),
Self::ByteWidth => Display::fmt("byte-width", f),
Self::WordWidth => Display::fmt("word-width", f),
Self::Register => Display::fmt("register", f),
Self::RadixMarkerDec => Display::fmt("decimal marker", f),
Self::RadixMarkerHex => Display::fmt("hexadecimal marker", f),
Self::RadixMarkerOct => Display::fmt("octal marker", f),
Self::RadixMarkerBin => Display::fmt("binary marker", f),
Self::Number => Display::fmt("number", f),
Self::Minus => Display::fmt("minus sign", f),
Self::Plus => Display::fmt("plus sign", f),
Self::LParen => Display::fmt("left parenthesis", f),
Self::RParen => Display::fmt("right parenthesis", f),
Self::Indirect => Display::fmt("indirect", f),
Self::Absolute => Display::fmt("absolute", f),
Self::Immediate => Display::fmt("immediate", f),
Self::Identifier => Display::fmt("identifier", f),
Self::Directive => Display::fmt("directive", f),
Self::Separator => Display::fmt("comma", f),
Self::EndOfFile => Display::fmt("EOF", f),
Self::Invalid => Display::fmt("invalid token", f),
}
}
}
/// A [Token] which can outlive its parent buffer
#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct OwnedToken {
/// The type of this token
variant: Type,
/// The sub[String] corresponding to this token
lexeme: String,
}
impl Display for OwnedToken {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{}", Token::from(self)) }
}
impl<'t> From<&'t OwnedToken> for Token<'t> {
fn from(value: &'t OwnedToken) -> Self { Token { variant: value.variant, lexeme: &value.lexeme } }
}
impl From<Token<'_>> for OwnedToken {
fn from(value: Token<'_>) -> Self {
let Token { variant, lexeme } = value;
OwnedToken { variant, lexeme: lexeme.to_owned() }
}
}
/// [Types] are an owned array of [types](Type), with a custom [Display] implementation
#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct Types(Vec<Type>);
impl<T: AsRef<[Type]>> From<T> for Types {
// TODO: Possibly bad. Check out in rust playground.
fn from(value: T) -> Self { Self(value.as_ref().to_owned()) }
}
impl Display for Types {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
for (idx, t) in self.0.iter().enumerate() {
Display::fmt(t, f)?;
match idx {
i if i < self.0.len() - 2 => Display::fmt(", ", f)?,
i if i < self.0.len() - 1 => Display::fmt(" or ", f)?,
_ => (),
}
}
Ok(())
}
}