// © 2023 John Breaux //! A [Token] is a [semantically tagged](Type) sequence of characters use crate::Error; use regex::Regex; use std::{ fmt::{Debug, Display}, sync::OnceLock, }; /// Implements regex matching functions on [`Token`] for each [`Type`], /// and implements [`From<&str>`] for [`Token`] macro_rules! regex_impl { (<$t:lifetime> $type:ty {$( $(#[$meta:meta])* pub fn $func:ident (text: &str) -> Option { regex!($out:path = $re:literal) } )*}) => { impl<$t> $type { /// Lexes a token only for the expected `variant` /// /// Warning: This bypasses precedence rules. Only use for specific patterns. pub fn expect(text: &$t str, expected: Type) -> Result { match expected {$( $out => Self::$func(text), )*}.ok_or(Error::UnexpectedToken { expected, got: Self::from(text).into(), }) } $( $(#[$meta])* /// Tries to read [` #[doc = stringify!($out)] /// `] from `text` pub fn $func(text: &$t str) -> Option { static RE: OnceLock = OnceLock::new(); let lexeme = RE.get_or_init(|| Regex::new($re).unwrap()) .find(text)?.into(); Some(Self { variant: $out, lexeme }) })* } impl<$t> From<&$t str> for $type { fn from (value: &$t str) -> Self { $( if let Some(token) = Self::$func(value) { token } else )* {todo!("Unexpected input: {value:#?} (Tokenization failure)")} } } }; } /// A [Token] is a [semantically tagged](Type) sequence of characters #[derive(Clone, Copy, Default, PartialEq, Eq, PartialOrd, Ord, Hash)] pub struct Token<'text> { /// The type of this token variant: Type, /// The sub[str]ing corresponding to this token lexeme: &'text str, } impl<'text> Token<'text> { /// Returns the [Type] of this [Token] pub fn variant(&self) -> Type { self.variant } /// Returns the lexeme (originating string slice) of this token pub fn lexeme(&self) -> &'text str { self.lexeme } /// Parses this [Token] into another type pub fn parse(&self) -> Result::Err> where F: std::str::FromStr { self.lexeme.parse() } /// Returns whether the Lexeme is the expected [Type] pub fn is_variant(&self, expected: Type) -> bool { self.variant == expected } /// Returns the length of [Self::lexeme] in bytes. pub fn len(&self) -> usize { self.lexeme.len() } /// Returns `true` if [Self::lexeme] has a length of zero bytes. pub fn is_empty(&self) -> bool { self.lexeme.is_empty() } } impl<'text> Debug for Token<'text> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_list().entry(&self.variant).entry(&self.lexeme).finish() } } impl<'text> Display for Token<'text> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self.variant { Type::Endl | Type::EndOfFile => Display::fmt(&self.variant, f), v => write!(f, "{v} \"{}\"", self.lexeme), } } } /// A [token Type](Type) is a semantic tag for a sequence of characters #[derive(Clone, Copy, Debug, Default, PartialEq, Eq, PartialOrd, Ord, Hash)] pub enum Type { /// contiguous whitespace, excluding newline Space, /// newline and contiguous whitespace Endl, /// A line-comment Comment, /// Jump label *definition* Label, /// Instructions Insn, /// Operand width is byte ByteWidth, /// Operand width is word WordWidth, /// Register mnemonic (i.e. `pc`, `r14`) Register, /// Marker for base-10 RadixMarkerDec, /// Marker for base-16 RadixMarkerHex, /// Marker for base-8 RadixMarkerOct, /// Marker for base-2 RadixMarkerBin, /// 1-4 hexadigit numbers only Number, /// Negative number marker Minus, /// post-increment mode marker Plus, /// Open-Indexed-Mode marker LParen, /// Close-Indexed-Mode marker RParen, /// Indirect mode marker Indirect, /// absolute address marker Absolute, /// immediate value marker Immediate, /// Valid identifier. Identifiers must start with a Latin alphabetic character or underline Identifier, /// Assembler directive Directive, /// Separator (comma) Separator, /// End of File marker #[default] EndOfFile, /// Invalid token Invalid, } regex_impl! {<'text> Token<'text> { pub fn expect_space(text: &str) -> Option { regex!(Type::Space = r"^[\s--\n]+") } pub fn expect_endl(text: &str) -> Option { regex!(Type::Endl = r"^\n[\s--\n]*") } pub fn expect_comment(text: &str) -> Option { regex!(Type::Comment = r"^(;|//|<.*>|\{.*\}).*") } pub fn expect_label(text: &str) -> Option { regex!(Type::Label = r"^:") } pub fn expect_insn(text: &str) -> Option { regex!(Type::Insn = r"(?i)^(adc|addc?|and|bi[cs]|bitb?|br|call|clr[cnz]?|cmp|dad[cd]|decd?|[de]int|incd?|inv|j([cz]|eq|ge|hs|lo?|mp|n[cez]?)|mov|[np]op|push|reti?|r[lr][ac]|sbc|set[cnz]|subc?|swpb|sxt|tst|xor)(?-u:\b)") } pub fn expect_byte_width(text: &str) -> Option { regex!(Type::ByteWidth = r"(?i)^\.b") } pub fn expect_word_width(text: &str) -> Option { regex!(Type::WordWidth = r"(?i)^\.w") } pub fn expect_register(text: &str) -> Option { // old regex regex!(Type::Register = r"(?i)^(r(1[0-5]|[0-9])|pc|s[pr]|cg)") regex!(Type::Register = r"(?i)^(r\d+|pc|s[pr]|cg)(?-u:\b)") } pub fn expect_radix_marker_dec(text: &str) -> Option { regex!(Type::RadixMarkerDec = r"(?i)^0d") } pub fn expect_radix_marker_hex(text: &str) -> Option { regex!(Type::RadixMarkerHex = r"(?i)^(0x|\$)") } pub fn expect_radix_marker_oct(text: &str) -> Option { regex!(Type::RadixMarkerOct = r"(?i)^0o") } pub fn expect_radix_marker_bin(text: &str) -> Option { regex!(Type::RadixMarkerBin = r"(?i)^0b") } pub fn expect_number(text: &str) -> Option { regex!(Type::Number = r"^+?[[:xdigit:]]+(?-u:\b)") } pub fn expect_minus(text: &str) -> Option { regex!(Type::Minus = r"^-") } pub fn expect_plus(text: &str) -> Option { regex!(Type::Plus = r"^\+") } pub fn expect_l_paren(text: &str) -> Option { regex!(Type::LParen = r"^\(") } pub fn expect_r_paren(text: &str) -> Option { regex!(Type::RParen = r"^\)") } pub fn expect_indrect(text: &str) -> Option { regex!(Type::Indirect = r"^@") } pub fn expect_absolute(text: &str) -> Option { regex!(Type::Absolute = r"^&") } pub fn expect_immediate(text: &str) -> Option { regex!(Type::Immediate = r"^#") } pub fn expect_directive(text: &str) -> Option { regex!(Type::Directive = r"^\.\S+") } pub fn expect_identifier(text: &str) -> Option { regex!(Type::Identifier = r"^[A-Za-z_]\w*") } pub fn expect_separator(text: &str) -> Option { regex!(Type::Separator = r"^,") } pub fn expect_end_of_file(text: &str) -> Option { regex!(Type::EndOfFile = r"^$") } pub fn expect_anything(text: &str) -> Option { regex!(Type::Invalid = r"^.*") } }} impl Display for Type { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { Self::Space => Display::fmt("space", f), Self::Endl => Display::fmt("newline", f), Self::Comment => Display::fmt("comment", f), Self::Label => Display::fmt("label definition", f), Self::Insn => Display::fmt("opcode", f), Self::ByteWidth => Display::fmt("byte-width", f), Self::WordWidth => Display::fmt("word-width", f), Self::Register => Display::fmt("register", f), Self::RadixMarkerDec => Display::fmt("decimal marker", f), Self::RadixMarkerHex => Display::fmt("hexadecimal marker", f), Self::RadixMarkerOct => Display::fmt("octal marker", f), Self::RadixMarkerBin => Display::fmt("binary marker", f), Self::Number => Display::fmt("number", f), Self::Minus => Display::fmt("minus sign", f), Self::Plus => Display::fmt("plus sign", f), Self::LParen => Display::fmt("left parenthesis", f), Self::RParen => Display::fmt("right parenthesis", f), Self::Indirect => Display::fmt("indirect", f), Self::Absolute => Display::fmt("absolute", f), Self::Immediate => Display::fmt("immediate", f), Self::Identifier => Display::fmt("identifier", f), Self::Directive => Display::fmt("directive", f), Self::Separator => Display::fmt("comma", f), Self::EndOfFile => Display::fmt("EOF", f), Self::Invalid => Display::fmt("invalid token", f), } } } /// A [Token] which can outlive its parent buffer #[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] pub struct OwnedToken { /// The type of this token variant: Type, /// The sub[String] corresponding to this token lexeme: String, } impl Display for OwnedToken { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{}", Token::from(self)) } } impl<'t> From<&'t OwnedToken> for Token<'t> { fn from(value: &'t OwnedToken) -> Self { Token { variant: value.variant, lexeme: &value.lexeme } } } impl From> for OwnedToken { fn from(value: Token<'_>) -> Self { let Token { variant, lexeme } = value; OwnedToken { variant, lexeme: lexeme.to_owned() } } } /// [Types] are an owned array of [types](Type), with a custom [Display] implementation #[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] pub struct Types(Vec); impl> From for Types { // TODO: Possibly bad. Check out in rust playground. fn from(value: T) -> Self { Self(value.as_ref().to_owned()) } } impl Display for Types { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { for (idx, t) in self.0.iter().enumerate() { Display::fmt(t, f)?; match idx { i if i < self.0.len() - 2 => Display::fmt(", ", f)?, i if i < self.0.len() - 1 => Display::fmt(" or ", f)?, _ => (), } } Ok(()) } }