316 lines
10 KiB
Rust
316 lines
10 KiB
Rust
// © 2023 John Breaux
|
|
//! A [Token] is a [semantically tagged](Type) sequence of characters
|
|
|
|
use crate::Error;
|
|
use regex::Regex;
|
|
use std::{
|
|
fmt::{Debug, Display},
|
|
sync::OnceLock,
|
|
};
|
|
|
|
/// Implements regex matching functions on [`Token`] for each [`Type`],
|
|
/// and implements [`From<&str>`] for [`Token`]
|
|
macro_rules! regex_impl {
|
|
(<$t:lifetime> $type:ty {$(
|
|
$(#[$meta:meta])*
|
|
pub fn $func:ident (text: &str) -> Option<Self> {
|
|
regex!($out:path = $re:literal)
|
|
}
|
|
)*}) => {
|
|
impl<$t> $type {
|
|
/// Lexes a token only for the expected `variant`
|
|
///
|
|
/// Warning: This bypasses precedence rules. Only use for specific patterns.
|
|
pub fn expect(text: &$t str, expected: Type) -> Result<Self, Error> {
|
|
match expected {$(
|
|
$out => Self::$func(text),
|
|
)*}.ok_or(Error::UnexpectedToken {
|
|
expected,
|
|
got: Self::from(text).into(),
|
|
})
|
|
}
|
|
$(
|
|
$(#[$meta])*
|
|
/// Tries to read [`
|
|
#[doc = stringify!($out)]
|
|
/// `] from `text`
|
|
pub fn $func(text: &$t str) -> Option<Self> {
|
|
static RE: OnceLock<Regex> = OnceLock::new();
|
|
let lexeme = RE.get_or_init(|| Regex::new($re).unwrap())
|
|
.find(text)?.into();
|
|
Some(Self { variant: $out, lexeme })
|
|
})*
|
|
}
|
|
impl<$t> From<&$t str> for $type {
|
|
fn from (value: &$t str) -> Self {
|
|
$(
|
|
if let Some(token) = Self::$func(value) {
|
|
token
|
|
} else
|
|
)*
|
|
{todo!("Unexpected input: {value:#?} (Tokenization failure)")}
|
|
}
|
|
}
|
|
};
|
|
}
|
|
|
|
/// A [Token] is a [semantically tagged](Type) sequence of characters
|
|
#[derive(Clone, Copy, Default, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
|
pub struct Token<'text> {
|
|
/// The type of this token
|
|
variant: Type,
|
|
/// The sub[str]ing corresponding to this token
|
|
lexeme: &'text str,
|
|
}
|
|
|
|
impl<'text> Token<'text> {
|
|
/// Returns the [Type] of this [Token]
|
|
pub fn variant(&self) -> Type { self.variant }
|
|
|
|
/// Returns the lexeme (originating string slice) of this token
|
|
pub fn lexeme(&self) -> &'text str { self.lexeme }
|
|
|
|
/// Parses this [Token] into another type
|
|
pub fn parse<F>(&self) -> Result<F, <F as std::str::FromStr>::Err>
|
|
where F: std::str::FromStr {
|
|
self.lexeme.parse()
|
|
}
|
|
/// Returns whether the Lexeme is the expected [Type]
|
|
pub fn is_variant(&self, expected: Type) -> bool { self.variant == expected }
|
|
|
|
/// Returns the length of [Self::lexeme] in bytes.
|
|
pub fn len(&self) -> usize { self.lexeme.len() }
|
|
|
|
/// Returns `true` if [Self::lexeme] has a length of zero bytes.
|
|
pub fn is_empty(&self) -> bool { self.lexeme.is_empty() }
|
|
}
|
|
|
|
impl<'text> Debug for Token<'text> {
|
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
f.debug_list().entry(&self.variant).entry(&self.lexeme).finish()
|
|
}
|
|
}
|
|
|
|
impl<'text> Display for Token<'text> {
|
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
match self.variant {
|
|
Type::Endl | Type::EndOfFile => Display::fmt(&self.variant, f),
|
|
v => write!(f, "{v} \"{}\"", self.lexeme),
|
|
}
|
|
}
|
|
}
|
|
|
|
/// A [token Type](Type) is a semantic tag for a sequence of characters
|
|
#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
|
pub enum Type {
|
|
/// contiguous whitespace, excluding newline
|
|
Space,
|
|
/// newline and contiguous whitespace
|
|
Endl,
|
|
/// A line-comment
|
|
Comment,
|
|
/// Jump label *definition*
|
|
Label,
|
|
/// Instructions
|
|
Insn,
|
|
/// Operand width is byte
|
|
ByteWidth,
|
|
/// Operand width is word
|
|
WordWidth,
|
|
/// Register mnemonic (i.e. `pc`, `r14`)
|
|
Register,
|
|
/// Marker for base-10
|
|
RadixMarkerDec,
|
|
/// Marker for base-16
|
|
RadixMarkerHex,
|
|
/// Marker for base-8
|
|
RadixMarkerOct,
|
|
/// Marker for base-2
|
|
RadixMarkerBin,
|
|
/// 1-4 hexadigit numbers only
|
|
Number,
|
|
/// Negative number marker
|
|
Minus,
|
|
/// post-increment mode marker
|
|
Plus,
|
|
/// Open-Indexed-Mode marker
|
|
LParen,
|
|
/// Close-Indexed-Mode marker
|
|
RParen,
|
|
/// Indirect mode marker
|
|
Indirect,
|
|
/// absolute address marker
|
|
Absolute,
|
|
/// immediate value marker
|
|
Immediate,
|
|
/// Valid identifier. Identifiers must start with a Latin alphabetic character or underline
|
|
Identifier,
|
|
/// Assembler directive
|
|
Directive,
|
|
/// Separator (comma)
|
|
Separator,
|
|
/// End of File marker
|
|
#[default]
|
|
EndOfFile,
|
|
/// Invalid token
|
|
Invalid,
|
|
}
|
|
|
|
regex_impl! {<'text> Token<'text> {
|
|
pub fn expect_space(text: &str) -> Option<Self> {
|
|
regex!(Type::Space = r"^[\s--\n]+")
|
|
}
|
|
pub fn expect_endl(text: &str) -> Option<Self> {
|
|
regex!(Type::Endl = r"^\n[\s--\n]*")
|
|
}
|
|
pub fn expect_comment(text: &str) -> Option<Self> {
|
|
regex!(Type::Comment = r"^(;|//|<.*>|\{.*\}).*")
|
|
}
|
|
pub fn expect_label(text: &str) -> Option<Self> {
|
|
regex!(Type::Label = r"^:")
|
|
}
|
|
pub fn expect_insn(text: &str) -> Option<Self> {
|
|
regex!(Type::Insn = r"(?i)^(adc|addc?|and|bi[cs]|bitb?|br|call|clr[cnz]?|cmp|dad[cd]|decd?|[de]int|incd?|inv|j([cz]|eq|ge|hs|lo?|mp|n[cez]?)|mov|[np]op|push|reti?|r[lr][ac]|sbc|set[cnz]|subc?|swpb|sxt|tst|xor)(?-u:\b)")
|
|
}
|
|
pub fn expect_byte_width(text: &str) -> Option<Self> {
|
|
regex!(Type::ByteWidth = r"(?i)^\.b")
|
|
}
|
|
pub fn expect_word_width(text: &str) -> Option<Self> {
|
|
regex!(Type::WordWidth = r"(?i)^\.w")
|
|
}
|
|
pub fn expect_register(text: &str) -> Option<Self> {
|
|
// old regex regex!(Type::Register = r"(?i)^(r(1[0-5]|[0-9])|pc|s[pr]|cg)")
|
|
regex!(Type::Register = r"(?i)^(r\d+|pc|s[pr]|cg)(?-u:\b)")
|
|
}
|
|
pub fn expect_radix_marker_dec(text: &str) -> Option<Self> {
|
|
regex!(Type::RadixMarkerDec = r"(?i)^0d")
|
|
}
|
|
pub fn expect_radix_marker_hex(text: &str) -> Option<Self> {
|
|
regex!(Type::RadixMarkerHex = r"(?i)^(0x|\$)")
|
|
}
|
|
pub fn expect_radix_marker_oct(text: &str) -> Option<Self> {
|
|
regex!(Type::RadixMarkerOct = r"(?i)^0o")
|
|
}
|
|
pub fn expect_radix_marker_bin(text: &str) -> Option<Self> {
|
|
regex!(Type::RadixMarkerBin = r"(?i)^0b")
|
|
}
|
|
pub fn expect_number(text: &str) -> Option<Self> {
|
|
regex!(Type::Number = r"^+?[[:xdigit:]]+(?-u:\b)")
|
|
}
|
|
pub fn expect_minus(text: &str) -> Option<Self> {
|
|
regex!(Type::Minus = r"^-")
|
|
}
|
|
pub fn expect_plus(text: &str) -> Option<Self> {
|
|
regex!(Type::Plus = r"^\+")
|
|
}
|
|
pub fn expect_l_paren(text: &str) -> Option<Self> {
|
|
regex!(Type::LParen = r"^\(")
|
|
}
|
|
pub fn expect_r_paren(text: &str) -> Option<Self> {
|
|
regex!(Type::RParen = r"^\)")
|
|
}
|
|
pub fn expect_indrect(text: &str) -> Option<Self> {
|
|
regex!(Type::Indirect = r"^@")
|
|
}
|
|
pub fn expect_absolute(text: &str) -> Option<Self> {
|
|
regex!(Type::Absolute = r"^&")
|
|
}
|
|
pub fn expect_immediate(text: &str) -> Option<Self> {
|
|
regex!(Type::Immediate = r"^#")
|
|
}
|
|
pub fn expect_directive(text: &str) -> Option<Self> {
|
|
regex!(Type::Directive = r"^\.\S+")
|
|
}
|
|
pub fn expect_identifier(text: &str) -> Option<Self> {
|
|
regex!(Type::Identifier = r"^[A-Za-z_]\w*")
|
|
}
|
|
pub fn expect_separator(text: &str) -> Option<Self> {
|
|
regex!(Type::Separator = r"^,")
|
|
}
|
|
pub fn expect_end_of_file(text: &str) -> Option<Self> {
|
|
regex!(Type::EndOfFile = r"^$")
|
|
}
|
|
pub fn expect_anything(text: &str) -> Option<Self> {
|
|
regex!(Type::Invalid = r"^.*")
|
|
}
|
|
}}
|
|
|
|
impl Display for Type {
|
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
match self {
|
|
Self::Space => Display::fmt("space", f),
|
|
Self::Endl => Display::fmt("newline", f),
|
|
Self::Comment => Display::fmt("comment", f),
|
|
Self::Label => Display::fmt("label definition", f),
|
|
Self::Insn => Display::fmt("opcode", f),
|
|
Self::ByteWidth => Display::fmt("byte-width", f),
|
|
Self::WordWidth => Display::fmt("word-width", f),
|
|
Self::Register => Display::fmt("register", f),
|
|
Self::RadixMarkerDec => Display::fmt("decimal marker", f),
|
|
Self::RadixMarkerHex => Display::fmt("hexadecimal marker", f),
|
|
Self::RadixMarkerOct => Display::fmt("octal marker", f),
|
|
Self::RadixMarkerBin => Display::fmt("binary marker", f),
|
|
Self::Number => Display::fmt("number", f),
|
|
Self::Minus => Display::fmt("minus sign", f),
|
|
Self::Plus => Display::fmt("plus sign", f),
|
|
Self::LParen => Display::fmt("left parenthesis", f),
|
|
Self::RParen => Display::fmt("right parenthesis", f),
|
|
Self::Indirect => Display::fmt("indirect", f),
|
|
Self::Absolute => Display::fmt("absolute", f),
|
|
Self::Immediate => Display::fmt("immediate", f),
|
|
Self::Identifier => Display::fmt("identifier", f),
|
|
Self::Directive => Display::fmt("directive", f),
|
|
Self::Separator => Display::fmt("comma", f),
|
|
Self::EndOfFile => Display::fmt("EOF", f),
|
|
Self::Invalid => Display::fmt("invalid token", f),
|
|
}
|
|
}
|
|
}
|
|
|
|
/// A [Token] which can outlive its parent buffer
|
|
#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
|
pub struct OwnedToken {
|
|
/// The type of this token
|
|
variant: Type,
|
|
/// The sub[String] corresponding to this token
|
|
lexeme: String,
|
|
}
|
|
|
|
impl Display for OwnedToken {
|
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{}", Token::from(self)) }
|
|
}
|
|
|
|
impl<'t> From<&'t OwnedToken> for Token<'t> {
|
|
fn from(value: &'t OwnedToken) -> Self { Token { variant: value.variant, lexeme: &value.lexeme } }
|
|
}
|
|
|
|
impl From<Token<'_>> for OwnedToken {
|
|
fn from(value: Token<'_>) -> Self {
|
|
let Token { variant, lexeme } = value;
|
|
OwnedToken { variant, lexeme: lexeme.to_owned() }
|
|
}
|
|
}
|
|
|
|
/// [Types] are an owned array of [types](Type), with a custom [Display] implementation
|
|
#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
|
pub struct Types(Vec<Type>);
|
|
|
|
impl<T: AsRef<[Type]>> From<T> for Types {
|
|
// TODO: Possibly bad. Check out in rust playground.
|
|
fn from(value: T) -> Self { Self(value.as_ref().to_owned()) }
|
|
}
|
|
|
|
impl Display for Types {
|
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
for (idx, t) in self.0.iter().enumerate() {
|
|
Display::fmt(t, f)?;
|
|
match idx {
|
|
i if i < self.0.len() - 2 => Display::fmt(", ", f)?,
|
|
i if i < self.0.len() - 1 => Display::fmt(" or ", f)?,
|
|
_ => (),
|
|
}
|
|
}
|
|
Ok(())
|
|
}
|
|
}
|