conlang: Tokenize punctuation (resolves #9)

Things left to consider:
- token::Type enum is getting fairly large.
  breaking it up could invoke substantial code bloat
- Compound operators might make more sense at the parser level
  - Compound-assign operators are ripe for syntactic desugaring,
    but there must be some reason it's done separately in other languages.
  - Operators like FatArrow may still make sense at the tokenizer level, regardless.
  - What is a lexer? A miserable pile of parsers!
- Operator overloading, or user-defined operators? Hmm...
This commit is contained in:
John 2023-09-28 00:11:31 -05:00
parent d3b1e65e35
commit f25685560a

View File

@ -29,6 +29,49 @@ pub mod token {
RBrack, RBrack,
LParen, LParen,
RParen, RParen,
// Compound punctuation
Lsh,
Rsh,
AndAnd,
OrOr,
NotNot,
CatEar,
EqEq,
NotEq,
StarEq,
DivEq,
AddEq,
SubEq,
AndEq,
OrEq,
XorEq,
LshEq,
RshEq,
Arrow,
FatArrow,
// Simple punctuation
Semi,
Dot,
Star,
Div,
Plus,
Minus,
Rem,
Bang,
Eq,
Lt,
Gt,
Amp,
Bar,
Xor,
Hash,
At,
Colon,
Backslash,
Question,
Comma,
Tilde,
Grave,
} }
#[derive(Clone, Copy, Debug, PartialEq, Eq)] #[derive(Clone, Copy, Debug, PartialEq, Eq)]
@ -131,6 +174,53 @@ pub mod lexer {
.or_else(|| self.l_paren()) .or_else(|| self.l_paren())
.or_else(|| self.r_paren()) .or_else(|| self.r_paren())
} }
/// Evaluates punctuation rules
pub fn punctuation(&mut self) -> Option<Token> {
None.or_else(|| self.lsh())
.or_else(|| self.rsh())
.or_else(|| self.and_and())
.or_else(|| self.or_or())
.or_else(|| self.not_not())
.or_else(|| self.cat_ear())
.or_else(|| self.eq_eq())
.or_else(|| self.not_eq())
.or_else(|| self.star_eq())
.or_else(|| self.div_eq())
.or_else(|| self.add_eq())
.or_else(|| self.sub_eq())
.or_else(|| self.and_eq())
.or_else(|| self.or_eq())
.or_else(|| self.xor_eq())
.or_else(|| self.lsh_eq())
.or_else(|| self.rsh_eq())
.or_else(|| self.arrow())
.or_else(|| self.fatarrow())
.or_else(|| self.semi())
.or_else(|| self.dot())
.or_else(|| self.star())
.or_else(|| self.div())
.or_else(|| self.plus())
.or_else(|| self.sub())
.or_else(|| self.rem())
.or_else(|| self.bang())
.or_else(|| self.eq())
.or_else(|| self.lt())
.or_else(|| self.gt())
.or_else(|| self.amp())
.or_else(|| self.bar())
.or_else(|| self.xor())
.or_else(|| self.hash())
.or_else(|| self.at())
.or_else(|| self.colon())
.or_else(|| self.backslash())
.or_else(|| self.question())
.or_else(|| self.comma())
.or_else(|| self.tilde())
.or_else(|| self.grave())
}
pub fn unary_op(&mut self) -> Option<Token> {
self.bang().or_else(|| self.sub())
}
// functions for lexing individual tokens // functions for lexing individual tokens
pub fn invalid(&mut self) -> Option<Token> { pub fn invalid(&mut self) -> Option<Token> {
self.map_rule(|r| r.invalid(), Type::Invalid) self.map_rule(|r| r.invalid(), Type::Invalid)
@ -194,6 +284,131 @@ pub mod lexer {
pub fn r_paren(&mut self) -> Option<Token> { pub fn r_paren(&mut self) -> Option<Token> {
self.map_rule(|r| r.char(')'), Type::RParen) self.map_rule(|r| r.char(')'), Type::RParen)
} }
// compound punctuation
pub fn lsh(&mut self) -> Option<Token> {
self.map_rule(|r| r.str("<<"), Type::Lsh)
}
pub fn rsh(&mut self) -> Option<Token> {
self.map_rule(|r| r.str(">>"), Type::Rsh)
}
pub fn and_and(&mut self) -> Option<Token> {
self.map_rule(|r| r.str("&&"), Type::AndAnd)
}
pub fn or_or(&mut self) -> Option<Token> {
self.map_rule(|r| r.str("||"), Type::OrOr)
}
pub fn not_not(&mut self) -> Option<Token> {
self.map_rule(|r| r.str("!!"), Type::NotNot)
}
pub fn cat_ear(&mut self) -> Option<Token> {
self.map_rule(|r| r.str("^^"), Type::CatEar)
}
pub fn eq_eq(&mut self) -> Option<Token> {
self.map_rule(|r| r.str("=="), Type::EqEq)
}
pub fn not_eq(&mut self) -> Option<Token> {
self.map_rule(|r| r.str("!="), Type::NotEq)
}
pub fn star_eq(&mut self) -> Option<Token> {
self.map_rule(|r| r.str("*="), Type::StarEq)
}
pub fn div_eq(&mut self) -> Option<Token> {
self.map_rule(|r| r.str("/="), Type::DivEq)
}
pub fn add_eq(&mut self) -> Option<Token> {
self.map_rule(|r| r.str("+="), Type::AddEq)
}
pub fn sub_eq(&mut self) -> Option<Token> {
self.map_rule(|r| r.str("-="), Type::SubEq)
}
pub fn and_eq(&mut self) -> Option<Token> {
self.map_rule(|r| r.str("&="), Type::AndEq)
}
pub fn or_eq(&mut self) -> Option<Token> {
self.map_rule(|r| r.str("|="), Type::OrEq)
}
pub fn xor_eq(&mut self) -> Option<Token> {
self.map_rule(|r| r.str("^="), Type::XorEq)
}
pub fn lsh_eq(&mut self) -> Option<Token> {
self.map_rule(|r| r.str("<<="), Type::LshEq)
}
pub fn rsh_eq(&mut self) -> Option<Token> {
self.map_rule(|r| r.str(">>="), Type::RshEq)
}
pub fn arrow(&mut self) -> Option<Token> {
self.map_rule(|r| r.str("->"), Type::Arrow)
}
pub fn fatarrow(&mut self) -> Option<Token> {
self.map_rule(|r| r.str("=>"), Type::FatArrow)
}
// simple punctuation
pub fn semi(&mut self) -> Option<Token> {
self.map_rule(|r| r.char(';'), Type::Semi)
}
pub fn dot(&mut self) -> Option<Token> {
self.map_rule(|r| r.char('.'), Type::Dot)
}
pub fn star(&mut self) -> Option<Token> {
self.map_rule(|r| r.char('*'), Type::Star)
}
pub fn div(&mut self) -> Option<Token> {
self.map_rule(|r| r.char('/'), Type::Div)
}
pub fn plus(&mut self) -> Option<Token> {
self.map_rule(|r| r.char('+'), Type::Plus)
}
pub fn sub(&mut self) -> Option<Token> {
self.map_rule(|r| r.char('-'), Type::Minus)
}
pub fn rem(&mut self) -> Option<Token> {
self.map_rule(|r| r.char('%'), Type::Rem)
}
pub fn bang(&mut self) -> Option<Token> {
self.map_rule(|r| r.char('!'), Type::Bang)
}
pub fn eq(&mut self) -> Option<Token> {
self.map_rule(|r| r.char('='), Type::Eq)
}
pub fn lt(&mut self) -> Option<Token> {
self.map_rule(|r| r.char('<'), Type::Lt)
}
pub fn gt(&mut self) -> Option<Token> {
self.map_rule(|r| r.char('>'), Type::Gt)
}
pub fn amp(&mut self) -> Option<Token> {
self.map_rule(|r| r.char('&'), Type::Amp)
}
pub fn bar(&mut self) -> Option<Token> {
self.map_rule(|r| r.char('|'), Type::Bar)
}
pub fn xor(&mut self) -> Option<Token> {
self.map_rule(|r| r.char('^'), Type::Xor)
}
pub fn hash(&mut self) -> Option<Token> {
self.map_rule(|r| r.char('#'), Type::Hash)
}
pub fn at(&mut self) -> Option<Token> {
self.map_rule(|r| r.char('@'), Type::At)
}
pub fn colon(&mut self) -> Option<Token> {
self.map_rule(|r| r.char(':'), Type::Colon)
}
pub fn question(&mut self) -> Option<Token> {
self.map_rule(|r| r.char('?'), Type::Question)
}
pub fn comma(&mut self) -> Option<Token> {
self.map_rule(|r| r.char(','), Type::Comma)
}
pub fn tilde(&mut self) -> Option<Token> {
self.map_rule(|r| r.char('~'), Type::Tilde)
}
pub fn grave(&mut self) -> Option<Token> {
self.map_rule(|r| r.char('`'), Type::Grave)
}
pub fn backslash(&mut self) -> Option<Token> {
self.map_rule(|r| r.char('\\'), Type::Backslash)
}
} }
/// A lexer [Rule] matches patterns in text in a declarative manner /// A lexer [Rule] matches patterns in text in a declarative manner
@ -651,6 +866,199 @@ mod tests {
assert_whole_input_is_token(")", Lexer::r_paren, Type::RParen); assert_whole_input_is_token(")", Lexer::r_paren, Type::RParen);
} }
} }
mod punctuation {
use super::*;
mod compound {
use super::*;
#[test]
fn lsh() {
assert_whole_input_is_token("<<", Lexer::lsh, Type::Lsh)
}
#[test]
fn rsh() {
assert_whole_input_is_token(">>", Lexer::rsh, Type::Rsh)
}
#[test]
fn and_and() {
assert_whole_input_is_token("&&", Lexer::and_and, Type::AndAnd)
}
#[test]
fn or_or() {
assert_whole_input_is_token("||", Lexer::or_or, Type::OrOr)
}
#[test]
fn not_not() {
assert_whole_input_is_token("!!", Lexer::not_not, Type::NotNot)
}
#[test]
fn cat_ear() {
assert_whole_input_is_token("^^", Lexer::cat_ear, Type::CatEar)
}
#[test]
fn eq_eq() {
assert_whole_input_is_token("==", Lexer::eq_eq, Type::EqEq)
}
#[test]
fn not_eq() {
assert_whole_input_is_token("!=", Lexer::not_eq, Type::NotEq)
}
#[test]
fn star_eq() {
assert_whole_input_is_token("*=", Lexer::star_eq, Type::StarEq)
}
#[test]
fn div_eq() {
assert_whole_input_is_token("/=", Lexer::div_eq, Type::DivEq)
}
#[test]
fn add_eq() {
assert_whole_input_is_token("+=", Lexer::add_eq, Type::AddEq)
}
#[test]
fn sub_eq() {
assert_whole_input_is_token("-=", Lexer::sub_eq, Type::SubEq)
}
#[test]
fn and_eq() {
assert_whole_input_is_token("&=", Lexer::and_eq, Type::AndEq)
}
#[test]
fn or_eq() {
assert_whole_input_is_token("|=", Lexer::or_eq, Type::OrEq)
}
#[test]
fn xor_eq() {
assert_whole_input_is_token("^=", Lexer::xor_eq, Type::XorEq)
}
#[test]
fn lsh_eq() {
assert_whole_input_is_token("<<=", Lexer::lsh_eq, Type::LshEq)
}
#[test]
fn rsh_eq() {
assert_whole_input_is_token(">>=", Lexer::rsh_eq, Type::RshEq)
}
}
mod simple {
use super::*;
#[test]
fn punctuation_class() {
assert_whole_input_is_token(";", Lexer::punctuation, Type::Semi);
assert_whole_input_is_token(".", Lexer::punctuation, Type::Dot);
assert_whole_input_is_token("*", Lexer::punctuation, Type::Star);
assert_whole_input_is_token("/", Lexer::punctuation, Type::Div);
assert_whole_input_is_token("+", Lexer::punctuation, Type::Plus);
assert_whole_input_is_token("-", Lexer::punctuation, Type::Minus);
assert_whole_input_is_token("%", Lexer::punctuation, Type::Rem);
assert_whole_input_is_token("!", Lexer::punctuation, Type::Bang);
assert_whole_input_is_token("=", Lexer::punctuation, Type::Eq);
assert_whole_input_is_token("<", Lexer::punctuation, Type::Lt);
assert_whole_input_is_token(">", Lexer::punctuation, Type::Gt);
assert_whole_input_is_token("&", Lexer::punctuation, Type::Amp);
assert_whole_input_is_token("|", Lexer::punctuation, Type::Bar);
assert_whole_input_is_token("^", Lexer::punctuation, Type::Xor);
assert_whole_input_is_token("#", Lexer::punctuation, Type::Hash);
assert_whole_input_is_token("@", Lexer::punctuation, Type::At);
assert_whole_input_is_token(":", Lexer::punctuation, Type::Colon);
assert_whole_input_is_token("?", Lexer::punctuation, Type::Question);
assert_whole_input_is_token(",", Lexer::punctuation, Type::Comma);
assert_whole_input_is_token("~", Lexer::punctuation, Type::Tilde);
assert_whole_input_is_token("`", Lexer::punctuation, Type::Grave);
assert_whole_input_is_token("\\", Lexer::punctuation, Type::Backslash);
}
// individual functions below
#[test]
fn semi() {
assert_whole_input_is_token(";", Lexer::semi, Type::Semi)
}
#[test]
fn dot() {
assert_whole_input_is_token(".", Lexer::dot, Type::Dot)
}
#[test]
fn star() {
assert_whole_input_is_token("*", Lexer::star, Type::Star)
}
#[test]
fn div() {
assert_whole_input_is_token("/", Lexer::div, Type::Div)
}
#[test]
fn plus() {
assert_whole_input_is_token("+", Lexer::plus, Type::Plus)
}
#[test]
fn minus() {
assert_whole_input_is_token("-", Lexer::sub, Type::Minus)
}
#[test]
fn rem() {
assert_whole_input_is_token("%", Lexer::rem, Type::Rem)
}
#[test]
fn bang() {
assert_whole_input_is_token("!", Lexer::bang, Type::Bang)
}
#[test]
fn eq() {
assert_whole_input_is_token("=", Lexer::eq, Type::Eq)
}
#[test]
fn lt() {
assert_whole_input_is_token("<", Lexer::lt, Type::Lt)
}
#[test]
fn gt() {
assert_whole_input_is_token(">", Lexer::gt, Type::Gt)
}
#[test]
fn and() {
assert_whole_input_is_token("&", Lexer::amp, Type::Amp)
}
#[test]
fn or() {
assert_whole_input_is_token("|", Lexer::bar, Type::Bar)
}
#[test]
fn xor() {
assert_whole_input_is_token("^", Lexer::xor, Type::Xor)
}
#[test]
fn hash() {
assert_whole_input_is_token("#", Lexer::hash, Type::Hash)
}
#[test]
fn at() {
assert_whole_input_is_token("@", Lexer::at, Type::At)
}
#[test]
fn colon() {
assert_whole_input_is_token(":", Lexer::colon, Type::Colon)
}
#[test]
fn backslash() {
assert_whole_input_is_token("\\", Lexer::backslash, Type::Backslash)
}
#[test]
fn question() {
assert_whole_input_is_token("?", Lexer::question, Type::Question)
}
#[test]
fn comma() {
assert_whole_input_is_token(",", Lexer::comma, Type::Comma)
}
#[test]
fn tilde() {
assert_whole_input_is_token("~", Lexer::tilde, Type::Tilde)
}
#[test]
fn grave() {
assert_whole_input_is_token("`", Lexer::grave, Type::Grave)
}
}
}
} }
mod parser { mod parser {
// TODO // TODO