token/lexer: Keywords are identifiers. Add missing operators. Fix <<=/>>=.

This commit is contained in:
John 2023-10-16 22:25:48 -05:00
parent 9c993b31a1
commit 50bb03ae18
2 changed files with 139 additions and 143 deletions

View File

@ -8,20 +8,13 @@ pub mod token {
mod token_type; mod token_type;
#[derive(Clone, Copy, Debug, PartialEq, Eq)] #[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum Type { pub enum Type {
// Invalid syntax
Invalid, Invalid,
// Any kind of comment
Comment, Comment,
// Any identifier
Identifier, Identifier,
// Keywords Keyword(Keyword),
KwBreak,
KwElse,
KwFalse,
KwFor,
KwFn,
KwIf,
KwIn,
KwLet,
KwTrue,
KwWhile,
// Literals // Literals
Integer, Integer,
Float, Float,
@ -37,14 +30,17 @@ pub mod token {
// Compound punctuation // Compound punctuation
Lsh, Lsh,
Rsh, Rsh,
AndAnd, AmpAmp,
OrOr, BarBar,
NotNot, NotNot,
CatEar, CatEar,
EqEq, EqEq,
GtEq,
LtEq,
NotEq, NotEq,
StarEq, StarEq,
DivEq, DivEq,
RemEq,
AddEq, AddEq,
SubEq, SubEq,
AndEq, AndEq,
@ -79,6 +75,43 @@ pub mod token {
Grave, Grave,
} }
/// Represents a reserved word.
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum Keyword {
Break,
Continue,
Else,
False,
For,
Fn,
If,
In,
Let,
Return,
True,
While,
}
impl std::str::FromStr for Keyword {
type Err = ();
fn from_str(s: &str) -> Result<Self, Self::Err> {
Ok(match s {
"break" => Self::Break,
"continue" => Self::Continue,
"else" => Self::Else,
"false" => Self::False,
"for" => Self::For,
"fn" => Self::Fn,
"if" => Self::If,
"in" => Self::In,
"let" => Self::Let,
"return" => Self::Return,
"true" => Self::True,
"while" => Self::While,
_ => Err(())?,
})
}
}
#[derive(Clone, Copy, Debug, PartialEq, Eq)] #[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub struct Token { pub struct Token {
ty: Type, ty: Type,
@ -91,6 +124,12 @@ pub mod token {
pub fn new(ty: Type, head: usize, tail: usize, line: usize, col: usize) -> Self { pub fn new(ty: Type, head: usize, tail: usize, line: usize, col: usize) -> Self {
Self { ty, head, tail, line, col } Self { ty, head, tail, line, col }
} }
pub fn cast(self, ty: Type) -> Self {
Self { ty, ..self }
}
pub fn rebound(self, head: usize, tail: usize) -> Self {
Self { head, tail, ..self }
}
pub fn line(&self) -> usize { pub fn line(&self) -> usize {
self.line self.line
} }
@ -179,27 +218,14 @@ pub mod lexer {
/// Returns the result of the rule with the highest precedence, if any matches /// Returns the result of the rule with the highest precedence, if any matches
pub fn any(&mut self) -> Option<Token> { pub fn any(&mut self) -> Option<Token> {
None.or_else(|| self.comment()) None.or_else(|| self.comment())
.or_else(|| self.keyword())
.or_else(|| self.identifier()) .or_else(|| self.identifier())
.or_else(|| self.literal()) .or_else(|| self.literal())
.or_else(|| self.delimiter()) .or_else(|| self.delimiter())
.or_else(|| self.punctuation()) .or_else(|| self.punctuation())
.or_else(|| self.invalid()) .or_else(|| self.invalid())
} }
/// Attempts to produce a Keyword
pub fn keyword(&mut self) -> Option<Token> {
None.or_else(|| self.kw_break())
.or_else(|| self.kw_else())
.or_else(|| self.kw_false())
.or_else(|| self.kw_for())
.or_else(|| self.kw_fn())
.or_else(|| self.kw_if())
.or_else(|| self.kw_in())
.or_else(|| self.kw_let())
.or_else(|| self.kw_true())
.or_else(|| self.kw_while())
} }
/// Attempts to produce a [Type::LitString], [Type::LitFloat], or [Type::LitInteger] /// Attempts to produce a [Type::String], [Type::Float], or [Type::Integer]
pub fn literal(&mut self) -> Option<Token> { pub fn literal(&mut self) -> Option<Token> {
None.or_else(|| self.string()) None.or_else(|| self.string())
.or_else(|| self.character()) .or_else(|| self.character())
@ -217,23 +243,26 @@ pub mod lexer {
} }
/// Evaluates punctuation rules /// Evaluates punctuation rules
pub fn punctuation(&mut self) -> Option<Token> { pub fn punctuation(&mut self) -> Option<Token> {
None.or_else(|| self.lsh()) None.or_else(|| self.amp_amp())
.or_else(|| self.rsh()) .or_else(|| self.bar_bar())
.or_else(|| self.and_and())
.or_else(|| self.or_or())
.or_else(|| self.not_not()) .or_else(|| self.not_not())
.or_else(|| self.cat_ear()) .or_else(|| self.cat_ear())
.or_else(|| self.eq_eq()) .or_else(|| self.eq_eq())
.or_else(|| self.gt_eq())
.or_else(|| self.lt_eq())
.or_else(|| self.not_eq()) .or_else(|| self.not_eq())
.or_else(|| self.lsh_eq())
.or_else(|| self.rsh_eq())
.or_else(|| self.star_eq()) .or_else(|| self.star_eq())
.or_else(|| self.div_eq()) .or_else(|| self.div_eq())
.or_else(|| self.rem_eq())
.or_else(|| self.add_eq()) .or_else(|| self.add_eq())
.or_else(|| self.sub_eq()) .or_else(|| self.sub_eq())
.or_else(|| self.and_eq()) .or_else(|| self.and_eq())
.or_else(|| self.or_eq()) .or_else(|| self.or_eq())
.or_else(|| self.xor_eq()) .or_else(|| self.xor_eq())
.or_else(|| self.lsh_eq()) .or_else(|| self.lsh())
.or_else(|| self.rsh_eq()) .or_else(|| self.rsh())
.or_else(|| self.arrow()) .or_else(|| self.arrow())
.or_else(|| self.fatarrow()) .or_else(|| self.fatarrow())
.or_else(|| self.semi()) .or_else(|| self.semi())
@ -270,40 +299,13 @@ pub mod lexer {
pub fn comment(&mut self) -> Option<Token> { pub fn comment(&mut self) -> Option<Token> {
self.map_rule(|r| r.comment(), Type::Comment) self.map_rule(|r| r.comment(), Type::Comment)
} }
// keywords
pub fn kw_break(&mut self) -> Option<Token> {
self.map_rule(|r| r.str("break"), Type::KwBreak)
}
pub fn kw_else(&mut self) -> Option<Token> {
self.map_rule(|r| r.str("else"), Type::KwElse)
}
pub fn kw_false(&mut self) -> Option<Token> {
self.map_rule(|r| r.str("false"), Type::KwFalse)
}
pub fn kw_for(&mut self) -> Option<Token> {
self.map_rule(|r| r.str("for"), Type::KwFor)
}
pub fn kw_fn(&mut self) -> Option<Token> {
self.map_rule(|r| r.str("fn"), Type::KwFn)
}
pub fn kw_if(&mut self) -> Option<Token> {
self.map_rule(|r| r.str("if"), Type::KwIf)
}
pub fn kw_in(&mut self) -> Option<Token> {
self.map_rule(|r| r.str("in"), Type::KwIn)
}
pub fn kw_let(&mut self) -> Option<Token> {
self.map_rule(|r| r.str("let"), Type::KwLet)
}
pub fn kw_true(&mut self) -> Option<Token> {
self.map_rule(|r| r.str("true"), Type::KwTrue)
}
pub fn kw_while(&mut self) -> Option<Token> {
self.map_rule(|r| r.str("while"), Type::KwWhile)
}
// identifiers // identifiers
pub fn identifier(&mut self) -> Option<Token> { pub fn identifier(&mut self) -> Option<Token> {
self.map_rule(|r| r.identifier(), Type::Identifier) self.map_rule(|r| r.identifier(), Type::Identifier)
.map(|token| match self.text[token.range()].parse() {
Ok(kw) => token.cast(Type::Keyword(kw)),
Err(_) => token,
})
} }
// literals // literals
pub fn integer(&mut self) -> Option<Token> { pub fn integer(&mut self) -> Option<Token> {
@ -313,10 +315,13 @@ pub mod lexer {
self.map_rule(|r| r.float(), Type::Float) self.map_rule(|r| r.float(), Type::Float)
} }
pub fn string(&mut self) -> Option<Token> { pub fn string(&mut self) -> Option<Token> {
// TODO: count lines and columns properly within string
self.map_rule(|r| r.string(), Type::String) self.map_rule(|r| r.string(), Type::String)
.map(|t| t.rebound(t.head + 1, t.tail - 1))
} }
pub fn character(&mut self) -> Option<Token> { pub fn character(&mut self) -> Option<Token> {
self.map_rule(|r| r.character(), Type::Character) self.map_rule(|r| r.character(), Type::Character)
.map(|t| t.rebound(t.head + 1, t.tail - 1))
} }
// delimiters // delimiters
pub fn l_brack(&mut self) -> Option<Token> { pub fn l_brack(&mut self) -> Option<Token> {
@ -344,11 +349,11 @@ pub mod lexer {
pub fn rsh(&mut self) -> Option<Token> { pub fn rsh(&mut self) -> Option<Token> {
self.map_rule(|r| r.str(">>"), Type::Rsh) self.map_rule(|r| r.str(">>"), Type::Rsh)
} }
pub fn and_and(&mut self) -> Option<Token> { pub fn amp_amp(&mut self) -> Option<Token> {
self.map_rule(|r| r.str("&&"), Type::AndAnd) self.map_rule(|r| r.str("&&"), Type::AmpAmp)
} }
pub fn or_or(&mut self) -> Option<Token> { pub fn bar_bar(&mut self) -> Option<Token> {
self.map_rule(|r| r.str("||"), Type::OrOr) self.map_rule(|r| r.str("||"), Type::BarBar)
} }
pub fn not_not(&mut self) -> Option<Token> { pub fn not_not(&mut self) -> Option<Token> {
self.map_rule(|r| r.str("!!"), Type::NotNot) self.map_rule(|r| r.str("!!"), Type::NotNot)
@ -359,6 +364,12 @@ pub mod lexer {
pub fn eq_eq(&mut self) -> Option<Token> { pub fn eq_eq(&mut self) -> Option<Token> {
self.map_rule(|r| r.str("=="), Type::EqEq) self.map_rule(|r| r.str("=="), Type::EqEq)
} }
pub fn gt_eq(&mut self) -> Option<Token> {
self.map_rule(|r| r.str(">="), Type::GtEq)
}
pub fn lt_eq(&mut self) -> Option<Token> {
self.map_rule(|r| r.str("<="), Type::LtEq)
}
pub fn not_eq(&mut self) -> Option<Token> { pub fn not_eq(&mut self) -> Option<Token> {
self.map_rule(|r| r.str("!="), Type::NotEq) self.map_rule(|r| r.str("!="), Type::NotEq)
} }
@ -368,6 +379,9 @@ pub mod lexer {
pub fn div_eq(&mut self) -> Option<Token> { pub fn div_eq(&mut self) -> Option<Token> {
self.map_rule(|r| r.str("/="), Type::DivEq) self.map_rule(|r| r.str("/="), Type::DivEq)
} }
pub fn rem_eq(&mut self) -> Option<Token> {
self.map_rule(|r| r.str("%="), Type::RemEq)
}
pub fn add_eq(&mut self) -> Option<Token> { pub fn add_eq(&mut self) -> Option<Token> {
self.map_rule(|r| r.str("+="), Type::AddEq) self.map_rule(|r| r.str("+="), Type::AddEq)
} }
@ -464,6 +478,7 @@ pub mod lexer {
} }
} }
// TODO: use real, functional parser-combinators here to produce tokens
/// A lexer [Rule] matches patterns in text in a declarative manner /// A lexer [Rule] matches patterns in text in a declarative manner
#[derive(Clone, Debug, PartialEq, Eq)] #[derive(Clone, Debug, PartialEq, Eq)]
pub struct Rule<'t> { pub struct Rule<'t> {
@ -761,50 +776,6 @@ mod tests {
assert_whole_input_is_token("fn main() {}", Lexer::comment, Type::Comment); assert_whole_input_is_token("fn main() {}", Lexer::comment, Type::Comment);
} }
} }
mod keyword {
use super::*;
#[test]
fn kw_break() {
assert_whole_input_is_token("break", Lexer::kw_break, Type::KwBreak);
}
#[test]
fn kw_else() {
assert_whole_input_is_token("else", Lexer::kw_else, Type::KwElse);
assert_has_type_and_range(" else ", Lexer::kw_else, Type::KwElse, 2..6);
}
#[test]
fn kw_false() {
assert_whole_input_is_token("false", Lexer::kw_false, Type::KwFalse);
}
#[test]
fn kw_for() {
assert_whole_input_is_token("for", Lexer::kw_for, Type::KwFor);
}
#[test]
fn kw_fn() {
assert_whole_input_is_token("fn", Lexer::kw_fn, Type::KwFn);
}
#[test]
fn kw_if() {
assert_whole_input_is_token("if", Lexer::kw_if, Type::KwIf);
}
#[test]
fn kw_in() {
assert_whole_input_is_token("in", Lexer::kw_in, Type::KwIn);
}
#[test]
fn kw_let() {
assert_whole_input_is_token("let", Lexer::kw_let, Type::KwLet);
}
#[test]
fn kw_true() {
assert_whole_input_is_token("true", Lexer::kw_true, Type::KwTrue);
}
#[test]
fn kw_while() {
assert_whole_input_is_token("while", Lexer::kw_while, Type::KwWhile);
}
}
mod identifier { mod identifier {
use super::*; use super::*;
@ -835,8 +806,8 @@ mod tests {
fn literal_class() { fn literal_class() {
assert_whole_input_is_token("1_00000", Lexer::literal, Type::Integer); assert_whole_input_is_token("1_00000", Lexer::literal, Type::Integer);
assert_whole_input_is_token("1.00000", Lexer::literal, Type::Float); assert_whole_input_is_token("1.00000", Lexer::literal, Type::Float);
assert_whole_input_is_token("\"1.0\"", Lexer::literal, Type::String); assert_has_type_and_range("\"1.0\"", Lexer::literal, Type::String, 1..4);
assert_whole_input_is_token("'\"'", Lexer::literal, Type::Character); assert_has_type_and_range("'\"'", Lexer::literal, Type::Character, 1..2);
} }
mod integer { mod integer {
use super::*; use super::*;
@ -894,18 +865,19 @@ mod tests {
use super::*; use super::*;
#[test] #[test]
fn empty_string() { fn empty_string() {
assert_whole_input_is_token("\"\"", Lexer::string, Type::String); assert_has_type_and_range("\"\"", Lexer::string, Type::String, 1..1);
} }
#[test] #[test]
fn unicode_string() { fn unicode_string() {
assert_whole_input_is_token("\"I 💙 🦈!\"", Lexer::string, Type::String); assert_has_type_and_range("\"I 💙 🦈!\"", Lexer::string, Type::String, 1..13);
} }
#[test] #[test]
fn escape_string() { fn escape_string() {
assert_whole_input_is_token( assert_has_type_and_range(
"\" \\\"This is a quote\\\" \"", "\" \\\"This is a quote\\\" \"",
Lexer::string, Lexer::string,
Type::String, Type::String,
1..22
); );
} }
} }
@ -913,22 +885,22 @@ mod tests {
use super::*; use super::*;
#[test] #[test]
fn plain_char() { fn plain_char() {
assert_whole_input_is_token("'A'", Lexer::character, Type::Character); assert_has_type_and_range("'A'", Lexer::character, Type::Character, 1..2);
assert_whole_input_is_token("'a'", Lexer::character, Type::Character); assert_has_type_and_range("'a'", Lexer::character, Type::Character, 1..2);
assert_whole_input_is_token("'#'", Lexer::character, Type::Character); assert_has_type_and_range("'#'", Lexer::character, Type::Character, 1..2);
} }
#[test] #[test]
fn unicode_char() { fn unicode_char() {
assert_whole_input_is_token("'ε'", Lexer::character, Type::Character); assert_has_type_and_range("'ε'", Lexer::character, Type::Character, 1..3);
} }
#[test] #[test]
fn escaped_char() { fn escaped_char() {
assert_whole_input_is_token("'\\n'", Lexer::character, Type::Character); assert_has_type_and_range("'\\n'", Lexer::character, Type::Character, 1..3);
} }
#[test] #[test]
#[should_panic] #[should_panic]
fn no_char() { fn no_char() {
assert_whole_input_is_token("''", Lexer::character, Type::Character); assert_has_type_and_range("''", Lexer::character, Type::Character, 1..1);
} }
} }
} }
@ -983,12 +955,12 @@ mod tests {
assert_whole_input_is_token(">>", Lexer::rsh, Type::Rsh) assert_whole_input_is_token(">>", Lexer::rsh, Type::Rsh)
} }
#[test] #[test]
fn and_and() { fn amp_amp() {
assert_whole_input_is_token("&&", Lexer::and_and, Type::AndAnd) assert_whole_input_is_token("&&", Lexer::amp_amp, Type::AmpAmp)
} }
#[test] #[test]
fn or_or() { fn bar_bar() {
assert_whole_input_is_token("||", Lexer::or_or, Type::OrOr) assert_whole_input_is_token("||", Lexer::bar_bar, Type::BarBar)
} }
#[test] #[test]
fn not_not() { fn not_not() {
@ -1003,6 +975,14 @@ mod tests {
assert_whole_input_is_token("==", Lexer::eq_eq, Type::EqEq) assert_whole_input_is_token("==", Lexer::eq_eq, Type::EqEq)
} }
#[test] #[test]
fn gt_eq() {
assert_whole_input_is_token(">=", Lexer::gt_eq, Type::GtEq)
}
#[test]
fn lt_eq() {
assert_whole_input_is_token("<=", Lexer::lt_eq, Type::LtEq)
}
#[test]
fn not_eq() { fn not_eq() {
assert_whole_input_is_token("!=", Lexer::not_eq, Type::NotEq) assert_whole_input_is_token("!=", Lexer::not_eq, Type::NotEq)
} }

View File

@ -1,24 +1,18 @@
use super::Type; //! Trait impls and helper functions for [Type] and [Keyword]
use super::{Keyword, Type};
use std::fmt::Display; use std::fmt::Display;
impl Display for Type { impl Display for Type {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self { match self {
Type::Invalid => Display::fmt("invalid", f), Type::Invalid => Display::fmt("invalid", f),
Type::Comment => Display::fmt("comment", f), Type::Comment => Display::fmt("comment", f),
Type::Identifier => Display::fmt("identifier", f), Type::Identifier => Display::fmt("identifier", f),
Type::KwBreak => Display::fmt("break", f), Type::Keyword(k) => Display::fmt(k, f),
Type::KwElse => Display::fmt("else", f), Type::Integer => Display::fmt("integer literal", f),
Type::KwFalse => Display::fmt("false", f), Type::Float => Display::fmt("float literal", f),
Type::KwFor => Display::fmt("for", f), Type::String => Display::fmt("string literal", f),
Type::KwFn => Display::fmt("fn", f), Type::Character => Display::fmt("char literal", f),
Type::KwIf => Display::fmt("if", f),
Type::KwIn => Display::fmt("in", f),
Type::KwLet => Display::fmt("let", f),
Type::KwTrue => Display::fmt("true", f),
Type::KwWhile => Display::fmt("while", f),
Type::LitInteger => Display::fmt("integer literal", f),
Type::LitFloat => Display::fmt("float literal", f),
Type::LitString => Display::fmt("string literal", f),
Type::LCurly => Display::fmt("left curly", f), Type::LCurly => Display::fmt("left curly", f),
Type::RCurly => Display::fmt("right curly", f), Type::RCurly => Display::fmt("right curly", f),
Type::LBrack => Display::fmt("left brack", f), Type::LBrack => Display::fmt("left brack", f),
@ -27,14 +21,17 @@ impl Display for Type {
Type::RParen => Display::fmt("right paren", f), Type::RParen => Display::fmt("right paren", f),
Type::Lsh => Display::fmt("shift left", f), Type::Lsh => Display::fmt("shift left", f),
Type::Rsh => Display::fmt("shift right", f), Type::Rsh => Display::fmt("shift right", f),
Type::AndAnd => Display::fmt("logical and", f), Type::AmpAmp => Display::fmt("and-and", f),
Type::OrOr => Display::fmt("logical or", f), Type::BarBar => Display::fmt("or-or", f),
Type::NotNot => Display::fmt("not-not", f), Type::NotNot => Display::fmt("not-not", f),
Type::CatEar => Display::fmt("cat-ears", f), Type::CatEar => Display::fmt("cat-ears", f),
Type::EqEq => Display::fmt("equal to", f), Type::EqEq => Display::fmt("equal to", f),
Type::GtEq => Display::fmt("greater than or equal to", f),
Type::LtEq => Display::fmt("less than or equal to", f),
Type::NotEq => Display::fmt("not equal to", f), Type::NotEq => Display::fmt("not equal to", f),
Type::StarEq => Display::fmt("star-assign", f), Type::StarEq => Display::fmt("star-assign", f),
Type::DivEq => Display::fmt("div-assign", f), Type::DivEq => Display::fmt("div-assign", f),
Type::RemEq => Display::fmt("rem-assign", f),
Type::AddEq => Display::fmt("add-assign", f), Type::AddEq => Display::fmt("add-assign", f),
Type::SubEq => Display::fmt("sub-assign", f), Type::SubEq => Display::fmt("sub-assign", f),
Type::AndEq => Display::fmt("and-assign", f), Type::AndEq => Display::fmt("and-assign", f),
@ -69,3 +66,22 @@ impl Display for Type {
} }
} }
} }
impl Display for Keyword {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::Break => Display::fmt("break", f),
Self::Continue => Display::fmt("continue", f),
Self::Else => Display::fmt("else", f),
Self::False => Display::fmt("false", f),
Self::For => Display::fmt("for", f),
Self::Fn => Display::fmt("fn", f),
Self::If => Display::fmt("if", f),
Self::In => Display::fmt("in", f),
Self::Let => Display::fmt("let", f),
Self::Return => Display::fmt("return", f),
Self::True => Display::fmt("true", f),
Self::While => Display::fmt("while", f),
}
}
}