316 lines
9.5 KiB
Rust

//! Conlang is an expression-based programming language
#![warn(clippy::all)]
pub mod token {
//! Stores a component of a file as a type and span
use std::ops::Range;
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum Type {
Comment,
Identifier,
}
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub struct Token {
ty: Type,
head: usize,
tail: usize,
}
impl Token {
pub fn new(ty: Type, head: usize, tail: usize) -> Self {
Self { ty, head, tail }
}
pub fn is_empty(&self) -> bool {
self.tail == self.head
}
pub fn len(&self) -> usize {
self.tail - self.head
}
// Gets the [Type] of the token
pub fn ty(&self) -> Type {
self.ty
}
// Gets the exclusive range of the token
pub fn range(&self) -> Range<usize> {
self.head..self.tail
}
}
}
pub mod ast {
//! Stores functions, data structure definitions, etc.
}
pub mod lexer {
//! Converts a text file into tokens
use crate::token::{Token, Type};
use lerox::Combinator;
#[allow(dead_code)]
pub struct Lexer<'t> {
text: &'t str,
cursor: usize,
}
/// Implements the non-terminals of a language
impl<'t> Lexer<'t> {
pub fn new(text: &'t str) -> Self {
Self { text, cursor: 0 }
}
fn produce_token(&mut self, ty: Type, len: usize) -> Option<Token> {
let start = self.cursor;
self.cursor += len;
Some(Token::new(ty, start, self.cursor))
}
fn text(&self) -> &str {
&self.text[self.cursor..]
}
fn skip_whitespace(&mut self) {
self.cursor += Rule::new(self.text).whitespace().end().unwrap_or_default()
}
// functions for lexing individual tokens
pub fn line_comment(&mut self) -> Option<Token> {
// line_comment := "//" ~ (^newline)*
self.skip_whitespace();
self.produce_token(
Type::Comment,
Rule::new(self.text())
.str("//")
.and_any(|rule| rule.not_char('\n'))
.end()?,
)
}
pub fn block_comment(&mut self) -> Option<Token> {
// block_comment := "/*" ~ (block_comment | all_but("*/"))* ~ "*/"
self.skip_whitespace();
self.produce_token(
Type::Comment,
Rule::new(self.text())
.str("/*")
.and_any(|rule| rule.not_str("*/"))
.str("*/")
.end()?,
)
}
pub fn shebang_comment(&mut self) -> Option<Token> {
// shebang_comment := "#!/" ~ (^newline)*
self.skip_whitespace();
self.produce_token(
Type::Comment,
Rule::new(self.text())
.str("#!/")
.and_any(|rule| rule.not_char('\n'))
.end()?,
)
}
pub fn identifier(&mut self) -> Option<Token> {
self.skip_whitespace();
self.produce_token(
Type::Identifier,
Rule::new(self.text())
.char('_')
.or(Rule::xid_start)
.and_any(Rule::xid_continue)
.end()?,
)
}
}
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct Rule<'t> {
text: &'t str,
taken: usize,
is_alright: bool,
}
impl<'t> Rule<'t> {
pub fn new(text: &'t str) -> Self {
Self { text, taken: 0, is_alright: true }
}
pub fn end(self) -> Option<usize> {
self.is_alright.then_some(self.taken)
}
pub fn remaining(&self) -> &str {
self.text
}
}
impl<'t> Rule<'t> {
pub fn char_between(self, start: char, end: char) -> Self {
self.char_fn(|c| start <= c && c <= end)
}
pub fn char(self, c: char) -> Self {
self.has(|rule| rule.text.starts_with(c), 1)
}
pub fn str(self, s: &str) -> Self {
self.has(|rule| rule.text.starts_with(s), s.len())
}
pub fn char_fn(self, f: impl Fn(char) -> bool) -> Self {
self.and(|rule| match rule.text.strip_prefix(&f) {
Some(text) => Self { text, taken: rule.taken + next_utf8(rule.text, 1), ..rule },
None => Self { is_alright: false, ..rule },
})
}
pub fn not_char(self, c: char) -> Self {
self.has(|rule| !rule.text.starts_with(c), 1)
}
pub fn not_str(self, s: &str) -> Self {
self.has(|rule| !rule.text.starts_with(s), 1)
}
pub fn any(self) -> Self {
self.has(|_| true, 1)
}
pub fn whitespace(self) -> Self {
self.and_any(|rule| rule.char_fn(|c| c.is_whitespace()))
}
pub fn xid_start(self) -> Self {
use unicode_xid::UnicodeXID;
self.char_fn(UnicodeXID::is_xid_start)
}
pub fn xid_continue(self) -> Self {
use unicode_xid::UnicodeXID;
self.char_fn(UnicodeXID::is_xid_continue)
}
fn has(self, condition: impl Fn(&Self) -> bool, len: usize) -> Self {
let len = next_utf8(self.text, len);
self.and(|rule| match condition(&rule) && !rule.text.is_empty() {
true => Self { text: &rule.text[len..], taken: rule.taken + len, ..rule },
false => Self { is_alright: false, ..rule },
})
}
}
impl<'t> lerox::Combinator for Rule<'t> {
fn is_alright(&self) -> bool {
self.is_alright
}
fn into_alright(self) -> Self {
Self { is_alright: true, ..self }
}
}
/// Returns the index of the next unicode character, rounded up
fn next_utf8(text: &str, mut index: usize) -> usize {
index = index.min(text.len());
while !text.is_char_boundary(index) {
index += 1
}
index
}
}
pub mod parser {
//! Parses tokens into an AST
}
pub mod interpreter {
//! Interprets an AST as a program
}
#[cfg(test)]
mod tests {
mod token {
use crate::token::*;
#[test]
fn token_has_type() {
assert_eq!(Token::new(Type::Comment, 0, 10).ty(), Type::Comment);
assert_eq!(Token::new(Type::Identifier, 0, 10).ty(), Type::Identifier);
}
#[test]
fn token_has_range() {
let t = Token::new(Type::Comment, 0, 10);
assert_eq!(t.range(), 0..10);
}
}
mod ast {
// TODO
}
mod lexer {
use crate::{
lexer::*,
token::{Token, Type},
};
fn assert_whole_input_is_token<'t, F>(input: &'t str, f: F, ty: Type)
where F: FnOnce(&mut Lexer<'t>) -> Option<Token> {
assert_has_type_and_len(input, f, ty, input.len())
}
fn assert_has_type_and_len<'t, F>(input: &'t str, f: F, ty: Type, len: usize)
where F: FnOnce(&mut Lexer<'t>) -> Option<Token> {
assert_eq!(Some(Token::new(ty, 0, len)), f(&mut Lexer::new(input)),)
}
mod comment {
use super::*;
#[test]
fn line_comment() {
assert_whole_input_is_token(
"// this is a comment",
Lexer::line_comment,
Type::Comment,
);
}
#[test]
#[should_panic]
fn not_line_comment() {
assert_whole_input_is_token("fn main() {}", Lexer::line_comment, Type::Comment);
}
#[test]
fn block_comment() {
assert_whole_input_is_token(
"/* this is a comment */",
Lexer::block_comment,
Type::Comment,
);
}
#[test]
#[should_panic]
fn not_block_comment() {
assert_whole_input_is_token("fn main() {}", Lexer::block_comment, Type::Comment);
}
#[test]
fn shebang_comment() {
assert_whole_input_is_token(
"#!/ this is a comment",
Lexer::shebang_comment,
Type::Comment,
);
}
#[test]
#[should_panic]
fn not_shebang_comment() {
assert_whole_input_is_token("fn main() {}", Lexer::shebang_comment, Type::Comment);
}
}
mod identifier {
use super::*;
#[test]
fn identifier() {
assert_whole_input_is_token(
"valid_identifier",
Lexer::identifier,
Type::Identifier,
);
assert_whole_input_is_token("_0", Lexer::identifier, Type::Identifier);
assert_whole_input_is_token("_", Lexer::identifier, Type::Identifier);
}
#[test]
fn unicode_identifier() {
assert_whole_input_is_token("ζ_ζζζ_ζζζ_ζζζ", Lexer::identifier, Type::Identifier);
assert_whole_input_is_token("_ζζζ_ζζζ_ζζζ_", Lexer::identifier, Type::Identifier);
}
#[test]
#[should_panic]
fn not_identifier() {
assert_whole_input_is_token("123456789", Lexer::identifier, Type::Identifier);
}
}
}
mod parser {
// TODO
}
mod interpreter {
// TODO
}
}