//! Conlang is an expression-based programming language #![warn(clippy::all)] pub mod token { //! Stores a component of a file as a type and span use std::ops::Range; #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub enum Type { Comment, Identifier, } #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub struct Token { ty: Type, head: usize, tail: usize, } impl Token { pub fn new(ty: Type, head: usize, tail: usize) -> Self { Self { ty, head, tail } } pub fn is_empty(&self) -> bool { self.tail == self.head } pub fn len(&self) -> usize { self.tail - self.head } // Gets the [Type] of the token pub fn ty(&self) -> Type { self.ty } // Gets the exclusive range of the token pub fn range(&self) -> Range { self.head..self.tail } } } pub mod ast { //! Stores functions, data structure definitions, etc. } pub mod lexer { //! Converts a text file into tokens use crate::token::{Token, Type}; use lerox::Combinator; #[allow(dead_code)] pub struct Lexer<'t> { text: &'t str, cursor: usize, } /// Implements the non-terminals of a language impl<'t> Lexer<'t> { pub fn new(text: &'t str) -> Self { Self { text, cursor: 0 } } fn produce_token(&mut self, ty: Type, len: usize) -> Option { let start = self.cursor; self.cursor += len; Some(Token::new(ty, start, self.cursor)) } fn text(&self) -> &str { &self.text[self.cursor..] } fn skip_whitespace(&mut self) { self.cursor += Rule::new(self.text).whitespace().end().unwrap_or_default() } // functions for lexing individual tokens pub fn line_comment(&mut self) -> Option { // line_comment := "//" ~ (^newline)* self.skip_whitespace(); self.produce_token( Type::Comment, Rule::new(self.text()) .str("//") .and_any(|rule| rule.not_char('\n')) .end()?, ) } pub fn block_comment(&mut self) -> Option { // block_comment := "/*" ~ (block_comment | all_but("*/"))* ~ "*/" self.skip_whitespace(); self.produce_token( Type::Comment, Rule::new(self.text()) .str("/*") .and_any(|rule| rule.not_str("*/")) .str("*/") .end()?, ) } pub fn shebang_comment(&mut self) -> Option { // shebang_comment := "#!/" ~ (^newline)* self.skip_whitespace(); self.produce_token( Type::Comment, Rule::new(self.text()) .str("#!/") .and_any(|rule| rule.not_char('\n')) .end()?, ) } pub fn identifier(&mut self) -> Option { self.skip_whitespace(); self.produce_token( Type::Identifier, Rule::new(self.text()) .char('_') .or(Rule::xid_start) .and_any(Rule::xid_continue) .end()?, ) } } #[derive(Clone, Debug, PartialEq, Eq)] pub struct Rule<'t> { text: &'t str, taken: usize, is_alright: bool, } impl<'t> Rule<'t> { pub fn new(text: &'t str) -> Self { Self { text, taken: 0, is_alright: true } } pub fn end(self) -> Option { self.is_alright.then_some(self.taken) } pub fn remaining(&self) -> &str { self.text } } impl<'t> Rule<'t> { pub fn char_between(self, start: char, end: char) -> Self { self.char_fn(|c| start <= c && c <= end) } pub fn char(self, c: char) -> Self { self.has(|rule| rule.text.starts_with(c), 1) } pub fn str(self, s: &str) -> Self { self.has(|rule| rule.text.starts_with(s), s.len()) } pub fn char_fn(self, f: impl Fn(char) -> bool) -> Self { self.and(|rule| match rule.text.strip_prefix(&f) { Some(text) => Self { text, taken: rule.taken + next_utf8(rule.text, 1), ..rule }, None => Self { is_alright: false, ..rule }, }) } pub fn not_char(self, c: char) -> Self { self.has(|rule| !rule.text.starts_with(c), 1) } pub fn not_str(self, s: &str) -> Self { self.has(|rule| !rule.text.starts_with(s), 1) } pub fn any(self) -> Self { self.has(|_| true, 1) } pub fn whitespace(self) -> Self { self.and_any(|rule| rule.char_fn(|c| c.is_whitespace())) } pub fn xid_start(self) -> Self { use unicode_xid::UnicodeXID; self.char_fn(UnicodeXID::is_xid_start) } pub fn xid_continue(self) -> Self { use unicode_xid::UnicodeXID; self.char_fn(UnicodeXID::is_xid_continue) } fn has(self, condition: impl Fn(&Self) -> bool, len: usize) -> Self { let len = next_utf8(self.text, len); self.and(|rule| match condition(&rule) && !rule.text.is_empty() { true => Self { text: &rule.text[len..], taken: rule.taken + len, ..rule }, false => Self { is_alright: false, ..rule }, }) } } impl<'t> lerox::Combinator for Rule<'t> { fn is_alright(&self) -> bool { self.is_alright } fn into_alright(self) -> Self { Self { is_alright: true, ..self } } } /// Returns the index of the next unicode character, rounded up fn next_utf8(text: &str, mut index: usize) -> usize { index = index.min(text.len()); while !text.is_char_boundary(index) { index += 1 } index } } pub mod parser { //! Parses tokens into an AST } pub mod interpreter { //! Interprets an AST as a program } #[cfg(test)] mod tests { mod token { use crate::token::*; #[test] fn token_has_type() { assert_eq!(Token::new(Type::Comment, 0, 10).ty(), Type::Comment); assert_eq!(Token::new(Type::Identifier, 0, 10).ty(), Type::Identifier); } #[test] fn token_has_range() { let t = Token::new(Type::Comment, 0, 10); assert_eq!(t.range(), 0..10); } } mod ast { // TODO } mod lexer { use crate::{ lexer::*, token::{Token, Type}, }; fn assert_whole_input_is_token<'t, F>(input: &'t str, f: F, ty: Type) where F: FnOnce(&mut Lexer<'t>) -> Option { assert_has_type_and_len(input, f, ty, input.len()) } fn assert_has_type_and_len<'t, F>(input: &'t str, f: F, ty: Type, len: usize) where F: FnOnce(&mut Lexer<'t>) -> Option { assert_eq!(Some(Token::new(ty, 0, len)), f(&mut Lexer::new(input)),) } mod comment { use super::*; #[test] fn line_comment() { assert_whole_input_is_token( "// this is a comment", Lexer::line_comment, Type::Comment, ); } #[test] #[should_panic] fn not_line_comment() { assert_whole_input_is_token("fn main() {}", Lexer::line_comment, Type::Comment); } #[test] fn block_comment() { assert_whole_input_is_token( "/* this is a comment */", Lexer::block_comment, Type::Comment, ); } #[test] #[should_panic] fn not_block_comment() { assert_whole_input_is_token("fn main() {}", Lexer::block_comment, Type::Comment); } #[test] fn shebang_comment() { assert_whole_input_is_token( "#!/ this is a comment", Lexer::shebang_comment, Type::Comment, ); } #[test] #[should_panic] fn not_shebang_comment() { assert_whole_input_is_token("fn main() {}", Lexer::shebang_comment, Type::Comment); } } mod identifier { use super::*; #[test] fn identifier() { assert_whole_input_is_token( "valid_identifier", Lexer::identifier, Type::Identifier, ); assert_whole_input_is_token("_0", Lexer::identifier, Type::Identifier); assert_whole_input_is_token("_", Lexer::identifier, Type::Identifier); } #[test] fn unicode_identifier() { assert_whole_input_is_token("ζ_ζζζ_ζζζ_ζζζ", Lexer::identifier, Type::Identifier); assert_whole_input_is_token("_ζζζ_ζζζ_ζζζ_", Lexer::identifier, Type::Identifier); } #[test] #[should_panic] fn not_identifier() { assert_whole_input_is_token("123456789", Lexer::identifier, Type::Identifier); } } } mod parser { // TODO } mod interpreter { // TODO } }