Conlang/libconlang/src/lib.rs

//! Conlang is an expression-based programming language
#![warn(clippy::all)]

pub mod token {
    //! Stores a component of a file as a type and span
    use std::ops::Range;

    #[derive(Clone, Copy, Debug, PartialEq, Eq)]
    pub enum Type {
        Comment,
        Identifier,
    }
    #[derive(Clone, Copy, Debug, PartialEq, Eq)]
    pub struct Token {
        ty: Type,
        head: usize,
        tail: usize,
    }
    impl Token {
        pub fn new(ty: Type, head: usize, tail: usize) -> Self {
            Self { ty, head, tail }
        }
        pub fn is_empty(&self) -> bool {
            self.tail == self.head
        }
        pub fn len(&self) -> usize {
            self.tail - self.head
        }
        // Gets the [Type] of the token
        pub fn ty(&self) -> Type {
            self.ty
        }
        // Gets the exclusive range of the token
        pub fn range(&self) -> Range<usize> {
            self.head..self.tail
        }
    }
}

pub mod ast {
    //! Stores functions, data structure definitions, etc.
}

pub mod lexer {
    //! Converts a text file into tokens
    use crate::token::{Token, Type};
    use lerox::Combinator;

    #[allow(dead_code)]
    pub struct Lexer<'t> {
        text: &'t str,
        cursor: usize,
    }
    /// Implements the non-terminals of a language
    impl<'t> Lexer<'t> {
        pub fn new(text: &'t str) -> Self {
            Self { text, cursor: 0 }
        }
        fn produce_token(&mut self, ty: Type, len: usize) -> Option<Token> {
            let start = self.cursor;
            self.cursor += len;
            Some(Token::new(ty, start, self.cursor))
        }
        fn text(&self) -> &str {
            &self.text[self.cursor..]
        }
        fn skip_whitespace(&mut self) {
            self.cursor += Rule::new(self.text).whitespace().end().unwrap_or_default()
        }
        // functions for lexing individual tokens
        pub fn line_comment(&mut self) -> Option<Token> {
            // line_comment := "//" ~ (^newline)*
            self.skip_whitespace();
            self.produce_token(
                Type::Comment,
                Rule::new(self.text())
                    .str("//")
                    .and_any(|rule| rule.not_char('\n'))
                    .end()?,
            )
        }
        pub fn block_comment(&mut self) -> Option<Token> {
            // block_comment := "/*" ~ (block_comment | all_but("*/"))* ~ "*/"
            self.skip_whitespace();
            self.produce_token(
                Type::Comment,
                Rule::new(self.text())
                    .str("/*")
                    .and_any(|rule| rule.not_str("*/"))
                    .str("*/")
                    .end()?,
            )
        }
        pub fn shebang_comment(&mut self) -> Option<Token> {
            // shebang_comment := "#!/" ~ (^newline)*
            self.skip_whitespace();
            self.produce_token(
                Type::Comment,
                Rule::new(self.text())
                    .str("#!/")
                    .and_any(|rule| rule.not_char('\n'))
                    .end()?,
            )
        }
        pub fn identifier(&mut self) -> Option<Token> {
            self.skip_whitespace();
            self.produce_token(
                Type::Identifier,
                Rule::new(self.text())
                    .char('_')
                    .or(Rule::xid_start)
                    .and_any(Rule::xid_continue)
                    .end()?,
            )
        }
    }

    #[derive(Clone, Debug, PartialEq, Eq)]
    pub struct Rule<'t> {
        text: &'t str,
        taken: usize,
        is_alright: bool,
    }
    impl<'t> Rule<'t> {
        pub fn new(text: &'t str) -> Self {
            Self { text, taken: 0, is_alright: true }
        }
        pub fn end(self) -> Option<usize> {
            self.is_alright.then_some(self.taken)
        }
        pub fn remaining(&self) -> &str {
            self.text
        }
    }

    impl<'t> Rule<'t> {
        pub fn char_between(self, start: char, end: char) -> Self {
            self.char_fn(|c| start <= c && c <= end)
        }
        pub fn char(self, c: char) -> Self {
            self.has(|rule| rule.text.starts_with(c), 1)
        }
        pub fn str(self, s: &str) -> Self {
            self.has(|rule| rule.text.starts_with(s), s.len())
        }
        pub fn char_fn(self, f: impl Fn(char) -> bool) -> Self {
            self.and(|rule| match rule.text.strip_prefix(&f) {
                Some(text) => Self { text, taken: rule.taken + next_utf8(rule.text, 1), ..rule },
                None => Self { is_alright: false, ..rule },
            })
        }
        pub fn not_char(self, c: char) -> Self {
            self.has(|rule| !rule.text.starts_with(c), 1)
        }
        pub fn not_str(self, s: &str) -> Self {
            self.has(|rule| !rule.text.starts_with(s), 1)
        }
        pub fn any(self) -> Self {
            self.has(|_| true, 1)
        }
        pub fn whitespace(self) -> Self {
            self.and_any(|rule| rule.char_fn(|c| c.is_whitespace()))
        }
        pub fn xid_start(self) -> Self {
            use unicode_xid::UnicodeXID;
            self.char_fn(UnicodeXID::is_xid_start)
        }
        pub fn xid_continue(self) -> Self {
            use unicode_xid::UnicodeXID;
            self.char_fn(UnicodeXID::is_xid_continue)
        }
        fn has(self, condition: impl Fn(&Self) -> bool, len: usize) -> Self {
            let len = next_utf8(self.text, len);
            self.and(|rule| match condition(&rule) && !rule.text.is_empty() {
                true => Self { text: &rule.text[len..], taken: rule.taken + len, ..rule },
                false => Self { is_alright: false, ..rule },
            })
        }
    }

    impl<'t> lerox::Combinator for Rule<'t> {
        fn is_alright(&self) -> bool {
            self.is_alright
        }
        fn into_alright(self) -> Self {
            Self { is_alright: true, ..self }
        }
    }

    /// Returns the index of the next unicode character, rounded up
    fn next_utf8(text: &str, mut index: usize) -> usize {
        index = index.min(text.len());
        while !text.is_char_boundary(index) {
            index += 1
        }
        index
    }
}

pub mod parser {
    //! Parses tokens into an AST
}

pub mod interpreter {
    //! Interprets an AST as a program
}

#[cfg(test)]
mod tests {
    mod token {
        use crate::token::*;
        #[test]
        fn token_has_type() {
            assert_eq!(Token::new(Type::Comment, 0, 10).ty(), Type::Comment);
            assert_eq!(Token::new(Type::Identifier, 0, 10).ty(), Type::Identifier);
        }
        #[test]
        fn token_has_range() {
            let t = Token::new(Type::Comment, 0, 10);
            assert_eq!(t.range(), 0..10);
        }
    }
    mod ast {
        // TODO
    }
    mod lexer {
        use crate::{
            lexer::*,
            token::{Token, Type},
        };

        fn assert_whole_input_is_token<'t, F>(input: &'t str, f: F, ty: Type)
        where F: FnOnce(&mut Lexer<'t>) -> Option<Token> {
            assert_has_type_and_len(input, f, ty, input.len())
        }
        fn assert_has_type_and_len<'t, F>(input: &'t str, f: F, ty: Type, len: usize)
        where F: FnOnce(&mut Lexer<'t>) -> Option<Token> {
            assert_eq!(Some(Token::new(ty, 0, len)), f(&mut Lexer::new(input)),)
        }

        mod comment {
            use super::*;

        #[test]
        fn line_comment() {
                assert_whole_input_is_token(
                    "// this is a comment",
                    Lexer::line_comment,
                    Type::Comment,
                );
        }
        #[test]
        #[should_panic]
        fn not_line_comment() {
            assert_whole_input_is_token("fn main() {}", Lexer::line_comment, Type::Comment);
        }
        #[test]
        fn block_comment() {
            assert_whole_input_is_token(
                "/* this is a comment */",
                Lexer::block_comment,
                Type::Comment,
            );
        }
        #[test]
        #[should_panic]
        fn not_block_comment() {
            assert_whole_input_is_token("fn main() {}", Lexer::block_comment, Type::Comment);
        }
        #[test]
        fn shebang_comment() {
            assert_whole_input_is_token(
                "#!/ this is a comment",
                Lexer::shebang_comment,
                Type::Comment,
            );
        }
        #[test]
        #[should_panic]
        fn not_shebang_comment() {
            assert_whole_input_is_token("fn main() {}", Lexer::shebang_comment, Type::Comment);
            }
        }
        mod identifier {
            use super::*;

            #[test]
            fn identifier() {
                assert_whole_input_is_token(
                    "valid_identifier",
                    Lexer::identifier,
                    Type::Identifier,
                );
                assert_whole_input_is_token("_0", Lexer::identifier, Type::Identifier);
                assert_whole_input_is_token("_", Lexer::identifier, Type::Identifier);
            }
            #[test]
            fn unicode_identifier() {
                assert_whole_input_is_token("ζ_ζζζ_ζζζ_ζζζ", Lexer::identifier, Type::Identifier);
                assert_whole_input_is_token("_ζζζ_ζζζ_ζζζ_", Lexer::identifier, Type::Identifier);
            }
            #[test]
            #[should_panic]
            fn not_identifier() {
                assert_whole_input_is_token("123456789", Lexer::identifier, Type::Identifier);
            }
        }
    }
    mod parser {
        // TODO
    }
    mod interpreter {
        // TODO
    }
}