From 50b473cd55bf9dc15368f372930934ba5c386a75 Mon Sep 17 00:00:00 2001 From: John Date: Thu, 29 Feb 2024 20:58:50 -0600 Subject: [PATCH] cl-lexer: Move lexer into its own crate --- Cargo.toml | 1 + cl-interpret/Cargo.toml | 2 +- cl-interpret/src/tests.rs | 2 +- cl-lexer/Cargo.toml | 13 ++ .../src/lexer.rs => cl-lexer/src/lib.rs | 6 +- cl-lexer/src/tests.rs | 167 ++++++++++++++++++ cl-parser/Cargo.toml | 2 +- cl-parser/src/error.rs | 2 +- cl-parser/src/parser.rs | 8 +- cl-repl/Cargo.toml | 1 + cl-repl/examples/collect-identifiers.rs | 2 +- cl-repl/examples/identify_tokens.rs | 2 +- cl-repl/src/lib.rs | 9 +- libconlang/src/lib.rs | 2 - libconlang/src/tests.rs | 167 ------------------ 15 files changed, 198 insertions(+), 188 deletions(-) create mode 100644 cl-lexer/Cargo.toml rename libconlang/src/lexer.rs => cl-lexer/src/lib.rs (99%) create mode 100644 cl-lexer/src/tests.rs diff --git a/Cargo.toml b/Cargo.toml index 8cef1ed..59fe784 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,6 +7,7 @@ members = [ "cl-token", "cl-ast", "cl-parser", + "cl-lexer", ] resolver = "2" diff --git a/cl-interpret/Cargo.toml b/cl-interpret/Cargo.toml index f66c36f..2b4467b 100644 --- a/cl-interpret/Cargo.toml +++ b/cl-interpret/Cargo.toml @@ -13,5 +13,5 @@ cl-structures = { path = "../cl-structures" } [dev-dependencies] -conlang = { path = "../libconlang" } +cl-lexer = { path = "../cl-lexer" } cl-parser = { path = "../cl-parser" } diff --git a/cl-interpret/src/tests.rs b/cl-interpret/src/tests.rs index c0d6659..aff467e 100644 --- a/cl-interpret/src/tests.rs +++ b/cl-interpret/src/tests.rs @@ -2,7 +2,7 @@ use crate::{env::Environment, temp_type_impl::ConValue, Interpret}; use cl_ast::*; use cl_parser::Parser; -use conlang::lexer::Lexer; +use cl_lexer::Lexer; pub use macros::*; mod macros { diff --git a/cl-lexer/Cargo.toml b/cl-lexer/Cargo.toml new file mode 100644 index 0000000..8267f5f --- /dev/null +++ b/cl-lexer/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "cl-lexer" +repository.workspace = true +version.workspace = true +authors.workspace = true +edition.workspace = true +license.workspace = true +publish.workspace = true + +[dependencies] +cl-token = { path = "../cl-token" } +cl-structures = { path = "../cl-structures" } +unicode-xid = "0.2.4" diff --git a/libconlang/src/lexer.rs b/cl-lexer/src/lib.rs similarity index 99% rename from libconlang/src/lexer.rs rename to cl-lexer/src/lib.rs index 5b947e0..ad3935a 100644 --- a/libconlang/src/lexer.rs +++ b/cl-lexer/src/lib.rs @@ -1,12 +1,16 @@ //! Converts a text file into tokens -use cl_token::*; +#![feature(decl_macro)] use cl_structures::span::Loc; +use cl_token::*; use std::{ iter::Peekable, str::{Chars, FromStr}, }; use unicode_xid::UnicodeXID; +#[cfg(test)] +mod tests; + pub mod lexer_iter { //! Iterator over a [`Lexer`], returning [`LResult`]s use super::{ diff --git a/cl-lexer/src/tests.rs b/cl-lexer/src/tests.rs new file mode 100644 index 0000000..7d3ec86 --- /dev/null +++ b/cl-lexer/src/tests.rs @@ -0,0 +1,167 @@ +use crate::Lexer; +use cl_token::*; + +macro test_lexer_output_type ($($f:ident {$($test:expr => $expect:expr),*$(,)?})*) {$( + #[test] + fn $f() {$( + assert_eq!( + Lexer::new($test) + .into_iter() + .map(|t| t.unwrap().ty()) + .collect::>(), + dbg!($expect) + ); + )*} +)*} + +macro test_lexer_data_type ($($f:ident {$($test:expr => $expect:expr),*$(,)?})*) {$( + #[test] + fn $f() {$( + assert_eq!( + Lexer::new($test) + .into_iter() + .map(|t| t.unwrap().into_data()) + .collect::>(), + dbg!($expect) + ); + )*} +)*} + +/// Convert an `[ expr, ... ]` into a `[ *, ... ]` +macro td ($($id:expr),*) { + [$($id.into()),*] +} + +mod ident { + use super::*; + macro ident ($($id:literal),*) { + [$(Data::Identifier($id.into())),*] + } + test_lexer_data_type! { + underscore { "_ _" => ident!["_", "_"] } + unicode { "_ε ε_" => ident!["_ε", "ε_"] } + many_underscore { "____________________________________" => + ident!["____________________________________"] } + } +} +mod keyword { + use super::*; + macro kw($($k:ident),*) { + [ $(Type::Keyword(Keyword::$k),)* ] + } + test_lexer_output_type! { + kw_break { "break break" => kw![Break, Break] } + kw_continue { "continue continue" => kw![Continue, Continue] } + kw_else { "else else" => kw![Else, Else] } + kw_false { "false false" => kw![False, False] } + kw_for { "for for" => kw![For, For] } + kw_fn { "fn fn" => kw![Fn, Fn] } + kw_if { "if if" => kw![If, If] } + kw_in { "in in" => kw![In, In] } + kw_let { "let let" => kw![Let, Let] } + kw_return { "return return" => kw![Return, Return] } + kw_true { "true true" => kw![True, True] } + kw_while { "while while" => kw![While, While] } + keywords { "break continue else false for fn if in let return true while" => + kw![Break, Continue, Else, False, For, Fn, If, In, Let, Return, True, While] } + } +} +mod integer { + use super::*; + test_lexer_data_type! { + hex { + "0x0 0x1 0x15 0x2100 0x8000" => + td![0x0, 0x1, 0x15, 0x2100, 0x8000] + } + dec { + "0d0 0d1 0d21 0d8448 0d32768" => + td![0, 0x1, 0x15, 0x2100, 0x8000] + } + oct { + "0o0 0o1 0o25 0o20400 0o100000" => + td![0x0, 0x1, 0x15, 0x2100, 0x8000] + } + bin { + "0b0 0b1 0b10101 0b10000100000000 0b1000000000000000" => + td![0x0, 0x1, 0x15, 0x2100, 0x8000] + } + baseless { + "0 1 21 8448 32768" => + td![0x0, 0x1, 0x15, 0x2100, 0x8000] + } + } +} +mod string { + use super::*; + test_lexer_data_type! { + empty_string { + "\"\"" => + td![String::from("")] + } + unicode_string { + "\"I 💙 🦈!\"" => + td![String::from("I 💙 🦈!")] + } + escape_string { + " \"This is a shark: \\u{1f988}\" " => + td![String::from("This is a shark: 🦈")] + } + } +} +mod punct { + use super::*; + test_lexer_output_type! { + l_curly { "{ {" => [ Type::LCurly, Type::LCurly ] } + r_curly { "} }" => [ Type::RCurly, Type::RCurly ] } + l_brack { "[ [" => [ Type::LBrack, Type::LBrack ] } + r_brack { "] ]" => [ Type::RBrack, Type::RBrack ] } + l_paren { "( (" => [ Type::LParen, Type::LParen ] } + r_paren { ") )" => [ Type::RParen, Type::RParen ] } + amp { "& &" => [ Type::Amp, Type::Amp ] } + amp_amp { "&& &&" => [ Type::AmpAmp, Type::AmpAmp ] } + amp_eq { "&= &=" => [ Type::AmpEq, Type::AmpEq ] } + arrow { "-> ->" => [ Type::Arrow, Type::Arrow] } + at { "@ @" => [ Type::At, Type::At] } + backslash { "\\ \\" => [ Type::Backslash, Type::Backslash] } + bang { "! !" => [ Type::Bang, Type::Bang] } + bangbang { "!! !!" => [ Type::BangBang, Type::BangBang] } + bangeq { "!= !=" => [ Type::BangEq, Type::BangEq] } + bar { "| |" => [ Type::Bar, Type::Bar] } + barbar { "|| ||" => [ Type::BarBar, Type::BarBar] } + bareq { "|= |=" => [ Type::BarEq, Type::BarEq] } + colon { ": :" => [ Type::Colon, Type::Colon] } + comma { ", ," => [ Type::Comma, Type::Comma] } + dot { ". ." => [ Type::Dot, Type::Dot] } + dotdot { ".. .." => [ Type::DotDot, Type::DotDot] } + dotdoteq { "..= ..=" => [ Type::DotDotEq, Type::DotDotEq] } + eq { "= =" => [ Type::Eq, Type::Eq] } + eqeq { "== ==" => [ Type::EqEq, Type::EqEq] } + fatarrow { "=> =>" => [ Type::FatArrow, Type::FatArrow] } + grave { "` `" => [ Type::Grave, Type::Grave] } + gt { "> >" => [ Type::Gt, Type::Gt] } + gteq { ">= >=" => [ Type::GtEq, Type::GtEq] } + gtgt { ">> >>" => [ Type::GtGt, Type::GtGt] } + gtgteq { ">>= >>=" => [ Type::GtGtEq, Type::GtGtEq] } + hash { "# #" => [ Type::Hash, Type::Hash] } + lt { "< <" => [ Type::Lt, Type::Lt] } + lteq { "<= <=" => [ Type::LtEq, Type::LtEq] } + ltlt { "<< <<" => [ Type::LtLt, Type::LtLt] } + ltlteq { "<<= <<=" => [ Type::LtLtEq, Type::LtLtEq] } + minus { "- -" => [ Type::Minus, Type::Minus] } + minuseq { "-= -=" => [ Type::MinusEq, Type::MinusEq] } + plus { "+ +" => [ Type::Plus, Type::Plus] } + pluseq { "+= +=" => [ Type::PlusEq, Type::PlusEq] } + question { "? ?" => [ Type::Question, Type::Question] } + rem { "% %" => [ Type::Rem, Type::Rem] } + remeq { "%= %=" => [ Type::RemEq, Type::RemEq] } + semi { "; ;" => [ Type::Semi, Type::Semi] } + slash { "/ /" => [ Type::Slash, Type::Slash] } + slasheq { "/= /=" => [ Type::SlashEq, Type::SlashEq] } + star { "* *" => [ Type::Star, Type::Star] } + stareq { "*= *=" => [ Type::StarEq, Type::StarEq] } + tilde { "~ ~" => [ Type::Tilde, Type::Tilde] } + xor { "^ ^" => [ Type::Xor, Type::Xor] } + xoreq { "^= ^=" => [ Type::XorEq, Type::XorEq] } + xorxor { "^^ ^^" => [ Type::XorXor, Type::XorXor] } + } +} diff --git a/cl-parser/Cargo.toml b/cl-parser/Cargo.toml index 5adb138..f08b0f3 100644 --- a/cl-parser/Cargo.toml +++ b/cl-parser/Cargo.toml @@ -9,6 +9,6 @@ publish.workspace = true [dependencies] cl-ast = { path = "../cl-ast" } +cl-lexer = { path = "../cl-lexer" } cl-token = { path = "../cl-token" } cl-structures = { path = "../cl-structures" } -conlang = { path = "../libconlang" } diff --git a/cl-parser/src/error.rs b/cl-parser/src/error.rs index 97eedfe..7db443f 100644 --- a/cl-parser/src/error.rs +++ b/cl-parser/src/error.rs @@ -1,6 +1,6 @@ use super::*; -use conlang::lexer::error::{Error as LexError, Reason}; +use cl_lexer::error::{Error as LexError, Reason}; use std::fmt::Display; pub type PResult = Result; diff --git a/cl-parser/src/parser.rs b/cl-parser/src/parser.rs index 6637385..9508598 100644 --- a/cl-parser/src/parser.rs +++ b/cl-parser/src/parser.rs @@ -5,7 +5,7 @@ use crate::error::{ PResult, Parsing, }; use cl_ast::*; -use conlang::lexer::Lexer; +use cl_lexer::Lexer; /// Parses a sequence of [Tokens](Token) into an [AST](cl_ast) pub struct Parser<'t> { @@ -905,11 +905,7 @@ impl<'t> Parser<'t> { /// [Block] = `{` [Stmt]* `}` pub fn block(&mut self) -> PResult { const PARSING: Parsing = Parsing::Block; - Ok( - Block { - stmts: delim(rep(Self::stmt, CURLIES.1, PARSING), CURLIES, PARSING)(self)?, - }, - ) + Ok(Block { stmts: delim(rep(Self::stmt, CURLIES.1, PARSING), CURLIES, PARSING)(self)? }) } } /// ## Control flow subexpressions diff --git a/cl-repl/Cargo.toml b/cl-repl/Cargo.toml index 89ca1fa..2da671f 100644 --- a/cl-repl/Cargo.toml +++ b/cl-repl/Cargo.toml @@ -12,6 +12,7 @@ publish.workspace = true [dependencies] conlang = { path = "../libconlang" } cl-ast = { path = "../cl-ast" } +cl-lexer = { path = "../cl-lexer" } cl-token = { path = "../cl-token" } cl-parser = { path = "../cl-parser" } cl-interpret = { path = "../cl-interpret" } diff --git a/cl-repl/examples/collect-identifiers.rs b/cl-repl/examples/collect-identifiers.rs index b73f754..4aef2d8 100644 --- a/cl-repl/examples/collect-identifiers.rs +++ b/cl-repl/examples/collect-identifiers.rs @@ -1,9 +1,9 @@ //! Collects identifiers into a list +use cl_lexer::Lexer; use cl_parser::Parser; use cl_repl::repline::Repline; use cl_structures::span::Loc; -use conlang::lexer::Lexer; use std::{ collections::HashMap, error::Error, diff --git a/cl-repl/examples/identify_tokens.rs b/cl-repl/examples/identify_tokens.rs index a7b255f..42cc710 100644 --- a/cl-repl/examples/identify_tokens.rs +++ b/cl-repl/examples/identify_tokens.rs @@ -1,7 +1,7 @@ //! This example grabs input from stdin, lexes it, and prints which lexer rules matched #![allow(unused_imports)] +use cl_lexer::Lexer; use cl_token::Token; -use conlang::lexer::Lexer; use std::{ error::Error, io::{stdin, IsTerminal, Read}, diff --git a/cl-repl/src/lib.rs b/cl-repl/src/lib.rs index 744155c..0270c6a 100644 --- a/cl-repl/src/lib.rs +++ b/cl-repl/src/lib.rs @@ -74,12 +74,9 @@ pub mod program { }; use cl_ast::{self as ast, ast_impl::format::Pretty}; + use cl_lexer::Lexer; use cl_parser::{error::PResult, Parser}; - use conlang::{ - // pretty_printer::{PrettyPrintable, Printer}, - lexer::Lexer, - resolver::{error::TyResult, Resolver}, - }; + use conlang::resolver::{error::TyResult, Resolver}; use std::{fmt::Display, io::Write}; pub struct Parsable; @@ -228,7 +225,7 @@ pub mod cli { match (repl, path) { (true, Some(path)) => { let prog = std::fs::read_to_string(path).unwrap(); - let code = cl_parser::Parser::new(conlang::lexer::Lexer::new(&prog)) + let code = cl_parser::Parser::new(cl_lexer::Lexer::new(&prog)) .file() .unwrap(); let mut env = cl_interpret::env::Environment::new(); diff --git a/libconlang/src/lib.rs b/libconlang/src/lib.rs index cc7fc9d..0fb29db 100644 --- a/libconlang/src/lib.rs +++ b/libconlang/src/lib.rs @@ -2,8 +2,6 @@ #![warn(clippy::all)] #![feature(decl_macro)] -pub mod lexer; - pub mod resolver; #[cfg(test)] diff --git a/libconlang/src/tests.rs b/libconlang/src/tests.rs index e45f773..9adda6a 100644 --- a/libconlang/src/tests.rs +++ b/libconlang/src/tests.rs @@ -5,173 +5,6 @@ mod ast { // TODO } mod lexer { - use crate::lexer::Lexer; - use cl_token::*; - - macro test_lexer_output_type ($($f:ident {$($test:expr => $expect:expr),*$(,)?})*) {$( - #[test] - fn $f() {$( - assert_eq!( - Lexer::new($test) - .into_iter() - .map(|t| t.unwrap().ty()) - .collect::>(), - dbg!($expect) - ); - )*} - )*} - - macro test_lexer_data_type ($($f:ident {$($test:expr => $expect:expr),*$(,)?})*) {$( - #[test] - fn $f() {$( - assert_eq!( - Lexer::new($test) - .into_iter() - .map(|t| t.unwrap().into_data()) - .collect::>(), - dbg!($expect) - ); - )*} - )*} - - /// Convert an `[ expr, ... ]` into a `[ *, ... ]` - macro td ($($id:expr),*) { - [$($id.into()),*] - } - - mod ident { - use super::*; - macro ident ($($id:literal),*) { - [$(Data::Identifier($id.into())),*] - } - test_lexer_data_type! { - underscore { "_ _" => ident!["_", "_"] } - unicode { "_ε ε_" => ident!["_ε", "ε_"] } - many_underscore { "____________________________________" => - ident!["____________________________________"] } - } - } - mod keyword { - use super::*; - macro kw($($k:ident),*) { - [ $(Type::Keyword(Keyword::$k),)* ] - } - test_lexer_output_type! { - kw_break { "break break" => kw![Break, Break] } - kw_continue { "continue continue" => kw![Continue, Continue] } - kw_else { "else else" => kw![Else, Else] } - kw_false { "false false" => kw![False, False] } - kw_for { "for for" => kw![For, For] } - kw_fn { "fn fn" => kw![Fn, Fn] } - kw_if { "if if" => kw![If, If] } - kw_in { "in in" => kw![In, In] } - kw_let { "let let" => kw![Let, Let] } - kw_return { "return return" => kw![Return, Return] } - kw_true { "true true" => kw![True, True] } - kw_while { "while while" => kw![While, While] } - keywords { "break continue else false for fn if in let return true while" => - kw![Break, Continue, Else, False, For, Fn, If, In, Let, Return, True, While] } - } - } - mod integer { - use super::*; - test_lexer_data_type! { - hex { - "0x0 0x1 0x15 0x2100 0x8000" => - td![0x0, 0x1, 0x15, 0x2100, 0x8000] - } - dec { - "0d0 0d1 0d21 0d8448 0d32768" => - td![0, 0x1, 0x15, 0x2100, 0x8000] - } - oct { - "0o0 0o1 0o25 0o20400 0o100000" => - td![0x0, 0x1, 0x15, 0x2100, 0x8000] - } - bin { - "0b0 0b1 0b10101 0b10000100000000 0b1000000000000000" => - td![0x0, 0x1, 0x15, 0x2100, 0x8000] - } - baseless { - "0 1 21 8448 32768" => - td![0x0, 0x1, 0x15, 0x2100, 0x8000] - } - } - } - mod string { - use super::*; - test_lexer_data_type! { - empty_string { - "\"\"" => - td![String::from("")] - } - unicode_string { - "\"I 💙 🦈!\"" => - td![String::from("I 💙 🦈!")] - } - escape_string { - " \"This is a shark: \\u{1f988}\" " => - td![String::from("This is a shark: 🦈")] - } - } - } - mod punct { - use super::*; - test_lexer_output_type! { - l_curly { "{ {" => [ Type::LCurly, Type::LCurly ] } - r_curly { "} }" => [ Type::RCurly, Type::RCurly ] } - l_brack { "[ [" => [ Type::LBrack, Type::LBrack ] } - r_brack { "] ]" => [ Type::RBrack, Type::RBrack ] } - l_paren { "( (" => [ Type::LParen, Type::LParen ] } - r_paren { ") )" => [ Type::RParen, Type::RParen ] } - amp { "& &" => [ Type::Amp, Type::Amp ] } - amp_amp { "&& &&" => [ Type::AmpAmp, Type::AmpAmp ] } - amp_eq { "&= &=" => [ Type::AmpEq, Type::AmpEq ] } - arrow { "-> ->" => [ Type::Arrow, Type::Arrow] } - at { "@ @" => [ Type::At, Type::At] } - backslash { "\\ \\" => [ Type::Backslash, Type::Backslash] } - bang { "! !" => [ Type::Bang, Type::Bang] } - bangbang { "!! !!" => [ Type::BangBang, Type::BangBang] } - bangeq { "!= !=" => [ Type::BangEq, Type::BangEq] } - bar { "| |" => [ Type::Bar, Type::Bar] } - barbar { "|| ||" => [ Type::BarBar, Type::BarBar] } - bareq { "|= |=" => [ Type::BarEq, Type::BarEq] } - colon { ": :" => [ Type::Colon, Type::Colon] } - comma { ", ," => [ Type::Comma, Type::Comma] } - dot { ". ." => [ Type::Dot, Type::Dot] } - dotdot { ".. .." => [ Type::DotDot, Type::DotDot] } - dotdoteq { "..= ..=" => [ Type::DotDotEq, Type::DotDotEq] } - eq { "= =" => [ Type::Eq, Type::Eq] } - eqeq { "== ==" => [ Type::EqEq, Type::EqEq] } - fatarrow { "=> =>" => [ Type::FatArrow, Type::FatArrow] } - grave { "` `" => [ Type::Grave, Type::Grave] } - gt { "> >" => [ Type::Gt, Type::Gt] } - gteq { ">= >=" => [ Type::GtEq, Type::GtEq] } - gtgt { ">> >>" => [ Type::GtGt, Type::GtGt] } - gtgteq { ">>= >>=" => [ Type::GtGtEq, Type::GtGtEq] } - hash { "# #" => [ Type::Hash, Type::Hash] } - lt { "< <" => [ Type::Lt, Type::Lt] } - lteq { "<= <=" => [ Type::LtEq, Type::LtEq] } - ltlt { "<< <<" => [ Type::LtLt, Type::LtLt] } - ltlteq { "<<= <<=" => [ Type::LtLtEq, Type::LtLtEq] } - minus { "- -" => [ Type::Minus, Type::Minus] } - minuseq { "-= -=" => [ Type::MinusEq, Type::MinusEq] } - plus { "+ +" => [ Type::Plus, Type::Plus] } - pluseq { "+= +=" => [ Type::PlusEq, Type::PlusEq] } - question { "? ?" => [ Type::Question, Type::Question] } - rem { "% %" => [ Type::Rem, Type::Rem] } - remeq { "%= %=" => [ Type::RemEq, Type::RemEq] } - semi { "; ;" => [ Type::Semi, Type::Semi] } - slash { "/ /" => [ Type::Slash, Type::Slash] } - slasheq { "/= /=" => [ Type::SlashEq, Type::SlashEq] } - star { "* *" => [ Type::Star, Type::Star] } - stareq { "*= *=" => [ Type::StarEq, Type::StarEq] } - tilde { "~ ~" => [ Type::Tilde, Type::Tilde] } - xor { "^ ^" => [ Type::Xor, Type::Xor] } - xoreq { "^= ^=" => [ Type::XorEq, Type::XorEq] } - xorxor { "^^ ^^" => [ Type::XorXor, Type::XorXor] } - } - } } mod parser { // TODO