cl-structures: Global (ew!) and local string interning!
- StringArena provides an arena for immutable strings, inspired by other string interners, and keeps track of the ends of every allocated string. Strings inserted into the arena are assigned a Symbol. - intern::Interner keeps track of the hashes of each inserted string, and provides deduplication for interned strings. This allows referential comparison between interned strings - global_intern::GlobalSym provides metered access to a Global Interner, and has a Display implementation which queries the Interner. The global interner is planned for use in cl-ast. TODO: the unstable raw_entry API is about to be removed from Rust. Maybe switch to hashbrown, or write my own hash table?
This commit is contained in:
		
							
								
								
									
										54
									
								
								compiler/cl-structures/src/arena.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										54
									
								
								compiler/cl-structures/src/arena.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,54 @@ | ||||
| //! Simple, long-lived string buffer | ||||
|  | ||||
| use std::marker::PhantomData; | ||||
| use symbol::Symbol; | ||||
|  | ||||
| pub mod global_intern; | ||||
| pub mod intern; | ||||
| pub mod symbol; | ||||
|  | ||||
| /// Compactly stores a set of immutable strings, producing a [Symbol] for each one | ||||
| #[derive(Debug)] | ||||
| pub struct StringArena<T: Symbol> { | ||||
|     ends: Vec<usize>, | ||||
|     buf: String, | ||||
|     _t: PhantomData<fn(T)>, | ||||
| } | ||||
|  | ||||
| impl<T: Symbol> StringArena<T> { | ||||
|     pub fn new() -> Self { | ||||
|         Default::default() | ||||
|     } | ||||
|     /// # May panic | ||||
|     /// Panics if Symbol::from_usize would panic | ||||
|     fn next_key(&self) -> T { | ||||
|         Symbol::from_usize(self.ends.len()) | ||||
|     } | ||||
|  | ||||
|     fn get_span(&self, key: T) -> Option<(usize, usize)> { | ||||
|         let key = key.into_usize(); | ||||
|         Some((*self.ends.get(key - 1)?, *self.ends.get(key)?)) | ||||
|     } | ||||
|  | ||||
|     pub fn get(&self, key: T) -> Option<&str> { | ||||
|         let (start, end) = self.get_span(key)?; | ||||
|         // Safety: start and end offsets were created by push_string | ||||
|         Some(unsafe { self.buf.get_unchecked(start..end) }) | ||||
|     } | ||||
|  | ||||
|     pub fn push_string(&mut self, s: &str) -> T { | ||||
|         if self.ends.is_empty() { | ||||
|             self.ends.push(self.buf.len()) | ||||
|         } | ||||
|         let key = self.next_key(); | ||||
|         self.buf.push_str(s); | ||||
|         self.ends.push(self.buf.len()); | ||||
|         key | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl<T: Symbol> Default for StringArena<T> { | ||||
|     fn default() -> Self { | ||||
|         Self { ends: Default::default(), buf: Default::default(), _t: PhantomData } | ||||
|     } | ||||
| } | ||||
							
								
								
									
										127
									
								
								compiler/cl-structures/src/arena/global_intern.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										127
									
								
								compiler/cl-structures/src/arena/global_intern.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,127 @@ | ||||
| //!  A global intern pool for strings, represented by the [GlobalSym] symbol | ||||
|  | ||||
| use super::{intern::Interner, symbol::Symbol}; | ||||
| use std::{ | ||||
|     fmt::Display, | ||||
|     num::NonZeroU32, | ||||
|     sync::{OnceLock, RwLock}, | ||||
| }; | ||||
|  | ||||
| static GLOBAL_INTERNER: OnceLock<RwLock<Interner<GlobalSym>>> = OnceLock::new(); | ||||
|  | ||||
| /// Gets the [GlobalSym] associated with this string, if there is one, or creates a new one | ||||
| /// | ||||
| /// # Blocks | ||||
| /// Locks the Global Interner for writing. If it is already locked,  | ||||
| /// # May Panic | ||||
| /// T | ||||
| pub fn get_or_insert(s: &str) -> GlobalSym { | ||||
|     GLOBAL_INTERNER | ||||
|         .get_or_init(Default::default) | ||||
|         .write() | ||||
|         .expect("global interner should not have been held by a panicked thread") | ||||
|         .get_or_insert(s) | ||||
| } | ||||
|  | ||||
| /// Gets the [GlobalSym] associated with this string, if there is one | ||||
| pub fn get(s: &str) -> Option<GlobalSym> { | ||||
|     GLOBAL_INTERNER.get()?.read().ok()?.get(s) | ||||
| } | ||||
|  | ||||
| /// Gets the [String] associated with this [GlobalSym], if there is one | ||||
| /// | ||||
| /// Returns none if the global symbol table is poisoned. | ||||
| pub fn get_string(sym: GlobalSym) -> Option<String> { | ||||
|     sym.try_into().ok() | ||||
| } | ||||
|  | ||||
| #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] | ||||
| pub struct GlobalSym(NonZeroU32); | ||||
|  | ||||
| impl GlobalSym { | ||||
|     /// Gets a [GlobalSym] associated with the given string, if one exists | ||||
|     pub fn try_from_str(value: &str) -> Option<Self> { | ||||
|         GLOBAL_INTERNER.get()?.read().ok()?.get(value) | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl Display for GlobalSym { | ||||
|     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { | ||||
|         let Some(interner) = GLOBAL_INTERNER.get() else { | ||||
|             return write!(f, "[sym@{} (uninitialized)]", self.0); | ||||
|         }; | ||||
|         let Ok(interner) = interner.read() else { | ||||
|             return write!(f, "[sym@{} (poisoned)]", self.0); | ||||
|         }; | ||||
|         let Some(str) = interner.get_str(*self) else { | ||||
|             return write!(f, "[sym@{} (invalid)]", self.0); | ||||
|         }; | ||||
|         str.fmt(f) | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl Symbol for GlobalSym { | ||||
|     const MAX: usize = u32::MAX as usize - 1; | ||||
|     fn try_from_usize(value: usize) -> Option<Self> { | ||||
|         Some(Self(NonZeroU32::try_from_usize(value)?)) | ||||
|     } | ||||
|     fn into_usize(self) -> usize { | ||||
|         self.0.into_usize() | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl From<&str> for GlobalSym { | ||||
|     /// Converts to this type from the input type. | ||||
|     /// | ||||
|     /// # Blocks | ||||
|     /// This conversion blocks if the Global Interner lock is held. | ||||
|     /// | ||||
|     /// # May Panic | ||||
|     /// Panics if the Global Interner's lock has been poisoned by a panic in another thread | ||||
|     fn from(value: &str) -> Self { | ||||
|         GLOBAL_INTERNER | ||||
|             .get_or_init(Default::default) | ||||
|             .write() | ||||
|             .expect("global interner should not be poisoned in another thread") | ||||
|             .get_or_insert(value) | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl TryFrom<GlobalSym> for String { | ||||
|     type Error = GlobalSymError; | ||||
|  | ||||
|     fn try_from(value: GlobalSym) -> Result<Self, Self::Error> { | ||||
|         let Some(interner) = GLOBAL_INTERNER.get() else { | ||||
|             Err(GlobalSymError::Uninitialized)? | ||||
|         }; | ||||
|         let Ok(interner) = interner.write() else { | ||||
|             Err(GlobalSymError::Poisoned)? | ||||
|         }; | ||||
|         match interner.get_str(value) { | ||||
|             None => Err(GlobalSymError::Unseen(value)), | ||||
|             Some(string) => Ok(string.into()), | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[derive(Clone, Copy, Debug, PartialEq, Eq)] | ||||
| pub enum GlobalSymError { | ||||
|     Uninitialized, | ||||
|     Poisoned, | ||||
|     Unseen(GlobalSym), | ||||
| } | ||||
| impl std::error::Error for GlobalSymError {} | ||||
| impl Display for GlobalSymError { | ||||
|     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { | ||||
|         match self { | ||||
|             GlobalSymError::Uninitialized => "String pool was not initialized".fmt(f), | ||||
|             GlobalSymError::Poisoned => "String pool was held by panicking thread".fmt(f), | ||||
|             GlobalSymError::Unseen(sym) => { | ||||
|                 write!(f, "Symbol {sym:?} not present in String pool") | ||||
|             } | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[cfg(test)] | ||||
| mod tests; | ||||
							
								
								
									
										28
									
								
								compiler/cl-structures/src/arena/global_intern/tests.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										28
									
								
								compiler/cl-structures/src/arena/global_intern/tests.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,28 @@ | ||||
| //! Tests for the global intern pool | ||||
| use super::*; | ||||
|  | ||||
| #[test] | ||||
| fn globalsym_from_returns_unique_value_for_unique_keys() { | ||||
|     let foo_bar = GlobalSym::from("foo_bar"); | ||||
|     let foo_baz = GlobalSym::from("foo_baz"); | ||||
|     assert_ne!(foo_bar, foo_baz); | ||||
|     assert_eq!(foo_bar, GlobalSym::from("foo_bar")); | ||||
|     assert_eq!(foo_baz, GlobalSym::from("foo_baz")); | ||||
| } | ||||
| #[test] | ||||
| fn get_returns_none_before_init() { | ||||
|     if let Some(value) = get("") { | ||||
|         panic!("{value}") | ||||
|     } | ||||
| } | ||||
| #[test] | ||||
| fn get_returns_some_when_key_exists() { | ||||
|     let _ = GlobalSym::from("foo_bar"); | ||||
|     assert!(dbg!(get("foo_bar")).is_some()); | ||||
| } | ||||
|  | ||||
| #[test] | ||||
| fn get_returns_the_same_thing_as_globalsym_from() { | ||||
|     let foo_bar = GlobalSym::from("foo_bar"); | ||||
|     assert_eq!(Some(foo_bar), get("foo_bar")); | ||||
| } | ||||
							
								
								
									
										55
									
								
								compiler/cl-structures/src/arena/intern.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										55
									
								
								compiler/cl-structures/src/arena/intern.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,55 @@ | ||||
| //! A string interner with deduplication | ||||
|  | ||||
| use super::{symbol::Symbol, StringArena}; | ||||
| use std::{ | ||||
|     collections::{hash_map::RawEntryMut, HashMap}, | ||||
|     hash::{BuildHasher, RandomState}, | ||||
| }; | ||||
|  | ||||
| #[derive(Debug)] | ||||
| pub struct Interner<T: Symbol, H: BuildHasher = RandomState> { | ||||
|     map: HashMap<T, ()>, | ||||
|     arena: StringArena<T>, | ||||
|     hasher: H, | ||||
| } | ||||
|  | ||||
| impl<T: Symbol, H: BuildHasher + Default> Default for Interner<T, H> { | ||||
|     fn default() -> Self { | ||||
|         Self { map: Default::default(), arena: Default::default(), hasher: Default::default() } | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl<T: Symbol, H: BuildHasher> Interner<T, H> { | ||||
|     pub fn get_or_insert(&mut self, s: &str) -> T { | ||||
|         let Self { map, arena, hasher } = self; | ||||
|         let hash = hasher.hash_one(s); | ||||
|         match map.raw_entry_mut().from_hash(hash, is_match(s, arena)) { | ||||
|             RawEntryMut::Occupied(entry) => *entry.into_key(), | ||||
|             RawEntryMut::Vacant(entry) => { | ||||
|                 let tok = arena.push_string(s); | ||||
|                 *(entry.insert_hashed_nocheck(hash, tok, ()).0) | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     pub fn get(&self, s: &str) -> Option<T> { | ||||
|         let Self { map, arena, hasher } = self; | ||||
|         map.raw_entry() | ||||
|             .from_hash(hasher.hash_one(s), is_match(s, arena)) | ||||
|             .map(|entry| *entry.0) | ||||
|     } | ||||
|  | ||||
|     pub fn get_str(&self, sym: T) -> Option<&str> { | ||||
|         self.arena.get(sym) | ||||
|     } | ||||
| } | ||||
|  | ||||
| fn is_match<'a, T: Symbol>( | ||||
|     target: &'a str, | ||||
|     arena: &'a StringArena<T>, | ||||
| ) -> impl Fn(&T) -> bool + 'a { | ||||
|     move |sym| match arena.get(*sym) { | ||||
|         Some(sym) => sym == target, | ||||
|         None => false, | ||||
|     } | ||||
| } | ||||
							
								
								
									
										35
									
								
								compiler/cl-structures/src/arena/symbol.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										35
									
								
								compiler/cl-structures/src/arena/symbol.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,35 @@ | ||||
| use std::{fmt, hash, num::*}; | ||||
|  | ||||
| pub trait Symbol: Copy + fmt::Debug + fmt::Display + Eq + hash::Hash { | ||||
|     /// The largest [`usize`] that may be stored in the [Symbol] | ||||
|     const MAX: usize; | ||||
|     /// Returns [`Some(Self)`](Some) if `value` is in range 0..=[Symbol::MAX] | ||||
|     fn try_from_usize(value: usize) -> Option<Self>; | ||||
|     /// # May Panic | ||||
|     /// May panic if `value` is not in range 0..=[Symbol::MAX] | ||||
|     fn from_usize(value: usize) -> Self { | ||||
|         Self::try_from_usize(value).expect("should be within MIN and MAX") | ||||
|     } | ||||
|     fn into_usize(self) -> usize; | ||||
| } | ||||
|  | ||||
| #[rustfmt::skip] | ||||
| impl Symbol for usize { | ||||
|     const MAX: usize = usize::MAX; | ||||
|     fn try_from_usize(value: usize) -> Option<Self> { Some(value) } | ||||
|     fn into_usize(self) -> usize { self } | ||||
| } | ||||
|  | ||||
| macro_rules! impl_symbol_for_nonzero{($($int:ident: $nonzero:ident),* $(,)?) => {$( | ||||
|     impl Symbol for $nonzero { | ||||
|         const MAX: usize = $int::MAX as usize - 1; | ||||
|         fn try_from_usize(value: usize) -> Option<Self> { | ||||
|             $nonzero::try_from(value.wrapping_add(1) as $int).ok() | ||||
|         } | ||||
|         fn into_usize(self) -> usize { | ||||
|             self.get() as usize - 1 | ||||
|         } | ||||
|     } | ||||
| )*}} | ||||
|  | ||||
| impl_symbol_for_nonzero!(u8: NonZeroU8, u16: NonZeroU16, u32: NonZeroU32, u64: NonZeroU64, usize: NonZeroUsize); | ||||
| @@ -2,9 +2,17 @@ | ||||
| //! - [Span](struct@span::Span): Stores a start and end [Loc](struct@span::Loc) | ||||
| //! - [Loc](struct@span::Loc): Stores the index in a stream | ||||
| #![warn(clippy::all)] | ||||
| #![feature(inline_const, dropck_eyepatch, decl_macro, get_many_mut)] | ||||
| #![feature( | ||||
|     inline_const, | ||||
|     dropck_eyepatch, | ||||
|     decl_macro, | ||||
|     get_many_mut, | ||||
|     hash_raw_entry | ||||
| )] | ||||
| #![deny(unsafe_op_in_unsafe_fn)] | ||||
|  | ||||
| pub mod arena; | ||||
|  | ||||
| pub mod span; | ||||
|  | ||||
| pub mod tree; | ||||
|   | ||||
		Reference in New Issue
	
	Block a user