From e70ffd18958dfb538081474208d02d1e07f1e631 Mon Sep 17 00:00:00 2001 From: John Date: Wed, 24 Apr 2024 17:11:41 -0500 Subject: [PATCH] cl-structures: Global (ew!) and local string interning! - StringArena provides an arena for immutable strings, inspired by other string interners, and keeps track of the ends of every allocated string. Strings inserted into the arena are assigned a Symbol. - intern::Interner keeps track of the hashes of each inserted string, and provides deduplication for interned strings. This allows referential comparison between interned strings - global_intern::GlobalSym provides metered access to a Global Interner, and has a Display implementation which queries the Interner. The global interner is planned for use in cl-ast. TODO: the unstable raw_entry API is about to be removed from Rust. Maybe switch to hashbrown, or write my own hash table? --- compiler/cl-structures/src/arena.rs | 54 ++++++++ .../cl-structures/src/arena/global_intern.rs | 127 ++++++++++++++++++ .../src/arena/global_intern/tests.rs | 28 ++++ compiler/cl-structures/src/arena/intern.rs | 55 ++++++++ compiler/cl-structures/src/arena/symbol.rs | 35 +++++ compiler/cl-structures/src/lib.rs | 10 +- 6 files changed, 308 insertions(+), 1 deletion(-) create mode 100644 compiler/cl-structures/src/arena.rs create mode 100644 compiler/cl-structures/src/arena/global_intern.rs create mode 100644 compiler/cl-structures/src/arena/global_intern/tests.rs create mode 100644 compiler/cl-structures/src/arena/intern.rs create mode 100644 compiler/cl-structures/src/arena/symbol.rs diff --git a/compiler/cl-structures/src/arena.rs b/compiler/cl-structures/src/arena.rs new file mode 100644 index 0000000..2baa4f4 --- /dev/null +++ b/compiler/cl-structures/src/arena.rs @@ -0,0 +1,54 @@ +//! Simple, long-lived string buffer + +use std::marker::PhantomData; +use symbol::Symbol; + +pub mod global_intern; +pub mod intern; +pub mod symbol; + +/// Compactly stores a set of immutable strings, producing a [Symbol] for each one +#[derive(Debug)] +pub struct StringArena { + ends: Vec, + buf: String, + _t: PhantomData, +} + +impl StringArena { + pub fn new() -> Self { + Default::default() + } + /// # May panic + /// Panics if Symbol::from_usize would panic + fn next_key(&self) -> T { + Symbol::from_usize(self.ends.len()) + } + + fn get_span(&self, key: T) -> Option<(usize, usize)> { + let key = key.into_usize(); + Some((*self.ends.get(key - 1)?, *self.ends.get(key)?)) + } + + pub fn get(&self, key: T) -> Option<&str> { + let (start, end) = self.get_span(key)?; + // Safety: start and end offsets were created by push_string + Some(unsafe { self.buf.get_unchecked(start..end) }) + } + + pub fn push_string(&mut self, s: &str) -> T { + if self.ends.is_empty() { + self.ends.push(self.buf.len()) + } + let key = self.next_key(); + self.buf.push_str(s); + self.ends.push(self.buf.len()); + key + } +} + +impl Default for StringArena { + fn default() -> Self { + Self { ends: Default::default(), buf: Default::default(), _t: PhantomData } + } +} diff --git a/compiler/cl-structures/src/arena/global_intern.rs b/compiler/cl-structures/src/arena/global_intern.rs new file mode 100644 index 0000000..dcb238c --- /dev/null +++ b/compiler/cl-structures/src/arena/global_intern.rs @@ -0,0 +1,127 @@ +//! A global intern pool for strings, represented by the [GlobalSym] symbol + +use super::{intern::Interner, symbol::Symbol}; +use std::{ + fmt::Display, + num::NonZeroU32, + sync::{OnceLock, RwLock}, +}; + +static GLOBAL_INTERNER: OnceLock>> = OnceLock::new(); + +/// Gets the [GlobalSym] associated with this string, if there is one, or creates a new one +/// +/// # Blocks +/// Locks the Global Interner for writing. If it is already locked, +/// # May Panic +/// T +pub fn get_or_insert(s: &str) -> GlobalSym { + GLOBAL_INTERNER + .get_or_init(Default::default) + .write() + .expect("global interner should not have been held by a panicked thread") + .get_or_insert(s) +} + +/// Gets the [GlobalSym] associated with this string, if there is one +pub fn get(s: &str) -> Option { + GLOBAL_INTERNER.get()?.read().ok()?.get(s) +} + +/// Gets the [String] associated with this [GlobalSym], if there is one +/// +/// Returns none if the global symbol table is poisoned. +pub fn get_string(sym: GlobalSym) -> Option { + sym.try_into().ok() +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +pub struct GlobalSym(NonZeroU32); + +impl GlobalSym { + /// Gets a [GlobalSym] associated with the given string, if one exists + pub fn try_from_str(value: &str) -> Option { + GLOBAL_INTERNER.get()?.read().ok()?.get(value) + } +} + +impl Display for GlobalSym { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let Some(interner) = GLOBAL_INTERNER.get() else { + return write!(f, "[sym@{} (uninitialized)]", self.0); + }; + let Ok(interner) = interner.read() else { + return write!(f, "[sym@{} (poisoned)]", self.0); + }; + let Some(str) = interner.get_str(*self) else { + return write!(f, "[sym@{} (invalid)]", self.0); + }; + str.fmt(f) + } +} + +impl Symbol for GlobalSym { + const MAX: usize = u32::MAX as usize - 1; + fn try_from_usize(value: usize) -> Option { + Some(Self(NonZeroU32::try_from_usize(value)?)) + } + fn into_usize(self) -> usize { + self.0.into_usize() + } +} + +impl From<&str> for GlobalSym { + /// Converts to this type from the input type. + /// + /// # Blocks + /// This conversion blocks if the Global Interner lock is held. + /// + /// # May Panic + /// Panics if the Global Interner's lock has been poisoned by a panic in another thread + fn from(value: &str) -> Self { + GLOBAL_INTERNER + .get_or_init(Default::default) + .write() + .expect("global interner should not be poisoned in another thread") + .get_or_insert(value) + } +} + +impl TryFrom for String { + type Error = GlobalSymError; + + fn try_from(value: GlobalSym) -> Result { + let Some(interner) = GLOBAL_INTERNER.get() else { + Err(GlobalSymError::Uninitialized)? + }; + let Ok(interner) = interner.write() else { + Err(GlobalSymError::Poisoned)? + }; + match interner.get_str(value) { + None => Err(GlobalSymError::Unseen(value)), + Some(string) => Ok(string.into()), + } + } +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum GlobalSymError { + Uninitialized, + Poisoned, + Unseen(GlobalSym), +} +impl std::error::Error for GlobalSymError {} +impl Display for GlobalSymError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + GlobalSymError::Uninitialized => "String pool was not initialized".fmt(f), + GlobalSymError::Poisoned => "String pool was held by panicking thread".fmt(f), + GlobalSymError::Unseen(sym) => { + write!(f, "Symbol {sym:?} not present in String pool") + } + } + } +} + +#[cfg(test)] +mod tests; diff --git a/compiler/cl-structures/src/arena/global_intern/tests.rs b/compiler/cl-structures/src/arena/global_intern/tests.rs new file mode 100644 index 0000000..1bcfff0 --- /dev/null +++ b/compiler/cl-structures/src/arena/global_intern/tests.rs @@ -0,0 +1,28 @@ +//! Tests for the global intern pool +use super::*; + +#[test] +fn globalsym_from_returns_unique_value_for_unique_keys() { + let foo_bar = GlobalSym::from("foo_bar"); + let foo_baz = GlobalSym::from("foo_baz"); + assert_ne!(foo_bar, foo_baz); + assert_eq!(foo_bar, GlobalSym::from("foo_bar")); + assert_eq!(foo_baz, GlobalSym::from("foo_baz")); +} +#[test] +fn get_returns_none_before_init() { + if let Some(value) = get("") { + panic!("{value}") + } +} +#[test] +fn get_returns_some_when_key_exists() { + let _ = GlobalSym::from("foo_bar"); + assert!(dbg!(get("foo_bar")).is_some()); +} + +#[test] +fn get_returns_the_same_thing_as_globalsym_from() { + let foo_bar = GlobalSym::from("foo_bar"); + assert_eq!(Some(foo_bar), get("foo_bar")); +} diff --git a/compiler/cl-structures/src/arena/intern.rs b/compiler/cl-structures/src/arena/intern.rs new file mode 100644 index 0000000..9caef1e --- /dev/null +++ b/compiler/cl-structures/src/arena/intern.rs @@ -0,0 +1,55 @@ +//! A string interner with deduplication + +use super::{symbol::Symbol, StringArena}; +use std::{ + collections::{hash_map::RawEntryMut, HashMap}, + hash::{BuildHasher, RandomState}, +}; + +#[derive(Debug)] +pub struct Interner { + map: HashMap, + arena: StringArena, + hasher: H, +} + +impl Default for Interner { + fn default() -> Self { + Self { map: Default::default(), arena: Default::default(), hasher: Default::default() } + } +} + +impl Interner { + pub fn get_or_insert(&mut self, s: &str) -> T { + let Self { map, arena, hasher } = self; + let hash = hasher.hash_one(s); + match map.raw_entry_mut().from_hash(hash, is_match(s, arena)) { + RawEntryMut::Occupied(entry) => *entry.into_key(), + RawEntryMut::Vacant(entry) => { + let tok = arena.push_string(s); + *(entry.insert_hashed_nocheck(hash, tok, ()).0) + } + } + } + + pub fn get(&self, s: &str) -> Option { + let Self { map, arena, hasher } = self; + map.raw_entry() + .from_hash(hasher.hash_one(s), is_match(s, arena)) + .map(|entry| *entry.0) + } + + pub fn get_str(&self, sym: T) -> Option<&str> { + self.arena.get(sym) + } +} + +fn is_match<'a, T: Symbol>( + target: &'a str, + arena: &'a StringArena, +) -> impl Fn(&T) -> bool + 'a { + move |sym| match arena.get(*sym) { + Some(sym) => sym == target, + None => false, + } +} diff --git a/compiler/cl-structures/src/arena/symbol.rs b/compiler/cl-structures/src/arena/symbol.rs new file mode 100644 index 0000000..0e05df6 --- /dev/null +++ b/compiler/cl-structures/src/arena/symbol.rs @@ -0,0 +1,35 @@ +use std::{fmt, hash, num::*}; + +pub trait Symbol: Copy + fmt::Debug + fmt::Display + Eq + hash::Hash { + /// The largest [`usize`] that may be stored in the [Symbol] + const MAX: usize; + /// Returns [`Some(Self)`](Some) if `value` is in range 0..=[Symbol::MAX] + fn try_from_usize(value: usize) -> Option; + /// # May Panic + /// May panic if `value` is not in range 0..=[Symbol::MAX] + fn from_usize(value: usize) -> Self { + Self::try_from_usize(value).expect("should be within MIN and MAX") + } + fn into_usize(self) -> usize; +} + +#[rustfmt::skip] +impl Symbol for usize { + const MAX: usize = usize::MAX; + fn try_from_usize(value: usize) -> Option { Some(value) } + fn into_usize(self) -> usize { self } +} + +macro_rules! impl_symbol_for_nonzero{($($int:ident: $nonzero:ident),* $(,)?) => {$( + impl Symbol for $nonzero { + const MAX: usize = $int::MAX as usize - 1; + fn try_from_usize(value: usize) -> Option { + $nonzero::try_from(value.wrapping_add(1) as $int).ok() + } + fn into_usize(self) -> usize { + self.get() as usize - 1 + } + } +)*}} + +impl_symbol_for_nonzero!(u8: NonZeroU8, u16: NonZeroU16, u32: NonZeroU32, u64: NonZeroU64, usize: NonZeroUsize); diff --git a/compiler/cl-structures/src/lib.rs b/compiler/cl-structures/src/lib.rs index b0728a4..8c73740 100644 --- a/compiler/cl-structures/src/lib.rs +++ b/compiler/cl-structures/src/lib.rs @@ -2,9 +2,17 @@ //! - [Span](struct@span::Span): Stores a start and end [Loc](struct@span::Loc) //! - [Loc](struct@span::Loc): Stores the index in a stream #![warn(clippy::all)] -#![feature(inline_const, dropck_eyepatch, decl_macro, get_many_mut)] +#![feature( + inline_const, + dropck_eyepatch, + decl_macro, + get_many_mut, + hash_raw_entry +)] #![deny(unsafe_op_in_unsafe_fn)] +pub mod arena; + pub mod span; pub mod tree;