From 81cf05cc694e77c321a01e0a2501a65a92910490 Mon Sep 17 00:00:00 2001 From: John Date: Sat, 27 Apr 2024 20:16:36 -0500 Subject: [PATCH] cl-structures: Interning v3: ACTUALLY DO THE THING Here we have *real* interning, producing unique references if and only if the input is unique! Boy am I glad I invested time into this, because it's really fun to work with. Hopefully my logic regarding Send-ness and Sync-ness aren't completely unsound. --- compiler/cl-structures/Cargo.toml | 1 + compiler/cl-structures/src/intern.rs | 292 +++++++++++++++++++++++++++ compiler/cl-structures/src/lib.rs | 2 + 3 files changed, 295 insertions(+) create mode 100644 compiler/cl-structures/src/intern.rs diff --git a/compiler/cl-structures/Cargo.toml b/compiler/cl-structures/Cargo.toml index f12136c..aa2823e 100644 --- a/compiler/cl-structures/Cargo.toml +++ b/compiler/cl-structures/Cargo.toml @@ -9,3 +9,4 @@ publish.workspace = true [dependencies] hashbrown = { version = "0.14.3", default-features = false } +cl-arena = { path = "../cl-arena" } diff --git a/compiler/cl-structures/src/intern.rs b/compiler/cl-structures/src/intern.rs new file mode 100644 index 0000000..4cd5dc3 --- /dev/null +++ b/compiler/cl-structures/src/intern.rs @@ -0,0 +1,292 @@ +//! Interners for [strings](string_interner) and arbitrary [types](typed_interner). +//! +//! An object is [Interned][1] if it is allocated within one of the interners +//! in this module. [Interned][1] values have referential equality semantics, and +//! [Deref](std::ops::Deref) to the value within their respective intern pool. +//! +//! This means, of course, that the same value interned in two different pools will be +//! considered *not equal* by [Eq] and [Hash](std::hash::Hash). +//! +//! [1]: interned::Interned + +pub mod interned { + //! An [Interned] reference asserts its wrapped value has referential equality. + use super::string_interner::StringInterner; + use std::{ + fmt::{Debug, Display}, + hash::Hash, + ops::Deref, + }; + + /// An [Interned] value is one that is *referentially comparable*. + /// That is, the interned value is unique in memory, simplifying + /// its equality and hashing implementation. + /// + /// Comparing [Interned] values via [PartialOrd] or [Ord] will still + /// dereference to the wrapped pointers, and as such, may produce + /// results inconsistent with [PartialEq] or [Eq]. + #[repr(transparent)] + #[derive(Eq)] + pub struct Interned<'a, T: ?Sized> { + value: &'a T, + } + + impl<'a, T: ?Sized> Interned<'a, T> { + /// Gets the internal value as a pointer + pub fn as_ptr(interned: &Self) -> *const T { + interned.value + } + } + + impl<'a, T: ?Sized + Debug> Debug for Interned<'a, T> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("Interned") + .field("value", &self.value) + .finish() + } + } + impl<'a, T: ?Sized> Interned<'a, T> { + pub(super) fn new(value: &'a T) -> Self { + Self { value } + } + } + impl<'a, T: ?Sized> Deref for Interned<'a, T> { + type Target = T; + fn deref(&self) -> &Self::Target { + self.value + } + } + impl<'a, T: ?Sized> Copy for Interned<'a, T> {} + impl<'a, T: ?Sized> Clone for Interned<'a, T> { + fn clone(&self) -> Self { + *self + } + } + // TODO: These implementations are subtly incorrect, as they do not line up with `eq` + // impl<'a, T: ?Sized + PartialOrd> PartialOrd for Interned<'a, T> { + // fn partial_cmp(&self, other: &Self) -> Option { + // match self == other { + // true => Some(std::cmp::Ordering::Equal), + // false => self.value.partial_cmp(other.value), + // } + // } + // } + // impl<'a, T: ?Sized + Ord> Ord for Interned<'a, T> { + // fn cmp(&self, other: &Self) -> std::cmp::Ordering { + // match self == other { + // true => std::cmp::Ordering::Equal, + // false => self.value.cmp(other.value), + // } + // } + // } + + impl<'a, T: ?Sized> PartialEq for Interned<'a, T> { + fn eq(&self, other: &Self) -> bool { + std::ptr::eq(self.value, other.value) + } + } + impl<'a, T: ?Sized> Hash for Interned<'a, T> { + fn hash(&self, state: &mut H) { + Self::as_ptr(self).hash(state) + } + } + impl Display for Interned<'_, T> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.value.fmt(f) + } + } + + impl> From for Interned<'static, str> { + /// Types which implement [AsRef] will be stored in the global [StringInterner] + fn from(value: T) -> Self { + from_str(value.as_ref()) + } + } + fn from_str(value: &str) -> Interned<'static, str> { + let global_interner = StringInterner::global(); + global_interner.get_or_insert(value) + } +} + +pub mod string_interner { + //! A [StringInterner] hands out [Interned] copies of each unique string given to it. + + use super::interned::Interned; + use cl_arena::dropless_arena::DroplessArena; + use std::{ + collections::HashSet, + ptr::addr_of, + sync::{OnceLock, RwLock}, + }; + + /// A string interner hands out [Interned] copies of each unique string given to it. + pub struct StringInterner<'a> { + arena: &'a DroplessArena, + keys: RwLock>, + } + + impl StringInterner<'static> { + /// Gets a reference to a global string interner whose [Interned] strings are `'static` + pub fn global() -> &'static Self { + static GLOBAL_INTERNER: OnceLock> = OnceLock::new(); + static mut ARENA: DroplessArena = DroplessArena::new(); + + // SAFETY: The RwLock within the interner's `keys` protects the arena + // from being modified concurrently. + GLOBAL_INTERNER.get_or_init(|| StringInterner { + arena: unsafe { &*addr_of!(ARENA) }, + keys: Default::default(), + }) + } + } + + impl<'a> StringInterner<'a> { + /// Creates a new [StringInterner] backed by the provided [DroplessArena] + pub fn new(arena: &'a DroplessArena) -> Self { + Self { arena, keys: RwLock::new(HashSet::new()) } + } + + /// Returns an [Interned] copy of the given string, + /// allocating a new one if it doesn't already exist. + /// + /// # Blocks + /// This function blocks when the interner is held by another thread. + pub fn get_or_insert(&self, value: &str) -> Interned<'a, str> { + let Self { arena, keys } = self; + + // Safety: Holding this write guard for the entire duration of this + // function enforces a safety invariant. See StringInterner::global. + let mut keys = keys.write().expect("should not be poisoned"); + + Interned::new(match keys.get(value) { + Some(value) => value, + None => { + let value = match value { + "" => "", // Arena will panic if passed an empty string + _ => arena.alloc_str(value), + }; + keys.insert(value); + value + } + }) + } + /// Gets a reference to the interned copy of the given value, if it exists + /// # Blocks + /// This function blocks when the interner is held by another thread. + pub fn get(&self, value: &str) -> Option> { + let keys = self.keys.read().expect("should not be poisoned"); + keys.get(value).copied().map(Interned::new) + } + } + + impl std::fmt::Debug for StringInterner<'_> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("Interner") + .field("keys", &self.keys) + .finish() + } + } + + // # Safety: + // This is fine because StringInterner::get_or_insert(v) holds a RwLock + // for its entire duration, and doesn't touch the non-(Send+Sync) arena + // unless the lock is held by a write guard. + unsafe impl<'a> Send for StringInterner<'a> {} + unsafe impl<'a> Sync for StringInterner<'a> {} + + #[cfg(test)] + mod tests { + use super::StringInterner; + + macro_rules! ptr_eq { + ($a: expr, $b: expr $(, $($t:tt)*)?) => { + assert_eq!(std::ptr::addr_of!($a), std::ptr::addr_of!($b) $(, $($t)*)?) + }; + } + macro_rules! ptr_ne { + ($a: expr, $b: expr $(, $($t:tt)*)?) => { + assert_ne!(std::ptr::addr_of!($a), std::ptr::addr_of!($b) $(, $($t)*)?) + }; + } + + #[test] + fn empties_is_unique() { + let interner = StringInterner::global(); + let empty = interner.get_or_insert(""); + let empty2 = interner.get_or_insert(""); + ptr_eq!(*empty, *empty2); + } + #[test] + fn non_empty_is_unique() { + let interner = StringInterner::global(); + let nonempty1 = interner.get_or_insert("not empty!"); + let nonempty2 = interner.get_or_insert("not empty!"); + let different = interner.get_or_insert("different!"); + ptr_eq!(*nonempty1, *nonempty2); + ptr_ne!(*nonempty1, *different); + } + } +} + +pub mod typed_interner { + //! A [TypedInterner] hands out [Interned] references for arbitrary types. + //! + //! Note: It is a *logic error* to modify the returned reference via interior mutability + //! in a way that changes the values produced by [Eq] and [Hash]. + //! + //! See the standard library [HashSet] for more details. + use super::interned::Interned; + use cl_arena::typed_arena::TypedArena; + use std::{collections::HashSet, hash::Hash, sync::RwLock}; + + /// A [TypedInterner] hands out [Interned] references for arbitrary types. + /// + /// See the [module-level documentation](self) for more information. + pub struct TypedInterner<'a, T: Eq + Hash> { + arena: &'a TypedArena, + keys: RwLock>, + } + + impl<'a, T: Eq + Hash> TypedInterner<'a, T> { + /// Creates a new [TypedInterner] backed by the provided [TypedArena] + pub fn new(arena: &'a TypedArena) -> Self { + Self { arena, keys: RwLock::new(HashSet::new()) } + } + + /// Converts the given value into an [Interned] value. + /// + /// # Blocks + /// This function blocks when the interner is held by another thread. + pub fn get_or_insert(&self, value: T) -> Interned<'a, T> { + let Self { arena, keys } = self; + + // Safety: Locking the keyset for the entire duration of this function + // enforces a safety invariant when the interner is stored in a global. + let mut keys = keys.write().expect("should not be poisoned"); + + Interned::new(match keys.get(&value) { + Some(value) => value, + None => { + let value = arena.alloc(value); + keys.insert(value); + value + } + }) + } + /// Returns the [Interned] copy of the given value, if one already exists + /// + /// # Blocks + /// This function blocks when the interner is being written to by another thread. + pub fn get(&self, value: &T) -> Option> { + let keys = self.keys.read().expect("should not be poisoned"); + keys.get(value).copied().map(Interned::new) + } + } + + /// # Safety + /// This should be safe because references yielded by + /// [get_or_insert](TypedInterner::get_or_insert) are unique, and the function uses + /// the [RwLock] around the [HashSet] to ensure mutual exclusion + unsafe impl<'a, T: Eq + Hash + Send> Send for TypedInterner<'a, T> where &'a T: Send {} + unsafe impl<'a, T: Eq + Hash + Send + Sync> Sync for TypedInterner<'a, T> {} +} diff --git a/compiler/cl-structures/src/lib.rs b/compiler/cl-structures/src/lib.rs index f0c11b0..56f4883 100644 --- a/compiler/cl-structures/src/lib.rs +++ b/compiler/cl-structures/src/lib.rs @@ -7,6 +7,8 @@ pub mod arena; +pub mod intern; + pub mod span; pub mod tree;