cl-structures: Global (ew!) and local string interning!
- StringArena provides an arena for immutable strings, inspired by other string interners, and keeps track of the ends of every allocated string. Strings inserted into the arena are assigned a Symbol. - intern::Interner keeps track of the hashes of each inserted string, and provides deduplication for interned strings. This allows referential comparison between interned strings - global_intern::GlobalSym provides metered access to a Global Interner, and has a Display implementation which queries the Interner. The global interner is planned for use in cl-ast. TODO: the unstable raw_entry API is about to be removed from Rust. Maybe switch to hashbrown, or write my own hash table?
This commit is contained in:
parent
f24bd10c53
commit
e70ffd1895
54
compiler/cl-structures/src/arena.rs
Normal file
54
compiler/cl-structures/src/arena.rs
Normal file
@ -0,0 +1,54 @@
|
|||||||
|
//! Simple, long-lived string buffer
|
||||||
|
|
||||||
|
use std::marker::PhantomData;
|
||||||
|
use symbol::Symbol;
|
||||||
|
|
||||||
|
pub mod global_intern;
|
||||||
|
pub mod intern;
|
||||||
|
pub mod symbol;
|
||||||
|
|
||||||
|
/// Compactly stores a set of immutable strings, producing a [Symbol] for each one
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub struct StringArena<T: Symbol> {
|
||||||
|
ends: Vec<usize>,
|
||||||
|
buf: String,
|
||||||
|
_t: PhantomData<fn(T)>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T: Symbol> StringArena<T> {
|
||||||
|
pub fn new() -> Self {
|
||||||
|
Default::default()
|
||||||
|
}
|
||||||
|
/// # May panic
|
||||||
|
/// Panics if Symbol::from_usize would panic
|
||||||
|
fn next_key(&self) -> T {
|
||||||
|
Symbol::from_usize(self.ends.len())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_span(&self, key: T) -> Option<(usize, usize)> {
|
||||||
|
let key = key.into_usize();
|
||||||
|
Some((*self.ends.get(key - 1)?, *self.ends.get(key)?))
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn get(&self, key: T) -> Option<&str> {
|
||||||
|
let (start, end) = self.get_span(key)?;
|
||||||
|
// Safety: start and end offsets were created by push_string
|
||||||
|
Some(unsafe { self.buf.get_unchecked(start..end) })
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn push_string(&mut self, s: &str) -> T {
|
||||||
|
if self.ends.is_empty() {
|
||||||
|
self.ends.push(self.buf.len())
|
||||||
|
}
|
||||||
|
let key = self.next_key();
|
||||||
|
self.buf.push_str(s);
|
||||||
|
self.ends.push(self.buf.len());
|
||||||
|
key
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T: Symbol> Default for StringArena<T> {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self { ends: Default::default(), buf: Default::default(), _t: PhantomData }
|
||||||
|
}
|
||||||
|
}
|
127
compiler/cl-structures/src/arena/global_intern.rs
Normal file
127
compiler/cl-structures/src/arena/global_intern.rs
Normal file
@ -0,0 +1,127 @@
|
|||||||
|
//! A global intern pool for strings, represented by the [GlobalSym] symbol
|
||||||
|
|
||||||
|
use super::{intern::Interner, symbol::Symbol};
|
||||||
|
use std::{
|
||||||
|
fmt::Display,
|
||||||
|
num::NonZeroU32,
|
||||||
|
sync::{OnceLock, RwLock},
|
||||||
|
};
|
||||||
|
|
||||||
|
static GLOBAL_INTERNER: OnceLock<RwLock<Interner<GlobalSym>>> = OnceLock::new();
|
||||||
|
|
||||||
|
/// Gets the [GlobalSym] associated with this string, if there is one, or creates a new one
|
||||||
|
///
|
||||||
|
/// # Blocks
|
||||||
|
/// Locks the Global Interner for writing. If it is already locked,
|
||||||
|
/// # May Panic
|
||||||
|
/// T
|
||||||
|
pub fn get_or_insert(s: &str) -> GlobalSym {
|
||||||
|
GLOBAL_INTERNER
|
||||||
|
.get_or_init(Default::default)
|
||||||
|
.write()
|
||||||
|
.expect("global interner should not have been held by a panicked thread")
|
||||||
|
.get_or_insert(s)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Gets the [GlobalSym] associated with this string, if there is one
|
||||||
|
pub fn get(s: &str) -> Option<GlobalSym> {
|
||||||
|
GLOBAL_INTERNER.get()?.read().ok()?.get(s)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Gets the [String] associated with this [GlobalSym], if there is one
|
||||||
|
///
|
||||||
|
/// Returns none if the global symbol table is poisoned.
|
||||||
|
pub fn get_string(sym: GlobalSym) -> Option<String> {
|
||||||
|
sym.try_into().ok()
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
|
||||||
|
pub struct GlobalSym(NonZeroU32);
|
||||||
|
|
||||||
|
impl GlobalSym {
|
||||||
|
/// Gets a [GlobalSym] associated with the given string, if one exists
|
||||||
|
pub fn try_from_str(value: &str) -> Option<Self> {
|
||||||
|
GLOBAL_INTERNER.get()?.read().ok()?.get(value)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Display for GlobalSym {
|
||||||
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
|
let Some(interner) = GLOBAL_INTERNER.get() else {
|
||||||
|
return write!(f, "[sym@{} (uninitialized)]", self.0);
|
||||||
|
};
|
||||||
|
let Ok(interner) = interner.read() else {
|
||||||
|
return write!(f, "[sym@{} (poisoned)]", self.0);
|
||||||
|
};
|
||||||
|
let Some(str) = interner.get_str(*self) else {
|
||||||
|
return write!(f, "[sym@{} (invalid)]", self.0);
|
||||||
|
};
|
||||||
|
str.fmt(f)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Symbol for GlobalSym {
|
||||||
|
const MAX: usize = u32::MAX as usize - 1;
|
||||||
|
fn try_from_usize(value: usize) -> Option<Self> {
|
||||||
|
Some(Self(NonZeroU32::try_from_usize(value)?))
|
||||||
|
}
|
||||||
|
fn into_usize(self) -> usize {
|
||||||
|
self.0.into_usize()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<&str> for GlobalSym {
|
||||||
|
/// Converts to this type from the input type.
|
||||||
|
///
|
||||||
|
/// # Blocks
|
||||||
|
/// This conversion blocks if the Global Interner lock is held.
|
||||||
|
///
|
||||||
|
/// # May Panic
|
||||||
|
/// Panics if the Global Interner's lock has been poisoned by a panic in another thread
|
||||||
|
fn from(value: &str) -> Self {
|
||||||
|
GLOBAL_INTERNER
|
||||||
|
.get_or_init(Default::default)
|
||||||
|
.write()
|
||||||
|
.expect("global interner should not be poisoned in another thread")
|
||||||
|
.get_or_insert(value)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl TryFrom<GlobalSym> for String {
|
||||||
|
type Error = GlobalSymError;
|
||||||
|
|
||||||
|
fn try_from(value: GlobalSym) -> Result<Self, Self::Error> {
|
||||||
|
let Some(interner) = GLOBAL_INTERNER.get() else {
|
||||||
|
Err(GlobalSymError::Uninitialized)?
|
||||||
|
};
|
||||||
|
let Ok(interner) = interner.write() else {
|
||||||
|
Err(GlobalSymError::Poisoned)?
|
||||||
|
};
|
||||||
|
match interner.get_str(value) {
|
||||||
|
None => Err(GlobalSymError::Unseen(value)),
|
||||||
|
Some(string) => Ok(string.into()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
|
||||||
|
pub enum GlobalSymError {
|
||||||
|
Uninitialized,
|
||||||
|
Poisoned,
|
||||||
|
Unseen(GlobalSym),
|
||||||
|
}
|
||||||
|
impl std::error::Error for GlobalSymError {}
|
||||||
|
impl Display for GlobalSymError {
|
||||||
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
|
match self {
|
||||||
|
GlobalSymError::Uninitialized => "String pool was not initialized".fmt(f),
|
||||||
|
GlobalSymError::Poisoned => "String pool was held by panicking thread".fmt(f),
|
||||||
|
GlobalSymError::Unseen(sym) => {
|
||||||
|
write!(f, "Symbol {sym:?} not present in String pool")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests;
|
28
compiler/cl-structures/src/arena/global_intern/tests.rs
Normal file
28
compiler/cl-structures/src/arena/global_intern/tests.rs
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
//! Tests for the global intern pool
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn globalsym_from_returns_unique_value_for_unique_keys() {
|
||||||
|
let foo_bar = GlobalSym::from("foo_bar");
|
||||||
|
let foo_baz = GlobalSym::from("foo_baz");
|
||||||
|
assert_ne!(foo_bar, foo_baz);
|
||||||
|
assert_eq!(foo_bar, GlobalSym::from("foo_bar"));
|
||||||
|
assert_eq!(foo_baz, GlobalSym::from("foo_baz"));
|
||||||
|
}
|
||||||
|
#[test]
|
||||||
|
fn get_returns_none_before_init() {
|
||||||
|
if let Some(value) = get("") {
|
||||||
|
panic!("{value}")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#[test]
|
||||||
|
fn get_returns_some_when_key_exists() {
|
||||||
|
let _ = GlobalSym::from("foo_bar");
|
||||||
|
assert!(dbg!(get("foo_bar")).is_some());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn get_returns_the_same_thing_as_globalsym_from() {
|
||||||
|
let foo_bar = GlobalSym::from("foo_bar");
|
||||||
|
assert_eq!(Some(foo_bar), get("foo_bar"));
|
||||||
|
}
|
55
compiler/cl-structures/src/arena/intern.rs
Normal file
55
compiler/cl-structures/src/arena/intern.rs
Normal file
@ -0,0 +1,55 @@
|
|||||||
|
//! A string interner with deduplication
|
||||||
|
|
||||||
|
use super::{symbol::Symbol, StringArena};
|
||||||
|
use std::{
|
||||||
|
collections::{hash_map::RawEntryMut, HashMap},
|
||||||
|
hash::{BuildHasher, RandomState},
|
||||||
|
};
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub struct Interner<T: Symbol, H: BuildHasher = RandomState> {
|
||||||
|
map: HashMap<T, ()>,
|
||||||
|
arena: StringArena<T>,
|
||||||
|
hasher: H,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T: Symbol, H: BuildHasher + Default> Default for Interner<T, H> {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self { map: Default::default(), arena: Default::default(), hasher: Default::default() }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T: Symbol, H: BuildHasher> Interner<T, H> {
|
||||||
|
pub fn get_or_insert(&mut self, s: &str) -> T {
|
||||||
|
let Self { map, arena, hasher } = self;
|
||||||
|
let hash = hasher.hash_one(s);
|
||||||
|
match map.raw_entry_mut().from_hash(hash, is_match(s, arena)) {
|
||||||
|
RawEntryMut::Occupied(entry) => *entry.into_key(),
|
||||||
|
RawEntryMut::Vacant(entry) => {
|
||||||
|
let tok = arena.push_string(s);
|
||||||
|
*(entry.insert_hashed_nocheck(hash, tok, ()).0)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn get(&self, s: &str) -> Option<T> {
|
||||||
|
let Self { map, arena, hasher } = self;
|
||||||
|
map.raw_entry()
|
||||||
|
.from_hash(hasher.hash_one(s), is_match(s, arena))
|
||||||
|
.map(|entry| *entry.0)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn get_str(&self, sym: T) -> Option<&str> {
|
||||||
|
self.arena.get(sym)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_match<'a, T: Symbol>(
|
||||||
|
target: &'a str,
|
||||||
|
arena: &'a StringArena<T>,
|
||||||
|
) -> impl Fn(&T) -> bool + 'a {
|
||||||
|
move |sym| match arena.get(*sym) {
|
||||||
|
Some(sym) => sym == target,
|
||||||
|
None => false,
|
||||||
|
}
|
||||||
|
}
|
35
compiler/cl-structures/src/arena/symbol.rs
Normal file
35
compiler/cl-structures/src/arena/symbol.rs
Normal file
@ -0,0 +1,35 @@
|
|||||||
|
use std::{fmt, hash, num::*};
|
||||||
|
|
||||||
|
pub trait Symbol: Copy + fmt::Debug + fmt::Display + Eq + hash::Hash {
|
||||||
|
/// The largest [`usize`] that may be stored in the [Symbol]
|
||||||
|
const MAX: usize;
|
||||||
|
/// Returns [`Some(Self)`](Some) if `value` is in range 0..=[Symbol::MAX]
|
||||||
|
fn try_from_usize(value: usize) -> Option<Self>;
|
||||||
|
/// # May Panic
|
||||||
|
/// May panic if `value` is not in range 0..=[Symbol::MAX]
|
||||||
|
fn from_usize(value: usize) -> Self {
|
||||||
|
Self::try_from_usize(value).expect("should be within MIN and MAX")
|
||||||
|
}
|
||||||
|
fn into_usize(self) -> usize;
|
||||||
|
}
|
||||||
|
|
||||||
|
#[rustfmt::skip]
|
||||||
|
impl Symbol for usize {
|
||||||
|
const MAX: usize = usize::MAX;
|
||||||
|
fn try_from_usize(value: usize) -> Option<Self> { Some(value) }
|
||||||
|
fn into_usize(self) -> usize { self }
|
||||||
|
}
|
||||||
|
|
||||||
|
macro_rules! impl_symbol_for_nonzero{($($int:ident: $nonzero:ident),* $(,)?) => {$(
|
||||||
|
impl Symbol for $nonzero {
|
||||||
|
const MAX: usize = $int::MAX as usize - 1;
|
||||||
|
fn try_from_usize(value: usize) -> Option<Self> {
|
||||||
|
$nonzero::try_from(value.wrapping_add(1) as $int).ok()
|
||||||
|
}
|
||||||
|
fn into_usize(self) -> usize {
|
||||||
|
self.get() as usize - 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
)*}}
|
||||||
|
|
||||||
|
impl_symbol_for_nonzero!(u8: NonZeroU8, u16: NonZeroU16, u32: NonZeroU32, u64: NonZeroU64, usize: NonZeroUsize);
|
@ -2,9 +2,17 @@
|
|||||||
//! - [Span](struct@span::Span): Stores a start and end [Loc](struct@span::Loc)
|
//! - [Span](struct@span::Span): Stores a start and end [Loc](struct@span::Loc)
|
||||||
//! - [Loc](struct@span::Loc): Stores the index in a stream
|
//! - [Loc](struct@span::Loc): Stores the index in a stream
|
||||||
#![warn(clippy::all)]
|
#![warn(clippy::all)]
|
||||||
#![feature(inline_const, dropck_eyepatch, decl_macro, get_many_mut)]
|
#![feature(
|
||||||
|
inline_const,
|
||||||
|
dropck_eyepatch,
|
||||||
|
decl_macro,
|
||||||
|
get_many_mut,
|
||||||
|
hash_raw_entry
|
||||||
|
)]
|
||||||
#![deny(unsafe_op_in_unsafe_fn)]
|
#![deny(unsafe_op_in_unsafe_fn)]
|
||||||
|
|
||||||
|
pub mod arena;
|
||||||
|
|
||||||
pub mod span;
|
pub mod span;
|
||||||
|
|
||||||
pub mod tree;
|
pub mod tree;
|
||||||
|
Loading…
Reference in New Issue
Block a user