Last active
January 13, 2026 18:37
-
-
Save AnthonyMikh/5f6dd08e0b1e88929f9cb4187e7f91eb to your computer and use it in GitHub Desktop.
bumpalo-based thin string interner
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| use bumpalo::Bump; | |
| use std::borrow::Borrow; | |
| use std::cell::RefCell; | |
| use std::collections::HashSet; | |
| use std::fmt::{self, Debug, Display}; | |
| use std::hash::{self, Hash}; | |
| use std::marker::PhantomData; | |
| use std::ptr::NonNull; | |
| const USIZE_SIZE: usize = std::mem::size_of::<usize>(); | |
| const USIZE_ALIGN: usize = std::mem::align_of::<usize>(); | |
| #[derive(Clone, Copy, PartialEq, Eq)] | |
| pub struct InternedStr<'a> { | |
| ptr: NonNull<u8>, | |
| _lt: PhantomData<&'a ()>, | |
| } | |
| impl<'a> InternedStr<'a> { | |
| unsafe fn from_ptr(ptr: NonNull<u8>) -> Self { | |
| Self { | |
| ptr, | |
| _lt: PhantomData, | |
| } | |
| } | |
| pub fn as_str(self) -> &'a str { | |
| let len = unsafe { self.ptr.cast::<usize>().read() }; | |
| let ptr = unsafe { self.ptr.add(USIZE_SIZE) }; | |
| unsafe { str::from_utf8_unchecked(std::slice::from_raw_parts(ptr.as_ptr(), len)) } | |
| } | |
| pub fn ptr_eq(self, other: InternedStr<'_>) -> bool { | |
| self.ptr == other.ptr | |
| } | |
| unsafe fn reborrow_unchecked<'b>(self) -> InternedStr<'b> { | |
| InternedStr { | |
| ptr: self.ptr, | |
| _lt: PhantomData, | |
| } | |
| } | |
| pub fn reborrow<'b>(self) -> Self | |
| where | |
| 'a: 'b, | |
| { | |
| Self { | |
| ptr: self.ptr, | |
| _lt: PhantomData, | |
| } | |
| } | |
| } | |
| impl Borrow<str> for InternedStr<'_> { | |
| fn borrow(&self) -> &str { | |
| self.as_str() | |
| } | |
| } | |
| impl Debug for InternedStr<'_> { | |
| fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { | |
| Debug::fmt(self.as_str(), f) | |
| } | |
| } | |
| impl Display for InternedStr<'_> { | |
| fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { | |
| Display::fmt(self.as_str(), f) | |
| } | |
| } | |
| impl Hash for InternedStr<'_> { | |
| fn hash<H: hash::Hasher>(&self, state: &mut H) { | |
| self.as_str().hash(state) | |
| } | |
| } | |
| #[derive(Default)] | |
| pub struct Interner<S = std::collections::hash_map::RandomState> { | |
| strs: RefCell<HashSet<InternedStr<'static>, S>>, | |
| mem: Bump<{ USIZE_ALIGN }>, | |
| } | |
| impl Interner { | |
| pub fn new() -> Self { | |
| Self::default() | |
| } | |
| pub fn alloc(&self, s: &str) -> InternedStr<'_> { | |
| let mut strs = self.strs.borrow_mut(); | |
| if let Some(&interned) = strs.get(s) { | |
| return interned; | |
| } | |
| let len = s.len(); | |
| let ptr = { | |
| let alloc_len = len.checked_add(USIZE_SIZE).expect("length overflow"); | |
| let layout = std::alloc::Layout::array::<u8>(alloc_len) | |
| .unwrap() | |
| .align_to(USIZE_ALIGN) | |
| .expect("invalid layout"); | |
| self.mem.alloc_layout(layout) | |
| }; | |
| let s = unsafe { | |
| ptr.cast::<usize>().write(len); | |
| ptr.byte_add(USIZE_SIZE) | |
| .as_ptr() | |
| .copy_from_nonoverlapping(s.as_ptr(), len); | |
| InternedStr::from_ptr(ptr) | |
| }; | |
| strs.insert(unsafe { s.reborrow_unchecked() }); | |
| s | |
| } | |
| pub fn clear(&mut self) { | |
| self.strs.get_mut().clear(); | |
| self.mem.reset(); | |
| } | |
| } | |
| #[test] | |
| fn smoke() { | |
| let mut i = Interner::new(); | |
| let h = i.alloc("hello"); | |
| let w = i.alloc("world"); | |
| let s = format!("{h}, {w}!"); | |
| assert_eq!(s, "hello, world!"); | |
| i.clear(); | |
| let a = i.alloc("hey"); | |
| let b = i.alloc("hey"); | |
| assert!(a.ptr_eq(b)); | |
| assert_eq!(a.as_str(), b.as_str()); | |
| i.clear(); | |
| let a = i.alloc("hey"); | |
| let b = i.alloc(a.as_str()); | |
| assert!(a.ptr_eq(b)); | |
| drop(i); | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment