Skip to content

Instantly share code, notes, and snippets.

@oxalica
Created April 13, 2025 13:13
Show Gist options
  • Save oxalica/c3a67f56486ad77772f9df72f703eaa4 to your computer and use it in GitHub Desktop.
Save oxalica/c3a67f56486ad77772f9df72f703eaa4 to your computer and use it in GitHub Desktop.
Test the optimization potential for string literal merging
// [dependencies]
// bstr = "1"
// goblin = "0.9"
// rustc-hash = "2.1.1"
// suff_collections = "2"
use std::cmp::Reverse;
use std::collections::HashSet;
use std::hash::{Hash, Hasher};
use bstr::{BStr, ByteSlice};
use goblin::archive::Archive;
use goblin::elf::Elf;
use rustc_hash::{FxHashMap, FxHasher};
use suff_collections::tree::OnlineSuffixTree;
fn main() {
let bin;
let mut sections = Vec::new();
{
let path = std::env::args()
.nth(1)
.expect("expecting the first argument to be a path to *.a");
bin = std::fs::read(&path).unwrap();
let obj = Archive::parse(&bin).unwrap();
for i in 0..obj.len() {
let member = obj.get_at(i).unwrap();
let off = member.offset;
let len = member.header.size;
let member_bin = &bin[off as usize..][..len];
let elf = Elf::parse(member_bin).unwrap();
for h in &elf.section_headers {
if elf.strtab[h.sh_name].starts_with(".rodata") {
let content = &member_bin[h.file_range().unwrap()];
sections.push(content);
}
}
}
}
let info = |msg: &str, f: for<'a> fn(&[&'a [u8]]) -> Vec<&'a [u8]>| {
let ret = f(&sections);
let total = ret.iter().map(|s| s.len()).sum::<usize>();
println!("{msg}: {} sections, total bytes: {total}", ret.len());
};
info("copy .rodata*", |s| s.to_vec());
info("dedup simple", dedup_simple);
info("merge suffix", merge_suffix);
info("merge substring", merge_subarray);
}
fn dedup_simple<'a>(secs: &[&'a [u8]]) -> Vec<&'a [u8]> {
let mut h = <HashSet<&[u8]>>::new();
let mut out = Vec::with_capacity(secs.len());
for &sec in secs {
if h.insert(sec) {
out.push(sec);
}
}
out
}
fn merge_suffix<'a>(secs: &[&'a [u8]]) -> Vec<&'a [u8]> {
// Default seed 0 causes collisions between [0] and [0, 0].
const SEED: usize = 0xDEAD_BEEF_AABB_CCDDusize;
// suffix hash -> suffix string
let mut suffixes = <FxHashMap<u64, &[u8]>>::default();
let mut sorted = secs.to_vec();
sorted.sort_by_key(|s| Reverse(s.len()));
let mut ret = Vec::with_capacity(secs.len());
for data in sorted {
if data.is_empty() {
continue;
}
{
let mut h = FxHasher::with_seed(SEED);
for &b in data.iter().rev() {
b.hash(&mut h);
}
if suffixes.get(&h.finish()).is_some_and(|prev| *prev == data) {
continue;
}
}
ret.push(data);
let mut h = FxHasher::with_seed(SEED);
for (i, &b) in data.iter().enumerate().rev() {
b.hash(&mut h);
let suffix = &data[i..];
if let Some(prev) = suffixes.insert(h.finish(), suffix) {
assert_eq!(prev, suffix, "TODO: collision");
}
}
}
ret
}
fn merge_subarray<'a>(secs: &[&'a [u8]]) -> Vec<&'a [u8]> {
let mut sorted = secs.to_vec();
sorted.sort_by_key(|s| Reverse(s.len()));
let mut out = Vec::with_capacity(secs.len());
let mut suf = OnlineSuffixTree::new();
for &data in &sorted {
// Workaround: suff_collections requires &str, we escape &[u8] to String for them so
// the equality and contain-relationship are still maintained.
let data_str = BStr::new(data).escape_bytes().collect::<String>();
if suf.find(&data_str).is_some() {
continue;
}
suf.add(&data_str);
// Record the original bytes.
out.push(data);
}
out
}
$ cargo r --release -- librustc_driver-b3ba5b00671ce1f8.a
copy .rodata*: 135428 sections, total bytes: 6638549
dedup simple: 42456 sections, total bytes: 3675613
merge suffix: 37281 sections, total bytes: 3526647
merge substring: 28527 sections, total bytes: 3353123
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment