Skip to content

Instantly share code, notes, and snippets.

@egorsmkv
Last active July 11, 2025 15:16
Show Gist options
  • Save egorsmkv/ea16104f4e167702f3a7f65a442eed6f to your computer and use it in GitHub Desktop.
Save egorsmkv/ea16104f4e167702f3a7f65a442eed6f to your computer and use it in GitHub Desktop.
Deduplicate large text datasets using https://github.com/beowolx/rensa
import pandas as pd
from datasets import load_dataset
from rensa import CMinHash, RMinHash
from tqdm import tqdm
COLUMN = "source"
SPLIT = "train"
ALGORITHM = "CMinHash"
def get_minhash(text, minhash_class, num_perm, seed=0):
"""Function to generate MinHash"""
m = minhash_class(num_perm=num_perm, seed=seed)
if isinstance(text, str):
m.update(text.split())
return m
elif isinstance(text, list) or isinstance(text[0], dict):
for t in text:
key = "content" if "content" in t else "value"
m.update(t[key].split())
return m
else:
raise ValueError(
f"Column {COLUMN} must contain a string or a list of dicts, not {type(text)}"
)
def deduplicate_dataset(dataset, text_column, minhash_class, num_perm=128):
hash_to_index = {} # Maps hash to the first occurrence index
deduplicated_indices = []
info = []
minhash_class = CMinHash if minhash_class == "CMinHash" else RMinHash
for idx, example in tqdm(
enumerate(dataset), total=len(dataset), desc=f"{ALGORITHM} deduplication"
):
try:
minhash_obj = get_minhash(example[text_column], minhash_class, num_perm)
if minhash_obj is None:
continue
except Exception as e:
print(e)
continue
hash_tuple = tuple(minhash_obj.digest())
if hash_tuple not in hash_to_index:
# First occurrence of this hash (keep it)
hash_to_index[hash_tuple] = idx
deduplicated_indices.append(idx)
else:
# Duplicate found (record which sample it's similar to)
original_idx = hash_to_index[hash_tuple]
info.append(
{
"removed_index": idx,
"similar_to_index": original_idx,
"hash": hash_tuple,
}
)
return deduplicated_indices, info
def get_removed(dataset, info, text_column):
"""Create a simple DataFrame with removed samples"""
if not info:
return pd.DataFrame()
removed_data = []
for info in tqdm(info, desc="Collecting removed samples"):
removed_idx = info["removed_index"]
similar_idx = info["similar_to_index"]
# Get the text content from both samples
removed_text = dataset[removed_idx][text_column]
similar_text = dataset[similar_idx][text_column]
# Create a record
record = {
"removed_index": removed_idx,
"similar_to_index": similar_idx,
"removed_text": removed_text,
"similar_text": similar_text,
}
removed_data.append(record)
return pd.DataFrame(removed_data)
ds = load_dataset("Yehor/en-uk-translation")
print(ds)
train_dataset = ds["train"]
half_size = len(train_dataset) // 2
first_half_train_dataset = train_dataset.select(range(half_size))
second_half_train_dataset = train_dataset.select(range(half_size, len(train_dataset)))
my_ds = first_half_train_dataset
ds_file = 'dataset_dedup.parquet'
# Deduplicate dataset
dedup_indices, info = deduplicate_dataset(
my_ds,
text_column=COLUMN,
minhash_class=ALGORITHM,
)
# Create the deduplicated dataset
dedup_dataset = my_ds.select(dedup_indices)
dedup_dataset.to_parquet(ds_file)
# Create simple DataFrame with only the 4 required columns
removed_df = get_removed(my_ds, info, COLUMN)
print(removed_df)
print(f"Original dataset size: {len(my_ds)} samples")
print(f"Deduplicated dataset size: {len(dedup_dataset)} samples")
print(f"{len(info)} samples were removed from the original dataset\n")
import polars as pl
df1 = pl.read_parquet('dataset_dedup.parquet')
df2 = pl.read_parquet('dataset_dedup_2.parquet')
df = pl.concat([df1, df2])
df.write_parquet('dataset.parquet')
from huggingface_hub import HfApi
api = HfApi(token='')
api.upload_file(
path_or_fileobj="dataset.parquet",
path_in_repo="dataset.parquet",
repo_id="Yehor/en-uk-translation-dedup",
repo_type="dataset",
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment