Last active
July 11, 2025 15:16
-
-
Save egorsmkv/ea16104f4e167702f3a7f65a442eed6f to your computer and use it in GitHub Desktop.
Deduplicate large text datasets using https://github.com/beowolx/rensa
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from datasets import load_dataset | |
from rensa import CMinHash, RMinHash | |
from tqdm import tqdm | |
COLUMN = "source" | |
SPLIT = "train" | |
ALGORITHM = "CMinHash" | |
def get_minhash(text, minhash_class, num_perm, seed=0): | |
"""Function to generate MinHash""" | |
m = minhash_class(num_perm=num_perm, seed=seed) | |
if isinstance(text, str): | |
m.update(text.split()) | |
return m | |
elif isinstance(text, list) or isinstance(text[0], dict): | |
for t in text: | |
key = "content" if "content" in t else "value" | |
m.update(t[key].split()) | |
return m | |
else: | |
raise ValueError( | |
f"Column {COLUMN} must contain a string or a list of dicts, not {type(text)}" | |
) | |
def deduplicate_dataset(dataset, text_column, minhash_class, num_perm=128): | |
hash_to_index = {} # Maps hash to the first occurrence index | |
deduplicated_indices = [] | |
info = [] | |
minhash_class = CMinHash if minhash_class == "CMinHash" else RMinHash | |
for idx, example in tqdm( | |
enumerate(dataset), total=len(dataset), desc=f"{ALGORITHM} deduplication" | |
): | |
try: | |
minhash_obj = get_minhash(example[text_column], minhash_class, num_perm) | |
if minhash_obj is None: | |
continue | |
except Exception as e: | |
print(e) | |
continue | |
hash_tuple = tuple(minhash_obj.digest()) | |
if hash_tuple not in hash_to_index: | |
# First occurrence of this hash (keep it) | |
hash_to_index[hash_tuple] = idx | |
deduplicated_indices.append(idx) | |
else: | |
# Duplicate found (record which sample it's similar to) | |
original_idx = hash_to_index[hash_tuple] | |
info.append( | |
{ | |
"removed_index": idx, | |
"similar_to_index": original_idx, | |
"hash": hash_tuple, | |
} | |
) | |
return deduplicated_indices, info | |
def get_removed(dataset, info, text_column): | |
"""Create a simple DataFrame with removed samples""" | |
if not info: | |
return pd.DataFrame() | |
removed_data = [] | |
for info in tqdm(info, desc="Collecting removed samples"): | |
removed_idx = info["removed_index"] | |
similar_idx = info["similar_to_index"] | |
# Get the text content from both samples | |
removed_text = dataset[removed_idx][text_column] | |
similar_text = dataset[similar_idx][text_column] | |
# Create a record | |
record = { | |
"removed_index": removed_idx, | |
"similar_to_index": similar_idx, | |
"removed_text": removed_text, | |
"similar_text": similar_text, | |
} | |
removed_data.append(record) | |
return pd.DataFrame(removed_data) | |
ds = load_dataset("Yehor/en-uk-translation") | |
print(ds) | |
train_dataset = ds["train"] | |
half_size = len(train_dataset) // 2 | |
first_half_train_dataset = train_dataset.select(range(half_size)) | |
second_half_train_dataset = train_dataset.select(range(half_size, len(train_dataset))) | |
my_ds = first_half_train_dataset | |
ds_file = 'dataset_dedup.parquet' | |
# Deduplicate dataset | |
dedup_indices, info = deduplicate_dataset( | |
my_ds, | |
text_column=COLUMN, | |
minhash_class=ALGORITHM, | |
) | |
# Create the deduplicated dataset | |
dedup_dataset = my_ds.select(dedup_indices) | |
dedup_dataset.to_parquet(ds_file) | |
# Create simple DataFrame with only the 4 required columns | |
removed_df = get_removed(my_ds, info, COLUMN) | |
print(removed_df) | |
print(f"Original dataset size: {len(my_ds)} samples") | |
print(f"Deduplicated dataset size: {len(dedup_dataset)} samples") | |
print(f"{len(info)} samples were removed from the original dataset\n") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import polars as pl | |
df1 = pl.read_parquet('dataset_dedup.parquet') | |
df2 = pl.read_parquet('dataset_dedup_2.parquet') | |
df = pl.concat([df1, df2]) | |
df.write_parquet('dataset.parquet') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from huggingface_hub import HfApi | |
api = HfApi(token='') | |
api.upload_file( | |
path_or_fileobj="dataset.parquet", | |
path_in_repo="dataset.parquet", | |
repo_id="Yehor/en-uk-translation-dedup", | |
repo_type="dataset", | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment