egorsmkv · July 11, 2025 15:16
diff --git a/dedup.py b/dedup.py
 import pandas as pd

 from datasets import load_dataset
 from rensa import CMinHash, RMinHash
 from tqdm import tqdm

 COLUMN = "source"
 SPLIT = "train"
 ALGORITHM = "CMinHash"


 def get_minhash(text, minhash_class, num_perm, seed=0):
    """Function to generate MinHash"""
    m = minhash_class(num_perm=num_perm, seed=seed)

    if isinstance(text, str):
        m.update(text.split())
        return m
    elif isinstance(text, list) or isinstance(text[0], dict):
        for t in text:
            key = "content" if "content" in t else "value"
            m.update(t[key].split())
        return m
    else:
        raise ValueError(
            f"Column {COLUMN} must contain a string or a list of dicts, not {type(text)}"
        )


 def deduplicate_dataset(dataset, text_column, minhash_class, num_perm=128):
    hash_to_index = {}  # Maps hash to the first occurrence index
    deduplicated_indices = []
    info = []
    minhash_class = CMinHash if minhash_class == "CMinHash" else RMinHash

    for idx, example in tqdm(
        enumerate(dataset), total=len(dataset), desc=f"{ALGORITHM} deduplication"
    ):
        try:
            minhash_obj = get_minhash(example[text_column], minhash_class, num_perm)

            if minhash_obj is None:
                continue
        except Exception as e:
            print(e)
            continue

        hash_tuple = tuple(minhash_obj.digest())

        if hash_tuple not in hash_to_index:
            # First occurrence of this hash (keep it)
            hash_to_index[hash_tuple] = idx
            deduplicated_indices.append(idx)
        else:
            # Duplicate found (record which sample it's similar to)
            original_idx = hash_to_index[hash_tuple]
            info.append(
                {
                    "removed_index": idx,
                    "similar_to_index": original_idx,
                    "hash": hash_tuple,
                }
            )

    return deduplicated_indices, info


 def get_removed(dataset, info, text_column):
    """Create a simple DataFrame with removed samples"""

    if not info:
        return pd.DataFrame()

    removed_data = []

    for info in tqdm(info, desc="Collecting removed samples"):
        removed_idx = info["removed_index"]
        similar_idx = info["similar_to_index"]

        # Get the text content from both samples
        removed_text = dataset[removed_idx][text_column]
        similar_text = dataset[similar_idx][text_column]

        # Create a record
        record = {
            "removed_index": removed_idx,
            "similar_to_index": similar_idx,
            "removed_text": removed_text,
            "similar_text": similar_text,
        }

        removed_data.append(record)

    return pd.DataFrame(removed_data)


 ds = load_dataset("Yehor/en-uk-translation")

 print(ds)

 train_dataset = ds["train"]

 half_size = len(train_dataset) // 2

 first_half_train_dataset = train_dataset.select(range(half_size))
 second_half_train_dataset = train_dataset.select(range(half_size, len(train_dataset)))

 my_ds = first_half_train_dataset
 ds_file = 'dataset_dedup.parquet'

 # Deduplicate dataset
 dedup_indices, info = deduplicate_dataset(
    my_ds,
    text_column=COLUMN,
    minhash_class=ALGORITHM,
 )

 # Create the deduplicated dataset
 dedup_dataset = my_ds.select(dedup_indices)

 dedup_dataset.to_parquet(ds_file)

 # Create simple DataFrame with only the 4 required columns

 removed_df = get_removed(my_ds, info, COLUMN)

 print(removed_df)

 print(f"Original dataset size: {len(my_ds)} samples")
 print(f"Deduplicated dataset size: {len(dedup_dataset)} samples")
 print(f"{len(info)} samples were removed from the original dataset\n")
diff --git a/merge.py b/merge.py
 import polars as pl

 df1 = pl.read_parquet('dataset_dedup.parquet')

 df2 = pl.read_parquet('dataset_dedup_2.parquet')

 df = pl.concat([df1, df2])

 df.write_parquet('dataset.parquet')
diff --git a/upload.py b/upload.py
 from huggingface_hub import HfApi

 api = HfApi(token='')

 api.upload_file(
    path_or_fileobj="dataset.parquet",
    path_in_repo="dataset.parquet",
    repo_id="Yehor/en-uk-translation-dedup",
    repo_type="dataset",
 )
	import pandas as pd

	from datasets import load_dataset
	from rensa import CMinHash, RMinHash
	from tqdm import tqdm

	COLUMN = "source"
	SPLIT = "train"
	ALGORITHM = "CMinHash"


	def get_minhash(text, minhash_class, num_perm, seed=0):
	"""Function to generate MinHash"""
	m = minhash_class(num_perm=num_perm, seed=seed)

	if isinstance(text, str):
	m.update(text.split())
	return m
	elif isinstance(text, list) or isinstance(text[0], dict):
	for t in text:
	key = "content" if "content" in t else "value"
	m.update(t[key].split())
	return m
	else:
	raise ValueError(
	f"Column {COLUMN} must contain a string or a list of dicts, not {type(text)}"
	)


	def deduplicate_dataset(dataset, text_column, minhash_class, num_perm=128):
	hash_to_index = {} # Maps hash to the first occurrence index
	deduplicated_indices = []
	info = []
	minhash_class = CMinHash if minhash_class == "CMinHash" else RMinHash

	for idx, example in tqdm(
	enumerate(dataset), total=len(dataset), desc=f"{ALGORITHM} deduplication"
	):
	try:
	minhash_obj = get_minhash(example[text_column], minhash_class, num_perm)

	if minhash_obj is None:
	continue
	except Exception as e:
	print(e)
	continue

	hash_tuple = tuple(minhash_obj.digest())

	if hash_tuple not in hash_to_index:
	# First occurrence of this hash (keep it)
	hash_to_index[hash_tuple] = idx
	deduplicated_indices.append(idx)
	else:
	# Duplicate found (record which sample it's similar to)
	original_idx = hash_to_index[hash_tuple]
	info.append(
	{
	"removed_index": idx,
	"similar_to_index": original_idx,
	"hash": hash_tuple,
	}
	)

	return deduplicated_indices, info


	def get_removed(dataset, info, text_column):
	"""Create a simple DataFrame with removed samples"""

	if not info:
	return pd.DataFrame()

	removed_data = []

	for info in tqdm(info, desc="Collecting removed samples"):
	removed_idx = info["removed_index"]
	similar_idx = info["similar_to_index"]

	# Get the text content from both samples
	removed_text = dataset[removed_idx][text_column]
	similar_text = dataset[similar_idx][text_column]

	# Create a record
	record = {
	"removed_index": removed_idx,
	"similar_to_index": similar_idx,
	"removed_text": removed_text,
	"similar_text": similar_text,
	}

	removed_data.append(record)

	return pd.DataFrame(removed_data)


	ds = load_dataset("Yehor/en-uk-translation")

	print(ds)

	train_dataset = ds["train"]

	half_size = len(train_dataset) // 2

	first_half_train_dataset = train_dataset.select(range(half_size))
	second_half_train_dataset = train_dataset.select(range(half_size, len(train_dataset)))

	my_ds = first_half_train_dataset
	ds_file = 'dataset_dedup.parquet'

	# Deduplicate dataset
	dedup_indices, info = deduplicate_dataset(
	my_ds,
	text_column=COLUMN,
	minhash_class=ALGORITHM,
	)

	# Create the deduplicated dataset
	dedup_dataset = my_ds.select(dedup_indices)

	dedup_dataset.to_parquet(ds_file)

	# Create simple DataFrame with only the 4 required columns

	removed_df = get_removed(my_ds, info, COLUMN)

	print(removed_df)

	print(f"Original dataset size: {len(my_ds)} samples")
	print(f"Deduplicated dataset size: {len(dedup_dataset)} samples")
	print(f"{len(info)} samples were removed from the original dataset\n")
	import polars as pl

	df1 = pl.read_parquet('dataset_dedup.parquet')

	df2 = pl.read_parquet('dataset_dedup_2.parquet')

	df = pl.concat([df1, df2])

	df.write_parquet('dataset.parquet')
	from huggingface_hub import HfApi

	api = HfApi(token='')

	api.upload_file(
	path_or_fileobj="dataset.parquet",
	path_in_repo="dataset.parquet",
	repo_id="Yehor/en-uk-translation-dedup",
	repo_type="dataset",
	)