BramVanroy · May 29, 2024 11:33 · BramVanroy · May 29, 2024
diff --git a/benchmark.py b/benchmark.py
 import gc

 import numpy as np
 import time

 import pandas as pd
 from tqdm import tqdm


 def pack_documents_original(tokenized_documents, block_size: int = 8192, use_tqdm=True):
    start_time = time.perf_counter()

    sorted_docs = sorted(tokenized_documents, key=len, reverse=True)
    bins = []

    def find_bin(doc):
        for b in bins:
            if sum(len(d) for d in b) + len(doc) <= block_size:
                return b
        return None

    for doc in tqdm(sorted_docs, disable=not use_tqdm, leave=False):
        target_bin = find_bin(doc)
        if target_bin is not None:
            target_bin.append(doc)
        else:
            bins.append([doc])

    end_time = time.perf_counter()
    return bins, end_time - start_time


 def pack_documents_optimized(tokenized_documents, block_size: int = 8192, use_tqdm=True):
    start_time = time.perf_counter()

    doc_lengths = np.array([len(doc) for doc in tokenized_documents])
    sorted_indices = np.argsort(-doc_lengths)

    bins = []
    bin_lengths = []

    def find_bin(_doc_length):
        for i, b_length in enumerate(bin_lengths):
            if b_length + _doc_length <= block_size:
                return i
        return None

    for idx in tqdm(sorted_indices, disable=not use_tqdm, leave=False):
        doc_length = doc_lengths[idx]
        bin_idx = find_bin(doc_length)
        if bin_idx is not None:
            bins[bin_idx].append(idx)
            bin_lengths[bin_idx] += doc_length
        else:
            bins.append([idx])
            bin_lengths.append(doc_length)

    packed_bins = [[tokenized_documents[idx] for idx in b] for b in bins]

    end_time = time.perf_counter()
    return packed_bins, end_time - start_time


 def benchmark():
    results = []
    for num_documents in (1000, 10_000):
        for max_doc_length in (2048, 4096, 8192):
            original_times = []
            optimized_times = []
            result = {
                "num_documents": num_documents,
                "max_doc_length": max_doc_length,
            }
            for run_idx in range(1, 4):
                # Generate `num_documents` random strings of length up to `max_doc_length`
                tokenized_documents = [
                    "".join(list(np.random.choice(['a', 'b', 'c', 'd', ' '], size=np.random.randint(1, max_doc_length))))
                    for _ in range(num_documents)
                ]

                packed_bins_optimized, time_optimized = pack_documents_optimized(tokenized_documents, max_doc_length)
                gc.collect()

                packed_bins_original, time_original = pack_documents_original(tokenized_documents, max_doc_length)
                gc.collect()

                # Ensure results are equivalent. Minor differences in sorting may have happened when two strings have
                # the same length, but both approaches should yield the same results in terms of lengths of bins
                grouped_lens_original = [sum(len(item) for item in _bin) for _bin in packed_bins_original]
                grouped_lens_optimized = [sum(len(item) for item in _bin) for _bin in packed_bins_optimized]
                assert grouped_lens_original == grouped_lens_optimized, "Results are not equivalent"

                original_times.append(time_original)
                optimized_times.append(time_optimized)

                result[f"time_{run_idx}_original"] = time_original

                result[f"run_{run_idx}_optimized"] = time_optimized

            # Print mean and standard deviation of times
            print(f"Number of documents: {num_documents}, max document length: {max_doc_length}")
            print(f"Optimized approach mean time: {np.mean(optimized_times):.4f} seconds (std {np.std(optimized_times):.4f})")
            print(f"Original approach mean time: {np.mean(original_times):.4f} seconds (std {np.std(original_times):.4f})")
            print()

            result["mean_time_original"] = np.mean(original_times)
            result["std_time_original"] = np.std(original_times)

            result["mean_time_optimized"] = np.mean(optimized_times)
            result["std_time_optimized"] = np.std(optimized_times)

            results.append(result)

    df = pd.DataFrame(results)
    df.to_csv("benchmark_results.csv", index=False)
    with pd.option_context('display.max_rows', None, 'display.max_columns', None):
        print(df[["num_documents", "max_doc_length", "mean_time_original", "mean_time_optimized"]])


 if __name__ == "__main__":
    benchmark()
	import gc

	import numpy as np
	import time

	import pandas as pd
	from tqdm import tqdm


	def pack_documents_original(tokenized_documents, block_size: int = 8192, use_tqdm=True):
	start_time = time.perf_counter()

	sorted_docs = sorted(tokenized_documents, key=len, reverse=True)
	bins = []

	def find_bin(doc):
	for b in bins:
	if sum(len(d) for d in b) + len(doc) <= block_size:
	return b
	return None

	for doc in tqdm(sorted_docs, disable=not use_tqdm, leave=False):
	target_bin = find_bin(doc)
	if target_bin is not None:
	target_bin.append(doc)
	else:
	bins.append([doc])

	end_time = time.perf_counter()
	return bins, end_time - start_time


	def pack_documents_optimized(tokenized_documents, block_size: int = 8192, use_tqdm=True):
	start_time = time.perf_counter()

	doc_lengths = np.array([len(doc) for doc in tokenized_documents])
	sorted_indices = np.argsort(-doc_lengths)

	bins = []
	bin_lengths = []

	def find_bin(_doc_length):
	for i, b_length in enumerate(bin_lengths):
	if b_length + _doc_length <= block_size:
	return i
	return None

	for idx in tqdm(sorted_indices, disable=not use_tqdm, leave=False):
	doc_length = doc_lengths[idx]
	bin_idx = find_bin(doc_length)
	if bin_idx is not None:
	bins[bin_idx].append(idx)
	bin_lengths[bin_idx] += doc_length
	else:
	bins.append([idx])
	bin_lengths.append(doc_length)

	packed_bins = [[tokenized_documents[idx] for idx in b] for b in bins]

	end_time = time.perf_counter()
	return packed_bins, end_time - start_time


	def benchmark():
	results = []
	for num_documents in (1000, 10_000):
	for max_doc_length in (2048, 4096, 8192):
	original_times = []
	optimized_times = []
	result = {
	"num_documents": num_documents,
	"max_doc_length": max_doc_length,
	}
	for run_idx in range(1, 4):
	# Generate `num_documents` random strings of length up to `max_doc_length`
	tokenized_documents = [
	"".join(list(np.random.choice(['a', 'b', 'c', 'd', ' '], size=np.random.randint(1, max_doc_length))))
	for _ in range(num_documents)
	]

	packed_bins_optimized, time_optimized = pack_documents_optimized(tokenized_documents, max_doc_length)
	gc.collect()

	packed_bins_original, time_original = pack_documents_original(tokenized_documents, max_doc_length)
	gc.collect()

	# Ensure results are equivalent. Minor differences in sorting may have happened when two strings have
	# the same length, but both approaches should yield the same results in terms of lengths of bins
	grouped_lens_original = [sum(len(item) for item in _bin) for _bin in packed_bins_original]
	grouped_lens_optimized = [sum(len(item) for item in _bin) for _bin in packed_bins_optimized]
	assert grouped_lens_original == grouped_lens_optimized, "Results are not equivalent"

	original_times.append(time_original)
	optimized_times.append(time_optimized)

	result[f"time_{run_idx}_original"] = time_original

	result[f"run_{run_idx}_optimized"] = time_optimized

	# Print mean and standard deviation of times
	print(f"Number of documents: {num_documents}, max document length: {max_doc_length}")
	print(f"Optimized approach mean time: {np.mean(optimized_times):.4f} seconds (std {np.std(optimized_times):.4f})")
	print(f"Original approach mean time: {np.mean(original_times):.4f} seconds (std {np.std(original_times):.4f})")
	print()

	result["mean_time_original"] = np.mean(original_times)
	result["std_time_original"] = np.std(original_times)

	result["mean_time_optimized"] = np.mean(optimized_times)
	result["std_time_optimized"] = np.std(optimized_times)

	results.append(result)

	df = pd.DataFrame(results)
	df.to_csv("benchmark_results.csv", index=False)
	with pd.option_context('display.max_rows', None, 'display.max_columns', None):
	print(df[["num_documents", "max_doc_length", "mean_time_original", "mean_time_optimized"]])


	if __name__ == "__main__":
	benchmark()
num_documents	max_doc_length	mean_time_original	mean_time_optimized
1000	2048	0.0894	0.0219
1000	4096	0.0896	0.0181
1000	8192	0.0899	0.0179
10000	2048	9.0949	1.8311
10000	4096	9.2626	1.8398
10000	8192	9.2381	1.7905