Last active
May 29, 2024 11:33
-
-
Save BramVanroy/448b7508ddeec5e215c1a33d155aae64 to your computer and use it in GitHub Desktop.
Fast method of "first-fit-decreasing" packing benchmark. Around 5x faster than baseline. Baseline taken from https://huggingface.co/DiscoResearch/Llama3-German-8B#document-packing. Note that memory usage will be higher in the optimized version.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import gc | |
import numpy as np | |
import time | |
import pandas as pd | |
from tqdm import tqdm | |
def pack_documents_original(tokenized_documents, block_size: int = 8192, use_tqdm=True): | |
start_time = time.perf_counter() | |
sorted_docs = sorted(tokenized_documents, key=len, reverse=True) | |
bins = [] | |
def find_bin(doc): | |
for b in bins: | |
if sum(len(d) for d in b) + len(doc) <= block_size: | |
return b | |
return None | |
for doc in tqdm(sorted_docs, disable=not use_tqdm, leave=False): | |
target_bin = find_bin(doc) | |
if target_bin is not None: | |
target_bin.append(doc) | |
else: | |
bins.append([doc]) | |
end_time = time.perf_counter() | |
return bins, end_time - start_time | |
def pack_documents_optimized(tokenized_documents, block_size: int = 8192, use_tqdm=True): | |
start_time = time.perf_counter() | |
doc_lengths = np.array([len(doc) for doc in tokenized_documents]) | |
sorted_indices = np.argsort(-doc_lengths) | |
bins = [] | |
bin_lengths = [] | |
def find_bin(_doc_length): | |
for i, b_length in enumerate(bin_lengths): | |
if b_length + _doc_length <= block_size: | |
return i | |
return None | |
for idx in tqdm(sorted_indices, disable=not use_tqdm, leave=False): | |
doc_length = doc_lengths[idx] | |
bin_idx = find_bin(doc_length) | |
if bin_idx is not None: | |
bins[bin_idx].append(idx) | |
bin_lengths[bin_idx] += doc_length | |
else: | |
bins.append([idx]) | |
bin_lengths.append(doc_length) | |
packed_bins = [[tokenized_documents[idx] for idx in b] for b in bins] | |
end_time = time.perf_counter() | |
return packed_bins, end_time - start_time | |
def benchmark(): | |
results = [] | |
for num_documents in (1000, 10_000): | |
for max_doc_length in (2048, 4096, 8192): | |
original_times = [] | |
optimized_times = [] | |
result = { | |
"num_documents": num_documents, | |
"max_doc_length": max_doc_length, | |
} | |
for run_idx in range(1, 4): | |
# Generate `num_documents` random strings of length up to `max_doc_length` | |
tokenized_documents = [ | |
"".join(list(np.random.choice(['a', 'b', 'c', 'd', ' '], size=np.random.randint(1, max_doc_length)))) | |
for _ in range(num_documents) | |
] | |
packed_bins_optimized, time_optimized = pack_documents_optimized(tokenized_documents, max_doc_length) | |
gc.collect() | |
packed_bins_original, time_original = pack_documents_original(tokenized_documents, max_doc_length) | |
gc.collect() | |
# Ensure results are equivalent. Minor differences in sorting may have happened when two strings have | |
# the same length, but both approaches should yield the same results in terms of lengths of bins | |
grouped_lens_original = [sum(len(item) for item in _bin) for _bin in packed_bins_original] | |
grouped_lens_optimized = [sum(len(item) for item in _bin) for _bin in packed_bins_optimized] | |
assert grouped_lens_original == grouped_lens_optimized, "Results are not equivalent" | |
original_times.append(time_original) | |
optimized_times.append(time_optimized) | |
result[f"time_{run_idx}_original"] = time_original | |
result[f"run_{run_idx}_optimized"] = time_optimized | |
# Print mean and standard deviation of times | |
print(f"Number of documents: {num_documents}, max document length: {max_doc_length}") | |
print(f"Optimized approach mean time: {np.mean(optimized_times):.4f} seconds (std {np.std(optimized_times):.4f})") | |
print(f"Original approach mean time: {np.mean(original_times):.4f} seconds (std {np.std(original_times):.4f})") | |
print() | |
result["mean_time_original"] = np.mean(original_times) | |
result["std_time_original"] = np.std(original_times) | |
result["mean_time_optimized"] = np.mean(optimized_times) | |
result["std_time_optimized"] = np.std(optimized_times) | |
results.append(result) | |
df = pd.DataFrame(results) | |
df.to_csv("benchmark_results.csv", index=False) | |
with pd.option_context('display.max_rows', None, 'display.max_columns', None): | |
print(df[["num_documents", "max_doc_length", "mean_time_original", "mean_time_optimized"]]) | |
if __name__ == "__main__": | |
benchmark() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Results: