Created
July 24, 2024 11:39
-
-
Save m4r00p/e705c22496eb805686fe31775d9d78ba to your computer and use it in GitHub Desktop.
Dedup statistical analysis (Micro optimizations)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import cProfile | |
import pstats | |
import io | |
import statistics | |
import pandas as pd | |
# Define the deduplication functions | |
def deduplicate_set(arr): | |
return list(set(arr)) | |
def deduplicate_dict(arr): | |
return list(dict.fromkeys(arr)) | |
def deduplicate_loop(arr): | |
result = [] | |
for item in arr: | |
if item not in result: | |
result.append(item) | |
return result | |
def deduplicate_comprehension(arr): | |
seen = set() | |
return [item for item in arr if not (item in seen or seen.add(item))] | |
# Define a sample list with duplicates | |
sample_list = [1, 2, 2, 3, 4, 4, 5, 6, 6, 7, 8, 8, 9, 10, 10] * 100000 | |
def profile_function(func, arr, runs=30): | |
times = [] | |
profiler = cProfile.Profile() | |
for _ in range(runs): | |
profiler.enable() | |
func(arr) | |
profiler.disable() | |
total_time = sum(stat.totaltime for stat in profiler.getstats()) | |
times.append(total_time) | |
profiler.clear() | |
mean_time = statistics.mean(times) | |
median_time = statistics.median(times) | |
stdev_time = statistics.stdev(times) | |
return mean_time, median_time, stdev_time | |
# Profile each deduplication function | |
runs = 3000 | |
results = [] | |
functions = [ | |
("deduplicate_set", deduplicate_set), | |
("deduplicate_dict", deduplicate_dict), | |
("deduplicate_loop", deduplicate_loop), | |
("deduplicate_comprehension", deduplicate_comprehension), | |
] | |
for func_name, func in functions: | |
mean_time, median_time, stdev_time = profile_function(func, sample_list, runs) | |
results.append( | |
{ | |
"Function": func_name, | |
"Mean Time (s)": mean_time, | |
"Median Time (s)": median_time, | |
"Standard Deviation (s)": stdev_time, | |
} | |
) | |
# Convert results to a DataFrame | |
df = pd.DataFrame(results) | |
# Display the results | |
print(df) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment