Skip to content

Instantly share code, notes, and snippets.

@mesejo
Created November 7, 2021 14:11
Show Gist options
  • Save mesejo/8969e069f7077b7a14d8eeba8d0e5ff3 to your computer and use it in GitHub Desktop.
Save mesejo/8969e069f7077b7a14d8eeba8d0e5ff3 to your computer and use it in GitHub Desktop.
defaultdict performance
from collections import defaultdict
from itertools import groupby, cycle
from operator import itemgetter as ig
import random
import perfplot
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style("dark")
def setup(unique):
def fun(length):
choices = cycle(range(unique))
result = [
(i, random.randint(0, 10), random.randint(0, 10_000))
for _, i in zip(range(length), choices)
]
random.shuffle(result)
return result
return fun
def defaultdict_max_approach(lst):
d = defaultdict(lambda: (0, 0, float("-inf")))
for e in lst:
first, _, last = e
if d[first][2] < last:
d[first] = e
return [*d.values()]
def dict_max_approach(lst):
# https://stackoverflow.com/a/69025193/4001592
d = {}
for tpl in lst:
first, *_, last = tpl
if first not in d or last > d[first][-1]:
d[first] = tpl
return [*d.values()]
def groupby_max_approach(lst):
# https://stackoverflow.com/a/69025193/4001592
return [max(g, key=ig(-1)) for _, g in groupby(sorted(lst), key=ig(0))]
def equality(a, b):
da = {k: v for k, _, v in a}
db = {k: v for k, _, v in b}
return da == db
if __name__ == "__main__":
res = []
for unique in [500, 1000, 5000, 10_000]:
b = perfplot.bench(
setup=setup(unique),
kernels=[
defaultdict_max_approach,
dict_max_approach,
groupby_max_approach,
],
labels=["defaultdict", "dict", "groupby"],
n_range=[k for k in range(100_000, 1000_001, 100_000)],
xlabel="len(lst)",
equality_check=equality,
)
df = pd.DataFrame(
data=b.timings_s.T,
index=pd.MultiIndex.from_product(
[[str(unique)], b.n_range], names=["unique", "range"]
),
columns=[*b.labels],
)
res.append(df)
data = pd.concat(res).reset_index()
data = data.melt(
id_vars=["range", "unique"], value_vars=["defaultdict", "dict", "groupby"]
)
mapping = {
"unique": "Unique Keys [count]",
"variable": "Algorithm",
"range": "len(lst) [Ks]",
"value": "Runtime [s]",
}
data = data.rename(columns=mapping)
data["len(lst) [Ks]"] = data["len(lst) [Ks]"] // 1000
g = sns.FacetGrid(data, sharey=False, col="Unique Keys [count]", col_wrap=2, hue="Algorithm", legend_out=False)
g.map(sns.lineplot, "len(lst) [Ks]", "Runtime [s]")
g.tight_layout()
g.add_legend()
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment