Skip to content

Instantly share code, notes, and snippets.

@dboyliao
Last active December 31, 2017 07:17
Show Gist options
  • Select an option

  • Save dboyliao/80af9caafd9b3d84376d3d510b72961b to your computer and use it in GitHub Desktop.

Select an option

Save dboyliao/80af9caafd9b3d84376d3d510b72961b to your computer and use it in GitHub Desktop.
profiling example with yap
#!/usr/bin/env python3
# -*- coding:utf8 -*-
import random
from collections import namedtuple
from itertools import combinations
from string import ascii_letters
import pandas as pd
import funcy as fy
import yappi
# generate data
def rand_key():
return ''.join(random.choices(ascii_letters, k=5))
def rand_id():
return ''.join(random.choices([str(i) for i in range(10)], k=5))
KeyPair = namedtuple('KeyPair', field_names=['key', 'id', 'rate'])
keys = [rand_key() for _ in range(100)]
ids = [rand_id() for _ in range(1000)]
data = [KeyPair(key, id, random.random() * 2)
for key, id in zip(random.choices(keys, k=10000), random.choices(ids, k=10000))]
df = pd.DataFrame(data)
# data transformation with pandas
def symm_prod(df):
grouped = df.groupby('id')
for indices in grouped.groups.values():
for idx1, idx2 in combinations(indices, 2):
key_1, rate_1 = df.loc[idx1][['key', 'rate']]
key_2, rate_2 = df.loc[idx2][['key', 'rate']]
rate = min(rate_1, rate_2)
yield key_1, key_2, rate
yield key_2, key_1, rate
print('profiling pandas implementation')
yappi.clear_stats() # clear func_stats first
yappi.set_clock_type('cpu')
yappi.start(builtins=True) # start profiler
_ = pd.DataFrame(symm_prod(df))
yappi.stop() # stop profiler
stat = yappi.get_func_stats()
stat.save('callgrind.symmprod_pd.prof', type='callgrind')
def symm_prod2(df):
"""
symmetric product (itertuples)
"""
grouped = df.groupby('id')
for _, group in grouped:
pairs = group[['key', 'rate']]
for t1, t2 in combinations(pairs.itertuples(index=False), 2):
key_1, rate_1 = t1
key_2, rate_2 = t2
rate = min(rate_1, rate_2)
yield key_1, key_2, rate
yield key_2, key_1, rate
print('profiling pandas implementation (itertuples)')
yappi.clear_stats() # clear func_stats first
yappi.set_clock_type('cpu')
yappi.start(builtins=True) # start profiler
_ = pd.DataFrame(symm_prod2(df))
yappi.stop() # stop profiler
stat = yappi.get_func_stats()
stat.save('callgrind.symmprod2_pd.prof', type='callgrind')
# data transformation with funcy
def symm_prod_fy(data):
grouped = fy.group_by(lambda t: t[1], data)
def trans_prods(pairs):
return [[(t1, t2), (t2, t1)] for t1, t2 in combinations(pairs, 2)]
prods = fy.cat(fy.mapcat(trans_prods, grouped.values()))
def trans_min_rate(prods):
pair_1, pair_2 = prods
rate = min(pair_1.rate, pair_2.rate)
return (pair_1.key, pair_2.key, rate)
rates = fy.map(trans_min_rate, prods)
yield from rates
print('profiling funcy implementation')
yappi.clear_stats()
yappi.set_clock_type('cpu')
yappi.start(builtins=True)
_ = pd.DataFrame(symm_prod_fy(data))
yappi.stop()
stat = yappi.get_func_stats()
stat.save('callgrind.symmprod_fy.prof', type='callgrind')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment