Last active
December 31, 2017 07:17
-
-
Save dboyliao/80af9caafd9b3d84376d3d510b72961b to your computer and use it in GitHub Desktop.
profiling example with yap
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| # -*- coding:utf8 -*- | |
| import random | |
| from collections import namedtuple | |
| from itertools import combinations | |
| from string import ascii_letters | |
| import pandas as pd | |
| import funcy as fy | |
| import yappi | |
| # generate data | |
| def rand_key(): | |
| return ''.join(random.choices(ascii_letters, k=5)) | |
| def rand_id(): | |
| return ''.join(random.choices([str(i) for i in range(10)], k=5)) | |
| KeyPair = namedtuple('KeyPair', field_names=['key', 'id', 'rate']) | |
| keys = [rand_key() for _ in range(100)] | |
| ids = [rand_id() for _ in range(1000)] | |
| data = [KeyPair(key, id, random.random() * 2) | |
| for key, id in zip(random.choices(keys, k=10000), random.choices(ids, k=10000))] | |
| df = pd.DataFrame(data) | |
| # data transformation with pandas | |
| def symm_prod(df): | |
| grouped = df.groupby('id') | |
| for indices in grouped.groups.values(): | |
| for idx1, idx2 in combinations(indices, 2): | |
| key_1, rate_1 = df.loc[idx1][['key', 'rate']] | |
| key_2, rate_2 = df.loc[idx2][['key', 'rate']] | |
| rate = min(rate_1, rate_2) | |
| yield key_1, key_2, rate | |
| yield key_2, key_1, rate | |
| print('profiling pandas implementation') | |
| yappi.clear_stats() # clear func_stats first | |
| yappi.set_clock_type('cpu') | |
| yappi.start(builtins=True) # start profiler | |
| _ = pd.DataFrame(symm_prod(df)) | |
| yappi.stop() # stop profiler | |
| stat = yappi.get_func_stats() | |
| stat.save('callgrind.symmprod_pd.prof', type='callgrind') | |
| def symm_prod2(df): | |
| """ | |
| symmetric product (itertuples) | |
| """ | |
| grouped = df.groupby('id') | |
| for _, group in grouped: | |
| pairs = group[['key', 'rate']] | |
| for t1, t2 in combinations(pairs.itertuples(index=False), 2): | |
| key_1, rate_1 = t1 | |
| key_2, rate_2 = t2 | |
| rate = min(rate_1, rate_2) | |
| yield key_1, key_2, rate | |
| yield key_2, key_1, rate | |
| print('profiling pandas implementation (itertuples)') | |
| yappi.clear_stats() # clear func_stats first | |
| yappi.set_clock_type('cpu') | |
| yappi.start(builtins=True) # start profiler | |
| _ = pd.DataFrame(symm_prod2(df)) | |
| yappi.stop() # stop profiler | |
| stat = yappi.get_func_stats() | |
| stat.save('callgrind.symmprod2_pd.prof', type='callgrind') | |
| # data transformation with funcy | |
| def symm_prod_fy(data): | |
| grouped = fy.group_by(lambda t: t[1], data) | |
| def trans_prods(pairs): | |
| return [[(t1, t2), (t2, t1)] for t1, t2 in combinations(pairs, 2)] | |
| prods = fy.cat(fy.mapcat(trans_prods, grouped.values())) | |
| def trans_min_rate(prods): | |
| pair_1, pair_2 = prods | |
| rate = min(pair_1.rate, pair_2.rate) | |
| return (pair_1.key, pair_2.key, rate) | |
| rates = fy.map(trans_min_rate, prods) | |
| yield from rates | |
| print('profiling funcy implementation') | |
| yappi.clear_stats() | |
| yappi.set_clock_type('cpu') | |
| yappi.start(builtins=True) | |
| _ = pd.DataFrame(symm_prod_fy(data)) | |
| yappi.stop() | |
| stat = yappi.get_func_stats() | |
| stat.save('callgrind.symmprod_fy.prof', type='callgrind') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment