Created
March 5, 2018 09:50
-
-
Save sneakers-the-rat/dffe995c508c9ebf2593eb973f393d7a to your computer and use it in GitHub Desktop.
multiprocessing cleaning ops
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import pandas as pd | |
import numpy as np | |
from itertools import cycle | |
from tqdm import tqdm | |
from multiprocessing import Pool | |
import time | |
def concat_sport(sport, i): | |
sport_files = [s for s in files if s.startswith(sport)] | |
sport_df = pd.read_pickle("".join([comp_dir,sport_files[0]])) | |
for f in tqdm(sport_files[1:], total=len(sport_files), position=i): | |
try: | |
temp_df = pd.read_pickle("".join([comp_dir,f])) | |
sport_df = pd.concat([sport_df, temp_df], axis=0) | |
except ValueError: | |
print("{} concat error".format(sport)) | |
# Because of the way we've been grouping, there will be lots of all na columns, drop them | |
sport_df = sport_df.dropna(axis=1, how="all") | |
sport_df = sport_df.infer_objects() | |
# replace Nones with nans | |
sport_df = sport_df.replace([None], [np.nan]) | |
# fix up ranks | |
try: | |
sport_df['rank'] = sport_df['rank'].str.strip() | |
except: | |
pass | |
try: | |
sport_df['rank'] = sport_df['rank'].astype(np.float) | |
sport_df['rank'] = sport_df['rank'].replace("", np.nan) | |
except: | |
pass | |
save_fn = "{}{}.pkl".format(final_dir, sport) | |
sport_df.to_pickle(save_fn) | |
if __name__ == "__main__": | |
comp_dir = '<dir>' | |
final_dir = '<dir>' | |
files = os.listdir(comp_dir) | |
sports = [s.split("_")[0] for s in files] | |
uq_sports = np.unique(sports) | |
cyc = cycle(range(1, 8)) | |
p = Pool(processes=7) | |
res = [] | |
for chunk in uq_sports: | |
res.append(p.apply_async(concat_sport, (chunk, cyc.next()))) | |
pbar = tqdm(total=len(uq_sports), position=0) | |
sports_done = 0 | |
while not all([r.ready() for r in res]): | |
new_sports_done = np.sum([r.ready() for r in res]) | |
if new_sports_done > sports_done: | |
pbar.update(new_sports_done-sports_done) | |
sports_done = new_sports_done | |
time.sleep(1) | |
print('\ncompleted :)') | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment