Last active
March 7, 2018 06:50
-
-
Save wolframalpha/e1683f27b78d6ae5448f90576cdf1157 to your computer and use it in GitHub Desktop.
This will work with file of any size since it read and write the data lazily, however it may fail in case the number of columns explodes(which is unlikely). You can even check on your local machine by downloading the file.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import pandas as pd | |
| chunksize = 1000 | |
| ip_filename = '/home/wolfram/Downloads/DataParserData.csv' | |
| op_filename = 'datafile.csv' | |
| prefix = 'cleansedquery_' | |
| all_keys = set() | |
| first_write = True | |
| def update_values(key_value, all_keys=all_keys): | |
| missing_keys = all_keys - set(key_value.keys()) | |
| key_value.update({key:0 for key in missing_keys}) | |
| return key_value | |
| for df in pd.read_csv(ip_filename, chunksize=chunksize, usecols=['CleansedQuery']): | |
| df['keys'] = df.CleansedQuery.apply(lambda x: [y.split(':')[0] for y in x.split(' ')]) | |
| all_keys.update({value for row in df['keys'].values for value in row}) | |
| for df in pd.read_csv(ip_filename, chunksize=chunksize): | |
| df['key_value'] = df.CleansedQuery.apply(lambda x: {y.split(':')[0]: int(y.split(':')[1]) for y in x.split(' ')}) | |
| df['key_value'] = df['key_value'].apply(update_values) | |
| for key in all_keys: | |
| df[prefix+key] = df['key_value'].apply(lambda x: x[key]) | |
| df.drop('key_value', inplace=True, axis=1) | |
| # df.sort_index(axis=1, inplace=True) | |
| if first_write: | |
| df.to_csv(open(op_filename, 'w')) | |
| columns = df.columns | |
| first_write = False | |
| else: | |
| df.loc[: columns].to_csv(open(op_filename, 'a'), header=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment