Skip to content

Instantly share code, notes, and snippets.

@wolframalpha
Last active March 7, 2018 06:50
Show Gist options
  • Select an option

  • Save wolframalpha/e1683f27b78d6ae5448f90576cdf1157 to your computer and use it in GitHub Desktop.

Select an option

Save wolframalpha/e1683f27b78d6ae5448f90576cdf1157 to your computer and use it in GitHub Desktop.
This will work with file of any size since it read and write the data lazily, however it may fail in case the number of columns explodes(which is unlikely). You can even check on your local machine by downloading the file.
import pandas as pd
chunksize = 1000
ip_filename = '/home/wolfram/Downloads/DataParserData.csv'
op_filename = 'datafile.csv'
prefix = 'cleansedquery_'
all_keys = set()
first_write = True
def update_values(key_value, all_keys=all_keys):
missing_keys = all_keys - set(key_value.keys())
key_value.update({key:0 for key in missing_keys})
return key_value
for df in pd.read_csv(ip_filename, chunksize=chunksize, usecols=['CleansedQuery']):
df['keys'] = df.CleansedQuery.apply(lambda x: [y.split(':')[0] for y in x.split(' ')])
all_keys.update({value for row in df['keys'].values for value in row})
for df in pd.read_csv(ip_filename, chunksize=chunksize):
df['key_value'] = df.CleansedQuery.apply(lambda x: {y.split(':')[0]: int(y.split(':')[1]) for y in x.split(' ')})
df['key_value'] = df['key_value'].apply(update_values)
for key in all_keys:
df[prefix+key] = df['key_value'].apply(lambda x: x[key])
df.drop('key_value', inplace=True, axis=1)
# df.sort_index(axis=1, inplace=True)
if first_write:
df.to_csv(open(op_filename, 'w'))
columns = df.columns
first_write = False
else:
df.loc[: columns].to_csv(open(op_filename, 'a'), header=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment