Skip to content

Instantly share code, notes, and snippets.

@johntbush
Created April 22, 2017 03:35
Show Gist options
  • Save johntbush/cf332edbe9e8e5559543e1228bb580f2 to your computer and use it in GitHub Desktop.
Save johntbush/cf332edbe9e8e5559543e1228bb580f2 to your computer and use it in GitHub Desktop.
pandas_load_test
import pandas as pd
import timeit
import numpy as np
import StringIO
import io
def generate_csv(width,height):
df = pd.DataFrame(np.random.randint(0, 1000000, size=(height, width)))
df[0] = df[0].astype(str)
df[1] = df[1].astype(str)
df[2] = df[2].astype(str)
df[3] = df[3].astype(str)
df[4] = df[4].astype(str)
df[5] = df[5].astype(str)
df.to_csv('data/test.csv')
df.to_hdf('data/test.h5','df',mode='a',format='table')
def load_csv():
df = pd.read_csv('data/test.csv')
def load_hdf():
df = pd.read_hdf('data/test.h5', chunksize=10000)
generate_csv(100,100000)
csv_time = timeit.timeit(load_csv, number=1) * 1000
print "load csv in {} ms".format(csv_time)
hdf_time = timeit.timeit(load_hdf, number=1) * 1000
print "load hdf in {} ms".format(hdf_time)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment