Created
September 25, 2018 14:45
-
-
Save usmcamp0811/a6dd9e217f7e72fbe4e117ea725464c0 to your computer and use it in GitHub Desktop.
Quick test to see if loading Feather files into Dask was any better or worse than Parquet files in Dask. They seem to be comparable in performance and not much extra code for the Feather files.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import feather | |
from tqdm import tqdm | |
import dask | |
import datetime | |
from dask import delayed | |
from dask import visualize | |
import pandas as pd | |
import dask.dataframe as dd | |
"""" | |
Example code to show how to load lots of binary files into Dask | |
Loading custom file types like Feather files is a little more boiler plate | |
but it seems to condense down to the same dask dataframe object so no | |
real loss by using feathers or other custom data types | |
""" | |
lazy_dataframes = [] | |
dir = "/media/mcamp/HDD1/Datasets/bci_data/train/feather" | |
dirp = "/media/mcamp/HDD1/Datasets/bci_data/train/parquet" | |
t1 = datetime.datetime.now() | |
filepaths = os.listdir(dir) | |
pfilepaths = os.listdir(dirp) | |
print("Starting the Feather files") | |
for file in filepaths: | |
df = delayed(feather.read_dataframe)(os.path.join(dir, file)) | |
df = delayed(pd.DataFrame.assign)(df, file=file.replace(".feather", "")) | |
lazy_dataframes.append(df) | |
df = dd.from_delayed(lazy_dataframes, meta=lazy_dataframes[0].compute()) | |
t2 = datetime.datetime.now() | |
_std = df.Fp1.std() | |
_mean = df.Fp1.mean() | |
t3 = datetime.datetime.now() | |
_std1, _mean1 = dask.compute(_std, _mean) | |
t4 = datetime.datetime.now() | |
print(_std1, _mean1) | |
print("T2:", t2-t1) | |
print("T3:", t3-t1) | |
print("T4:", t4-t1) | |
del df | |
print("Starting the Parquet files") | |
t1 = datetime.datetime.now() | |
df = dd.read_parquet(os.path.join(dirp, '*.gzip')) | |
t2 = datetime.datetime.now() | |
_std = df.Fp1.std() | |
_mean = df.Fp1.mean() | |
t3 = datetime.datetime.now() | |
_std1, _mean1 = dask.compute(_std, _mean) | |
t4 = datetime.datetime.now() | |
print(_std1, _mean1) | |
print("T2:", t2-t1) | |
print("T3:", t3-t1) | |
print("T4:", t4-t1) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment