usmcamp0811 · September 25, 2018 14:45
diff --git a/load_dfs_in_dask.py b/load_dfs_in_dask.py
 import os
 import feather
 from tqdm import tqdm
 import dask
 import datetime
 from dask import delayed
 from dask import visualize
 import pandas as pd
 import dask.dataframe as dd

 """"
 Example code to show how to load lots of binary files into Dask

 Loading custom file types like Feather files is a little more boiler plate
 but it seems to condense down to the same dask dataframe object so no 
 real loss by using feathers or other custom data types

 """

 lazy_dataframes = []
 dir = "/media/mcamp/HDD1/Datasets/bci_data/train/feather"
 dirp = "/media/mcamp/HDD1/Datasets/bci_data/train/parquet"
 t1 = datetime.datetime.now()
 filepaths = os.listdir(dir)
 pfilepaths = os.listdir(dirp)

 print("Starting the Feather files")
 for file in filepaths:
    df = delayed(feather.read_dataframe)(os.path.join(dir, file))
    df = delayed(pd.DataFrame.assign)(df, file=file.replace(".feather", ""))

    lazy_dataframes.append(df)

 df = dd.from_delayed(lazy_dataframes, meta=lazy_dataframes[0].compute())
 t2 = datetime.datetime.now()
 _std = df.Fp1.std()
 _mean = df.Fp1.mean()
 t3 = datetime.datetime.now()
 _std1, _mean1 = dask.compute(_std, _mean)
 t4 = datetime.datetime.now()
 print(_std1, _mean1)
 print("T2:", t2-t1)
 print("T3:", t3-t1)
 print("T4:", t4-t1)

 del df
 print("Starting the Parquet files")
 t1 = datetime.datetime.now()
 df = dd.read_parquet(os.path.join(dirp, '*.gzip'))
 t2 = datetime.datetime.now()
 _std = df.Fp1.std()
 _mean = df.Fp1.mean()
 t3 = datetime.datetime.now()
 _std1, _mean1 = dask.compute(_std, _mean)
 t4 = datetime.datetime.now()
 print(_std1, _mean1)
 print("T2:", t2-t1)
 print("T3:", t3-t1)
 print("T4:", t4-t1)
	import os
	import feather
	from tqdm import tqdm
	import dask
	import datetime
	from dask import delayed
	from dask import visualize
	import pandas as pd
	import dask.dataframe as dd

	""""
	Example code to show how to load lots of binary files into Dask

	Loading custom file types like Feather files is a little more boiler plate
	but it seems to condense down to the same dask dataframe object so no
	real loss by using feathers or other custom data types

	"""

	lazy_dataframes = []
	dir = "/media/mcamp/HDD1/Datasets/bci_data/train/feather"
	dirp = "/media/mcamp/HDD1/Datasets/bci_data/train/parquet"
	t1 = datetime.datetime.now()
	filepaths = os.listdir(dir)
	pfilepaths = os.listdir(dirp)

	print("Starting the Feather files")
	for file in filepaths:
	df = delayed(feather.read_dataframe)(os.path.join(dir, file))
	df = delayed(pd.DataFrame.assign)(df, file=file.replace(".feather", ""))

	lazy_dataframes.append(df)

	df = dd.from_delayed(lazy_dataframes, meta=lazy_dataframes[0].compute())
	t2 = datetime.datetime.now()
	_std = df.Fp1.std()
	_mean = df.Fp1.mean()
	t3 = datetime.datetime.now()
	_std1, _mean1 = dask.compute(_std, _mean)
	t4 = datetime.datetime.now()
	print(_std1, _mean1)
	print("T2:", t2-t1)
	print("T3:", t3-t1)
	print("T4:", t4-t1)

	del df
	print("Starting the Parquet files")
	t1 = datetime.datetime.now()
	df = dd.read_parquet(os.path.join(dirp, '*.gzip'))
	t2 = datetime.datetime.now()
	_std = df.Fp1.std()
	_mean = df.Fp1.mean()
	t3 = datetime.datetime.now()
	_std1, _mean1 = dask.compute(_std, _mean)
	t4 = datetime.datetime.now()
	print(_std1, _mean1)
	print("T2:", t2-t1)
	print("T3:", t3-t1)
	print("T4:", t4-t1)
No results found