Created
August 8, 2022 07:23
-
-
Save P8H/cd3808bda583419e2879a4d9cbef5994 to your computer and use it in GitHub Desktop.
Load a TileDB DataFrame with Dask Delayed - Example with a timeseries (Slicing on DateTimeIndex)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import dask | |
import numpy as np | |
import pandas as pd | |
import dask.dataframe as ddf | |
dask_chunks = 20 | |
# Partition the dataframe | |
slices = [ | |
np.datetime64(d) | |
for d in list(pd.date_range(datetm_start, datetm_end, periods=dask_chunks)) | |
] | |
slices = zip(slices[:-1], slices[1:]) | |
def slice_tiledb(sl): | |
tdf = tiledb.open("s3://bucker-timeseries/main", mode="r", ctx=ctx) | |
ddd = tdf.query(attrs=columns_to_select, use_arrow=True).df[ | |
( | |
slice(*sl) | |
) | |
] | |
return ddd | |
delayed_slices = [dask.delayed(slice_tiledb)(sl) for sl in slices] | |
df = ddf.from_delayed(delayed_slices) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment