Created
December 11, 2020 03:10
-
-
Save qiuwei/e1118d19692412a94f53abdb0536c441 to your computer and use it in GitHub Desktop.
Benchmark read and write speed of TileDb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#%% | |
import glob | |
import os | |
import datetime | |
import tiledb | |
import pandas as pd | |
from tiledb.dataframe_ import _tiledb_result_as_dataframe | |
from tqdm import tqdm | |
import time | |
import numpy as np | |
cfg = tiledb.Ctx().config() | |
cfg.update( | |
{ | |
'py.init_buffer_bytes': 1024**2 * 50 # 50MB per attribute | |
} | |
) | |
tiledb.default_ctx(cfg) | |
#%% | |
def load_data(data_path = "eodprices.parquet"): | |
ts = time.time() | |
return pd.read_parquet(data_path) | |
te = time.time() | |
print(f"loading data takes {te-ts}s") | |
#%% | |
def write_tiledb(df, path, tile=1024**2): | |
t0 = time.time() | |
tiledb.from_pandas(path, df, sparse=False, tile=tile) | |
t1 = time.time() | |
print(f"Saving to tiledb takes {t1 - t0}") | |
def read_tiledb(path): | |
t0 = time.time() | |
df = tiledb.open_dataframe(path) | |
t1 = time.time() | |
print(f"reading from tiledb takes {t1 - t0}") | |
return df | |
#%% | |
def read_single_column(attrs, path, ctx=None): | |
t0 = time.time() | |
if ctx is None: | |
ctx = tiledb.default_ctx() | |
# TODO support `distributed=True` option? | |
with tiledb.open(path, ctx=ctx) as A: | |
nonempty = A.nonempty_domain() | |
data = A.query(attrs).multi_index.__getitem__(tuple(slice(s1, s2) for s1,s2 in nonempty)) | |
new_df = _tiledb_result_as_dataframe(A, data) | |
t1 = time.time() | |
print(f"reading single column form tiledb takes {t1-t0}s") | |
return new_df | |
#%% | |
def read_slicing(uri=): | |
with tiledb.open(uri) as A: | |
# q = A.query(attrs=('S_FA_ROA',)) | |
# indexing the Query object will only retrieve the | |
# selected attribute(s) | |
q = A.query() | |
data = q[np.datetime64('2005-02-25'):np.datetime64('2010-11-03'), :] | |
new_df = _tiledb_result_as_dataframe(A,data) | |
return new_df | |
#%% | |
# benchmark 1 | |
df = load_data() | |
write_tiledb(df, "benchmark1.tdb") | |
read_tiledb("benchmark1.tdb") | |
read_single_column(["S_DQ_AVGPRICE"],"benchmark1.tdb") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi could you also provide the code for parquet benchmark test.