Last active
February 2, 2023 15:33
-
-
Save yhilpisch/e642c1a8b5fa91162fce to your computer and use it in GitHub Desktop.
Python file to test TsTables Performance
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# | |
# TsTables -- High Frequency Times Series Data with PyTables | |
# | |
# to install -- pip install tstables | |
# Github repo -- https://github.com/afiedler/tstables | |
# | |
# Dr. Yves J. Hilpisch | |
# | |
# The Python Quants GmbH | |
# http://quant-platform.com | |
# http://pythonquants.com | |
# | |
import os | |
import numpy as np | |
import pandas as pd | |
import tables as tb | |
import random | |
from time import time | |
import tstables | |
from datetime import datetime | |
## Preliminaries | |
# First, some basic assumptions. | |
path = '/notebooks/ssd/data/' # delete or adjust | |
co = 10 # number of time series | |
dt = 1. / (12 * 30 * 24 * 60) # second as year fraction | |
vol = 0.2 # volatility for stochastic processes | |
# Second, a function to generate paths based on geometric Brownian motion. | |
def generate_paths(no, initials): | |
paths = initials * np.exp(np.cumsum(-0.5 * vol ** 2 * dt + | |
vol * np.sqrt(dt) * np.random.standard_normal((no, co)), axis=0)) | |
paths[0] = initials | |
return paths | |
## Sample Data and Storage | |
# To store the time series data in a PyTables table we define | |
# the table structure. | |
class TS(tb.IsDescription): | |
timestamp = tb.Int64Col(pos=0) | |
ts1 = tb.Float64Col(pos=1) | |
ts2 = tb.Float64Col(pos=2) | |
ts3 = tb.Float64Col(pos=3) | |
ts4 = tb.Float64Col(pos=4) | |
ts5 = tb.Float64Col(pos=5) | |
ts6 = tb.Float64Col(pos=6) | |
ts7 = tb.Float64Col(pos=7) | |
ts8 = tb.Float64Col(pos=8) | |
ts9 = tb.Float64Col(pos=9) | |
ts10 = tb.Float64Col(pos=10) | |
# Then open a database file and create the table object. | |
h5 = tb.open_file(path + 'ts_sec.h5', 'w') | |
# TsTables adds a new function ``create_ts`` to PyTables. | |
ts = h5.create_ts('/', 'TS', TS) | |
# We can now generate and write sample data to the table object. | |
t0 = time() | |
start = datetime(2015, 1, 1, 0, 0) | |
initials = np.array([100.] * co) | |
total = 0 | |
for i in xrange(18): | |
if start.month == 12: | |
end = datetime(start.year + 1, 1, 1) | |
else: | |
end = datetime(start.year, start.month + 1, 1) | |
index = pd.date_range(start, end, freq='1s') | |
start = end | |
data = pd.DataFrame(generate_paths(len(index), initials), | |
index=index) | |
initials = data.values[-1] | |
print i, len(index) | |
total += len(index) | |
ts.append(data) | |
size = os.path.getsize(path + 'ts_sec.h5') / 1000000 | |
duration = (time() - t0) | |
print "\ndata written to disk in mega bytes %d" % size | |
print "time to write all data in seconds %5.3f" % (time() - t0) | |
print "\nwriting speed in mega bytes per second %.1f" % int(size / duration) | |
print "writing speed in records per second %d" % int(total / duration) | |
# The strength of TsTables lies in retrieving chunks of time series | |
# data defined by a start date and an end date (which obviously is a | |
# typical case in finance, e.g. in backtesting strategies or risk | |
# management). | |
t0 = time() | |
read_start_dt = datetime(2015, 2, 7, 0, 0) # one day | |
read_end_dt = datetime(2015, 2, 7, 23, 59) # worth of data | |
# TsTables tries to make such an operation as fast as possible. | |
# Conveniently, the returned object is a pandas DataFrame. | |
rows = ts.read_range(read_start_dt, read_end_dt) | |
print "\ntime to retrieve data slice in seconds %4.3f" % (time()- t0) | |
print "number of rows in results object %d" % rows.count()[0] | |
print "\nthe first 5 rows of this object:" | |
print np.round(rows.head(), 2).to_string() | |
# The following simulates a randomized access where | |
# chunks of daily data sets are accessed and retrieved | |
# as pandas DataFrame objects. | |
t0 = time() | |
its = 250 | |
total = 0 | |
for _ in xrange(its): | |
day = random.randint(1, 27) | |
month = random.randint(1, 11) | |
read_start_dt = datetime(2015, month, day, 0, 0) | |
read_end_dt = datetime(2015, month, day + 1, 0, 0) | |
rows = ts.read_range(read_start_dt, read_end_dt) | |
total += rows.count()[0] | |
t1 = time() | |
duration = t1 - t0 | |
print "\ntime for %d random accesses %5.3f seconds" % (its, duration) | |
print "average time for random access %5.3f seconds" % (duration / its) | |
print "number of records per second %d" % (total / duration) | |
# close database and remove sample data file | |
h5.close() | |
os.remove(path + 'ts_sec.h5') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
What exactly is n ot working? Is there any error message?