Skip to content

Instantly share code, notes, and snippets.

@yhilpisch
Last active February 2, 2023 15:33
Show Gist options
  • Save yhilpisch/e642c1a8b5fa91162fce to your computer and use it in GitHub Desktop.
Save yhilpisch/e642c1a8b5fa91162fce to your computer and use it in GitHub Desktop.
Python file to test TsTables Performance
#
# TsTables -- High Frequency Times Series Data with PyTables
#
# to install -- pip install tstables
# Github repo -- https://github.com/afiedler/tstables
#
# Dr. Yves J. Hilpisch
#
# The Python Quants GmbH
# http://quant-platform.com
# http://pythonquants.com
#
import os
import numpy as np
import pandas as pd
import tables as tb
import random
from time import time
import tstables
from datetime import datetime
## Preliminaries
# First, some basic assumptions.
path = '/notebooks/ssd/data/' # delete or adjust
co = 10 # number of time series
dt = 1. / (12 * 30 * 24 * 60) # second as year fraction
vol = 0.2 # volatility for stochastic processes
# Second, a function to generate paths based on geometric Brownian motion.
def generate_paths(no, initials):
paths = initials * np.exp(np.cumsum(-0.5 * vol ** 2 * dt +
vol * np.sqrt(dt) * np.random.standard_normal((no, co)), axis=0))
paths[0] = initials
return paths
## Sample Data and Storage
# To store the time series data in a PyTables table we define
# the table structure.
class TS(tb.IsDescription):
timestamp = tb.Int64Col(pos=0)
ts1 = tb.Float64Col(pos=1)
ts2 = tb.Float64Col(pos=2)
ts3 = tb.Float64Col(pos=3)
ts4 = tb.Float64Col(pos=4)
ts5 = tb.Float64Col(pos=5)
ts6 = tb.Float64Col(pos=6)
ts7 = tb.Float64Col(pos=7)
ts8 = tb.Float64Col(pos=8)
ts9 = tb.Float64Col(pos=9)
ts10 = tb.Float64Col(pos=10)
# Then open a database file and create the table object.
h5 = tb.open_file(path + 'ts_sec.h5', 'w')
# TsTables adds a new function ``create_ts`` to PyTables.
ts = h5.create_ts('/', 'TS', TS)
# We can now generate and write sample data to the table object.
t0 = time()
start = datetime(2015, 1, 1, 0, 0)
initials = np.array([100.] * co)
total = 0
for i in xrange(18):
if start.month == 12:
end = datetime(start.year + 1, 1, 1)
else:
end = datetime(start.year, start.month + 1, 1)
index = pd.date_range(start, end, freq='1s')
start = end
data = pd.DataFrame(generate_paths(len(index), initials),
index=index)
initials = data.values[-1]
print i, len(index)
total += len(index)
ts.append(data)
size = os.path.getsize(path + 'ts_sec.h5') / 1000000
duration = (time() - t0)
print "\ndata written to disk in mega bytes %d" % size
print "time to write all data in seconds %5.3f" % (time() - t0)
print "\nwriting speed in mega bytes per second %.1f" % int(size / duration)
print "writing speed in records per second %d" % int(total / duration)
# The strength of TsTables lies in retrieving chunks of time series
# data defined by a start date and an end date (which obviously is a
# typical case in finance, e.g. in backtesting strategies or risk
# management).
t0 = time()
read_start_dt = datetime(2015, 2, 7, 0, 0) # one day
read_end_dt = datetime(2015, 2, 7, 23, 59) # worth of data
# TsTables tries to make such an operation as fast as possible.
# Conveniently, the returned object is a pandas DataFrame.
rows = ts.read_range(read_start_dt, read_end_dt)
print "\ntime to retrieve data slice in seconds %4.3f" % (time()- t0)
print "number of rows in results object %d" % rows.count()[0]
print "\nthe first 5 rows of this object:"
print np.round(rows.head(), 2).to_string()
# The following simulates a randomized access where
# chunks of daily data sets are accessed and retrieved
# as pandas DataFrame objects.
t0 = time()
its = 250
total = 0
for _ in xrange(its):
day = random.randint(1, 27)
month = random.randint(1, 11)
read_start_dt = datetime(2015, month, day, 0, 0)
read_end_dt = datetime(2015, month, day + 1, 0, 0)
rows = ts.read_range(read_start_dt, read_end_dt)
total += rows.count()[0]
t1 = time()
duration = t1 - t0
print "\ntime for %d random accesses %5.3f seconds" % (its, duration)
print "average time for random access %5.3f seconds" % (duration / its)
print "number of records per second %d" % (total / duration)
# close database and remove sample data file
h5.close()
os.remove(path + 'ts_sec.h5')
@schwed2
Copy link

schwed2 commented Feb 2, 2023

What exactly is n ot working? Is there any error message?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment