yhilpisch · February 2, 2023 15:33 · schwed2 · Feb 2, 2023
diff --git a/big_tstables.py b/big_tstables.py
 #
 # TsTables -- High Frequency Times Series Data with PyTables
 #
 # to install -- pip install tstables
 # Github repo -- https://github.com/afiedler/tstables
 #
 # Dr. Yves J. Hilpisch
 #
 # The Python Quants GmbH
 # http://quant-platform.com
 # http://pythonquants.com
 #
 import os
 import numpy as np
 import pandas as pd
 import tables as tb
 import random
 from time import time
 import tstables
 from datetime import datetime

 ## Preliminaries

 # First, some basic assumptions.

 path = '/notebooks/ssd/data/'  # delete or adjust

 co = 10  # number of time series
 dt = 1. / (12 * 30 * 24 * 60)  # second as year fraction
 vol = 0.2  # volatility for stochastic processes


 # Second, a function to generate paths based on geometric Brownian motion.

 def generate_paths(no, initials):
    paths = initials * np.exp(np.cumsum(-0.5 * vol ** 2 * dt +
            vol * np.sqrt(dt) * np.random.standard_normal((no, co)), axis=0))
    paths[0] = initials
    return paths


 ## Sample Data and Storage

 # To store the time series data in a PyTables table we define
 # the table structure.

 class TS(tb.IsDescription):
    timestamp = tb.Int64Col(pos=0)
    ts1 = tb.Float64Col(pos=1)
    ts2 = tb.Float64Col(pos=2)
    ts3 = tb.Float64Col(pos=3)
    ts4 = tb.Float64Col(pos=4)
    ts5 = tb.Float64Col(pos=5)
    ts6 = tb.Float64Col(pos=6)
    ts7 = tb.Float64Col(pos=7)
    ts8 = tb.Float64Col(pos=8)
    ts9 = tb.Float64Col(pos=9)
    ts10 = tb.Float64Col(pos=10)


 # Then open a database file and create the table object.

 h5 = tb.open_file(path + 'ts_sec.h5', 'w')


 # TsTables adds a new function ``create_ts`` to PyTables.

 ts = h5.create_ts('/', 'TS', TS)


 # We can now generate and write sample data to the table object.

 t0 = time()
 start = datetime(2015, 1, 1, 0, 0)
 initials = np.array([100.] * co)
 total = 0
 for i in xrange(18):
    if start.month == 12:
        end = datetime(start.year + 1, 1, 1)
    else:
    	end = datetime(start.year, start.month + 1, 1)
    index = pd.date_range(start, end, freq='1s')
    start = end
    data = pd.DataFrame(generate_paths(len(index), initials),
                        index=index)
    initials = data.values[-1]
    print i, len(index)
    total += len(index)
    ts.append(data)

 size = os.path.getsize(path + 'ts_sec.h5') / 1000000
 duration = (time() - t0)

 print "\ndata written to disk in mega bytes        %d" % size
 print "time to write all data in seconds         %5.3f" % (time() - t0)

 print "\nwriting speed in mega bytes per second    %.1f" % int(size / duration)
 print "writing speed in records per second       %d" % int(total / duration)


 # The strength of TsTables lies in retrieving chunks of time series
 # data defined by a start date and an end date (which obviously is a
 # typical case in finance, e.g. in backtesting strategies or risk
 # management).

 t0 = time()

 read_start_dt = datetime(2015, 2, 7, 0, 0)  # one day
 read_end_dt = datetime(2015, 2, 7, 23, 59)  # worth of data

 # TsTables tries to make such an operation as fast as possible.
 # Conveniently, the returned object is a pandas DataFrame.

 rows = ts.read_range(read_start_dt, read_end_dt)

 print "\ntime to retrieve data slice in seconds    %4.3f" % (time()- t0)


 print "number of rows in results object          %d" % rows.count()[0]

 print "\nthe first 5 rows of this object:"

 print np.round(rows.head(), 2).to_string()

 # The following simulates a randomized access where
 # chunks of daily data sets are accessed and retrieved
 # as pandas DataFrame objects.

 t0 = time()
 its = 250
 total = 0
 for _ in xrange(its):
    day = random.randint(1, 27)
    month = random.randint(1, 11)
    read_start_dt = datetime(2015, month, day, 0, 0)
    read_end_dt = datetime(2015, month, day + 1, 0, 0)
    rows = ts.read_range(read_start_dt, read_end_dt)
    total += rows.count()[0]
 t1 = time()

 duration = t1 - t0
 print "\ntime for %d random accesses       %5.3f seconds" % (its, duration)
 print "average time for random access     %5.3f seconds" % (duration / its)
 print "number of records per second       %d" % (total / duration)


 # close database and remove sample data file

 h5.close()
 os.remove(path + 'ts_sec.h5')
	#
	# TsTables -- High Frequency Times Series Data with PyTables
	#
	# to install -- pip install tstables
	# Github repo -- https://github.com/afiedler/tstables
	#
	# Dr. Yves J. Hilpisch
	#
	# The Python Quants GmbH
	# http://quant-platform.com
	# http://pythonquants.com
	#
	import os
	import numpy as np
	import pandas as pd
	import tables as tb
	import random
	from time import time
	import tstables
	from datetime import datetime

	## Preliminaries

	# First, some basic assumptions.

	path = '/notebooks/ssd/data/' # delete or adjust

	co = 10 # number of time series
	dt = 1. / (12 * 30 * 24 * 60) # second as year fraction
	vol = 0.2 # volatility for stochastic processes


	# Second, a function to generate paths based on geometric Brownian motion.

	def generate_paths(no, initials):
	paths = initials * np.exp(np.cumsum(-0.5 * vol ** 2 * dt +
	vol * np.sqrt(dt) * np.random.standard_normal((no, co)), axis=0))
	paths[0] = initials
	return paths


	## Sample Data and Storage

	# To store the time series data in a PyTables table we define
	# the table structure.

	class TS(tb.IsDescription):
	timestamp = tb.Int64Col(pos=0)
	ts1 = tb.Float64Col(pos=1)
	ts2 = tb.Float64Col(pos=2)
	ts3 = tb.Float64Col(pos=3)
	ts4 = tb.Float64Col(pos=4)
	ts5 = tb.Float64Col(pos=5)
	ts6 = tb.Float64Col(pos=6)
	ts7 = tb.Float64Col(pos=7)
	ts8 = tb.Float64Col(pos=8)
	ts9 = tb.Float64Col(pos=9)
	ts10 = tb.Float64Col(pos=10)


	# Then open a database file and create the table object.

	h5 = tb.open_file(path + 'ts_sec.h5', 'w')


	# TsTables adds a new function ``create_ts`` to PyTables.

	ts = h5.create_ts('/', 'TS', TS)


	# We can now generate and write sample data to the table object.

	t0 = time()
	start = datetime(2015, 1, 1, 0, 0)
	initials = np.array([100.] * co)
	total = 0
	for i in xrange(18):
	if start.month == 12:
	end = datetime(start.year + 1, 1, 1)
	else:
	end = datetime(start.year, start.month + 1, 1)
	index = pd.date_range(start, end, freq='1s')
	start = end
	data = pd.DataFrame(generate_paths(len(index), initials),
	index=index)
	initials = data.values[-1]
	print i, len(index)
	total += len(index)
	ts.append(data)

	size = os.path.getsize(path + 'ts_sec.h5') / 1000000
	duration = (time() - t0)

	print "\ndata written to disk in mega bytes %d" % size
	print "time to write all data in seconds %5.3f" % (time() - t0)

	print "\nwriting speed in mega bytes per second %.1f" % int(size / duration)
	print "writing speed in records per second %d" % int(total / duration)


	# The strength of TsTables lies in retrieving chunks of time series
	# data defined by a start date and an end date (which obviously is a
	# typical case in finance, e.g. in backtesting strategies or risk
	# management).

	t0 = time()

	read_start_dt = datetime(2015, 2, 7, 0, 0) # one day
	read_end_dt = datetime(2015, 2, 7, 23, 59) # worth of data

	# TsTables tries to make such an operation as fast as possible.
	# Conveniently, the returned object is a pandas DataFrame.

	rows = ts.read_range(read_start_dt, read_end_dt)

	print "\ntime to retrieve data slice in seconds %4.3f" % (time()- t0)


	print "number of rows in results object %d" % rows.count()[0]

	print "\nthe first 5 rows of this object:"

	print np.round(rows.head(), 2).to_string()

	# The following simulates a randomized access where
	# chunks of daily data sets are accessed and retrieved
	# as pandas DataFrame objects.

	t0 = time()
	its = 250
	total = 0
	for _ in xrange(its):
	day = random.randint(1, 27)
	month = random.randint(1, 11)
	read_start_dt = datetime(2015, month, day, 0, 0)
	read_end_dt = datetime(2015, month, day + 1, 0, 0)
	rows = ts.read_range(read_start_dt, read_end_dt)
	total += rows.count()[0]
	t1 = time()

	duration = t1 - t0
	print "\ntime for %d random accesses %5.3f seconds" % (its, duration)
	print "average time for random access %5.3f seconds" % (duration / its)
	print "number of records per second %d" % (total / duration)


	# close database and remove sample data file

	h5.close()
	os.remove(path + 'ts_sec.h5')