Created
September 18, 2019 12:57
-
-
Save ranaroussi/6c3345bf974c04f66c62e4d772959607 to your computer and use it in GitHub Desktop.
QTPyLib timeseries resampler
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
def resample(data, resolution="1T", tz=None, ffill=True, dropna=False): | |
""" | |
>>> resample(df, '500K') # resample 500 ticks (by counting trades) | |
>>> resample(df, '500V') # resample ~500 volume (by counting volume) | |
""" | |
def __finalize(data, tz=None): | |
# figure out timezone | |
try: | |
tz = data.index.tz if tz is None else tz | |
except Exception: | |
pass | |
if str(tz) != 'None': | |
try: | |
data.index = data.index.tz_convert(tz) | |
except Exception: | |
data.index = data.index.tz_localize('UTC').tz_convert(tz) | |
# sort by index (datetime) | |
data.sort_index(inplace=True) | |
# drop duplicate rows per instrument | |
data.loc[:, '_idx_'] = data.index | |
data.drop_duplicates(subset=['_idx_'], keep='last', inplace=True) | |
data.drop('_idx_', axis=1, inplace=True) | |
return data | |
# return data[~data.index.duplicated(keep='last')] | |
def __resample_ticks(data, freq=1000, by='last'): | |
""" | |
function that re-samples tick data into an N-tick or N-volume OHLC | |
df = pandas pd.dataframe of raw tick data | |
freq = resoltuin grouping | |
by = the column name to resample by | |
""" | |
data.fillna(value=np.nan, inplace=True) | |
# get only ticks and fill missing data | |
try: | |
df = data[['last', 'lastsize']].copy() | |
price_col = 'last' | |
size_col = 'lastsize' | |
except Exception: | |
df = data[['close', 'volume']].copy() | |
price_col = 'close' | |
size_col = 'volume' | |
# add group indicator evey N df | |
if by == 'size' or by == 'lastsize' or by == 'volume': | |
df['cumvol'] = df[size_col].cumsum() | |
df['mark'] = freq * (round((df['cumvol'] / freq) / .1) * .1 // 1) | |
df['diff'] = df['mark'].diff().fillna(0).astype(int) | |
df['grp'] = np.where(df['diff'] >= freq - 1, | |
(df['mark'] / freq), np.nan) | |
else: | |
df['grp'] = [np.nan if i % | |
freq else i for i in range(len(df[price_col]))] | |
df.loc[:1, 'grp'] = 0 | |
df.fillna(method='ffill', inplace=True) | |
# place timestamp index in T colums | |
# (to be used as future df index) | |
df['T'] = df.index | |
# make group the index | |
df = df.set_index('grp') | |
# grop df | |
groupped = df.groupby(df.index, sort=False) | |
# build ohlc(v) pd.dataframe from new grp column | |
newdf = pd.DataFrame({ | |
'open': groupped[price_col].first(), | |
'high': groupped[price_col].max(), | |
'low': groupped[price_col].min(), | |
'close': groupped[price_col].last(), | |
'volume': groupped[size_col].sum() | |
}) | |
# set index to timestamp | |
newdf['datetime'] = groupped.T.head(1) | |
newdf.set_index(['datetime'], inplace=True) | |
return newdf | |
if data.empty: | |
return __finalize(data, tz) | |
# --------------------------------------------- | |
# resample | |
data.columns = map(str.lower, data.columns) | |
periods = int("".join([s for s in resolution if s.isdigit()])) | |
combined = [] | |
if "K" in resolution: | |
if periods > 1: | |
data = __resample_ticks(data.copy(), freq=periods, by='last') | |
data.dropna(inplace=True, subset=[ | |
'open', 'high', 'low', 'close', 'volume']) | |
return data | |
return data | |
if "V" in resolution: | |
if periods > 1: | |
data = __resample_ticks(data.copy(), freq=periods, by='lastsize') | |
data.dropna(inplace=True, subset=[ | |
'open', 'high', 'low', 'close', 'volume']) | |
return data | |
return data | |
# continue... | |
if "last" in data.columns: | |
ohlc = data['last'].resample(resolution).ohlc() | |
data = data.resample(resolution).apply({'lastsize': 'sum'}).fillna(value=np.nan) | |
data.rename(columns={'lastsize': 'volume'}, inplace=True) | |
data['open'] = ohlc['open'] | |
data['high'] = ohlc['high'] | |
data['low'] = ohlc['low'] | |
data['close'] = ohlc['close'] | |
else: | |
original_length = len(data) | |
data = data.resample(resolution).apply({ | |
'open': 'first', | |
'high': 'max', | |
'low': 'min', | |
'close': 'last', | |
'volume': 'sum', | |
}).fillna(value=np.nan) | |
# deal with new rows caused by resample | |
if len(data) > original_length: | |
# volume is 0 on rows created using resample | |
data['volume'].fillna(0, inplace=True) | |
data.ffill(inplace=True) | |
# no fill / return original index | |
filler = data['close'] if ffill else np.nan | |
data['open'] = np.where( | |
data['volume'] <= 0, filler, data['open']) | |
data['high'] = np.where( | |
data['volume'] <= 0, filler, data['high']) | |
data['low'] = np.where( | |
data['volume'] <= 0, filler, data['low']) | |
# drop NANs | |
if dropna: | |
symdata.dropna(inplace=True) | |
# cleanup | |
data.dropna(inplace=True, subset=[ | |
'open', 'high', 'low', 'close', 'volume']) | |
data['volume'] = data['volume'].astype(int) | |
return __finalize(data, tz) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
You need to make sure you have a last/lastsize or close/volume columns for tick data or ohlcv for second+ level bar data.