Skip to content

Instantly share code, notes, and snippets.

@casperkaae
Last active August 7, 2018 07:38
Show Gist options
  • Save casperkaae/6df9448124d872f9a27fb0c4aadd9540 to your computer and use it in GitHub Desktop.
Save casperkaae/6df9448124d872f9a27fb0c4aadd9540 to your computer and use it in GitHub Desktop.
def update_dynamic_datasets(outputfolder=None, recompute_out_of_sample_static_y_hat=True):
###### COMBINED PREPREOCESSING ###########
logger.info('Preprocessing - making out of sample predictions')
target_col = 'intraday_pct_change_next'
#df = load_newest_dataset_from_fileserver(STATIC_EARNINGS_DATA_NAME)
# ds = load_newest_dataset_from_fileserver(MARKET_BAR_DATASET_NAME)
#df = get_static_earnings_input_data(df, ds, overwrite_overnight_pct_change=True)
#if settings.STATIC_DATASET_SETTINGS['use_shifted_static_data'] and recompute_out_of_sample_static_y_hat:
# df_shift = load_newest_dataset_from_fileserver(STATIC_EARNINGS_DATA_SHIFTED_NAME)
# ds_shift = load_newest_dataset_from_fileserver(MARKET_BAR_SHIFTED_DATASET_NAME)
# df_shift = get_static_earnings_input_data(df_shift, ds_shift, overwrite_overnight_pct_change=True)
#
# #we need to make the indexes unique as they are year_quarter_month company_id we can simply mutiply them with an integer > 1, e.g. 2
# df_shift.index = df_shift.index * 2
# df = pd.concat([df, df_shift])
# use _load_static_data to get the static data set. Note that this returns
# shifted data with index * 2
# non recomputed price events with index * 3
ds = load_newest_dataset_from_fileserver(MARKET_BAR_DATASET_NAME)
from trading.airflow.retrain_model.retrain_model import _load_static_data
df, _, _, _, _ = _load_static_data()
import settings
df.dropna(subset=[target_col], inplace=True)
if recompute_out_of_sample_static_y_hat:
model = get_default_static_earnings_model()
df_train = add_out_of_sample_y_hat(df, model, cross_to_forward_split_year='2017')
df = df_train.loc[df_train['is_shifted_earningsday'] == False] #remove shifted data
logger.info('Saving y_hat predictions')
save_dataset(df["y_hat"], OUT_OF_SAMPLE_STATIC_Y_HAT_DATA_NAME, overwrite_newest=True)
else:
df = df.loc[df['is_shifted_earningsday'] == False] #remove shifted data
y_hat = load_newest_dataset_from_fileserver(OUT_OF_SAMPLE_STATIC_Y_HAT_DATA_NAME)
df["y_hat"] = y_hat.reindex(df.index.values)
#we now have a df with the recomputed static_y_hat at index * 1 and the non-recomputed static_y_hat at index*3 and the shifted data have been removed
df_premarket = df.loc[df['overwrite_overnight_pct_change'] == True].copy()
df_open = df.loc[df['overwrite_overnight_pct_change'] == False].copy()
df_open = df_open.index = (df_open.index // 3).astype('int') #change back to orignal index
static_y_hat = dict(premarket=df_premarket['static_y_hat'].copy(),
open=df_open['static_y_hat'].copy())
#arbitrarily use the df_premarket to extact features from in the function below
df = df_premarket
##### DYNAMIC INPUT DATA - PRETRAINED CLASSIFIER #####
ds = get_dynamic_earnings_input_data(df.copy(), ds.copy(), static_y_hat, add_static_y_hat=True,
subset_and_add_target=False)
logger.info('saving dynamic dataset')
save_dataset(ds, DYNAMIC_INPUT_DATA_NAME, output_folder=outputfolder)
logger.info('all done')
def get_dynamic_earnings_input_data(df, ds,
static_y_hat=None,
overnight_jump_thres=1.0, remove_big_overnight_jumpers=False,
subset_and_add_target=True, min_num_trades=10, add_static_y_hat=True,
earnings_day_to_last_trading_day_traded_value_ratio=3,
featnames=None, exclude_samples_with_no_static_y_hat=True,
adjust_overnight_pct_change=True, exit_price=None):
ds["last_mid_price"] = xr.DataArray(compute_midprice(ds["last_ask_price"].values, ds["last_bid_price"].values),
coords=ds["last_ask_price"].coords, dims=ds["last_ask_price"].dims)
ds["mean_mid_price"] = xr.DataArray(compute_midprice(ds["mean_ask_price"].values, ds["mean_bid_price"].values),
coords=ds["mean_ask_price"].coords, dims=ds["mean_ask_price"].dims)
ds["last_spread"] = (ds["last_ask_price"] - ds["last_bid_price"])
ds["relative_last_spread"] = ds["last_spread"] / ds["last_mid_price"]
if remove_big_overnight_jumpers:
ds = remove_too_high_overnight_jumpers(ds, thres=overnight_jump_thres)
ds = add_features_from_static_data_to_market_bar_dataset(
df, ds, static_y_hat,
add_static_y_hat=add_static_y_hat,
copy=True,
earnings_day_to_last_trading_day_traded_value_ratio=earnings_day_to_last_trading_day_traded_value_ratio,
featnames=featnames,
exclude_samples_with_no_static_y_hat=exclude_samples_with_no_static_y_hat,
adjust_overnight_pct_change=adjust_overnight_pct_change
)
ds["is_short_target"] = ds["company_id"].astype(np.float32).copy()
ds["is_short_target"].values.fill(0)
ds = set_first_prices(ds, copy=True)
ds["timedelta_feature"] = get_timedelta_feature_dataarray(ds)
ds["traded_value_normalized"] = get_traded_value_normalized_dataarray(ds["traded_value"], ds["40_tdays_median_value"])
ds["overnight_value_normalized"] = ds["overnight_value"] / ds["40_tdays_median_value"]
ds["cumsum_traded_value_normalized"] = get_cumsum_traded_value_normalized_dataarray(
ds["traded_value_normalized"],
ds["overnight_value_normalized"]
)
ds['vwap_relative_to_previous_close_price'] = get_relative_price_dataarray(ds["vwap"],
ds["previous_close_price"])
ds['vwap_relative_to_starting_price'] = get_relative_price_dataarray(ds["vwap"],
ds["starting_price"])
ds["static_y_hat_corrected"] = get_static_y_hat_corrected_dataarray(ds["vwap_relative_to_starting_price"],
ds["static_y_hat"])
micro_midprice_diff = compute_vwap_midprice_diff(ds["mean_microprice"].values.astype(np.float32),
ds["mean_mid_price"].values.astype(np.float32)
).astype(np.float32)
ds["micro_midprice_diff"] = xr.DataArray(micro_midprice_diff, coords=ds["vwap"].coords, dims=ds["vwap"].dims)
VWAPMidPriceDiffComputer().forward(ds)
QuotePriceSmoother().forward(ds)
SectorPriceBetaAdjuster().forward(ds)
if subset_and_add_target:
ds = subset_dataset_and_add_target(ds, min_num_trades=min_num_trades,
exit_price=exit_price)
return ds
def add_features_from_static_data_to_market_bar_dataset(
df, ds,
static_y_hat=None,
add_static_y_hat=True,
copy=True,
earnings_day_to_last_trading_day_traded_value_ratio=3,
featnames=None,
exclude_samples_with_no_static_y_hat=True,
adjust_overnight_pct_change=True,
add_prev_value_features=False
):
"""
:param df: output from trading.dataset_creation.static_earnings_features.get_static_earnings_input_data with "static_y_hat" if add_static_y_hat is True
:param ds: output from trading.dataset_creation.market_bar_dataset.create_market_bar_dataset
:param add_static_y_hat:
:param copy:
:param earnings_day_to_last_trading_day_traded_value_ratio:
:return:
"""
assert df.index.name == "ts_id"
if copy:
ds = ds.copy()
if featnames is None:
featnames = list(DEFAULT_DYNAMIC_FEATNAMES_FROM_STATIC)
if add_static_y_hat:
#here we need to make ds["static_y_hat"] into a timeseries and put the predictions into the right positions and forward fill
static_y_hat['premarket']
static_y_hat['open']
#....
#ds["static_y_hat"] = df["y_hat"].reindex(ds.ts_id.values)
# after this we are done and only need to change the static_y_hat from a static into a dynamic feature in the dynamic earning regressors
if exclude_samples_with_no_static_y_hat:
ds = ds.sel(ts_id=ds.ts_id.loc[ds["static_y_hat"].notnull()]).copy(deep=True)
for featname in featnames:
ds[featname] = df[featname].reindex(ds.ts_id.values)
if adjust_overnight_pct_change:
df["adjustment_factor_last"] = df["close_last"] / df["closeunadj_last"]
df["adjustment_factor_next"] = df["close_next"] / df["closeunadj_next"]
a = (df["adjustment_factor_next"] / df["adjustment_factor_last"]).reindex(ds.ts_id.values)
a.fillna(1.0, inplace=True)
ds["overnight_pct_change"] = ds["overnight_pct_change"] * a + a - 1.0
assert np.all((ds["next_trading_date_1"] == ds["date"]) | ds["next_trading_date_1"].isnull())
if add_prev_value_features:
ds["previous_open_value"] = (ds["previous_open_price"] * ds["previous_open_size"])
no_prev_open_value = ds["median_prev_open_value"].isnull()
ds["median_prev_open_value"].loc[no_prev_open_value] = (ds["previous_open_value"].loc[no_prev_open_value]
* earnings_day_to_last_trading_day_traded_value_ratio)
ds["median_prev_total_traded_value_normalized"] = (ds["median_prev_total_traded_value"]
/ ( ds["40_tdays_median_value_spy"] / 1000)
)
return ds
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment