-
-
Save casperkaae/6df9448124d872f9a27fb0c4aadd9540 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def update_dynamic_datasets(outputfolder=None, recompute_out_of_sample_static_y_hat=True): | |
###### COMBINED PREPREOCESSING ########### | |
logger.info('Preprocessing - making out of sample predictions') | |
target_col = 'intraday_pct_change_next' | |
#df = load_newest_dataset_from_fileserver(STATIC_EARNINGS_DATA_NAME) | |
# ds = load_newest_dataset_from_fileserver(MARKET_BAR_DATASET_NAME) | |
#df = get_static_earnings_input_data(df, ds, overwrite_overnight_pct_change=True) | |
#if settings.STATIC_DATASET_SETTINGS['use_shifted_static_data'] and recompute_out_of_sample_static_y_hat: | |
# df_shift = load_newest_dataset_from_fileserver(STATIC_EARNINGS_DATA_SHIFTED_NAME) | |
# ds_shift = load_newest_dataset_from_fileserver(MARKET_BAR_SHIFTED_DATASET_NAME) | |
# df_shift = get_static_earnings_input_data(df_shift, ds_shift, overwrite_overnight_pct_change=True) | |
# | |
# #we need to make the indexes unique as they are year_quarter_month company_id we can simply mutiply them with an integer > 1, e.g. 2 | |
# df_shift.index = df_shift.index * 2 | |
# df = pd.concat([df, df_shift]) | |
# use _load_static_data to get the static data set. Note that this returns | |
# shifted data with index * 2 | |
# non recomputed price events with index * 3 | |
ds = load_newest_dataset_from_fileserver(MARKET_BAR_DATASET_NAME) | |
from trading.airflow.retrain_model.retrain_model import _load_static_data | |
df, _, _, _, _ = _load_static_data() | |
import settings | |
df.dropna(subset=[target_col], inplace=True) | |
if recompute_out_of_sample_static_y_hat: | |
model = get_default_static_earnings_model() | |
df_train = add_out_of_sample_y_hat(df, model, cross_to_forward_split_year='2017') | |
df = df_train.loc[df_train['is_shifted_earningsday'] == False] #remove shifted data | |
logger.info('Saving y_hat predictions') | |
save_dataset(df["y_hat"], OUT_OF_SAMPLE_STATIC_Y_HAT_DATA_NAME, overwrite_newest=True) | |
else: | |
df = df.loc[df['is_shifted_earningsday'] == False] #remove shifted data | |
y_hat = load_newest_dataset_from_fileserver(OUT_OF_SAMPLE_STATIC_Y_HAT_DATA_NAME) | |
df["y_hat"] = y_hat.reindex(df.index.values) | |
#we now have a df with the recomputed static_y_hat at index * 1 and the non-recomputed static_y_hat at index*3 and the shifted data have been removed | |
df_premarket = df.loc[df['overwrite_overnight_pct_change'] == True].copy() | |
df_open = df.loc[df['overwrite_overnight_pct_change'] == False].copy() | |
df_open = df_open.index = (df_open.index // 3).astype('int') #change back to orignal index | |
static_y_hat = dict(premarket=df_premarket['static_y_hat'].copy(), | |
open=df_open['static_y_hat'].copy()) | |
#arbitrarily use the df_premarket to extact features from in the function below | |
df = df_premarket | |
##### DYNAMIC INPUT DATA - PRETRAINED CLASSIFIER ##### | |
ds = get_dynamic_earnings_input_data(df.copy(), ds.copy(), static_y_hat, add_static_y_hat=True, | |
subset_and_add_target=False) | |
logger.info('saving dynamic dataset') | |
save_dataset(ds, DYNAMIC_INPUT_DATA_NAME, output_folder=outputfolder) | |
logger.info('all done') | |
def get_dynamic_earnings_input_data(df, ds, | |
static_y_hat=None, | |
overnight_jump_thres=1.0, remove_big_overnight_jumpers=False, | |
subset_and_add_target=True, min_num_trades=10, add_static_y_hat=True, | |
earnings_day_to_last_trading_day_traded_value_ratio=3, | |
featnames=None, exclude_samples_with_no_static_y_hat=True, | |
adjust_overnight_pct_change=True, exit_price=None): | |
ds["last_mid_price"] = xr.DataArray(compute_midprice(ds["last_ask_price"].values, ds["last_bid_price"].values), | |
coords=ds["last_ask_price"].coords, dims=ds["last_ask_price"].dims) | |
ds["mean_mid_price"] = xr.DataArray(compute_midprice(ds["mean_ask_price"].values, ds["mean_bid_price"].values), | |
coords=ds["mean_ask_price"].coords, dims=ds["mean_ask_price"].dims) | |
ds["last_spread"] = (ds["last_ask_price"] - ds["last_bid_price"]) | |
ds["relative_last_spread"] = ds["last_spread"] / ds["last_mid_price"] | |
if remove_big_overnight_jumpers: | |
ds = remove_too_high_overnight_jumpers(ds, thres=overnight_jump_thres) | |
ds = add_features_from_static_data_to_market_bar_dataset( | |
df, ds, static_y_hat, | |
add_static_y_hat=add_static_y_hat, | |
copy=True, | |
earnings_day_to_last_trading_day_traded_value_ratio=earnings_day_to_last_trading_day_traded_value_ratio, | |
featnames=featnames, | |
exclude_samples_with_no_static_y_hat=exclude_samples_with_no_static_y_hat, | |
adjust_overnight_pct_change=adjust_overnight_pct_change | |
) | |
ds["is_short_target"] = ds["company_id"].astype(np.float32).copy() | |
ds["is_short_target"].values.fill(0) | |
ds = set_first_prices(ds, copy=True) | |
ds["timedelta_feature"] = get_timedelta_feature_dataarray(ds) | |
ds["traded_value_normalized"] = get_traded_value_normalized_dataarray(ds["traded_value"], ds["40_tdays_median_value"]) | |
ds["overnight_value_normalized"] = ds["overnight_value"] / ds["40_tdays_median_value"] | |
ds["cumsum_traded_value_normalized"] = get_cumsum_traded_value_normalized_dataarray( | |
ds["traded_value_normalized"], | |
ds["overnight_value_normalized"] | |
) | |
ds['vwap_relative_to_previous_close_price'] = get_relative_price_dataarray(ds["vwap"], | |
ds["previous_close_price"]) | |
ds['vwap_relative_to_starting_price'] = get_relative_price_dataarray(ds["vwap"], | |
ds["starting_price"]) | |
ds["static_y_hat_corrected"] = get_static_y_hat_corrected_dataarray(ds["vwap_relative_to_starting_price"], | |
ds["static_y_hat"]) | |
micro_midprice_diff = compute_vwap_midprice_diff(ds["mean_microprice"].values.astype(np.float32), | |
ds["mean_mid_price"].values.astype(np.float32) | |
).astype(np.float32) | |
ds["micro_midprice_diff"] = xr.DataArray(micro_midprice_diff, coords=ds["vwap"].coords, dims=ds["vwap"].dims) | |
VWAPMidPriceDiffComputer().forward(ds) | |
QuotePriceSmoother().forward(ds) | |
SectorPriceBetaAdjuster().forward(ds) | |
if subset_and_add_target: | |
ds = subset_dataset_and_add_target(ds, min_num_trades=min_num_trades, | |
exit_price=exit_price) | |
return ds | |
def add_features_from_static_data_to_market_bar_dataset( | |
df, ds, | |
static_y_hat=None, | |
add_static_y_hat=True, | |
copy=True, | |
earnings_day_to_last_trading_day_traded_value_ratio=3, | |
featnames=None, | |
exclude_samples_with_no_static_y_hat=True, | |
adjust_overnight_pct_change=True, | |
add_prev_value_features=False | |
): | |
""" | |
:param df: output from trading.dataset_creation.static_earnings_features.get_static_earnings_input_data with "static_y_hat" if add_static_y_hat is True | |
:param ds: output from trading.dataset_creation.market_bar_dataset.create_market_bar_dataset | |
:param add_static_y_hat: | |
:param copy: | |
:param earnings_day_to_last_trading_day_traded_value_ratio: | |
:return: | |
""" | |
assert df.index.name == "ts_id" | |
if copy: | |
ds = ds.copy() | |
if featnames is None: | |
featnames = list(DEFAULT_DYNAMIC_FEATNAMES_FROM_STATIC) | |
if add_static_y_hat: | |
#here we need to make ds["static_y_hat"] into a timeseries and put the predictions into the right positions and forward fill | |
static_y_hat['premarket'] | |
static_y_hat['open'] | |
#.... | |
#ds["static_y_hat"] = df["y_hat"].reindex(ds.ts_id.values) | |
# after this we are done and only need to change the static_y_hat from a static into a dynamic feature in the dynamic earning regressors | |
if exclude_samples_with_no_static_y_hat: | |
ds = ds.sel(ts_id=ds.ts_id.loc[ds["static_y_hat"].notnull()]).copy(deep=True) | |
for featname in featnames: | |
ds[featname] = df[featname].reindex(ds.ts_id.values) | |
if adjust_overnight_pct_change: | |
df["adjustment_factor_last"] = df["close_last"] / df["closeunadj_last"] | |
df["adjustment_factor_next"] = df["close_next"] / df["closeunadj_next"] | |
a = (df["adjustment_factor_next"] / df["adjustment_factor_last"]).reindex(ds.ts_id.values) | |
a.fillna(1.0, inplace=True) | |
ds["overnight_pct_change"] = ds["overnight_pct_change"] * a + a - 1.0 | |
assert np.all((ds["next_trading_date_1"] == ds["date"]) | ds["next_trading_date_1"].isnull()) | |
if add_prev_value_features: | |
ds["previous_open_value"] = (ds["previous_open_price"] * ds["previous_open_size"]) | |
no_prev_open_value = ds["median_prev_open_value"].isnull() | |
ds["median_prev_open_value"].loc[no_prev_open_value] = (ds["previous_open_value"].loc[no_prev_open_value] | |
* earnings_day_to_last_trading_day_traded_value_ratio) | |
ds["median_prev_total_traded_value_normalized"] = (ds["median_prev_total_traded_value"] | |
/ ( ds["40_tdays_median_value_spy"] / 1000) | |
) | |
return ds |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment