casperkaae · August 7, 2018 07:38
diff --git a/..py b/..py
 def update_dynamic_datasets(outputfolder=None, recompute_out_of_sample_static_y_hat=True):
    ###### COMBINED PREPREOCESSING ###########
    logger.info('Preprocessing - making out of sample predictions')
    target_col = 'intraday_pct_change_next'
    #df = load_newest_dataset_from_fileserver(STATIC_EARNINGS_DATA_NAME)
    # ds = load_newest_dataset_from_fileserver(MARKET_BAR_DATASET_NAME)
    #df = get_static_earnings_input_data(df, ds, overwrite_overnight_pct_change=True)

    #if settings.STATIC_DATASET_SETTINGS['use_shifted_static_data'] and recompute_out_of_sample_static_y_hat:
    #    df_shift = load_newest_dataset_from_fileserver(STATIC_EARNINGS_DATA_SHIFTED_NAME)
    #    ds_shift = load_newest_dataset_from_fileserver(MARKET_BAR_SHIFTED_DATASET_NAME)
    #    df_shift = get_static_earnings_input_data(df_shift, ds_shift, overwrite_overnight_pct_change=True)
    #
    #   #we need to make the indexes unique as they are year_quarter_month company_id we can simply mutiply them with an integer > 1, e.g. 2
    #    df_shift.index = df_shift.index * 2
    #    df = pd.concat([df, df_shift])

    # use _load_static_data to get the static data set. Note that this returns
    # shifted data with index * 2
    # non recomputed price events with index * 3
    ds = load_newest_dataset_from_fileserver(MARKET_BAR_DATASET_NAME)
    from trading.airflow.retrain_model.retrain_model import _load_static_data
    df, _, _, _, _ = _load_static_data()
    import settings

    df.dropna(subset=[target_col], inplace=True)

    if recompute_out_of_sample_static_y_hat:
        model = get_default_static_earnings_model()
        df_train = add_out_of_sample_y_hat(df, model, cross_to_forward_split_year='2017')
        df = df_train.loc[df_train['is_shifted_earningsday'] == False] #remove shifted data
        logger.info('Saving y_hat predictions')
        save_dataset(df["y_hat"], OUT_OF_SAMPLE_STATIC_Y_HAT_DATA_NAME, overwrite_newest=True)

    else:
        df = df.loc[df['is_shifted_earningsday'] == False] #remove shifted data
        y_hat = load_newest_dataset_from_fileserver(OUT_OF_SAMPLE_STATIC_Y_HAT_DATA_NAME)
        df["y_hat"] = y_hat.reindex(df.index.values)


    #we now have a df with the recomputed static_y_hat at index * 1 and the non-recomputed static_y_hat at index*3 and the shifted data have been removed
    df_premarket = df.loc[df['overwrite_overnight_pct_change'] == True].copy()
    df_open = df.loc[df['overwrite_overnight_pct_change'] == False].copy()
    df_open = df_open.index = (df_open.index // 3).astype('int') #change back to orignal index
    static_y_hat = dict(premarket=df_premarket['static_y_hat'].copy(),
                       open=df_open['static_y_hat'].copy())

    #arbitrarily use the df_premarket to extact features from in the function below
    df = df_premarket

    ##### DYNAMIC INPUT DATA - PRETRAINED CLASSIFIER #####
    ds = get_dynamic_earnings_input_data(df.copy(), ds.copy(), static_y_hat, add_static_y_hat=True,
                                          subset_and_add_target=False)
    logger.info('saving dynamic dataset')
    save_dataset(ds, DYNAMIC_INPUT_DATA_NAME, output_folder=outputfolder)

    logger.info('all done')
    
    
    
 def get_dynamic_earnings_input_data(df, ds,
                                    static_y_hat=None,
                                    overnight_jump_thres=1.0, remove_big_overnight_jumpers=False,
                                    subset_and_add_target=True, min_num_trades=10, add_static_y_hat=True,
                                    earnings_day_to_last_trading_day_traded_value_ratio=3,
                                    featnames=None, exclude_samples_with_no_static_y_hat=True,
                                    adjust_overnight_pct_change=True, exit_price=None):

    ds["last_mid_price"] = xr.DataArray(compute_midprice(ds["last_ask_price"].values, ds["last_bid_price"].values),
                                        coords=ds["last_ask_price"].coords, dims=ds["last_ask_price"].dims)

    ds["mean_mid_price"] = xr.DataArray(compute_midprice(ds["mean_ask_price"].values, ds["mean_bid_price"].values),
                                        coords=ds["mean_ask_price"].coords, dims=ds["mean_ask_price"].dims)

    ds["last_spread"] = (ds["last_ask_price"] - ds["last_bid_price"])
    ds["relative_last_spread"] = ds["last_spread"] / ds["last_mid_price"]

    if remove_big_overnight_jumpers:
        ds = remove_too_high_overnight_jumpers(ds, thres=overnight_jump_thres)

    ds = add_features_from_static_data_to_market_bar_dataset(
        df, ds, static_y_hat,
        add_static_y_hat=add_static_y_hat,
        copy=True,
        earnings_day_to_last_trading_day_traded_value_ratio=earnings_day_to_last_trading_day_traded_value_ratio,
        featnames=featnames,
        exclude_samples_with_no_static_y_hat=exclude_samples_with_no_static_y_hat,
        adjust_overnight_pct_change=adjust_overnight_pct_change
    )

    ds["is_short_target"] = ds["company_id"].astype(np.float32).copy()
    ds["is_short_target"].values.fill(0)

    ds = set_first_prices(ds, copy=True)


    ds["timedelta_feature"] = get_timedelta_feature_dataarray(ds)

    ds["traded_value_normalized"] = get_traded_value_normalized_dataarray(ds["traded_value"], ds["40_tdays_median_value"])

    ds["overnight_value_normalized"] = ds["overnight_value"] / ds["40_tdays_median_value"]

    ds["cumsum_traded_value_normalized"] = get_cumsum_traded_value_normalized_dataarray(
                                                                            ds["traded_value_normalized"],
                                                                            ds["overnight_value_normalized"]
                                                             )

    ds['vwap_relative_to_previous_close_price'] = get_relative_price_dataarray(ds["vwap"],
                                                                                            ds["previous_close_price"])

    ds['vwap_relative_to_starting_price'] = get_relative_price_dataarray(ds["vwap"],
                                                                                      ds["starting_price"])

    ds["static_y_hat_corrected"] = get_static_y_hat_corrected_dataarray(ds["vwap_relative_to_starting_price"],
                                                                        ds["static_y_hat"])

    micro_midprice_diff = compute_vwap_midprice_diff(ds["mean_microprice"].values.astype(np.float32),
                                                    ds["mean_mid_price"].values.astype(np.float32)
                                                    ).astype(np.float32)
    ds["micro_midprice_diff"] = xr.DataArray(micro_midprice_diff, coords=ds["vwap"].coords, dims=ds["vwap"].dims)


    VWAPMidPriceDiffComputer().forward(ds)
    QuotePriceSmoother().forward(ds)

    SectorPriceBetaAdjuster().forward(ds)

    if subset_and_add_target:
        ds = subset_dataset_and_add_target(ds, min_num_trades=min_num_trades,
                                           exit_price=exit_price)

    return ds


 def add_features_from_static_data_to_market_bar_dataset(
        df, ds, 
        static_y_hat=None,
        add_static_y_hat=True,
        copy=True,
        earnings_day_to_last_trading_day_traded_value_ratio=3,
        featnames=None,
        exclude_samples_with_no_static_y_hat=True,
        adjust_overnight_pct_change=True,
        add_prev_value_features=False
    ):
    """

    :param df: output from trading.dataset_creation.static_earnings_features.get_static_earnings_input_data with "static_y_hat" if add_static_y_hat is True
    :param ds: output from trading.dataset_creation.market_bar_dataset.create_market_bar_dataset
    :param add_static_y_hat:
    :param copy:
    :param earnings_day_to_last_trading_day_traded_value_ratio:
    :return:
    """
    assert df.index.name == "ts_id"
    if copy:
        ds = ds.copy()

    if featnames is None:
        featnames = list(DEFAULT_DYNAMIC_FEATNAMES_FROM_STATIC)

    if add_static_y_hat:
        #here we need to make ds["static_y_hat"] into a timeseries and put the predictions into the right positions and forward fill
        static_y_hat['premarket']
        static_y_hat['open']
        #....
        #ds["static_y_hat"] = df["y_hat"].reindex(ds.ts_id.values)
        # after this we are done and only need to change the static_y_hat from a static into a dynamic feature in the dynamic earning regressors
        if exclude_samples_with_no_static_y_hat:
            ds = ds.sel(ts_id=ds.ts_id.loc[ds["static_y_hat"].notnull()]).copy(deep=True)

    for featname in featnames:
        ds[featname] = df[featname].reindex(ds.ts_id.values)

    if adjust_overnight_pct_change:
        df["adjustment_factor_last"] = df["close_last"] / df["closeunadj_last"]
        df["adjustment_factor_next"] = df["close_next"] / df["closeunadj_next"]
        a = (df["adjustment_factor_next"] / df["adjustment_factor_last"]).reindex(ds.ts_id.values)
        a.fillna(1.0, inplace=True)
        ds["overnight_pct_change"] = ds["overnight_pct_change"] * a + a - 1.0

    assert np.all((ds["next_trading_date_1"] == ds["date"]) | ds["next_trading_date_1"].isnull())

    if add_prev_value_features:
        ds["previous_open_value"] = (ds["previous_open_price"] * ds["previous_open_size"])
        no_prev_open_value = ds["median_prev_open_value"].isnull()
        ds["median_prev_open_value"].loc[no_prev_open_value] = (ds["previous_open_value"].loc[no_prev_open_value]
                                                                * earnings_day_to_last_trading_day_traded_value_ratio)

        ds["median_prev_total_traded_value_normalized"] = (ds["median_prev_total_traded_value"]
                                                           / ( ds["40_tdays_median_value_spy"] / 1000)
                                                           )

    return ds
	def update_dynamic_datasets(outputfolder=None, recompute_out_of_sample_static_y_hat=True):
	###### COMBINED PREPREOCESSING ###########
	logger.info('Preprocessing - making out of sample predictions')
	target_col = 'intraday_pct_change_next'
	#df = load_newest_dataset_from_fileserver(STATIC_EARNINGS_DATA_NAME)
	# ds = load_newest_dataset_from_fileserver(MARKET_BAR_DATASET_NAME)
	#df = get_static_earnings_input_data(df, ds, overwrite_overnight_pct_change=True)

	#if settings.STATIC_DATASET_SETTINGS['use_shifted_static_data'] and recompute_out_of_sample_static_y_hat:
	# df_shift = load_newest_dataset_from_fileserver(STATIC_EARNINGS_DATA_SHIFTED_NAME)
	# ds_shift = load_newest_dataset_from_fileserver(MARKET_BAR_SHIFTED_DATASET_NAME)
	# df_shift = get_static_earnings_input_data(df_shift, ds_shift, overwrite_overnight_pct_change=True)
	#
	# #we need to make the indexes unique as they are year_quarter_month company_id we can simply mutiply them with an integer > 1, e.g. 2
	# df_shift.index = df_shift.index * 2
	# df = pd.concat([df, df_shift])

	# use _load_static_data to get the static data set. Note that this returns
	# shifted data with index * 2
	# non recomputed price events with index * 3
	ds = load_newest_dataset_from_fileserver(MARKET_BAR_DATASET_NAME)
	from trading.airflow.retrain_model.retrain_model import _load_static_data
	df, _, _, _, _ = _load_static_data()
	import settings

	df.dropna(subset=[target_col], inplace=True)

	if recompute_out_of_sample_static_y_hat:
	model = get_default_static_earnings_model()
	df_train = add_out_of_sample_y_hat(df, model, cross_to_forward_split_year='2017')
	df = df_train.loc[df_train['is_shifted_earningsday'] == False] #remove shifted data
	logger.info('Saving y_hat predictions')
	save_dataset(df["y_hat"], OUT_OF_SAMPLE_STATIC_Y_HAT_DATA_NAME, overwrite_newest=True)

	else:
	df = df.loc[df['is_shifted_earningsday'] == False] #remove shifted data
	y_hat = load_newest_dataset_from_fileserver(OUT_OF_SAMPLE_STATIC_Y_HAT_DATA_NAME)
	df["y_hat"] = y_hat.reindex(df.index.values)


	#we now have a df with the recomputed static_y_hat at index * 1 and the non-recomputed static_y_hat at index*3 and the shifted data have been removed
	df_premarket = df.loc[df['overwrite_overnight_pct_change'] == True].copy()
	df_open = df.loc[df['overwrite_overnight_pct_change'] == False].copy()
	df_open = df_open.index = (df_open.index // 3).astype('int') #change back to orignal index
	static_y_hat = dict(premarket=df_premarket['static_y_hat'].copy(),
	open=df_open['static_y_hat'].copy())

	#arbitrarily use the df_premarket to extact features from in the function below
	df = df_premarket

	##### DYNAMIC INPUT DATA - PRETRAINED CLASSIFIER #####
	ds = get_dynamic_earnings_input_data(df.copy(), ds.copy(), static_y_hat, add_static_y_hat=True,
	subset_and_add_target=False)
	logger.info('saving dynamic dataset')
	save_dataset(ds, DYNAMIC_INPUT_DATA_NAME, output_folder=outputfolder)

	logger.info('all done')



	def get_dynamic_earnings_input_data(df, ds,
	static_y_hat=None,
	overnight_jump_thres=1.0, remove_big_overnight_jumpers=False,
	subset_and_add_target=True, min_num_trades=10, add_static_y_hat=True,
	earnings_day_to_last_trading_day_traded_value_ratio=3,
	featnames=None, exclude_samples_with_no_static_y_hat=True,
	adjust_overnight_pct_change=True, exit_price=None):

	ds["last_mid_price"] = xr.DataArray(compute_midprice(ds["last_ask_price"].values, ds["last_bid_price"].values),
	coords=ds["last_ask_price"].coords, dims=ds["last_ask_price"].dims)

	ds["mean_mid_price"] = xr.DataArray(compute_midprice(ds["mean_ask_price"].values, ds["mean_bid_price"].values),
	coords=ds["mean_ask_price"].coords, dims=ds["mean_ask_price"].dims)

	ds["last_spread"] = (ds["last_ask_price"] - ds["last_bid_price"])
	ds["relative_last_spread"] = ds["last_spread"] / ds["last_mid_price"]

	if remove_big_overnight_jumpers:
	ds = remove_too_high_overnight_jumpers(ds, thres=overnight_jump_thres)

	ds = add_features_from_static_data_to_market_bar_dataset(
	df, ds, static_y_hat,
	add_static_y_hat=add_static_y_hat,
	copy=True,
	earnings_day_to_last_trading_day_traded_value_ratio=earnings_day_to_last_trading_day_traded_value_ratio,
	featnames=featnames,
	exclude_samples_with_no_static_y_hat=exclude_samples_with_no_static_y_hat,
	adjust_overnight_pct_change=adjust_overnight_pct_change
	)

	ds["is_short_target"] = ds["company_id"].astype(np.float32).copy()
	ds["is_short_target"].values.fill(0)

	ds = set_first_prices(ds, copy=True)


	ds["timedelta_feature"] = get_timedelta_feature_dataarray(ds)

	ds["traded_value_normalized"] = get_traded_value_normalized_dataarray(ds["traded_value"], ds["40_tdays_median_value"])

	ds["overnight_value_normalized"] = ds["overnight_value"] / ds["40_tdays_median_value"]

	ds["cumsum_traded_value_normalized"] = get_cumsum_traded_value_normalized_dataarray(
	ds["traded_value_normalized"],
	ds["overnight_value_normalized"]
	)

	ds['vwap_relative_to_previous_close_price'] = get_relative_price_dataarray(ds["vwap"],
	ds["previous_close_price"])

	ds['vwap_relative_to_starting_price'] = get_relative_price_dataarray(ds["vwap"],
	ds["starting_price"])

	ds["static_y_hat_corrected"] = get_static_y_hat_corrected_dataarray(ds["vwap_relative_to_starting_price"],
	ds["static_y_hat"])

	micro_midprice_diff = compute_vwap_midprice_diff(ds["mean_microprice"].values.astype(np.float32),
	ds["mean_mid_price"].values.astype(np.float32)
	).astype(np.float32)
	ds["micro_midprice_diff"] = xr.DataArray(micro_midprice_diff, coords=ds["vwap"].coords, dims=ds["vwap"].dims)


	VWAPMidPriceDiffComputer().forward(ds)
	QuotePriceSmoother().forward(ds)

	SectorPriceBetaAdjuster().forward(ds)

	if subset_and_add_target:
	ds = subset_dataset_and_add_target(ds, min_num_trades=min_num_trades,
	exit_price=exit_price)

	return ds


	def add_features_from_static_data_to_market_bar_dataset(
	df, ds,
	static_y_hat=None,
	add_static_y_hat=True,
	copy=True,
	earnings_day_to_last_trading_day_traded_value_ratio=3,
	featnames=None,
	exclude_samples_with_no_static_y_hat=True,
	adjust_overnight_pct_change=True,
	add_prev_value_features=False
	):
	"""

	:param df: output from trading.dataset_creation.static_earnings_features.get_static_earnings_input_data with "static_y_hat" if add_static_y_hat is True
	:param ds: output from trading.dataset_creation.market_bar_dataset.create_market_bar_dataset
	:param add_static_y_hat:
	:param copy:
	:param earnings_day_to_last_trading_day_traded_value_ratio:
	:return:
	"""
	assert df.index.name == "ts_id"
	if copy:
	ds = ds.copy()

	if featnames is None:
	featnames = list(DEFAULT_DYNAMIC_FEATNAMES_FROM_STATIC)

	if add_static_y_hat:
	#here we need to make ds["static_y_hat"] into a timeseries and put the predictions into the right positions and forward fill
	static_y_hat['premarket']
	static_y_hat['open']
	#....
	#ds["static_y_hat"] = df["y_hat"].reindex(ds.ts_id.values)
	# after this we are done and only need to change the static_y_hat from a static into a dynamic feature in the dynamic earning regressors
	if exclude_samples_with_no_static_y_hat:
	ds = ds.sel(ts_id=ds.ts_id.loc[ds["static_y_hat"].notnull()]).copy(deep=True)

	for featname in featnames:
	ds[featname] = df[featname].reindex(ds.ts_id.values)

	if adjust_overnight_pct_change:
	df["adjustment_factor_last"] = df["close_last"] / df["closeunadj_last"]
	df["adjustment_factor_next"] = df["close_next"] / df["closeunadj_next"]
	a = (df["adjustment_factor_next"] / df["adjustment_factor_last"]).reindex(ds.ts_id.values)
	a.fillna(1.0, inplace=True)
	ds["overnight_pct_change"] = ds["overnight_pct_change"] * a + a - 1.0

	assert np.all((ds["next_trading_date_1"] == ds["date"]) \| ds["next_trading_date_1"].isnull())

	if add_prev_value_features:
	ds["previous_open_value"] = (ds["previous_open_price"] * ds["previous_open_size"])
	no_prev_open_value = ds["median_prev_open_value"].isnull()
	ds["median_prev_open_value"].loc[no_prev_open_value] = (ds["previous_open_value"].loc[no_prev_open_value]
	* earnings_day_to_last_trading_day_traded_value_ratio)

	ds["median_prev_total_traded_value_normalized"] = (ds["median_prev_total_traded_value"]
	/ ( ds["40_tdays_median_value_spy"] / 1000)
	)

	return ds