Last active
January 14, 2023 05:43
-
-
Save the-moliver/dcdd2862dc2c78dda600f1b449071c93 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 168, | |
"id": "798d5d25", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"import matplotlib.pyplot as plt\n", | |
"from scipy.stats import rankdata\n", | |
"from sklearn.linear_model import Ridge\n", | |
"from sklearn.mixture import BayesianGaussianMixture\n", | |
"from xgboost import XGBRegressor" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"id": "f8637ccc", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"training_data = pd.read_parquet(\"numerai_training_data.parquet\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 189, | |
"id": "c919eb30", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"val_data = pd.read_parquet(\"numerai_validation_data.parquet\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"id": "c57156b7", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"rlm = Ridge(fit_intercept=False)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"id": "fadbc9ea", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"era_list = training_data.era.unique()\n", | |
"coefs = []\n", | |
"for ii, era in enumerate(era_list):\n", | |
" # get features and target from data and center\n", | |
" features = training_data[training_data.era == era].filter(like='feature').values - .5\n", | |
" target = training_data[training_data.era == era]['target'] - .5\n", | |
" # fit ridge regression model for each era\n", | |
" rlm.fit(features, target)\n", | |
" coefs.append(rlm.coef_)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"id": "c87696da", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"\n", | |
"coefs = np.vstack(coefs)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 177, | |
"id": "04b505f4", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Initialization 0\n", | |
"Initialization converged: True\n" | |
] | |
} | |
], | |
"source": [ | |
"# For generating lots of fake data, you could just run all of below in a loop\n", | |
"\n", | |
"# choose a random number of components for a Gaussian Mixture Model\n", | |
"n_components = np.random.choice(range(2,10))\n", | |
"\n", | |
"# fit a Gaussian Mixture Model of linear regression weights\n", | |
"gm = BayesianGaussianMixture(n_components=n_components, verbose=True)\n", | |
"gm.fit(coefs)\n", | |
"\n", | |
"# make probability of sampling each component equal to better balance rare regimes\n", | |
"gm.weights_[:] = 1/n_components" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 194, | |
"id": "239b279b", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"0573\r" | |
] | |
} | |
], | |
"source": [ | |
"fake_target = []\n", | |
"real_target = []\n", | |
"all_features = []\n", | |
"bins = [0, 0.05, 0.25, 0.75, 0.95, 1]\n", | |
"\n", | |
"for era in era_list[np.random.choice(4)::4]:\n", | |
" print(era, end='\\r')\n", | |
" features = training_data[training_data.era == era].filter(like='feature').values - .5\n", | |
" real_target.append(training_data[training_data.era == era]['target'])\n", | |
" # sample a set of weights from GMM\n", | |
" beta, _ = gm.sample(1)\n", | |
" \n", | |
" # create fake continuous target\n", | |
" fake_targ = features @ beta[0]\n", | |
" \n", | |
" # bin fake target like real target\n", | |
" fake_targ = (rankdata(fake_targ) - .5)/len(fake_targ)\n", | |
" fake_targ = (np.digitize(fake_targ, bins) - 1)/4\n", | |
" \n", | |
" fake_target.append(fake_targ)\n", | |
" all_features.append(features)\n", | |
" \n", | |
"all_features = np.concatenate(all_features)\n", | |
"fake_target = np.concatenate(fake_target)\n", | |
"real_target = np.concatenate(real_target)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 195, | |
"id": "573e329c", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"xgbr = XGBRegressor(colsample_bytree=0.1, learning_rate=0.01, max_depth=5, n_estimators=100)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 196, | |
"id": "659e0652", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n", | |
" colsample_bynode=1, colsample_bytree=0.1, gamma=0, gpu_id=-1,\n", | |
" importance_type='gain', interaction_constraints='',\n", | |
" learning_rate=0.01, max_delta_step=0, max_depth=5,\n", | |
" min_child_weight=1, missing=nan, monotone_constraints='()',\n", | |
" n_estimators=100, n_jobs=32, num_parallel_tree=1, random_state=0,\n", | |
" reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,\n", | |
" tree_method='exact', validate_parameters=1, verbosity=None)" | |
] | |
}, | |
"execution_count": 196, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"xgbr.fit(all_features, fake_target)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 197, | |
"id": "f35a1f3d", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"pred = xgbr.predict(val_data.filter(like='feature').values)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 216, | |
"id": "2a247340", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"val corr trained on fake target: 0.013349771549412833\n" | |
] | |
} | |
], | |
"source": [ | |
"c1 = np.corrcoef(pred, val_data['target'])[0,1]\n", | |
"print(f'val corr trained on fake target: {c1}')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 199, | |
"id": "8803f5e6", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"xgbr2 = XGBRegressor(colsample_bytree=0.1, learning_rate=0.01, max_depth=5, n_estimators=100)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 200, | |
"id": "7845e9e7", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n", | |
" colsample_bynode=1, colsample_bytree=0.1, gamma=0, gpu_id=-1,\n", | |
" importance_type='gain', interaction_constraints='',\n", | |
" learning_rate=0.01, max_delta_step=0, max_depth=5,\n", | |
" min_child_weight=1, missing=nan, monotone_constraints='()',\n", | |
" n_estimators=100, n_jobs=32, num_parallel_tree=1, random_state=0,\n", | |
" reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,\n", | |
" tree_method='exact', validate_parameters=1, verbosity=None)" | |
] | |
}, | |
"execution_count": 200, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"xgbr2.fit(all_features, real_target)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 201, | |
"id": "afc525ce", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"pred2 = xgbr2.predict(val_data.filter(like='feature').values)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 215, | |
"id": "147cbf25", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"val corr trained on real target: 0.010239539249306307\n" | |
] | |
} | |
], | |
"source": [ | |
"c2 = np.corrcoef(pred2, val_data['target'])[0,1]\n", | |
"print(f'val corr trained on real target: {c2}')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "fdb6f431", | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.8.8" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment