Skip to content

Instantly share code, notes, and snippets.

@the-moliver
Last active January 14, 2023 05:43
Show Gist options
  • Save the-moliver/dcdd2862dc2c78dda600f1b449071c93 to your computer and use it in GitHub Desktop.
Save the-moliver/dcdd2862dc2c78dda600f1b449071c93 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 168,
"id": "798d5d25",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"from scipy.stats import rankdata\n",
"from sklearn.linear_model import Ridge\n",
"from sklearn.mixture import BayesianGaussianMixture\n",
"from xgboost import XGBRegressor"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "f8637ccc",
"metadata": {},
"outputs": [],
"source": [
"training_data = pd.read_parquet(\"numerai_training_data.parquet\")"
]
},
{
"cell_type": "code",
"execution_count": 189,
"id": "c919eb30",
"metadata": {},
"outputs": [],
"source": [
"val_data = pd.read_parquet(\"numerai_validation_data.parquet\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "c57156b7",
"metadata": {},
"outputs": [],
"source": [
"rlm = Ridge(fit_intercept=False)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "fadbc9ea",
"metadata": {},
"outputs": [],
"source": [
"era_list = training_data.era.unique()\n",
"coefs = []\n",
"for ii, era in enumerate(era_list):\n",
" # get features and target from data and center\n",
" features = training_data[training_data.era == era].filter(like='feature').values - .5\n",
" target = training_data[training_data.era == era]['target'] - .5\n",
" # fit ridge regression model for each era\n",
" rlm.fit(features, target)\n",
" coefs.append(rlm.coef_)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "c87696da",
"metadata": {},
"outputs": [],
"source": [
"\n",
"coefs = np.vstack(coefs)"
]
},
{
"cell_type": "code",
"execution_count": 177,
"id": "04b505f4",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Initialization 0\n",
"Initialization converged: True\n"
]
}
],
"source": [
"# For generating lots of fake data, you could just run all of below in a loop\n",
"\n",
"# choose a random number of components for a Gaussian Mixture Model\n",
"n_components = np.random.choice(range(2,10))\n",
"\n",
"# fit a Gaussian Mixture Model of linear regression weights\n",
"gm = BayesianGaussianMixture(n_components=n_components, verbose=True)\n",
"gm.fit(coefs)\n",
"\n",
"# make probability of sampling each component equal to better balance rare regimes\n",
"gm.weights_[:] = 1/n_components"
]
},
{
"cell_type": "code",
"execution_count": 194,
"id": "239b279b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0573\r"
]
}
],
"source": [
"fake_target = []\n",
"real_target = []\n",
"all_features = []\n",
"bins = [0, 0.05, 0.25, 0.75, 0.95, 1]\n",
"\n",
"for era in era_list[np.random.choice(4)::4]:\n",
" print(era, end='\\r')\n",
" features = training_data[training_data.era == era].filter(like='feature').values - .5\n",
" real_target.append(training_data[training_data.era == era]['target'])\n",
" # sample a set of weights from GMM\n",
" beta, _ = gm.sample(1)\n",
" \n",
" # create fake continuous target\n",
" fake_targ = features @ beta[0]\n",
" \n",
" # bin fake target like real target\n",
" fake_targ = (rankdata(fake_targ) - .5)/len(fake_targ)\n",
" fake_targ = (np.digitize(fake_targ, bins) - 1)/4\n",
" \n",
" fake_target.append(fake_targ)\n",
" all_features.append(features)\n",
" \n",
"all_features = np.concatenate(all_features)\n",
"fake_target = np.concatenate(fake_target)\n",
"real_target = np.concatenate(real_target)\n"
]
},
{
"cell_type": "code",
"execution_count": 195,
"id": "573e329c",
"metadata": {},
"outputs": [],
"source": [
"xgbr = XGBRegressor(colsample_bytree=0.1, learning_rate=0.01, max_depth=5, n_estimators=100)"
]
},
{
"cell_type": "code",
"execution_count": 196,
"id": "659e0652",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n",
" colsample_bynode=1, colsample_bytree=0.1, gamma=0, gpu_id=-1,\n",
" importance_type='gain', interaction_constraints='',\n",
" learning_rate=0.01, max_delta_step=0, max_depth=5,\n",
" min_child_weight=1, missing=nan, monotone_constraints='()',\n",
" n_estimators=100, n_jobs=32, num_parallel_tree=1, random_state=0,\n",
" reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,\n",
" tree_method='exact', validate_parameters=1, verbosity=None)"
]
},
"execution_count": 196,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"xgbr.fit(all_features, fake_target)"
]
},
{
"cell_type": "code",
"execution_count": 197,
"id": "f35a1f3d",
"metadata": {},
"outputs": [],
"source": [
"pred = xgbr.predict(val_data.filter(like='feature').values)"
]
},
{
"cell_type": "code",
"execution_count": 216,
"id": "2a247340",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"val corr trained on fake target: 0.013349771549412833\n"
]
}
],
"source": [
"c1 = np.corrcoef(pred, val_data['target'])[0,1]\n",
"print(f'val corr trained on fake target: {c1}')"
]
},
{
"cell_type": "code",
"execution_count": 199,
"id": "8803f5e6",
"metadata": {},
"outputs": [],
"source": [
"xgbr2 = XGBRegressor(colsample_bytree=0.1, learning_rate=0.01, max_depth=5, n_estimators=100)"
]
},
{
"cell_type": "code",
"execution_count": 200,
"id": "7845e9e7",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n",
" colsample_bynode=1, colsample_bytree=0.1, gamma=0, gpu_id=-1,\n",
" importance_type='gain', interaction_constraints='',\n",
" learning_rate=0.01, max_delta_step=0, max_depth=5,\n",
" min_child_weight=1, missing=nan, monotone_constraints='()',\n",
" n_estimators=100, n_jobs=32, num_parallel_tree=1, random_state=0,\n",
" reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,\n",
" tree_method='exact', validate_parameters=1, verbosity=None)"
]
},
"execution_count": 200,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"xgbr2.fit(all_features, real_target)"
]
},
{
"cell_type": "code",
"execution_count": 201,
"id": "afc525ce",
"metadata": {},
"outputs": [],
"source": [
"pred2 = xgbr2.predict(val_data.filter(like='feature').values)"
]
},
{
"cell_type": "code",
"execution_count": 215,
"id": "147cbf25",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"val corr trained on real target: 0.010239539249306307\n"
]
}
],
"source": [
"c2 = np.corrcoef(pred2, val_data['target'])[0,1]\n",
"print(f'val corr trained on real target: {c2}')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fdb6f431",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment