Skip to content

Instantly share code, notes, and snippets.

@fwhigh
Last active July 10, 2017 15:19
Show Gist options
  • Save fwhigh/806ed8f8c1737d64efada44bbd6d05c7 to your computer and use it in GitHub Desktop.
Save fwhigh/806ed8f8c1737d64efada44bbd6d05c7 to your computer and use it in GitHub Desktop.
Debiasing regularized regression
import argparse
import os.path
from svmlight_loader import load_svmlight_files, load_svmlight_file
import numpy as np
from scipy.sparse import csr_matrix
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import ElasticNetCV, LinearRegression
from sklearn.dummy import DummyRegressor
from sklearn.externals import joblib
from sklearn.model_selection import train_test_split
import logging
# global stuff
logger = logging.getLogger()
handler = logging.StreamHandler()
formatter = logging.Formatter(
'%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
)
handler.setFormatter(formatter)
logger.addHandler(handler)
logger.setLevel(logging.DEBUG)
sns.set(style="ticks")
def mse(a, b):
return np.mean((a-b)**2)
def print_element(name, obj):
if hasattr(obj, name):
logger.info("{} {}".format(name, getattr(obj,name)))
def regression_report(pred, y, model):
logger.info("MSE {}".format(mse(pred, y)))
print_element('alpha_', model)
print_element('l1_ratio_', model)
print_element('intercept_', model)
# print_element('coef_', model)
def io_plot(x=None, y=None, data=None, filename=None, plot_range=None):
fig = plt.figure()
ax = fig.add_subplot(111, aspect='equal')
sns.regplot(x=x, y=y, data=data, ax=ax)
plt.ylim(plot_range[0], plot_range[1])
plt.xlim(plot_range[0], plot_range[1])
plt.plot(plot_range, plot_range, 'k')
plt.xlabel(r'$\hat y_{\mathrm{test}}$')
plt.ylabel(r'$y_{\mathrm{test}}$')
plt.tight_layout()
sns.plt.savefig(filename, bbox_inches='tight')
plt.clf()
plt.close('all')
def coef_hist(x=None, filename=None):
fig = plt.figure()
sns.distplot(x[x!=0], kde=False, axlabel=r'$\hat w_i \neq 0$')
plt.tight_layout()
sns.plt.savefig(filename, bbox_inches='tight')
plt.clf()
plt.close('all')
parser = argparse.ArgumentParser(
description='Debiased regularized regression',
)
parser.add_argument(
'--experiment_name',
help="Experiment name",
type=str,
required=True)
parser.add_argument(
'--train_data_filename', type=str,
required=True)
parser.add_argument(
'--test_data_filename', type=str,
default=None)
parser.add_argument(
'--data_dir', type=str,
default='data')
args = parser.parse_args()
reg_model_filename = os.path.join(
args.data_dir,
"{}_elasticnet_regression_model.pickle".format(args.experiment_name)
)
ord_model_filename = os.path.join(
args.data_dir,
"{}_ordinary_regression_model.pickle".format(args.experiment_name)
)
# Here we go
logger.info("Exeriment: {}".format(args.experiment_name))
logger.info("Loading data")
if args.test_data_filename is not None:
X_train, y_train, X_test, y_test = load_svmlight_files(
(os.path.join(args.data_dir,args.train_data_filename),
os.path.join(args.data_dir,args.test_data_filename))
)
else:
X, y = load_svmlight_file(
os.path.join(args.data_dir,args.train_data_filename)
)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=1/float(3), random_state=42
)
logger.info("Fitting mean dummy regression model")
dummy_model = DummyRegressor(strategy='mean').fit(X_train,y_train)
regression_report(dummy_model.predict(X_test), y_test, dummy_model)
logger.info("Fitting median dummy regression model")
dummy_model = DummyRegressor(strategy='median').fit(X_train,y_train)
regression_report(dummy_model.predict(X_test), y_test, dummy_model)
if not os.path.isfile(reg_model_filename):
logger.info("Fitting regularized regression model")
reg_model = ElasticNetCV(
cv=10,
l1_ratio=[.1, .5, .7, .9, .95, .99, 1],
n_alphas=10,
n_jobs=-1).fit(X_train,y_train)
joblib.dump(reg_model, reg_model_filename)
else:
logger.info("Loading regularized regression model")
reg_model = joblib.load(reg_model_filename)
logger.info("Testing regularized regression model")
pred_reg_test = reg_model.predict(X_test)
df = pd.DataFrame({'pred_reg_test':pred_reg_test, 'y_test':y_test})
regression_report(df['pred_reg_test'], df['y_test'], reg_model)
plot_range = np.array([df['y_test'].min(),df['y_test'].max()]) + [-1,1]
logger.info("Plotting regularized regression result")
io_plot(
x='pred_reg_test', y='y_test', data=df,
filename=os.path.join(args.data_dir,"{}_elasticnet_regression_result.png".format(args.experiment_name)),
plot_range=plot_range
)
coef_hist(
x=reg_model.coef_,
filename=os.path.join(args.data_dir,"{}_elasticnet_regression_coef_hist.png".format(args.experiment_name))
)
if not os.path.isfile(ord_model_filename):
logger.info("Fitting ordinary regression model")
ord_model = LinearRegression(n_jobs=-1).fit(X_train,y_train)
joblib.dump(ord_model, ord_model_filename)
else:
logger.info("Loading ordinary regression model")
ord_model = joblib.load(ord_model_filename)
logger.info("Testing ordinary regression model")
df['pred_ord_test'] = ord_model.predict(X_test)
regression_report(df['pred_ord_test'], df['y_test'], ord_model)
logger.info("Plotting ordinary regression result")
io_plot(
x='pred_ord_test', y='y_test', data=df,
filename=os.path.join(args.data_dir,"{}_ordinary_regression_result.png".format(args.experiment_name)),
plot_range=plot_range
)
logger.info("Debiasing regularized regression model")
pred_reg_debias = reg_model.predict(X_train)
debias_model = LinearRegression(n_jobs=-1).fit(pred_reg_debias[np.newaxis].T,y_train)
logger.info("Testing debiased regularized regression model")
pred_reg_test = reg_model.predict(X_test)
df['pred_debiased_reg_test'] = debias_model.predict(pred_reg_test[np.newaxis].T)
regression_report(df['pred_debiased_reg_test'], df['y_test'], debias_model)
logger.info("Plotting debiased regularized regression result")
io_plot(
x='pred_debiased_reg_test', y='y_test', data=df,
filename=os.path.join(args.data_dir,"{}_debiased_reg_regression_result.png".format(args.experiment_name)),
plot_range=plot_range
)
#!/usr/bin/env bash
mkdir -p data
if [ ! -f data/simulated_data_train ]; then
python simulate_regression_data.py
fi
python ./debiased_reg.py --experiment_name simulated_data --train_data_filename simulated_data_train --test_data_filename simulated_data_test
if [ ! -f cpusmall ]; then
wget http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression/cpusmall
fi
python ./debiased_reg.py --experiment_name cpusmall --train_data_filename <(awk '$1>10' cpusmall)
if [ ! -f triazines ]; then
wget http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression/triazines
fi
python ./debiased_reg.py --experiment_name triazines --train_data_filename triazines
if [ ! -f eunite2001 ]; then
wget http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression/eunite2001
wget http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression/eunite2001.t
fi
python ./debiased_reg.py --experiment_name eunite2001 --train_data_filename eunite2001 --test_data_filename eunite2001.t
if [ ! -f YearPredictionMSD ]; then
wget http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression/YearPredictionMSD.bz2
wget http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression/YearPredictionMSD.t.bz2
bunzip2 YearPredictionMSD.bz2
bunzip2 YearPredictionMSD.t.bz2
fi
python ./debiased_reg.py --experiment_name YearPredictionMSD --train_data_filename YearPredictionMSD --test_data_filename YearPredictionMSD.t
if [ ! -f E2006.train ]; then
wget http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression/E2006.train.bz2
wget http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression/E2006.test.bz2
bunzip2 E2006.train.bz2
bunzip2 E2006.test.bz2
fi
python ./debiased_reg.py --experiment_name E2006 --train_data_filename E2006.train --test_data_filename E2006.test
import os.path
import numpy as np
from scipy.sparse import csr_matrix
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import dump_svmlight_file
sns.set(style="ticks")
data_dir="data"
N=int(1e3)
D=int(1e4)
sparsity=1e-2
frac_predictive=1e2/float(D)
var=1e-1
def simulate_data(n_data, data_dim):
return np.random.normal(size=(n_data,data_dim))*np.random.binomial(1, sparsity, size=(n_data,data_dim))
w0, w = 1.0, np.random.binomial(1, frac_predictive, size=D)
x_train = simulate_data(N, D)
x_test = simulate_data(N, D)
epsilon_train = np.random.normal(size=N, scale=np.sqrt(var))
epsilon_test = np.random.normal(size=N, scale=np.sqrt(var))
f_of_x_train = w0 + np.dot(x_train,w.T)
f_of_x_test = w0 + np.dot(x_test,w.T)
y_train = f_of_x_train + epsilon_train
y_test = f_of_x_test + epsilon_test
dump_svmlight_file(x_train, y_train, os.path.join(data_dir,"simulated_data_train"))
dump_svmlight_file(x_test, y_test, os.path.join(data_dir,"simulated_data_test"))
df = pd.DataFrame({'f_of_x_test': f_of_x_test, 'y_test': y_test})
plot_range = np.array([df['y_test'].min(),df['y_test'].max()]) + [-1,1]
fig = plt.figure()
ax = fig.add_subplot(111, aspect='equal')
sns.regplot(x='f_of_x_test', y='y_test', data=df, ax=ax)
plt.ylim(plot_range[0], plot_range[1])
plt.xlim(plot_range[0], plot_range[1])
plt.plot(plot_range, plot_range, 'k')
plt.xlabel(r'$f(x_{\mathrm{test}})$')
plt.ylabel(r'$y_{\mathrm{test}}$')
plt.tight_layout()
sns.plt.savefig(os.path.join(data_dir,"simulated_data.png"), bbox_inches='tight')
plt.clf()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment