Skip to content

Instantly share code, notes, and snippets.

@albertotb
Forked from agramfort/bench_lasso_screening.py
Created December 11, 2015 23:04
Show Gist options
  • Save albertotb/00284b4e2f5dc67cd2dd to your computer and use it in GitHub Desktop.
Save albertotb/00284b4e2f5dc67cd2dd to your computer and use it in GitHub Desktop.
import time
import numpy as np
from scipy import linalg, io, sparse
import matplotlib.pyplot as plt
from sklearn.externals.joblib import Memory
from sklearn.linear_model import lasso_path
from sklearn.datasets.mldata import fetch_mldata
from sklearn import datasets
screening_methods = {
0: "No screening",
1: "GAP SAFE (sphere)",
}
markers = ['s', 'd', '^', 'v', '<', 'p', '>']
colors = plt.rcParams['axes.color_cycle']
plt.close('all')
import sys
dataset_id = 0
if len(sys.argv) > 1:
dataset_id = int(sys.argv[1])
eps = 1e-2 # the smaller it is the longer is the path
if dataset_id == 0:
dataset_name = 'leukemia'
data = fetch_mldata(dataset_name)
X = data.data
y = data.target
X = X.astype(float)
y = y.astype(float)
y /= linalg.norm(y)
elif dataset_id == 1:
dataset_name = 'news'
def get_Xy_news():
data = datasets.fetch_20newsgroups(categories=['comp.graphics',
'talk.religion.misc'])
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
X = vect.fit_transform(data.data)
y = data.target.astype(np.float)
y[y == 0] = -1.
return X, y
mem = Memory('.')
X, y, = mem.cache(get_Xy_news)()
elif dataset_id == 2:
dataset_name = 'rcv1'
data = io.loadmat('rcv1_train.binary.mat')
X = data['X'].astype(np.float)
y = data['y'].astype(np.float).ravel()
eps = 1e-2 # the smaller it is the longer is the path
if sparse.issparse(X):
pass
else:
# Standardize data (easier to set the l1_ratio parameter)
X /= np.sqrt(np.sum(X ** 2, axis=0))
mask = np.sum(np.isnan(X), axis=0) == 0
if np.any(mask):
X = X[:, mask]
print(X.shape)
y /= linalg.norm(y) # to correct for tol scaling
# Compute paths
tols = range(2, 11, 2)
screenings = [0, 10]
times = np.zeros((len(screenings), len(tols)))
plt.close('all')
for itol, tol in enumerate(tols):
plt.figure()
for iscreening, screening in enumerate(screenings):
begin = time.time()
print("Computing regularization path using the lasso "
"with screening=%d..." % screening)
alphas, coefs, gaps = lasso_path(X, y, eps, n_alphas=100,
precompute=False, tol=10 ** (-tol),
verbose=0, max_iter=3000,
screening=screening)
duration = time.time() - begin
print(duration)
times[iscreening, itol] = duration
# Display results
gap_lasso = np.maximum(np.abs(gaps), 1e-15)
l1 = plt.plot(-np.log10(alphas / np.max(alphas)), np.log10(gaps.T),
label=screening_methods[iscreening],
marker=markers[iscreening])
plt.axhline(-tol, linestyle='--', color='k')
plt.xlabel('-Log(alpha)')
plt.ylabel('Log(gap)')
plt.axis('tight')
plt.legend(loc="lower right")
plt.show()
time.sleep(0.1)
plt.tight_layout()
plt.show()
screenings_names = ['No screening', "GAP SAFE (sphere)"]
import pandas as pd
df = pd.DataFrame(times.T, columns=screenings_names)
fig, ax = plt.subplots(1, 1, figsize=(9, 6))
df.plot(kind='bar', ax=ax)
plt.xticks(range(len(tols)), [str(t) for t in tols])
plt.xlabel("-log10(duality gap)")
plt.ylabel("Time (s)")
plt.title("Lasso on %s" % dataset_name)
plt.show()
time.sleep(0.2)
plt.tight_layout()
plt.show()
leg = plt.legend(frameon=True, loc='upper left')
leg.get_frame().set_alpha(0.5)
plt.savefig('img/lasso_bench_%s.png' % dataset_name)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment