syhw · July 17, 2014 14:55
diff --git a/dropout_simple_models.py b/dropout_simple_models.py
 from sklearn.datasets import fetch_20newsgroups, load_digits
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.cross_validation import train_test_split
 import numpy as np
 from sklearn.naive_bayes import MultinomialNB, BernoulliNB
 from sklearn.linear_model import LogisticRegression, SGDClassifier
 from sklearn import metrics

 newsgroups_train = fetch_20newsgroups(subset='train')
 vectorizer = TfidfVectorizer(encoding='latin-1', max_features=10000)
 vectors = vectorizer.fit_transform(newsgroups_train.data)
 dense_vectors = vectors.todense()
 dense_vectors = np.asarray(dense_vectors)

 newsgroups_test = fetch_20newsgroups(subset='test')
 vectors_test = vectorizer.transform(newsgroups_test.data)

 digits = load_digits()
 d_train_x, d_test_x, d_train_y, d_test_y = train_test_split(digits.data,
    digits.target, test_size=0.2)

 DO_ALL = True
 N_TIMES = 20  # number of datasets dropped out
 DROPOUT_RATE = 0.5  # TODO explore 0.0->0.5


 #class Dropout(object):
 #    def __init__(self, p=0.5):
 #        self.p = p
 #
 #    def fit(self, X, y):
 #        return self
 #
 #    def transform(self, X):
 #        return np.random.binomial(n=1, p=1.-self.p, size=X.shape) * X
 #
 #    def get_params(self, **kwargs):
 #            return {"p": self.p}


 for dname, x_train, y_train, x_test, y_test in (('digits', d_train_x,
        d_train_y, d_test_x, d_test_y), ('20newsgroups', dense_vectors,
            newsgroups_train.target, vectors_test, newsgroups_test.target)):

    classifiers = [LogisticRegression(), SGDClassifier()]
    # default penaly for LogisticRegression and SGDClassifier
    # is L2 and dropout approximates an L2 ellipsis
    if dname == '20newsgroups':
        classifiers += [MultinomialNB(alpha=0.01), BernoulliNB(alpha=0.01)]
    print "==> dataset name:", dname
    print "-> without dropout"
    if DO_ALL:
        for clf in classifiers:
            print clf
            clf.fit(x_train, y_train)
            pred = clf.predict(x_test)
            print metrics.f1_score(pred, y_test)

    tmp_l = [dense_vectors * np.random.binomial(n=1, p=0.5,
             size=dense_vectors.shape) for _ in xrange(N_TIMES)]
    X = np.concatenate(tmp_l, axis=0)
    y = np.concatenate([newsgroups_train.target for _ in xrange(N_TIMES)], axis=0)

    print "-> now with", N_TIMES, "dropouts, with rate", DROPOUT_RATE
    classifiers = [LogisticRegression(C=1.E6), SGDClassifier(alpha=1.E-9)]
    # default penaly for LogisticRegression and SGDClassifier
    # is L2 and dropout approximates an L2 ellipsis ==> we try to remove the L2
    if dname == '20newsgroups':
        classifiers += [MultinomialNB(alpha=0.01), BernoulliNB(alpha=0.01)]
    for clf in classifiers:
        print clf
        clf.fit(x_train, y_train)
        pred = clf.predict(x_test)
        print metrics.f1_score(pred, y_test)


 # Three things to keep in mind:
 # - dropout is usually done at the hidden units activations, not at the input
 # - dropout is good when models are strongly overfitting
 # - dropout is good with lots of data

 # ==> dataset name: digits
 # -> without dropout
 # LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
 #           intercept_scaling=1, penalty=l2, random_state=None, tol=0.0001)
 # 0.96389747273
 # SGDClassifier(alpha=0.0001, class_weight=None, epsilon=0.1, eta0=0.0,
 #        fit_intercept=True, l1_ratio=0.15, learning_rate=optimal,
 #        loss=hinge, n_iter=5, n_jobs=1, penalty=l2, power_t=0.5,
 #        random_state=None, rho=None, shuffle=False, verbose=0,
 #        warm_start=False)
 # 0.93351689353
 # -> now with 20 dropouts, with rate 0.5
 # LogisticRegression(C=1000000.0, class_weight=None, dual=False,
 #           fit_intercept=True, intercept_scaling=1, penalty=l2,
 #           random_state=None, tol=0.0001)
 # 0.935424946443
 # SGDClassifier(alpha=1e-09, class_weight=None, epsilon=0.1, eta0=0.0,
 #        fit_intercept=True, l1_ratio=0.15, learning_rate=optimal,
 #        loss=hinge, n_iter=5, n_jobs=1, penalty=l2, power_t=0.5,
 #        random_state=None, rho=None, shuffle=False, verbose=0,
 #        warm_start=False)
 # 0.94890380291
 # ==> dataset name: 20newsgroups
 # -> without dropout
 # LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
 #           intercept_scaling=1, penalty=l2, random_state=None, tol=0.0001)
 # 0.810228116561
 # SGDClassifier(alpha=0.0001, class_weight=None, epsilon=0.1, eta0=0.0,
 #        fit_intercept=True, l1_ratio=0.15, learning_rate=optimal,
 #        loss=hinge, n_iter=5, n_jobs=1, penalty=l2, power_t=0.5,
 #        random_state=None, rho=None, shuffle=False, verbose=0,
 #        warm_start=False)
 # 0.813840047475
 # MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)
 # 0.806747433797
 # BernoulliNB(alpha=0.01, binarize=0.0, class_prior=None, fit_prior=True)
 # 0.71331798034
 # -> now with 20 dropouts, with rate 0.5
 # LogisticRegression(C=1000000.0, class_weight=None, dual=False,
 #           fit_intercept=True, intercept_scaling=1, penalty=l2,
 #           random_state=None, tol=0.0001)
 # 0.813512528347
 # SGDClassifier(alpha=1e-09, class_weight=None, epsilon=0.1, eta0=0.0,
 #        fit_intercept=True, l1_ratio=0.15, learning_rate=optimal,
 #        loss=hinge, n_iter=5, n_jobs=1, penalty=l2, power_t=0.5,
 #        random_state=None, rho=None, shuffle=False, verbose=0,
 #        warm_start=False)
 # 0.764192166602
 # MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)
 # 0.806747433797
 # BernoulliNB(alpha=0.01, binarize=0.0, class_prior=None, fit_prior=True)
 # 0.71331798034
	from sklearn.datasets import fetch_20newsgroups, load_digits
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.cross_validation import train_test_split
	import numpy as np
	from sklearn.naive_bayes import MultinomialNB, BernoulliNB
	from sklearn.linear_model import LogisticRegression, SGDClassifier
	from sklearn import metrics

	newsgroups_train = fetch_20newsgroups(subset='train')
	vectorizer = TfidfVectorizer(encoding='latin-1', max_features=10000)
	vectors = vectorizer.fit_transform(newsgroups_train.data)
	dense_vectors = vectors.todense()
	dense_vectors = np.asarray(dense_vectors)

	newsgroups_test = fetch_20newsgroups(subset='test')
	vectors_test = vectorizer.transform(newsgroups_test.data)

	digits = load_digits()
	d_train_x, d_test_x, d_train_y, d_test_y = train_test_split(digits.data,
	digits.target, test_size=0.2)

	DO_ALL = True
	N_TIMES = 20 # number of datasets dropped out
	DROPOUT_RATE = 0.5 # TODO explore 0.0->0.5


	#class Dropout(object):
	# def __init__(self, p=0.5):
	# self.p = p
	#
	# def fit(self, X, y):
	# return self
	#
	# def transform(self, X):
	# return np.random.binomial(n=1, p=1.-self.p, size=X.shape) * X
	#
	# def get_params(self, **kwargs):
	# return {"p": self.p}


	for dname, x_train, y_train, x_test, y_test in (('digits', d_train_x,
	d_train_y, d_test_x, d_test_y), ('20newsgroups', dense_vectors,
	newsgroups_train.target, vectors_test, newsgroups_test.target)):

	classifiers = [LogisticRegression(), SGDClassifier()]
	# default penaly for LogisticRegression and SGDClassifier
	# is L2 and dropout approximates an L2 ellipsis
	if dname == '20newsgroups':
	classifiers += [MultinomialNB(alpha=0.01), BernoulliNB(alpha=0.01)]
	print "==> dataset name:", dname
	print "-> without dropout"
	if DO_ALL:
	for clf in classifiers:
	print clf
	clf.fit(x_train, y_train)
	pred = clf.predict(x_test)
	print metrics.f1_score(pred, y_test)

	tmp_l = [dense_vectors * np.random.binomial(n=1, p=0.5,
	size=dense_vectors.shape) for _ in xrange(N_TIMES)]
	X = np.concatenate(tmp_l, axis=0)
	y = np.concatenate([newsgroups_train.target for _ in xrange(N_TIMES)], axis=0)

	print "-> now with", N_TIMES, "dropouts, with rate", DROPOUT_RATE
	classifiers = [LogisticRegression(C=1.E6), SGDClassifier(alpha=1.E-9)]
	# default penaly for LogisticRegression and SGDClassifier
	# is L2 and dropout approximates an L2 ellipsis ==> we try to remove the L2
	if dname == '20newsgroups':
	classifiers += [MultinomialNB(alpha=0.01), BernoulliNB(alpha=0.01)]
	for clf in classifiers:
	print clf
	clf.fit(x_train, y_train)
	pred = clf.predict(x_test)
	print metrics.f1_score(pred, y_test)


	# Three things to keep in mind:
	# - dropout is usually done at the hidden units activations, not at the input
	# - dropout is good when models are strongly overfitting
	# - dropout is good with lots of data

	# ==> dataset name: digits
	# -> without dropout
	# LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
	# intercept_scaling=1, penalty=l2, random_state=None, tol=0.0001)
	# 0.96389747273
	# SGDClassifier(alpha=0.0001, class_weight=None, epsilon=0.1, eta0=0.0,
	# fit_intercept=True, l1_ratio=0.15, learning_rate=optimal,
	# loss=hinge, n_iter=5, n_jobs=1, penalty=l2, power_t=0.5,
	# random_state=None, rho=None, shuffle=False, verbose=0,
	# warm_start=False)
	# 0.93351689353
	# -> now with 20 dropouts, with rate 0.5
	# LogisticRegression(C=1000000.0, class_weight=None, dual=False,
	# fit_intercept=True, intercept_scaling=1, penalty=l2,
	# random_state=None, tol=0.0001)
	# 0.935424946443
	# SGDClassifier(alpha=1e-09, class_weight=None, epsilon=0.1, eta0=0.0,
	# fit_intercept=True, l1_ratio=0.15, learning_rate=optimal,
	# loss=hinge, n_iter=5, n_jobs=1, penalty=l2, power_t=0.5,
	# random_state=None, rho=None, shuffle=False, verbose=0,
	# warm_start=False)
	# 0.94890380291
	# ==> dataset name: 20newsgroups
	# -> without dropout
	# LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
	# intercept_scaling=1, penalty=l2, random_state=None, tol=0.0001)
	# 0.810228116561
	# SGDClassifier(alpha=0.0001, class_weight=None, epsilon=0.1, eta0=0.0,
	# fit_intercept=True, l1_ratio=0.15, learning_rate=optimal,
	# loss=hinge, n_iter=5, n_jobs=1, penalty=l2, power_t=0.5,
	# random_state=None, rho=None, shuffle=False, verbose=0,
	# warm_start=False)
	# 0.813840047475
	# MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)
	# 0.806747433797
	# BernoulliNB(alpha=0.01, binarize=0.0, class_prior=None, fit_prior=True)
	# 0.71331798034
	# -> now with 20 dropouts, with rate 0.5
	# LogisticRegression(C=1000000.0, class_weight=None, dual=False,
	# fit_intercept=True, intercept_scaling=1, penalty=l2,
	# random_state=None, tol=0.0001)
	# 0.813512528347
	# SGDClassifier(alpha=1e-09, class_weight=None, epsilon=0.1, eta0=0.0,
	# fit_intercept=True, l1_ratio=0.15, learning_rate=optimal,
	# loss=hinge, n_iter=5, n_jobs=1, penalty=l2, power_t=0.5,
	# random_state=None, rho=None, shuffle=False, verbose=0,
	# warm_start=False)
	# 0.764192166602
	# MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)
	# 0.806747433797
	# BernoulliNB(alpha=0.01, binarize=0.0, class_prior=None, fit_prior=True)
	# 0.71331798034