Skip to content

Instantly share code, notes, and snippets.

@jnothman
Created March 22, 2017 11:33
Show Gist options
  • Save jnothman/7af35286c61daac570a307ea0568eb5a to your computer and use it in GitHub Desktop.
Save jnothman/7af35286c61daac570a307ea0568eb5a to your computer and use it in GitHub Desktop.
example/test for scikit-learn#7602
"""
============================
Classifier Chain
============================
An ensemble of 10 logistic regression classifier chains trained on a
multi-label dataset achieves a higher Jaccard similarity score than a set
of independently trained logistic regression models.
"""
import numpy as np
from sklearn.multioutput import ClassifierChain
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import jaccard_similarity_score
from sklearn.linear_model import LogisticRegression
from scipy.sparse import coo_matrix
import arff # pypi:liac-arff
def load_bibtex(path):
bibtex = arff.load(open(path),
return_type=arff.COO, encode_nominal=True)
data, row, col = bibtex['data']
M = coo_matrix((np.array(data), (np.array(row), np.array(col))),
shape=(len(data), len(bibtex['attributes']))).tocsc()
y_mask = np.array([attr.startswith('TAG_')
for attr, _ in bibtex['attributes']])
Y = M[:, y_mask][:10000].A
X = M[:, ~y_mask][:10000]
print(X.shape, Y.shape)
return X, Y
X_train, Y_train = load_bibtex('/Users/joel/Downloads/bibtex-train.arff')
X_test, Y_test = load_bibtex('/Users/joel/Downloads/bibtex-test.arff')
# Fit an independent logistic regression model for each class using the
# OneVsRestClassifier wrapper
ovr = OneVsRestClassifier(LogisticRegression())
ovr.fit(X_train, Y_train)
Y_pred_ovr = ovr.predict(X_test)
print("Independent models Jaccard similarity score:",
jaccard_similarity_score(Y_test, Y_pred_ovr))
# Fit an ensemble of logistic regression classifier chains and take the
# take the average prediction of all the chains
chains = [ClassifierChain(LogisticRegression(), cv=3, order='random')
for i in range(10)]
scores = []
Y_preds = []
for chain in chains:
Y_pred = chain.fit(X_train, Y_train).predict(X_test)
scores.append(jaccard_similarity_score(Y_test, Y_pred))
print(scores[-1])
Y_preds.append(Y_pred)
Y_pred_ensemble = np.array(Y_preds).mean(axis=0)
print("Classifier chain ensemble Jaccard similarity score:",
jaccard_similarity_score(Y_test, Y_pred_ensemble >= .5))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment