Created
March 30, 2016 16:34
-
-
Save mikkelam/0c333310ee47c14dad4b8bd51589a8a5 to your computer and use it in GitHub Desktop.
Extends vowpals sklearn interface to ect with multiclass classification
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# pylint: disable=line-too-long, unused-argument, invalid-name, too-many-arguments, too-many-locals | |
""" | |
Utilities to support integration of Vowpal Wabbit and scikit-learn | |
""" | |
import numpy as np | |
import sklearn | |
from pyvw import vw | |
import re | |
from scipy.sparse import csr_matrix | |
from sklearn import metrics | |
from sklearn.base import BaseEstimator, RegressorMixin | |
from sklearn.cross_validation import train_test_split | |
from sklearn.ensemble import GradientBoostingClassifier | |
from sklearn.linear_model.base import LinearClassifierMixin, SparseCoefMixin | |
from sklearn.datasets.svmlight_format import dump_svmlight_file | |
from sklearn.metrics import accuracy_score | |
from sklearn.utils.validation import check_is_fitted | |
import StringIO | |
DEFAULT_NS = '' | |
CONSTANT_HASH = 116060 | |
INVALID_CHARS = re.compile(r"[\|: \n]+") | |
class VW(BaseEstimator): | |
""" Vowpal Wabbit Scikit-learn Base Estimator wrapper | |
Attributes | |
---------- | |
params : {dict} | |
dictionary of model parameter keys and values | |
fit_ : {bool} | |
this variable is only created after the model is fitted | |
""" | |
params = dict() | |
def __init__(self, | |
probabilities=None, | |
random_seed=None, | |
ring_size=None, | |
convert_to_vw=None, | |
bfgs=None, | |
mem=None, | |
learning_rate=None, | |
l=None, | |
power_t=None, | |
decay_learning_rate=None, | |
initial_t=None, | |
feature_mask=None, | |
initial_regressor=None, | |
i=None, | |
initial_weight=None, | |
random_weights=None, | |
input_feature_regularizer=None, | |
audit=None, | |
a=None, | |
progress=None, | |
P=None, | |
quiet=None, | |
no_stdin=None, | |
hash=None, | |
ignore=None, | |
keep=None, | |
redefine=None, | |
bit_precision=None, | |
b=None, | |
noconstant=None, | |
constant=None, | |
C=None, | |
ngram=None, | |
skips=None, | |
feature_limit=None, | |
affix=None, | |
spelling=None, | |
dictionary=None, | |
dictionary_path=None, | |
interactions=None, | |
permutations=None, | |
leave_duplicate_interactions=None, | |
quadratic=None, | |
q=None, | |
cubic=None, | |
testonly=None, | |
t=None, | |
min_prediction=None, | |
max_prediction=None, | |
sort_features=None, | |
loss_function=None, | |
link=None, | |
quantile_tau=None, | |
l1=None, | |
l2=None, | |
named_labels=None, | |
final_regressor=None, | |
f=None, | |
readable_model=None, | |
invert_hash=None, | |
passes=None, | |
save_resume=None, | |
output_feature_regularizer_binary=None, | |
output_feature_regularizer_text=None, | |
oaa=None, | |
ect=None): | |
""" VW model constructor, exposing all supported parameters to keep sklearn happy | |
Parameters | |
---------- | |
probabilities | |
random_seed (int): seed random number generator | |
ring_size (int): size of example ring | |
convert_to_vw (bool): flag to convert X input to vw format | |
Update options | |
bfgs | |
mem | |
learning_rate,l (float): Set learning rate | |
power_t (float): t power value | |
decay_learning_rate (float): Set Decay factor for learning_rate between passes | |
initial_t (float): initial t value | |
feature_mask (str): Use existing regressor to determine which parameters may be updated. | |
If no initial_regressor given, also used for initial weights. | |
Weight options | |
initial_regressor,i (str): Initial regressor(s) | |
initial_weight (float): Set all weights to an initial value of arg. | |
random_weights (bool): make initial weights random | |
input_feature_regularizer (str): Per feature regularization input file | |
Diagnostic options | |
audit,a (bool): print weights of features | |
progress,P (str): Progress update frequency. int: additive, float: multiplicative | |
quiet (bool): Don't output disgnostics and progress updates | |
Feature options | |
hash (str): how to hash the features. Available options: strings, all | |
ignore (str): ignore namespaces beginning with character <arg> | |
keep (str): keep namespaces beginning with character <arg> | |
redefine (str): Redefine namespaces beginning with characters of string S as namespace N. <arg> shall be in | |
form 'N:=S' where := is operator. Empty N or S are treated as default namespace. | |
Use ':' as a wildcard in S. | |
bit_precision,b (int): number of bits in the feature table | |
noconstant (bool): Don't add a constant feature | |
constant,C (float): Set initial value of constant | |
ngram (str): Generate N grams. To generate N grams for a single namespace 'foo', arg should be fN. | |
skips (str): Generate skips in N grams. This in conjunction with the ngram tag can be used to generate | |
generalized n-skip-k-gram. To generate n-skips for a single namespace 'foo', arg should be fN. | |
feature_limit (str): limit to N features. To apply to a single namespace 'foo', arg should be fN | |
affix (str): generate prefixes/suffixes of features; argument '+2a,-3b,+1' means generate 2-char prefixes for | |
namespace a, 3-char suffixes for b and 1 char prefixes for default namespace | |
spelling (str): compute spelling features for a give namespace (use '_' for default namespace) | |
dictionary (str): read a dictionary for additional features (arg either 'x:file' or just 'file') | |
dictionary_path (str): look in this directory for dictionaries; defaults to current directory or env{PATH} | |
interactions (str): Create feature interactions of any level between namespaces. | |
permutations (bool): Use permutations instead of combinations for feature interactions of same namespace. | |
leave_duplicate_interactions (bool): Don't remove interactions with duplicate combinations of namespaces. For | |
ex. this is a duplicate: '-q ab -q ba' and a lot more in '-q ::'. | |
quadratic,q (str): Create and use quadratic features, q:: corresponds to a wildcard for all printable characters | |
cubic (str): Create and use cubic features | |
Example options | |
testonly,t (bool): Ignore label information and just test | |
min_prediction (float): Smallest prediction to output | |
max_prediction (float): Largest prediction to output | |
sort_features (bool): turn this on to disregard order in which features have been defined. This will lead to | |
smaller cache sizes | |
loss_function (str): default_value("squared"), "Specify the loss function to be used, uses squared by default. | |
Currently available ones are squared, classic, hinge, logistic and quantile. | |
link (str): apply a link function to convert output: e.g. 'logistic' | |
quantile_tau (float): default_value(0.5), "Parameter \\tau associated with Quantile loss. Defaults to 0.5 | |
l1 (float): l_1 lambda | |
l2 (float): l_2 lambda | |
named_labels (str): use names for labels (multiclass, etc.) rather than integers, argument specified all | |
possible labels, comma-sep, eg \"--named_labels Noun,Verb,Adj,Punc\" | |
Output model | |
final_regressor,f (str): Final regressor | |
readable_model (str): Output human-readable final regressor with numeric features | |
invert_hash (str): Output human-readable final regressor with feature names. Computationally expensive. | |
passes (int): Number of training passes | |
save_resume (bool): save extra state so learning can be resumed later with new data | |
output_feature_regularizer_binary (str): Per feature regularization output file | |
output_feature_regularizer_text (str): Per feature regularization output file, in text | |
Multiclass options | |
oaa (int): Use one-against-all multiclass learning with labels | |
Returns | |
------- | |
self : object | |
Returns self. | |
""" | |
# clear estimator attributes | |
if hasattr(self, 'fit_'): | |
del self.fit_ | |
if hasattr(self, 'passes_'): | |
del self.passes_ | |
if hasattr(self, 'convert_to_vw_'): | |
del self.convert_to_vw_ | |
if hasattr(self, 'vw_'): | |
del self.vw_ | |
# reset params and quiet models by default | |
self.params = {'quiet': True} | |
# assign all valid args to params dict | |
args = dict(locals()) | |
for k, v in args.iteritems(): | |
if k != 'self' and v is not None: | |
self.params[k] = v | |
# store passes separately to be used in fit | |
self.passes_ = self.params.pop('passes', 1) | |
# pull out convert_to_vw from params | |
self.convert_to_vw_ = self.params.pop('convert_to_vw', True) | |
self.vw_ = None | |
super(VW, self).__init__() | |
def get_vw(self): | |
""" | |
Factory to create a vw instance on demand | |
Returns | |
------- | |
pyvw.vw instance | |
""" | |
if self.vw_ is None: | |
self.vw_ = vw(**self.params) | |
return self.vw_ | |
def fit(self, X, y=None, sample_weight=None): | |
""" Fit the model according to the given training data | |
TODO: for first pass create and store example objects. | |
for N-1 passes use example objects directly (simulate cache file...but in memory for faster processing) | |
Parameters | |
---------- | |
X : {array-like, sparse matrix}, shape (n_samples, n_features or 1 if not convert_to_vw) or | |
Training vector, where n_samples in the number of samples and | |
n_features is the number of features. | |
if not using convert_to_vw, X is expected to be a list of vw formatted feature vector strings with labels | |
y : array-like, shape (n_samples,), optional if not convert_to_vw | |
Target vector relative to X. | |
sample_weight : array-like, shape (n_samples,) | |
sample weight vector relative to X. | |
Returns | |
------- | |
return self so pipeline can call transform() after fit | |
""" | |
self.classes_ = np.unique(y) | |
self.n_classes_ = int(len(self.classes_)) | |
# add examples to model | |
for _ in xrange(self.passes_): | |
for idx, x in enumerate(X): | |
if self.convert_to_vw_: | |
x = tovw(x=x, y=y[idx], sample_weight=sample_weight)[0] | |
self.get_vw().learn(x) | |
self.fit_ = True | |
return self | |
def transform(self, X, y=None): | |
""" Transform does nothing by default besides closing the model. Transform is required for any estimator | |
in a sklearn pipeline that isn't the final estimator | |
Parameters | |
---------- | |
X : {array-like, sparse matrix}, shape (n_samples, n_features or 1 if not convert_to_vw) or | |
Training vector, where n_samples in the number of samples and | |
n_features is the number of features. | |
if not using convert_to_vw, X is expected to be a list of vw formatted feature vector strings with labels | |
y : array-like, shape (n_samples,), optional if not convert_to_vw | |
Target vector relative to X. | |
Returns | |
------- | |
return X to be passed into next estimator in pipeline | |
""" | |
if not self.get_vw().finished: | |
self.get_vw().finish() | |
return X | |
def predict(self, X): | |
""" Predict with Vowpal Wabbit model | |
Parameters | |
---------- | |
X : {array-like, sparse matrix}, shape (n_samples, n_features or 1) | |
Training vector, where n_samples in the number of samples and | |
n_features is the number of features. | |
if not using convert_to_vw, X is expected to be a list of vw formatted feature vector strings with labels | |
Returns | |
------- | |
y : array-like, shape (n_samples,) | |
Output vector relative to X. | |
""" | |
check_is_fitted(self, 'fit_') | |
try: | |
num_samples = X.shape[0] if X.ndim > 1 else len(X) | |
except AttributeError: | |
num_samples = len(X) | |
# add test examples to model | |
y = np.empty([num_samples]) | |
for idx, x in enumerate(X): | |
if self.convert_to_vw_: | |
x = tovw(x)[0] | |
ex = self.get_vw().example(x) | |
# need to set test bit to skip learning | |
ex.set_test_only(True) | |
ex.learn() | |
# check if oaa classifier | |
if 'oaa' in self.params: | |
y[idx] = ex.get_multiclass_prediction()-1 | |
elif 'ect' in self.params: | |
y[idx] = ex.get_multiclass_prediction()-1 | |
else: | |
y[idx] = ex.get_simplelabel_prediction() | |
ex.finish() | |
# self.get_vw().finish() ######################## VIGTIGT!!!!!!!!! | |
return y | |
def score(self, X, y=None): | |
"""Returns the score on the given data, if the estimator has been refit. | |
This uses the score defined by ``scoring`` where provided, and the | |
``best_estimator_.score`` method otherwise. | |
Parameters | |
---------- | |
X : array-like, shape = [n_samples, n_features] | |
Input data, where n_samples is the number of samples and | |
n_features is the number of features. | |
y : array-like, shape = [n_samples] or [n_samples, n_output], optional | |
Target relative to X for classification or regression; | |
None for unsupervised learning. | |
Returns | |
------- | |
score : float | |
""" | |
pred = self.predict(X) | |
score = metrics.accuracy_score(y, pred) | |
return score | |
def __str__(self): | |
if self.params is not None: | |
return str(self.params) | |
def __repr__(self): | |
return self.__str__() | |
def get_params(self, deep=True): | |
out = dict() | |
# add in the vw params | |
out.update(self.params) | |
# add in the estimator params | |
out['passes'] = self.passes_ | |
out['convert_to_vw'] = self.convert_to_vw_ | |
return out | |
def set_params(self, **params): | |
""" This destroys and recreates the Vowpal Wabbit model with updated parameters | |
any parameters not provided will remain as they were initialized to at construction | |
Parameters | |
---------- | |
params : {dict} | |
dictionary of model parameter keys and values to update | |
""" | |
self.params.update(params) | |
# manage passes and convert_to_vw params different because they are estimator params, not vw params | |
if 'passes' not in params: | |
self.params['passes'] = self.passes_ | |
if 'convert_to_vw' not in params: | |
self.params['convert_to_vw'] = self.convert_to_vw_ | |
self.__init__(**self.params) | |
return self | |
def get_coefs(self): | |
""" Returns coefficient weights as ordered sparse matrix | |
Returns | |
------- | |
{sparse matrix} coefficient weights for model | |
""" | |
return csr_matrix([self.get_vw().get_weight(i) for i in xrange(self.get_vw().num_weights())]) | |
def get_intercept(self): | |
""" Returns intercept weight for model | |
Returns | |
------- | |
{int} intercept value, 0 if noconstant | |
""" | |
return self.get_vw().get_weight(CONSTANT_HASH) | |
class ThresholdingLinearClassifierMixin(LinearClassifierMixin): | |
""" | |
Mixin for linear classifiers. A threshold is used to specify the positive | |
class cutoff | |
Handles prediction for sparse and dense X. | |
""" | |
def __init__(self, **params): | |
# assume 0 as positive score threshold | |
self.pos_threshold = params.pop('pos_threshold', 0.0) | |
super(ThresholdingLinearClassifierMixin, self).__init__(**params) | |
def predict(self, X): | |
"""Predict class labels for samples in X. | |
Parameters | |
---------- | |
X : {array-like, sparse matrix}, shape = [n_samples, n_features] | |
Samples. | |
Returns | |
------- | |
C : array, shape = [n_samples] | |
Predicted class label per sample. | |
""" | |
scores = self.decision_function(X) | |
if len(scores.shape) == 1: | |
indices = (scores >= self.pos_threshold).astype(np.int) | |
else: | |
indices = scores.argmax(axis=1) | |
return self.classes_[indices] | |
# class VWClassifier(SparseCoefMixin, LinearClassifierMixin, VW): | |
class VWClassifier(SparseCoefMixin, ThresholdingLinearClassifierMixin, VW): | |
""" Vowpal Wabbit Classifier model | |
Only supports binary classification currently. | |
note - don't try to apply link='logistic' on top of the existing functionality | |
""" | |
classes_ = np.array([-1., 1.]) | |
def __init__(self, **params): | |
# assume logistic loss functions | |
if 'loss_function' not in params: | |
params['loss_function'] = 'logistic' | |
super(VWClassifier, self).__init__(**params) | |
def decision_function(self, X): | |
"""Predict confidence scores for samples. | |
The confidence score for a sample is the signed distance of that | |
sample to the hyperplane. | |
Parameters | |
---------- | |
X : {array-like, sparse matrix}, shape = (n_samples, n_features) | |
Samples. | |
Returns | |
------- | |
array, shape=(n_samples,) if n_classes == 2 else (n_samples, n_classes) | |
Confidence scores per (sample, class) combination. In the binary | |
case, confidence score for self.classes_[1] where >0 means this | |
class would be predicted. | |
""" | |
return VW.predict(self, X=X) | |
def __del__(self): | |
VW.__del__(self) | |
class VWRegressor(VW, RegressorMixin): | |
""" Vowpal Wabbit Regressor model """ | |
pass | |
def tovw(x, y=None, sample_weight=None): | |
""" Convert array or sparse matrix to Vowpal Wabbit format | |
Parameters | |
---------- | |
x : {array-like, sparse matrix}, shape (n_samples, n_features) | |
Training vector, where n_samples is the number of samples and | |
n_features is the number of features. | |
y : {array-like}, shape (n_samples,), optional | |
Target vector relative to X. | |
sample_weight : {array-like}, shape (n_samples,), optional | |
sample weight vector relative to X. | |
Returns | |
------- | |
out : {array-like}, shape (n_samples, 1) | |
Training vectors in VW string format | |
""" | |
use_truth = y is not None | |
use_weight = sample_weight is not None | |
# convert to numpy array if needed | |
if not isinstance(x, (np.ndarray, csr_matrix)): | |
x = np.array(x) | |
if not isinstance(y, np.ndarray): | |
y = np.array(y) | |
# make sure this is a 2d array | |
if x.ndim == 1: | |
x = x.reshape(1, -1) | |
if y.ndim == 0: | |
y = y.reshape(1) | |
rows, cols = x.shape | |
# check for invalid characters if array has string values | |
if x.dtype.char == 'S': | |
for row in rows: | |
for col in cols: | |
x[row, col] = INVALID_CHARS.sub('.', x[row, col]) | |
# convert input to svmlight format | |
s = StringIO.StringIO() | |
dump_svmlight_file(x, np.zeros(rows), s) | |
# parse entries to construct VW format | |
rows = s.getvalue().split('\n')[:-1] | |
out = [] | |
for idx, row in enumerate(rows): | |
truth = y[idx]+1 if use_truth else 1 | |
weight = sample_weight[idx] if use_weight else 1 | |
features = row.split('0 ', 1)[1] | |
# only using a single namespace and no tags | |
out.append(('{y} {w} |{ns} {x}'.format(y=truth, w=weight, ns=DEFAULT_NS, x=features))) | |
s.close() | |
return out | |
data = sklearn.datasets.load_iris() | |
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.33, random_state=42) | |
vw_model = VW(ect=np.max(data.target)+1, passes=10, b=22) | |
vw_model.fit(X_train, y_train) | |
print(accuracy_score(y_test, vw_model.predict(X_test))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment