Created
March 2, 2017 12:30
-
-
Save kudkudak/0c758961f4f7b3cf11e5cc80c5c94c70 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python2 | |
# -*- coding: utf-8 -*- | |
""" | |
This file that defines model factories. | |
It is quite messy | |
""" | |
import logging | |
import sklearn | |
from scipy.stats import spearmanr | |
from sklearn.base import BaseEstimator | |
from sklearn.cross_validation import StratifiedKFold | |
from sklearn.ensemble import RandomForestClassifier | |
from sklearn.ensemble import RandomForestRegressor | |
from sklearn.grid_search import GridSearchCV | |
from sklearn.grid_search import RandomizedSearchCV | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.linear_model import Ridge | |
from sklearn.neighbors import KNeighborsClassifier | |
from sklearn.pipeline import Pipeline | |
from sklearn.preprocessing import StandardScaler | |
from sklearn.svm import SVC | |
from sklearn.svm import SVR | |
from sklearn_utils.metrics import wac_score | |
from src.experiments.transformers import SentenceEmbedder, DoubleListEmbedder, DoubleListDotProduct | |
class SentenceCNN(BaseEstimator): | |
def __init__(self, embedding, filter_length=3, nb_filter=500, hidden_dims=500, | |
nb_epoch=10, batch_size=32, | |
dense_W_l2=0.0, dense_activity_l2=0.01, dropout=0.2, optimizer="adam", | |
lr=0.001, | |
maxlen=60, | |
verbose=0, validation_split=0.): | |
self.embedding = embedding | |
self.filter_length = filter_length | |
self.nb_filter = nb_filter | |
self.hidden_dims = hidden_dims | |
self.nb_epoch = nb_epoch | |
self.batch_size = batch_size | |
self.dense_W_l2 = dense_W_l2 | |
self.dense_activity_l2 = dense_activity_l2 | |
self.dropout = dropout | |
self.optimizer = optimizer | |
self.maxlen = maxlen | |
self.lr = lr | |
self.verbose = verbose | |
self.validation_split = validation_split | |
def _build(self): | |
self.model = Sequential() | |
# TODO: Why not train | |
self.model.add(Embedding( | |
dropout=self.dropout, | |
trainable=False, | |
mask_zero=False, | |
output_dim=self.embedding_weights.shape[1], | |
input_dim=self.embedding_weights.shape[0], | |
weights=[self.embedding_weights])) | |
self.model.add(Convolution1D(nb_filter=self.nb_filter, | |
filter_length=self.filter_length, | |
border_mode='valid', | |
activation='relu', | |
subsample_length=1)) | |
def max_1d(X): | |
return K.max(X, axis=1) | |
self.model.add(Lambda(max_1d, output_shape=(self.nb_filter,))) | |
if self.hidden_dims > 0: | |
self.model.add(Dense(self.hidden_dims, activation="relu", W_regularizer=l2(self.dense_W_l2), | |
activity_regularizer=activity_l2(self.dense_activity_l2))) | |
self.model.add(Dense(1, activation="sigmoid")) | |
self.model.compile(loss='binary_crossentropy', | |
optimizer=self._optimizer, | |
metrics=['accuracy']) | |
def _initialize(self): | |
if self.optimizer == "adam": | |
self._optimizer = optimizers.Adam(lr=self.lr) | |
words = self.embedding.vocabulary.words | |
vectors = self.embedding.vectors | |
self.index_dict = {} | |
for i, word in enumerate(words): | |
self.index_dict[word] = i + 1 | |
word_vectors = {} | |
for word, vector in zip(words, vectors): | |
word_vectors[word] = vector | |
vocab_dim = vectors.shape[1] | |
n_symbols = len(self.index_dict) + 1 # adding 1 to account for 0th index (for masking) | |
self.embedding_weights = np.zeros((n_symbols + 1, vocab_dim)) | |
for word, index in self.index_dict.items(): | |
self.embedding_weights[index, :] = word_vectors[word] | |
self._build() | |
def _prepare_X(self, X): | |
X_tr = X.copy() | |
X_tr = X_tr.ravel() | |
for i, sentence in enumerate(X.ravel()): | |
X_tr[i] = [self.index_dict[word] for word in sentence] | |
X_tr = sequence.pad_sequences(X_tr, maxlen=self.maxlen) | |
return X_tr | |
def _prepare_y(self, y): | |
if self._y_type == "-1,1": | |
return (y + 1) / 2 | |
elif self._y_type == "0,1": | |
return y | |
else: | |
raise RuntimeError() | |
def _transform_prediction(self, y_pred): | |
if self._y_type == "-1,1": | |
return 2 * (y_pred > 0.5).astype(np.int64).ravel() - 1 | |
elif self._y_type == "0,1": | |
return (y_pred > 0.5).astype(np.int64).ravel() | |
else: | |
raise RuntimeError() | |
def fit(self, X, y): | |
if set(y) == set([0, 1]): | |
self._y_type = "0,1" | |
elif set(y) == set([-1, 1]): | |
self._y_type = "-1,1" | |
else: | |
raise RuntimeError() | |
self._initialize() | |
X_tr = self._prepare_X(X) | |
y_tr = self._prepare_y(y) | |
if self.validation_split != 0.0: | |
callbacks = [EarlyStopping(monitor='val_loss', patience=1, verbose=0, mode='auto')] | |
else: | |
callbacks = [] | |
self.model.fit(X_tr, y_tr, batch_size=self.batch_size, nb_epoch=self.nb_epoch, | |
verbose=self.verbose, callbacks=callbacks, | |
validation_split=self.validation_split) | |
return self | |
def predict(self, X, y=None): | |
X_tr = self._prepare_X(X) | |
return self._transform_prediction(self.model.predict(X_tr)) | |
class DummyModel(BaseEstimator): | |
def __init__(self): | |
pass | |
def fit(self, X, y): | |
return self | |
def predict(self, X): | |
return X.ravel() | |
logger = logging.getLogger(__name__) | |
class SKFGridSearchCV(GridSearchCV): | |
# Passing explicit parameters, otherwise get_params() won't work | |
def __init__(self, scoring, param_grid, estimator, skf_rng=777, cv=5, n_jobs=1): | |
assert (type(cv) is int) | |
assert (cv > 1) | |
GridSearchCV.__init__(self, cv=cv, scoring=scoring, param_grid=param_grid, | |
estimator=estimator, n_jobs=n_jobs) | |
self.skf_rng = skf_rng | |
def fit(self, X, y): | |
self.cv = StratifiedKFold(y, n_folds=self.cv, shuffle=True, random_state=self.skf_rng) | |
return GridSearchCV.fit(self, X, y) | |
class FallBackGridSearchCV(BaseEstimator): | |
""" | |
Simple class that fits GridSearchCV but performs also a cross validation of supplied fall_back_estimator | |
and if it is better picks it. | |
Notes | |
----- | |
For now it is assumed that fall_back_estimator is constant | |
""" | |
def __init__(self, fall_back_estimator, gcv): | |
self.gcv = None | |
assert gcv.scoring is not None, "gcv has to define explicitely scoring" | |
if isinstance(fall_back_estimator, GridSearchCV) or isinstance(fall_back_estimator, SKFGridSearchCV): | |
if hasattr(gcv, "scoring") and not fall_back_estimator.scoring == gcv.scoring: | |
raise RuntimeError("scorings must match") | |
if hasattr(gcv, "cv") and not fall_back_estimator.cv == gcv.cv: | |
raise RuntimeError("cv must match") | |
self.fall_back_cv = fall_back_estimator | |
else: | |
assert isinstance(gcv, GridSearchCV) or isinstance(gcv, SKFGridSearchCV) | |
self.fall_back_cv = sklearn.base.clone(gcv, safe=True) | |
self.fall_back_cv.estimator = fall_back_estimator | |
self.fall_back_cv.param_grid = {} # Hack, has to have a parameter grid actually | |
self.original_gcv = gcv | |
def fit(self, X, y): | |
self.original_gcv.fit(X, y) | |
self.fall_back_cv.fit(X, y) | |
if self.original_gcv.best_score_ >= self.fall_back_cv.best_score_: | |
# Score is always the larget the better | |
self.gcv = self.original_gcv | |
else: | |
self.gcv = self.fall_back_cv | |
return self | |
def predict(self, X): | |
return self.gcv.predict(X) | |
def predict_proba(self, X): | |
return self.gcv.predict_proba(X) | |
def __setattr__(self, k, v): | |
self.__dict__[k] = v | |
# Fun question: can you implement it any better? | |
def __getattr__(self, name): | |
try: | |
return self.__dict__[name] | |
except KeyError: | |
if name != "gcv" and self.gcv: | |
try: | |
return self.gcv.__dict__[name] | |
except KeyError: | |
raise AttributeError | |
else: | |
raise AttributeError | |
def lambda_fallback(fall_back_estimator, gcv): | |
def model(E): | |
return FallBackGridSearchCV(fall_back_estimator=fall_back_estimator(E), gcv=gcv(E)) | |
return model | |
############# | |
# One word models # | |
############# | |
def OneWordLR(scorer=sklearn.metrics.make_scorer(wac_score)): | |
class_weight = "balanced" if scorer == sklearn.metrics.make_scorer(wac_score) else None | |
def lambda_model(E): | |
estimator = Pipeline([("emb", SentenceEmbedder(embedding=E, method="concat")), | |
("lgr", LogisticRegression(class_weight=class_weight))]) | |
model_lr = SKFGridSearchCV(skf_rng=777, | |
estimator=estimator, | |
scoring=scorer, | |
param_grid={"lgr__C": [10. ** i for i in range(-4, 5)]}, | |
cv=3, | |
n_jobs=1) | |
return model_lr | |
return lambda_model | |
def OneWordRFRegularized3(scorer=sklearn.metrics.make_scorer(wac_score)): | |
class_weight = "balanced" if scorer == sklearn.metrics.make_scorer(wac_score) else None | |
def lambda_model(E): | |
estimator = Pipeline([("emb", SentenceEmbedder(embedding=E, method="concat")), | |
("rf", RandomForestClassifier(n_estimators=500, class_weight=class_weight))]) | |
model_RF = SKFGridSearchCV(skf_rng=777, | |
estimator=estimator, | |
scoring=scorer, | |
param_grid={"rf__max_depth": [None, 10, 5, 3], | |
"rf__min_samples_leaf": [1, 2, 5, 10], | |
"rf__max_features": ["sqrt"]}, | |
cv=3, | |
n_jobs=1) # n_jobs=1, because joblib serializes to disk.. | |
return model_RF | |
return lambda_model | |
# Old results are calling OneWordRFRegularized2() | |
def OneWordKnn(scorer=sklearn.metrics.make_scorer(wac_score)): | |
def lambda_model(E): | |
estimator = Pipeline([("emb", SentenceEmbedder(embedding=E, method="concat")), | |
("scaler", StandardScaler()), | |
("knn", KNeighborsClassifier(n_neighbors=1))]) | |
model = SKFGridSearchCV(skf_rng=777, | |
estimator=estimator, | |
scoring=scorer, | |
param_grid={"knn__n_neighbors": [2, 3, 5]}, | |
cv=3, # TODO: check if this is important for fit quality | |
n_jobs=1) # TODO: I think n_jobs > 1 doesn't work correctyl.. | |
return model | |
return lambda_model | |
from sklearn.naive_bayes import GaussianNB | |
def OneWordNB(scorer="accuracy"): | |
assert scorer == "accuracy" | |
def lambda_model(E): | |
estimator = Pipeline([("emb", SentenceEmbedder(embedding=E, method="concat")), | |
("scaler", StandardScaler()), | |
("knn", GaussianNB())]) | |
return estimator | |
return lambda_model | |
def OneWordSVMrbf(scorer=sklearn.metrics.make_scorer(wac_score)): | |
class_weight = "balanced" if scorer == sklearn.metrics.make_scorer(wac_score) else None | |
# 60 tasks | |
def lambda_model(E): | |
estimator = Pipeline([("emb", SentenceEmbedder(embedding=E, method="concat")), | |
("scaler", StandardScaler()), | |
("svm", SVC(kernel="rbf", gamma=0.01, max_iter=1e7, C=0.1, class_weight=class_weight))]) | |
model_RF = SKFGridSearchCV(skf_rng=777, | |
estimator=estimator, | |
scoring=scorer, | |
param_grid={"svm__gamma": [10. ** i for i in range(-6, 0)], | |
"svm__C": [10. ** i for i in range(-6, 6)]}, | |
cv=3, | |
n_jobs=3) | |
return model_RF | |
return lambda_model | |
############################################ | |
## ## | |
## SENTENCE MODELS ## | |
## ## | |
############################################ | |
def SentimentCNN(mode="extended"): | |
if mode == "vanilla": | |
def lambda_model(E, maxlen=200): | |
return SentenceCNN(embedding=E) | |
elif mode == "extended": | |
param_distributions = { | |
"nb_filter": [50, 100], | |
"filter_length": [3], | |
"hidden_dims": [0, 10, 50], | |
"nb_epoch": [30], # This is slight trick - we don't want to use valid holdout essentionally | |
"dropout": [0., 0.2, 0.4], | |
"optimizer": ["adam"], | |
"validation_split": [0.1], | |
"maxlen": [60], | |
"lr": [0.001, 0.01] | |
} | |
n_iter = 25 | |
cv = 5 | |
scoring = "accuracy" | |
def lambda_model(E): | |
model = RandomizedSearchCV(estimator=SentenceCNN(embedding=E), param_distributions=param_distributions, | |
n_iter=n_iter, random_state=777, n_jobs=20, cv=cv, scoring=scoring) | |
return model | |
else: | |
raise ValueError() | |
return lambda_model | |
def AvgSVMRbf2(): | |
def lambda_model(E): | |
pipe = [("emb", SentenceEmbedder(embedding=E, on_missing="raise"))] | |
pipe.append(("scaler", StandardScaler())) | |
pipe.append(('svm', SVC(kernel="rbf", max_iter=5e7))) | |
param_grid = { | |
"svm__gamma": [10. ** i for i in range(-6, 0)], | |
"svm__C": [10. ** i for i in range(-5, 5)] | |
} | |
return SKFGridSearchCV(estimator=Pipeline(pipe), | |
param_grid=param_grid, | |
n_jobs=3, | |
skf_rng=777, | |
scoring="accuracy") | |
return lambda_model | |
def AvgLR(): | |
def lambda_model(E): | |
pipe = [("emb", SentenceEmbedder(embedding=E, on_missing="raise"))] | |
pipe.append(("scaler", StandardScaler())) | |
pipe.append(("lgr", LogisticRegression())) | |
estimator = Pipeline(pipe) | |
model = SKFGridSearchCV(estimator=estimator, | |
scoring="accuracy", | |
param_grid={"lgr__C": [10. ** i for i in range(-4, 5)]}, | |
skf_rng=777, | |
n_jobs=3) | |
return model | |
return lambda_model | |
def AvgGNB(): | |
def lambda_model(E): | |
pipe = [("emb", SentenceEmbedder(embedding=E, on_missing="raise"))] | |
pipe.append(("scaler", StandardScaler())) | |
pipe.append(("nb", GaussianNB())) | |
estimator = Pipeline(pipe) | |
return estimator | |
return lambda_model | |
############################################ | |
## ## | |
## SIMILARITY MODELS ## | |
## ## | |
############################################ | |
def SimilarityDotProduct(): | |
def lambda_model(E): | |
return Pipeline([('emb', DoubleListEmbedder(E)), | |
('dot', DoubleListDotProduct(method="single", pairs=[[0, 1]])), | |
('identity', DummyModel())]) | |
return lambda_model | |
def SimilarityRidge(mode="vanilla"): | |
if mode == "vanilla": | |
alpha_grid = [10.0 ** n for n in range(-5, 6)] | |
method_grid = ["diagonal", "double_diagonal", "quadruple_diagonal", "concat"] | |
n_folds = 5 | |
elif mode == "linear": | |
alpha_grid = [10.0 ** n for n in range(-5, 6)] | |
method_grid = ["diagonal", "triple_diagonal", "concat"] | |
n_folds = 5 | |
else: | |
raise ValueError() | |
def scorer(estimator, X, y): | |
return spearmanr(estimator.predict(X), y).correlation | |
def lambda_model(E): | |
return GridSearchCV( | |
estimator=Pipeline([('emb', DoubleListEmbedder(E)), ('dot', DoubleListDotProduct()), ('ridge', Ridge())]), | |
param_grid={'ridge__alpha': alpha_grid, 'dot__method': method_grid}, | |
scoring=scorer, | |
cv=n_folds) | |
return lambda_model | |
def SimilarityRandomForestDiagonal(mode="vanilla"): | |
if mode == "vanilla": | |
n_estimators_grid = [50, 500] | |
max_features_grid = [None] | |
max_depth_grid = [None] | |
n_folds = 5 | |
elif mode == "extended": | |
n_estimators_grid = [50, 500] | |
max_features_grid = ["sqrt", "log2", None] | |
max_depth_grid = [3, 10, None] | |
n_folds = 5 | |
else: | |
raise ValueError() | |
def scorer(estimator, X, y): | |
return spearmanr(estimator.predict(X), y).correlation | |
def lambda_model(E): | |
estimator = Pipeline([('emb', DoubleListEmbedder(E)), ('dot', DoubleListDotProduct(method="diagonal")), | |
('rfr', RandomForestRegressor())]) | |
param_grid = { | |
'rfr__n_estimators': n_estimators_grid, | |
'rfr__max_features': max_features_grid, | |
'rfr__max_depth': max_depth_grid, | |
} | |
return GridSearchCV(estimator=estimator, | |
param_grid=param_grid, | |
scoring=scorer, | |
cv=n_folds) | |
return lambda_model | |
def SimilaritySVR(mode="vanilla"): | |
if mode == "vanilla": | |
C_grid = [0.01, 0.1, 1.0, 3.0] | |
gamma_grid = [0.01, 0.001, 0.0001] | |
method_grid = ["diagonal", "double_diagonal", "quadruple_diagonal", "concat"] | |
n_folds = 5 | |
else: | |
raise ValueError() | |
def lambda_model(E): | |
estimator = Pipeline([ | |
('emb', DoubleListEmbedder(E)), | |
('dot', DoubleListDotProduct()), | |
('scaler', StandardScaler()), | |
('svm', SVR(kernel="rbf", max_iter=1e7))]) | |
param_grid = { | |
'dot__method': method_grid, | |
'svm__C': C_grid, | |
'svm__gamma': gamma_grid, | |
} | |
def scorer(estimator, X, y): | |
return spearmanr(estimator.predict(X), y).correlation | |
return GridSearchCV(estimator=estimator, | |
param_grid=param_grid, | |
cv=n_folds, | |
scoring=scorer) | |
return lambda_model |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment