Last active
June 1, 2020 21:55
-
-
Save tansey/4322e514c1f89f347b91467444c576e8 to your computer and use it in GitHub Desktop.
Quick and dirty binary matrix factorization via alternating logistic regression
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
from functools import partial | |
from scipy.optimize import fmin_l_bfgs_b | |
from sklearn.linear_model import LogisticRegression | |
def binary_mf(Y, nembeds, lam=None, lams=30, cv=5, max_steps=30, tol=1e-4, verbose=False): | |
# Convert to a log-space grid | |
if lam is None and isinstance(lams, int): | |
lams = np.exp(np.linspace(np.log(1e-2), np.log(1), lams)) | |
# If there is no lambda provided, select it via CV | |
if lam is None: | |
from sklearn.model_selection import KFold | |
# Cross-validation setup | |
cv_scores = np.zeros((len(lams), cv)) | |
indices = np.array([[i,j] for i,j in np.ndindex(Y.shape) if not np.isnan(Y[i,j])]) | |
kf = KFold(n_splits=cv, shuffle=True) | |
for cv_idx, (train_index, test_index) in enumerate(kf.split(indices)): | |
if verbose: | |
print('Fold {}/{}'.format(cv_idx+1, cv)) | |
W_prev, V_prev = None, None | |
for lam_idx, cur_lam in enumerate(lams): | |
# Get the training data | |
Y_train = np.copy(Y) | |
for i,j in indices[test_index]: | |
Y_train[i,j] = np.nan | |
# Fit the model | |
W, V = binary_mf(Y_train, nembeds, lam=cur_lam, verbose=verbose>1) | |
# Evaluate on held out indices | |
Mu = ilogit(W.dot(V.T)) | |
Y_test = np.array([Y[i,j] for i,j in indices[test_index]]) | |
Mu_test = np.array([Mu[i,j] for i,j in indices[test_index]]) | |
cv_scores[lam_idx, cv_idx] = cross_entropy(Y_test, Mu_test) | |
if verbose: | |
print('\tLam {}/{} ({:.4f}) loss: {:.6f}'.format(lam_idx+1, len(lams), cur_lam, cv_scores[lam_idx, cv_idx])) | |
W_prev, V_prev = W, V | |
# Select the best lambda, fit, and return | |
best_lam = lams[np.argmax(cv_scores.mean(axis=1))] | |
if verbose: | |
print('Best lam: {:.6f}'.format(best_lam)) | |
return binary_mf(Y, nembeds, lam=best_lam, verbose=verbose) | |
# Initialize the embeddings | |
W = np.random.normal(0, 1/np.sqrt(nembeds), size=(Y.shape[0], nembeds)) | |
V = np.random.normal(0, 1/np.sqrt(nembeds), size=(Y.shape[1], nembeds)) | |
# Setup the LR optimizer | |
clf = LogisticRegression(C=lam, fit_intercept=False, solver='lbfgs') | |
# Measure the reconstruction loss | |
prev_loss = cross_entropy(Y, ilogit(W.dot(V.T))) | |
missing = np.isnan(Y) | |
for step in range(max_steps): | |
if verbose: | |
print('Step {}/{}'.format(step+1, max_steps)) | |
# W-step | |
for i in range(Y.shape[0]): | |
clf.fit(V[~missing[i]], Y[i,~missing[i]]) | |
W[i] = clf.coef_[0] | |
# V-step | |
for i in range(Y.shape[1]): | |
clf.fit(W[~missing[:,i]], Y[~missing[:,i],i]) | |
V[i] = clf.coef_[0] | |
# Track progress and stop early if converged | |
loss = cross_entropy(Y, ilogit(W.dot(V.T))) | |
if verbose: | |
print('Loss: {:.6f}'.format(loss)) | |
if loss - prev_loss < tol: | |
break | |
prev_loss = loss | |
return W, V |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment