Last active
May 22, 2018 07:16
-
-
Save vene/3197a2bae38f77799ea8 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" Poisson-loss Factorization Machines with Numba | |
Follows the vanilla FM model from: | |
Steffen Rendle (2012): Factorization Machines with libFM. | |
In: ACM Trans. Intell. Syst. Technol., 3(3), May. | |
http://doi.acm.org/10.1145/2168752.2168771 | |
See also: https://github.com/coreylynch/pyFM | |
""" | |
# Author: Vlad Niculae <[email protected]> | |
# License: Simplified BSD | |
from __future__ import print_function | |
import numpy as np | |
from numba import jit | |
from scipy import sparse as sp | |
from sklearn.utils import check_random_state, check_array, check_X_y | |
from sklearn.metrics import mean_squared_error | |
from sklearn.preprocessing import OneHotEncoder | |
@jit("f8(f8[:], i4[:], f8[:], f8[:], f8[:,:], f8[:])", nopython=True) | |
def _fm_predict(row_data, row_indices, scaling, w, V, Vx): | |
"""Factorization machine forward pass""" | |
pred = 0 | |
for i in range(row_indices.shape[0]): | |
pred += w[row_indices[i]] * row_data[i] | |
pred *= scaling[1] # pred = wscale * dot(x, w/wscale) | |
pred += scaling[0] # add bias | |
second_order = 0 | |
for d in range(V.shape[1]): | |
Vx[d] = 0 | |
for i in range(row_indices.shape[0]): | |
v_id = V[row_indices[i], d] | |
Vx[d] += v_id * row_data[i] | |
second_order -= (v_id ** 2) * (row_data[i] ** 2) | |
second_order += Vx[d] ** 2 | |
Vx[d] *= scaling[2] # Vx = Vscale * dot(V/Vscale, x), | |
second_order *= 0.5 * scaling[2] ** 2 | |
# second_order = 1/2 Vscale^2 * [dot(V/Vscale, x)^2 - (V/Vscale)^2 * x^2] | |
pred += second_order | |
return pred | |
@jit("f8[:](f8[:], i4[:], i4[:], f8[:], f8[:], f8[:,:], i4)") | |
def _fm_predict_batch_fast(X_data, X_indices, X_indptr, scaling, w, V, | |
n_samples): | |
res = np.zeros(n_samples, dtype=np.double) | |
for i in range(n_samples): | |
for k in range(X_indptr[i], X_indptr[i + 1]): | |
res[i] += w[X_indices[k]] * X_data[k] | |
res[i] *= scaling[1] | |
res[i] += scaling[0] | |
second_order = 0 | |
for d in range(V.shape[1]): | |
vx = 0 | |
for k in range(X_indptr[i], X_indptr[i + 1]): | |
v_id = V[X_indices[k], d] | |
vx += v_id * X_data[k] | |
second_order -= (v_id ** 2) * (X_data[i] ** 2) | |
second_order += vx ** 2 | |
second_order *= 0.5 * scaling[2] ** 2 | |
res[i] += second_order | |
return res | |
def _fm_predict_batch(X, scaling, w, V): | |
"""Factorization machine prediction over a batch.""" | |
return _fm_predict_batch_fast(X.data, X.indices, X.indptr, scaling, w, V, | |
X.shape[0]) | |
@jit('void(f8[:], i4[:], f8, f8[:], f8[:], f8[:, :], f8, f8, f8)', | |
nopython=True) | |
def _fm_poisson_step(data, indices, y, scaling, w, V, learning_rate, reg_w, | |
reg_V): | |
Vx = np.empty(V.shape[1], dtype=np.double) # byproduct of _fm_predict | |
z = _fm_predict(data, indices, scaling, w, V, Vx) | |
# loss: exp(z) - y * z + y! (y! = ln gamma(y + 1)) | |
dloss_dz = np.exp(z) - y | |
# update bias: | |
scaling[0] -= learning_rate * dloss_dz | |
# update w: gradw_i = x_i | |
for k in range(indices.shape[0]): | |
w[indices[k]] -= learning_rate * dloss_dz * data[k] / scaling[1] | |
# update V: d indexes latent dims, i=indices[k] indexes nonzero features | |
# gradv_id = x_i * sum(v_jd x_j) - v_id * x_i^2 | |
for d in range(V.shape[1]): | |
for k in range(indices.shape[0]): | |
grad_v = data[k] * Vx[d] / scaling[2] | |
grad_v -= V[indices[k], d] * data[k] ** 2 | |
V[indices[k], d] -= learning_rate * dloss_dz * grad_v | |
# L2-regularize without touching the data | |
# Inspired by the scikit-learn stochastic gradient descent approach | |
# do not scale to negative values reg_* is too big: instead set to zero | |
scaling[1] *= max(0, 1.0 - learning_rate * reg_w) | |
scaling[2] *= max(0, 1.0 - learning_rate * reg_V) | |
@jit("void(f8[:], i4[:], i4[:], f8[:], f8[:], f8[:], f8[:, :], f8, f8, f8)", | |
nopython=True) | |
def _fm_poisson_epoch(X_data, X_indices, X_indptr, y, scaling, w, V, | |
learning_rate, reg_w, reg_V): | |
n_samples = y.shape[0] | |
for i in range(n_samples): | |
_fm_poisson_step(X_data[X_indptr[i]:X_indptr[i + 1]], | |
X_indices[X_indptr[i]:X_indptr[i + 1]], | |
y[i], scaling, w, V, learning_rate, reg_w, reg_V) | |
def factorization_machine_poisson(X, y, X_val=None, y_val=None, n_latent=3, | |
reg_w=1., reg_V=1., init_std=0.001, | |
warm_start=None, learning_rate=0.01, | |
n_iter=10, verbose=False, random_state=None): | |
"""Learns a factorization machine model with Poisson loss by SGD. | |
The response is modeled as: | |
y = exp(intercept + <w, x> + sum_i sum_j>i <x_i * v_i, x_j * v_j>, | |
where <., .> is the dot product, | |
w is a vector of length n_features capturing first-degree effects | |
and V is an array (n_features, n_latent) for second-degree interactions. | |
The Poisson loss is appropriate for count and rate outputs. | |
Parameters | |
---------- | |
X: array-like or sparse, shape: (n_samples, n_features) | |
Training data points. | |
y: array, shape: (n_samples,) | |
Target labels. | |
X_val: array-like or sparse, shape: (n_samples_val, n_features), optional | |
Validation data points. Used only if verbose != False | |
y_val: array, shape: (n_samples_val,), optional | |
Validation labels. Used only if verbose != False | |
n_latent: int, | |
Latent dimensions of the second-degree model. Default: 3 | |
reg_w: float, | |
Regularization amount for first-degree weights. Default: 1 | |
reg_V: float, | |
Regularization amount for second-degree weights. Default: 1 | |
init_std: float, | |
Initial scale of weight values. (Used only if warm_start is None.) | |
Weights are initialized from a centered gaussian with init_std scale. | |
Default: 0.001 | |
warm_start: tuple (intercept, w, V) or None, | |
Initial setting for the weights. | |
learning_rate: float, | |
Step size for stochastic gradient descent. Default: 0.01 | |
n_iter: int, | |
Maximum number of passes over the training data. Default: 10 | |
verbose: int, | |
If > 0, prints statistics at every `verbose` iteration. Default: False. | |
random_state: int, RandomState or None: | |
Seed for the random number generator. | |
Returns | |
------- | |
scaling: tuple (intercept, w_scale, V_scale), | |
Scales of the learning parameters. | |
w: array, shape (n_features,): | |
First degree coefficients, divided by w_scale (see scaling above) | |
V: array, shape (n_features, n_latent): | |
Embedded second-degree interaction weights, divided by V_scale. | |
""" | |
X, y = check_X_y(X, y, accept_sparse='csr', dtype=np.double) | |
y = y.astype(np.double) | |
if not sp.issparse(X): # force sparse | |
X = sp.csr_matrix(X) | |
if X_val is not None: | |
X_val, y_val = check_X_y(X_val, y_val, accept_sparse='csr', | |
dtype=np.double) | |
if not sp.issparse(X): | |
X = sp.csr_matrix(X) | |
n_samples, n_features = X.shape | |
rng = check_random_state(random_state) | |
reg_w *= 0.5 | |
reg_V *= 0.5 * n_latent # Unsure about the correct scaling | |
# initialize | |
if warm_start is not None: | |
bias, w, V = warm_start | |
scaling = np.array([bias, 1, 1], dtype=np.double) | |
else: | |
w = rng.randn(n_features) | |
V = rng.randn(n_features, n_latent) | |
scaling = np.array([0, init_std, init_std], dtype=np.double) | |
for it in range(n_iter): | |
if verbose and it % verbose == 0: | |
pred_tr = _fm_predict_batch(X, scaling, w, V) | |
pred_te = _fm_predict_batch(X_val, scaling, w, V) | |
np.exp(pred_tr, out=pred_tr) | |
np.exp(pred_te, out=pred_te) | |
dloss = np.mean(y - pred_tr) | |
print( | |
"{:>3}: dloss={:<8.4f} MSE tr={:<4.2f}, valid={:4.2f}".format( | |
it, | |
dloss, | |
mean_squared_error(y, pred_tr), | |
mean_squared_error(y_val, pred_te)), | |
end=" ") | |
_fm_poisson_epoch(X.data, X.indices, X.indptr, y, scaling, w, V, | |
learning_rate, reg_w, reg_V) | |
# rescale if needed | |
if scaling[1] <= 1e-9: | |
w *= scaling[1] | |
scaling[1] = 1.0 | |
if scaling[2] <= 1e-9: | |
V *= scaling[2] | |
scaling[2] = 1.0 | |
if verbose and it % verbose == 0: | |
print("w_0={:<6.2f} ||w||={:<6.2f} ||V||={:<6.2f}".format( | |
scaling[0], | |
np.linalg.norm(w * scaling[1]), | |
np.linalg.norm(V.ravel() * scaling[2]))) | |
return scaling, w, V | |
def array_to_fm_format(X): | |
"""Converts a dense or sparse array X to factorization machine format. | |
If x[i, j] is represented (if X is sparse) or not nan (if dense) | |
the output will have a row: | |
[one_hot(i), one_hot(j)] -> x[i, j] | |
Parameters | |
---------- | |
X, array-like or sparse | |
Input array | |
Returns | |
------- | |
X_one_hot, sparse array, shape (n_x_entries, X.shape[1] + X.shape[2]) | |
Indices of non-empty values in X, in factorization machine format. | |
y: array, shape (n_x_entries,) | |
Non-empty values in X. | |
""" | |
X = check_array(X, accept_sparse='coo', force_all_finite=False) | |
n_rows, n_cols = X.shape | |
encoder = OneHotEncoder(n_values=[n_rows, n_cols]) | |
if sp.issparse(X): | |
y = X.data | |
X_ix = np.column_stack([X.row, X.col]) | |
else: | |
ix = np.isfinite(X) | |
X_ix = np.column_stack(np.where(ix)) | |
y = X[ix].ravel() | |
X_oh = encoder.fit_transform(X_ix) | |
return X_oh, y | |
if __name__ == '__main__': | |
from scipy.stats.distributions import poisson | |
from sklearn.cross_validation import train_test_split | |
from time import time | |
X_dense = poisson.rvs(mu=3, size=(500, 600), random_state=0) | |
X, y = array_to_fm_format(X_dense) | |
X, X_val, y, y_val = train_test_split(X, y, random_state=0) | |
for reg in (0, 1e-5): | |
print("Regularization: {}".format(reg)) | |
t0 = time() | |
w_0, w, V = factorization_machine_poisson( | |
X, y, X_val, y_val, n_latent=50, | |
reg_w=reg, reg_V=reg, | |
init_std=0.01, learning_rate=5e-3, n_iter=31, verbose=10, | |
random_state=0) | |
print("Done in {}".format(time() - t0)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment