Created
October 17, 2019 09:44
-
-
Save eldrin/1ea9ce0ea3397ea1dea8abaf39b99e0d to your computer and use it in GitHub Desktop.
Quick sanity check for baseline with given sparse matrix for ranking metric
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
from functools import partial | |
import numpy as np | |
import numba as nb | |
from scipy import sparse as sp | |
from implicit.als import AlternatingLeastSquares | |
from skopt import gp_minimize | |
from skopt.space import Real, Integer | |
from skopt.utils import use_named_args | |
import fire | |
@nb.jit | |
def ndcg(actual, predicted, k=10): | |
""" for binary relavance """ | |
if len(predicted) > k: | |
predicted = predicted[:k] | |
dcg = 0. | |
idcg = 0. | |
for i, p in enumerate(predicted): | |
if np.any(actual == p) and np.all(predicted[:i] != p): | |
dcg += 1. / np.log2(i + 2.) | |
if i < len(actual): | |
idcg += 1. / np.log2(i + 2.) | |
if len(actual) == 0: | |
return 0. | |
return dcg / idcg | |
def load_data(fn, delim=',', shape=None): | |
""" Load triplet file from the disk and converto csr matrix | |
""" | |
if fn.endswith('.npz'): | |
csr = sp.load_npz(fn).tocsr() | |
elif fn.endswith('.txt') or fn.endswith('.csv'): | |
# read file | |
with open(fn, 'r') as f: | |
lines = [line.strip('\n').split(delim) for line in f] | |
# parse triplet | |
r, c, v = zip(*lines) | |
# convert to csr matrix | |
csr = sp.coo_matrix((v, (r, c)), shape=shape).tocsr() | |
return csr | |
def split_user(csr, ratio=0.8): | |
""" Split given sparse matrix X by user | |
""" | |
def _add(X, rows, cols, vals): | |
X['rows'].append(rows) | |
X['cols'].append(cols) | |
X['vals'].append(vals) | |
def _build_csr(X, shape): | |
X = {k:np.concatenate(v) for k, v in X.items()} | |
return sp.coo_matrix( | |
(X['vals'], (X['rows'], X['cols'])), | |
shape=shape | |
).tocsr() | |
A = {'rows':[], 'cols':[], 'vals':[]} | |
B = {'rows':[], 'cols':[], 'vals':[]} | |
for user in range(csr.shape[0]): | |
slc = slice(csr.indptr[user], csr.indptr[user+1]) | |
items = csr.indices[slc] | |
vals = csr.data[slc] | |
rnd_idx = np.random.permutation(len(items)) | |
bound = int(len(items) * ratio) | |
items_a, items_b = items[rnd_idx[:bound]], items[rnd_idx[bound:]] | |
vals_a, vals_b = vals[rnd_idx[:bound]], vals[rnd_idx[bound:]] | |
# if len(items_a) == 0 or len(items_b) == 0: | |
rows_a = np.full((len(items_a),), user) | |
rows_b = np.full((len(items_b),), user) | |
_add(A, rows_a, items_a, vals_a) | |
_add(B, rows_b, items_b, vals_b) | |
A = _build_csr(A, csr.shape) | |
B = _build_csr(B, csr.shape) | |
return A, B | |
def evaluate_model(train, test, mf, cutoff): | |
scores = [] | |
for user in range(test.shape[0]): | |
gt = test[user].indices.astype(np.float64) | |
recs = [r[0] for r in mf.recommend(user, train, N=cutoff)] | |
scores.append(ndcg(gt, np.array(recs), cutoff)) | |
return np.mean(scores) | |
def find_optimal_wrmf(train, valid, num_factors=32, num_calls=50, | |
cutoff=5, verbose=False, random_state=0): | |
""" Find optimal hyper parameter of WRMF for given train/valid set | |
""" | |
search_space = [ | |
Real(10**-5, 10**0, "log-uniform", name='regularization'), | |
Real(10**-3, 10**3, "log-uniform", name='confidence'), | |
Integer(10, 30, name='iterations') | |
] | |
@use_named_args(search_space) | |
def evaluate(**setup): | |
mf = AlternatingLeastSquares(num_factors, | |
regularization=setup['regularization'], | |
iterations=setup['iterations'] | |
) | |
mf.fit(train.T * setup['confidence'], show_progress=False) | |
return -evaluate_model(train, valid, mf, cutoff) | |
res = gp_minimize( | |
evaluate, search_space, | |
n_calls=num_calls, random_state=0, verbose=verbose | |
) | |
# fit the best model | |
mf = AlternatingLeastSquares( | |
num_factors, | |
regularization=res['x'][0], | |
iterations=res['x'][2] | |
) | |
mf.fit(train.T * res['x'][1], show_progress=False) | |
return mf, res | |
def main(train_fn, test_fn, valid_fn=None, cutoff=5, verbose=False): | |
""" Test given interaction data with WRMF | |
Inputs: | |
train_fn (str): filename of the training data | |
test_fn (str): filename of the test data | |
valid_fn (str): filename of the validation data. if not given, | |
split it from the training set | |
cutoff (int): cutoff for the ranked list for evaluation | |
verbose (bool): constrols verbosity | |
""" | |
Xtr = load_data(train_fn) | |
Xts = load_data(test_fn) | |
if valid_fn is None: | |
# split internal validation split | |
Xtr, Xvl = split_user(Xtr, ratio=0.8) | |
else: | |
Xvl = load_data(valid_fn) | |
mf, res = find_optimal_wrmf(Xtr, Xvl, verbose=verbose, cutoff=cutoff) | |
print(evaluate_model(Xtr + Xvl, Xts, mf, cutoff)) | |
if __name__ == "__main__": | |
fire.Fire(main) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment