Skip to content

Instantly share code, notes, and snippets.

@eldrin
Created October 17, 2019 09:44
Show Gist options
  • Save eldrin/1ea9ce0ea3397ea1dea8abaf39b99e0d to your computer and use it in GitHub Desktop.
Save eldrin/1ea9ce0ea3397ea1dea8abaf39b99e0d to your computer and use it in GitHub Desktop.
Quick sanity check for baseline with given sparse matrix for ranking metric
import os
from functools import partial
import numpy as np
import numba as nb
from scipy import sparse as sp
from implicit.als import AlternatingLeastSquares
from skopt import gp_minimize
from skopt.space import Real, Integer
from skopt.utils import use_named_args
import fire
@nb.jit
def ndcg(actual, predicted, k=10):
""" for binary relavance """
if len(predicted) > k:
predicted = predicted[:k]
dcg = 0.
idcg = 0.
for i, p in enumerate(predicted):
if np.any(actual == p) and np.all(predicted[:i] != p):
dcg += 1. / np.log2(i + 2.)
if i < len(actual):
idcg += 1. / np.log2(i + 2.)
if len(actual) == 0:
return 0.
return dcg / idcg
def load_data(fn, delim=',', shape=None):
""" Load triplet file from the disk and converto csr matrix
"""
if fn.endswith('.npz'):
csr = sp.load_npz(fn).tocsr()
elif fn.endswith('.txt') or fn.endswith('.csv'):
# read file
with open(fn, 'r') as f:
lines = [line.strip('\n').split(delim) for line in f]
# parse triplet
r, c, v = zip(*lines)
# convert to csr matrix
csr = sp.coo_matrix((v, (r, c)), shape=shape).tocsr()
return csr
def split_user(csr, ratio=0.8):
""" Split given sparse matrix X by user
"""
def _add(X, rows, cols, vals):
X['rows'].append(rows)
X['cols'].append(cols)
X['vals'].append(vals)
def _build_csr(X, shape):
X = {k:np.concatenate(v) for k, v in X.items()}
return sp.coo_matrix(
(X['vals'], (X['rows'], X['cols'])),
shape=shape
).tocsr()
A = {'rows':[], 'cols':[], 'vals':[]}
B = {'rows':[], 'cols':[], 'vals':[]}
for user in range(csr.shape[0]):
slc = slice(csr.indptr[user], csr.indptr[user+1])
items = csr.indices[slc]
vals = csr.data[slc]
rnd_idx = np.random.permutation(len(items))
bound = int(len(items) * ratio)
items_a, items_b = items[rnd_idx[:bound]], items[rnd_idx[bound:]]
vals_a, vals_b = vals[rnd_idx[:bound]], vals[rnd_idx[bound:]]
# if len(items_a) == 0 or len(items_b) == 0:
rows_a = np.full((len(items_a),), user)
rows_b = np.full((len(items_b),), user)
_add(A, rows_a, items_a, vals_a)
_add(B, rows_b, items_b, vals_b)
A = _build_csr(A, csr.shape)
B = _build_csr(B, csr.shape)
return A, B
def evaluate_model(train, test, mf, cutoff):
scores = []
for user in range(test.shape[0]):
gt = test[user].indices.astype(np.float64)
recs = [r[0] for r in mf.recommend(user, train, N=cutoff)]
scores.append(ndcg(gt, np.array(recs), cutoff))
return np.mean(scores)
def find_optimal_wrmf(train, valid, num_factors=32, num_calls=50,
cutoff=5, verbose=False, random_state=0):
""" Find optimal hyper parameter of WRMF for given train/valid set
"""
search_space = [
Real(10**-5, 10**0, "log-uniform", name='regularization'),
Real(10**-3, 10**3, "log-uniform", name='confidence'),
Integer(10, 30, name='iterations')
]
@use_named_args(search_space)
def evaluate(**setup):
mf = AlternatingLeastSquares(num_factors,
regularization=setup['regularization'],
iterations=setup['iterations']
)
mf.fit(train.T * setup['confidence'], show_progress=False)
return -evaluate_model(train, valid, mf, cutoff)
res = gp_minimize(
evaluate, search_space,
n_calls=num_calls, random_state=0, verbose=verbose
)
# fit the best model
mf = AlternatingLeastSquares(
num_factors,
regularization=res['x'][0],
iterations=res['x'][2]
)
mf.fit(train.T * res['x'][1], show_progress=False)
return mf, res
def main(train_fn, test_fn, valid_fn=None, cutoff=5, verbose=False):
""" Test given interaction data with WRMF
Inputs:
train_fn (str): filename of the training data
test_fn (str): filename of the test data
valid_fn (str): filename of the validation data. if not given,
split it from the training set
cutoff (int): cutoff for the ranked list for evaluation
verbose (bool): constrols verbosity
"""
Xtr = load_data(train_fn)
Xts = load_data(test_fn)
if valid_fn is None:
# split internal validation split
Xtr, Xvl = split_user(Xtr, ratio=0.8)
else:
Xvl = load_data(valid_fn)
mf, res = find_optimal_wrmf(Xtr, Xvl, verbose=verbose, cutoff=cutoff)
print(evaluate_model(Xtr + Xvl, Xts, mf, cutoff))
if __name__ == "__main__":
fire.Fire(main)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment