Skip to content

Instantly share code, notes, and snippets.

@reachsumit
Last active March 1, 2025 03:07
Show Gist options
  • Save reachsumit/35598b25c7abdc0c034f03db823c9a7d to your computer and use it in GitHub Desktop.
Save reachsumit/35598b25c7abdc0c034f03db823c9a7d to your computer and use it in GitHub Desktop.
import numpy as np
import pandas as pd
from numpy import bincount, log, log1p
from scipy.sparse import coo_matrix, linalg
class ImplicitCF:
def __init__(self):
self.df = pd.read_csv("lastfm-dataset-360K/usersha1-artmbid-artname-plays.tsv", sep='\t', header=None, names=['user', 'artist', 'plays'], usecols=[0,2,3])
self.df['user'] = self.df['user'].astype("category")
self.df['artist'] = self.df['artist'].astype("category")
self.df.dropna(inplace=True)
self.plays = coo_matrix((self.df['plays'].astype(float),
(self.df['artist'].cat.codes,
self.df['user'].cat.codes)))
def _bm25_weight(self, X, K1=100, B=0.8):
"""Weighs each row of a sparse matrix X by BM25 weighting"""
# calculate idf per term (user)
X = coo_matrix(X)
N = float(X.shape[0])
idf = log(N) - log1p(bincount(X.col))
# calculate length_norm per document (artist)
row_sums = np.ravel(X.sum(axis=1))
average_length = row_sums.mean()
length_norm = (1.0 - B) + B * row_sums / average_length
# weight matrix rows by bm25
X.data = X.data * (K1 + 1.0) / (K1 * length_norm[X.row] + X.data) * idf[X.col]
return X
def _alternating_least_squares(self, Cui, factors, regularization, iterations=20):
artists, users = Cui.shape
X = np.random.rand(artists, factors) * 0.01
Y = np.random.rand(users, factors) * 0.01
Ciu = Cui.T.tocsr()
for iteration in range(iterations):
self._least_squares(Cui, X, Y, regularization)
self._least_squares(Ciu, Y, X, regularization)
return X, Y
def _least_squares(self, Cui, X, Y, regularization):
artists, factors = X.shape
YtY = Y.T.dot(Y)
for u in range(artists):
# accumulate YtCuY + regularization * I in A
A = YtY + regularization * np.eye(factors)
# accumulate YtCuPu in b
b = np.zeros(factors)
for i in Cui[u,:].indices:
confidence = Cui[u,i]
factor = Y[i]
A += (confidence - 1) * np.outer(factor, factor)
b += confidence * factor
# Xu = (YtCuY + regularization * I)^-1 (YtCuPu)
X[u] = np.linalg.solve(A, b)
def factorize(self):
artist_factor, user_factor = self._alternating_least_squares(self._bm25_weight(self.plays).tocsr(), 50, 1, 10)
return artist_factor, user_factor
def init_predict(self, x_factors):
# fully normalize factors, so can compare with only the dot product
norms = np.linalg.norm(x_factors, axis=-1)
self.factors = x_factors / norms[:, np.newaxis]
def get_related(self, x_id, N=5):
scores = self.factors.dot(self.factors[x_id])
best = np.argpartition(scores, -N)[-N:]
print("Recommendations:")
for _id, score in sorted(zip(best, scores[best]), key=lambda x: -x[1]):
print(f"artist id: {_id}, artist_name: {self.df.artist[_id]} score: {score:.5f}")
cf_object = ImplicitCF()
print(cf_object.df.head())
# user artist plays
#0 00000c289a1829a808ac09c00daf10bc3c4e223b betty blowtorch 2137
#1 00000c289a1829a808ac09c00daf10bc3c4e223b die Ärzte 1099
#2 00000c289a1829a808ac09c00daf10bc3c4e223b melissa etheridge 897
#3 00000c289a1829a808ac09c00daf10bc3c4e223b elvenking 717
#4 00000c289a1829a808ac09c00daf10bc3c4e223b juliette & the licks 706
print(cf_object.df.user.nunique()) # 358868
print(cf_object.df.artist.nunique()) # 292364
print(cf_object.df.plays.describe())
#count 1.753565e+07
#mean 2.151932e+02
#std 6.144815e+02
#min 0.000000e+00
#25% 3.500000e+01
#50% 9.400000e+01
#75% 2.240000e+02
#max 4.191570e+05
#Name: plays, dtype: float64
print(cf_object.plays.shape) # (292364, 358868)
artist_factor, user_factor = cf_object.factorize()
print(artist_factor.shape) # (292364, 50)
print(user_factor.shape) # (358868, 50)
cf_object.init_predict(artist_factor)
print(cf_object.factors.shape) # (292364, 50)
cf_object.get_related(2170)
#Recommendations:
#artist id: 2170, artist_name: maroon 5 score: 1.00000
#artist id: 170436, artist_name: vilma palma e vampiros score: 1.00000
#artist id: 257, artist_name: the beatles score: 1.00000
#artist id: 24297, artist_name: dizzee rascal score: 0.77604
#artist id: 257575, artist_name: the velvet underground score: 0.71697
@snehalkumawat
Copy link

Hello... I gone through your article. Have you did this part?
Next, I went over one of the most popular research on a factor model which is specially tailored for implicit feedback recommenders. We also implemented factorization-based recommender systems in Python for both explicit and implicit datasets.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment