reachsumit · March 1, 2025 03:07 · snehalkumawat · Oct 12, 2022
diff --git a/cf_implicit.py b/cf_implicit.py
 import numpy as np
 import pandas as pd

 from numpy import bincount, log, log1p
 from scipy.sparse import coo_matrix, linalg

 class ImplicitCF:
    def __init__(self):
        self.df = pd.read_csv("lastfm-dataset-360K/usersha1-artmbid-artname-plays.tsv", sep='\t', header=None, names=['user', 'artist', 'plays'], usecols=[0,2,3])
        self.df['user'] = self.df['user'].astype("category")
        self.df['artist'] = self.df['artist'].astype("category")
        self.df.dropna(inplace=True)
        
        self.plays = coo_matrix((self.df['plays'].astype(float),
                                  (self.df['artist'].cat.codes,
                                  self.df['user'].cat.codes)))
    
    def _bm25_weight(self, X, K1=100, B=0.8):
        """Weighs each row of a sparse matrix X  by BM25 weighting"""
        # calculate idf per term (user)
        X = coo_matrix(X)
        N = float(X.shape[0])
        idf = log(N) - log1p(bincount(X.col))
        # calculate length_norm per document (artist)
        row_sums = np.ravel(X.sum(axis=1))
        average_length = row_sums.mean()
        length_norm = (1.0 - B) + B * row_sums / average_length
        # weight matrix rows by bm25
        X.data = X.data * (K1 + 1.0) / (K1 * length_norm[X.row] + X.data) * idf[X.col]
        return X
    
    def _alternating_least_squares(self, Cui, factors, regularization, iterations=20):
        artists, users = Cui.shape

        X = np.random.rand(artists, factors) * 0.01
        Y = np.random.rand(users, factors) * 0.01

        Ciu = Cui.T.tocsr()
        for iteration in range(iterations):
            self._least_squares(Cui, X, Y, regularization)
            self._least_squares(Ciu, Y, X, regularization)

        return X, Y

    def _least_squares(self, Cui, X, Y, regularization):
        artists, factors = X.shape
        YtY = Y.T.dot(Y)

        for u in range(artists):
            # accumulate YtCuY + regularization * I in A
            A = YtY + regularization * np.eye(factors)

            # accumulate YtCuPu in b
            b = np.zeros(factors)

            for i in Cui[u,:].indices:
                confidence = Cui[u,i]
                factor = Y[i]
                A += (confidence - 1) * np.outer(factor, factor)
                b += confidence * factor

            # Xu = (YtCuY + regularization * I)^-1 (YtCuPu)
            X[u] = np.linalg.solve(A, b)
    
    def factorize(self):
        artist_factor, user_factor = self._alternating_least_squares(self._bm25_weight(self.plays).tocsr(), 50, 1, 10)
        return artist_factor, user_factor
    
    def init_predict(self, x_factors):
        # fully normalize factors, so can compare with only the dot product
        norms = np.linalg.norm(x_factors, axis=-1)
        self.factors = x_factors / norms[:, np.newaxis]
    
    def get_related(self, x_id, N=5):
        scores = self.factors.dot(self.factors[x_id])
        best = np.argpartition(scores, -N)[-N:]
        print("Recommendations:")
        for _id, score in sorted(zip(best, scores[best]), key=lambda x: -x[1]):
            print(f"artist id: {_id}, artist_name: {self.df.artist[_id]} score: {score:.5f}")
 			
 cf_object = ImplicitCF()
 print(cf_object.df.head())
 #                                       user                artist  plays
 #0  00000c289a1829a808ac09c00daf10bc3c4e223b       betty blowtorch   2137
 #1  00000c289a1829a808ac09c00daf10bc3c4e223b             die Ärzte   1099
 #2  00000c289a1829a808ac09c00daf10bc3c4e223b     melissa etheridge    897
 #3  00000c289a1829a808ac09c00daf10bc3c4e223b             elvenking    717
 #4  00000c289a1829a808ac09c00daf10bc3c4e223b  juliette & the licks    706
 print(cf_object.df.user.nunique()) # 358868
 print(cf_object.df.artist.nunique()) # 292364
 print(cf_object.df.plays.describe())
 #count    1.753565e+07
 #mean     2.151932e+02
 #std      6.144815e+02
 #min      0.000000e+00
 #25%      3.500000e+01
 #50%      9.400000e+01
 #75%      2.240000e+02
 #max      4.191570e+05
 #Name: plays, dtype: float64
 print(cf_object.plays.shape) # (292364, 358868)

 artist_factor, user_factor = cf_object.factorize()
 print(artist_factor.shape) # (292364, 50)
 print(user_factor.shape) # (358868, 50)

 cf_object.init_predict(artist_factor)
 print(cf_object.factors.shape) # (292364, 50)
 cf_object.get_related(2170)
 #Recommendations:
 #artist id: 2170, artist_name: maroon 5 score: 1.00000
 #artist id: 170436, artist_name: vilma palma e vampiros score: 1.00000
 #artist id: 257, artist_name: the beatles score: 1.00000
 #artist id: 24297, artist_name: dizzee rascal score: 0.77604
 #artist id: 257575, artist_name: the velvet underground score: 0.71697
	import numpy as np
	import pandas as pd

	from numpy import bincount, log, log1p
	from scipy.sparse import coo_matrix, linalg

	class ImplicitCF:
	def __init__(self):
	self.df = pd.read_csv("lastfm-dataset-360K/usersha1-artmbid-artname-plays.tsv", sep='\t', header=None, names=['user', 'artist', 'plays'], usecols=[0,2,3])
	self.df['user'] = self.df['user'].astype("category")
	self.df['artist'] = self.df['artist'].astype("category")
	self.df.dropna(inplace=True)

	self.plays = coo_matrix((self.df['plays'].astype(float),
	(self.df['artist'].cat.codes,
	self.df['user'].cat.codes)))

	def _bm25_weight(self, X, K1=100, B=0.8):
	"""Weighs each row of a sparse matrix X by BM25 weighting"""
	# calculate idf per term (user)
	X = coo_matrix(X)
	N = float(X.shape[0])
	idf = log(N) - log1p(bincount(X.col))
	# calculate length_norm per document (artist)
	row_sums = np.ravel(X.sum(axis=1))
	average_length = row_sums.mean()
	length_norm = (1.0 - B) + B * row_sums / average_length
	# weight matrix rows by bm25
	X.data = X.data * (K1 + 1.0) / (K1 * length_norm[X.row] + X.data) * idf[X.col]
	return X

	def _alternating_least_squares(self, Cui, factors, regularization, iterations=20):
	artists, users = Cui.shape

	X = np.random.rand(artists, factors) * 0.01
	Y = np.random.rand(users, factors) * 0.01

	Ciu = Cui.T.tocsr()
	for iteration in range(iterations):
	self._least_squares(Cui, X, Y, regularization)
	self._least_squares(Ciu, Y, X, regularization)

	return X, Y

	def _least_squares(self, Cui, X, Y, regularization):
	artists, factors = X.shape
	YtY = Y.T.dot(Y)

	for u in range(artists):
	# accumulate YtCuY + regularization * I in A
	A = YtY + regularization * np.eye(factors)

	# accumulate YtCuPu in b
	b = np.zeros(factors)

	for i in Cui[u,:].indices:
	confidence = Cui[u,i]
	factor = Y[i]
	A += (confidence - 1) * np.outer(factor, factor)
	b += confidence * factor

	# Xu = (YtCuY + regularization * I)^-1 (YtCuPu)
	X[u] = np.linalg.solve(A, b)

	def factorize(self):
	artist_factor, user_factor = self._alternating_least_squares(self._bm25_weight(self.plays).tocsr(), 50, 1, 10)
	return artist_factor, user_factor

	def init_predict(self, x_factors):
	# fully normalize factors, so can compare with only the dot product
	norms = np.linalg.norm(x_factors, axis=-1)
	self.factors = x_factors / norms[:, np.newaxis]

	def get_related(self, x_id, N=5):
	scores = self.factors.dot(self.factors[x_id])
	best = np.argpartition(scores, -N)[-N:]
	print("Recommendations:")
	for _id, score in sorted(zip(best, scores[best]), key=lambda x: -x[1]):
	print(f"artist id: {_id}, artist_name: {self.df.artist[_id]} score: {score:.5f}")

	cf_object = ImplicitCF()
	print(cf_object.df.head())
	# user artist plays
	#0 00000c289a1829a808ac09c00daf10bc3c4e223b betty blowtorch 2137
	#1 00000c289a1829a808ac09c00daf10bc3c4e223b die Ärzte 1099
	#2 00000c289a1829a808ac09c00daf10bc3c4e223b melissa etheridge 897
	#3 00000c289a1829a808ac09c00daf10bc3c4e223b elvenking 717
	#4 00000c289a1829a808ac09c00daf10bc3c4e223b juliette & the licks 706
	print(cf_object.df.user.nunique()) # 358868
	print(cf_object.df.artist.nunique()) # 292364
	print(cf_object.df.plays.describe())
	#count 1.753565e+07
	#mean 2.151932e+02
	#std 6.144815e+02
	#min 0.000000e+00
	#25% 3.500000e+01
	#50% 9.400000e+01
	#75% 2.240000e+02
	#max 4.191570e+05
	#Name: plays, dtype: float64
	print(cf_object.plays.shape) # (292364, 358868)

	artist_factor, user_factor = cf_object.factorize()
	print(artist_factor.shape) # (292364, 50)
	print(user_factor.shape) # (358868, 50)

	cf_object.init_predict(artist_factor)
	print(cf_object.factors.shape) # (292364, 50)
	cf_object.get_related(2170)
	#Recommendations:
	#artist id: 2170, artist_name: maroon 5 score: 1.00000
	#artist id: 170436, artist_name: vilma palma e vampiros score: 1.00000
	#artist id: 257, artist_name: the beatles score: 1.00000
	#artist id: 24297, artist_name: dizzee rascal score: 0.77604
	#artist id: 257575, artist_name: the velvet underground score: 0.71697