erap129’s gists

erap129 / recsys_ratings_user_1.csv

Last active May 6, 2022 07:59

	user_id	movie_id	rating	time	movie_name	genre
0	user_1	movie_1193	5	978300760	One Flew Over the Cuckoo's Nest (1975)	['Drama']
45	user_1	movie_1028	5	978301777	Mary Poppins (1964)	["Children's", 'Comedy', 'Musical']
39	user_1	movie_150	5	978301777	Apollo 13 (1995)	['Drama']
23	user_1	movie_527	5	978824195	Schindler's List (1993)	['Drama', 'War']
4	user_1	movie_2355	5	978824291	Bug's Life, A (1998)	['Animation',"Children's", 'Comedy']
6	user_1	movie_1287	5	978302039	Ben-Hur (1959)	['Action', 'Adventure', 'Drama']
10	user_1	movie_595	5	978824268	Beauty and the Beast (1991)	['Animation',"Children's", 'Musical']
48	user_1	movie_2028	5	978301619	Saving Private Ryan (1998)	['Action', 'Drama', 'War']
46	user_1	movie_1029	5	978302205	Dumbo (1941)	['Animation',"Children's", 'Musical']

erap129 / recsys_image_embedding.py

Last active May 6, 2022 07:27

	img2vec = Img2Vec(cuda=False)
	movie_df_with_plots_posters_filepath = f'{BASE_FOLDER}/MovieLens-1M/movie_df_with_plots_posters.csv'

	if os.path.exists(movie_df_with_plots_posters_filepath):
	movies_df = pd.read_csv(movie_df_with_plots_posters_filepath)
	else:
	def get_movie_embedding(url):
	try:
	response = requests.get(url)
	img = Image.open(BytesIO(response.content)).convert('RGB')

erap129 / recsys_removing_people_names.py

Last active May 6, 2022 07:26

	nltk.download('punkt')
	text = ' '.join(list(feature_list))
	st = StanfordNERTagger(f'{BASE_FOLDER}/MovieLens-1M/english.all.3class.distsim.crf.ser.gz',
	f'{BASE_FOLDER}/MovieLens-1M/stanford-ner.jar')
	people = []
	for sent in nltk.sent_tokenize(text):
	tokens = nltk.tokenize.word_tokenize(sent)
	tags = st.tag(tokens)
	for tag in tags:
	if tag[1] == "PERSON":

erap129 / recsys_tfidf_less_features.py

Last active May 5, 2022 18:56

	constant_filter = VarianceThreshold(threshold = 0.0002)
	constant_filter.fit(tfidf_df)
	feature_list = tfidf_df.columns[constant_filter.get_support(indices=True)]
	print('Number of selected features: ' ,len(list(feature_list)),'\n')
	print('List of selected features: \n' ,list(feature_list))

	item_matrix_filtered_words_trainset_loocv = get_item_matrix_with_inner_ids(tfidf_df[feature_list].values, movies_df, train_loocv)
	cosine_sim_filtered_words_trainset_loocv = cosine_similarity(item_matrix_filtered_words_trainset_loocv,
	item_matrix_filtered_words_trainset_loocv)
	item_matrix_filtered_words_trainset = get_item_matrix_with_inner_ids(tfidf_df[feature_list].values, movies_df, trainset)

erap129 / recsys_tfidf_results.py

Last active May 5, 2022 18:43

	item_matrix_all_words_trainset_loocv = get_item_matrix_with_inner_ids(tfidf_matrix.todense(), movies_df, train_loocv)
	cosine_sim_all_words_trainset_loocv = cosine_similarity(item_matrix_all_words_trainset_loocv,
	item_matrix_all_words_trainset_loocv)
	item_matrix_all_words_trainset = get_item_matrix_with_inner_ids(tfidf_matrix.todense(), movies_df, trainset)
	cosine_sim_all_words_trainset = cosine_similarity(item_matrix_all_words_trainset, item_matrix_all_words_trainset)

	get_algorithm_report(CustomSimKNNAlgorithm, trainset, testset, train_loocv, test_loocv, movies_df,
	target_movie_id='movie_1', target_user_id='user_1', top_k=10,
	algo_kwargs_trainset=dict(similarities=cosine_sim_all_words_trainset, sim_options={'user_based': False}),
	algo_kwargs_trainset_loocv=dict(similarities=cosine_sim_all_words_trainset_loocv, sim_options={'user_based': False}))

erap129 / recsys_tfidf_matrix.py

Created February 19, 2022 12:25

	stemmer = SnowballStemmer('english')
	movies_df['movie_plot'] = movies_df['movie_plot'].apply(lambda x:' '.join([stemmer.stem(y) for y in x.split()]))

	tfidf = TfidfVectorizer(stop_words='english')
	tfidf_matrix = tfidf.fit_transform(movies_df['movie_plot'])

	tfidf_df = pd.DataFrame(
	tfidf_matrix.todense(),
	columns=tfidf.get_feature_names()
	)

erap129 / recsys_custom_sim_knn.py

Created February 19, 2022 12:23

	from surprise import AlgoBase, KNNBasic
	from surprise.prediction_algorithms.knns import SymmetricAlgo

	class CustomSimKNNAlgorithm(KNNBasic):
	def __init__(self, sim_options, k=40, min_k=1):
	SymmetricAlgo.__init__(self)
	self.sim_options = sim_options
	self.k = k
	self.min_k = min_k

erap129 / recsys_knn_item.py

Last active May 5, 2022 16:28

	algo_kwargs = dict(k=50, sim_options={'name': 'pearson', 'user_based': False, 'verbose' : True})
	get_algorithm_report(KNNBasicWithTqdm, trainset, testset, train_loocv, test_loocv, movies_df,
	target_movie_id='movie_1', target_user_id='user_1', top_k=10,
	algo_kwargs_trainset=algo_kwargs, algo_kwargs_trainset_loocv=algo_kwargs, calc_most_similar=True)

erap129 / recsys_knn_tqdm_and_knn_user.py

Last active May 5, 2022 16:22

	algo_kwargs = dict(k=50, sim_options={'name': 'pearson', 'user_based': True, 'verbose' : True})
	get_algorithm_report(KNNBasicWithTqdm, trainset, testset, train_loocv, test_loocv, movies_df,
	target_movie_id='movie_1', target_user_id='user_1', top_k=10,
	algo_kwargs_trainset=algo_kwargs, algo_kwargs_trainset_loocv=algo_kwargs, calc_most_similar=False)

erap129 / recsys_svd.py

Last active May 5, 2022 16:21

	get_algorithm_report(SVDWithTqdm, trainset, testset, train_loocv, test_loocv, movies_df,
	target_movie_id='movie_1', target_user_id='user_1', top_k=10)

Elad Rapaport erap129