Skip to content

Instantly share code, notes, and snippets.

user_id movie_id rating time movie_name genre
0 user_1 movie_1193 5 978300760 One Flew Over the Cuckoo's Nest (1975) ['Drama']
45 user_1 movie_1028 5 978301777 Mary Poppins (1964) ["Children's", 'Comedy', 'Musical']
39 user_1 movie_150 5 978301777 Apollo 13 (1995) ['Drama']
23 user_1 movie_527 5 978824195 Schindler's List (1993) ['Drama', 'War']
4 user_1 movie_2355 5 978824291 Bug's Life, A (1998) ['Animation',"Children's", 'Comedy']
6 user_1 movie_1287 5 978302039 Ben-Hur (1959) ['Action', 'Adventure', 'Drama']
10 user_1 movie_595 5 978824268 Beauty and the Beast (1991) ['Animation',"Children's", 'Musical']
48 user_1 movie_2028 5 978301619 Saving Private Ryan (1998) ['Action', 'Drama', 'War']
46 user_1 movie_1029 5 978302205 Dumbo (1941) ['Animation',"Children's", 'Musical']
img2vec = Img2Vec(cuda=False)
movie_df_with_plots_posters_filepath = f'{BASE_FOLDER}/MovieLens-1M/movie_df_with_plots_posters.csv'
if os.path.exists(movie_df_with_plots_posters_filepath):
movies_df = pd.read_csv(movie_df_with_plots_posters_filepath)
else:
def get_movie_embedding(url):
try:
response = requests.get(url)
img = Image.open(BytesIO(response.content)).convert('RGB')
nltk.download('punkt')
text = ' '.join(list(feature_list))
st = StanfordNERTagger(f'{BASE_FOLDER}/MovieLens-1M/english.all.3class.distsim.crf.ser.gz',
f'{BASE_FOLDER}/MovieLens-1M/stanford-ner.jar')
people = []
for sent in nltk.sent_tokenize(text):
tokens = nltk.tokenize.word_tokenize(sent)
tags = st.tag(tokens)
for tag in tags:
if tag[1] == "PERSON":
constant_filter = VarianceThreshold(threshold = 0.0002)
constant_filter.fit(tfidf_df)
feature_list = tfidf_df.columns[constant_filter.get_support(indices=True)]
print('Number of selected features: ' ,len(list(feature_list)),'\n')
print('List of selected features: \n' ,list(feature_list))
item_matrix_filtered_words_trainset_loocv = get_item_matrix_with_inner_ids(tfidf_df[feature_list].values, movies_df, train_loocv)
cosine_sim_filtered_words_trainset_loocv = cosine_similarity(item_matrix_filtered_words_trainset_loocv,
item_matrix_filtered_words_trainset_loocv)
item_matrix_filtered_words_trainset = get_item_matrix_with_inner_ids(tfidf_df[feature_list].values, movies_df, trainset)
item_matrix_all_words_trainset_loocv = get_item_matrix_with_inner_ids(tfidf_matrix.todense(), movies_df, train_loocv)
cosine_sim_all_words_trainset_loocv = cosine_similarity(item_matrix_all_words_trainset_loocv,
item_matrix_all_words_trainset_loocv)
item_matrix_all_words_trainset = get_item_matrix_with_inner_ids(tfidf_matrix.todense(), movies_df, trainset)
cosine_sim_all_words_trainset = cosine_similarity(item_matrix_all_words_trainset, item_matrix_all_words_trainset)
get_algorithm_report(CustomSimKNNAlgorithm, trainset, testset, train_loocv, test_loocv, movies_df,
target_movie_id='movie_1', target_user_id='user_1', top_k=10,
algo_kwargs_trainset=dict(similarities=cosine_sim_all_words_trainset, sim_options={'user_based': False}),
algo_kwargs_trainset_loocv=dict(similarities=cosine_sim_all_words_trainset_loocv, sim_options={'user_based': False}))
stemmer = SnowballStemmer('english')
movies_df['movie_plot'] = movies_df['movie_plot'].apply(lambda x:' '.join([stemmer.stem(y) for y in x.split()]))
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies_df['movie_plot'])
tfidf_df = pd.DataFrame(
tfidf_matrix.todense(),
columns=tfidf.get_feature_names()
)
from surprise import AlgoBase, KNNBasic
from surprise.prediction_algorithms.knns import SymmetricAlgo
class CustomSimKNNAlgorithm(KNNBasic):
def __init__(self, sim_options, k=40, min_k=1):
SymmetricAlgo.__init__(self)
self.sim_options = sim_options
self.k = k
self.min_k = min_k
algo_kwargs = dict(k=50, sim_options={'name': 'pearson', 'user_based': False, 'verbose' : True})
get_algorithm_report(KNNBasicWithTqdm, trainset, testset, train_loocv, test_loocv, movies_df,
target_movie_id='movie_1', target_user_id='user_1', top_k=10,
algo_kwargs_trainset=algo_kwargs, algo_kwargs_trainset_loocv=algo_kwargs, calc_most_similar=True)
algo_kwargs = dict(k=50, sim_options={'name': 'pearson', 'user_based': True, 'verbose' : True})
get_algorithm_report(KNNBasicWithTqdm, trainset, testset, train_loocv, test_loocv, movies_df,
target_movie_id='movie_1', target_user_id='user_1', top_k=10,
algo_kwargs_trainset=algo_kwargs, algo_kwargs_trainset_loocv=algo_kwargs, calc_most_similar=False)
get_algorithm_report(SVDWithTqdm, trainset, testset, train_loocv, test_loocv, movies_df,
target_movie_id='movie_1', target_user_id='user_1', top_k=10)