This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
user_id | movie_id | rating | time | movie_name | genre | ||
---|---|---|---|---|---|---|---|
0 | user_1 | movie_1193 | 5 | 978300760 | One Flew Over the Cuckoo's Nest (1975) | ['Drama'] | |
45 | user_1 | movie_1028 | 5 | 978301777 | Mary Poppins (1964) | ["Children's", 'Comedy', 'Musical'] | |
39 | user_1 | movie_150 | 5 | 978301777 | Apollo 13 (1995) | ['Drama'] | |
23 | user_1 | movie_527 | 5 | 978824195 | Schindler's List (1993) | ['Drama', 'War'] | |
4 | user_1 | movie_2355 | 5 | 978824291 | Bug's Life, A (1998) | ['Animation',"Children's", 'Comedy'] | |
6 | user_1 | movie_1287 | 5 | 978302039 | Ben-Hur (1959) | ['Action', 'Adventure', 'Drama'] | |
10 | user_1 | movie_595 | 5 | 978824268 | Beauty and the Beast (1991) | ['Animation',"Children's", 'Musical'] | |
48 | user_1 | movie_2028 | 5 | 978301619 | Saving Private Ryan (1998) | ['Action', 'Drama', 'War'] | |
46 | user_1 | movie_1029 | 5 | 978302205 | Dumbo (1941) | ['Animation',"Children's", 'Musical'] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
img2vec = Img2Vec(cuda=False) | |
movie_df_with_plots_posters_filepath = f'{BASE_FOLDER}/MovieLens-1M/movie_df_with_plots_posters.csv' | |
if os.path.exists(movie_df_with_plots_posters_filepath): | |
movies_df = pd.read_csv(movie_df_with_plots_posters_filepath) | |
else: | |
def get_movie_embedding(url): | |
try: | |
response = requests.get(url) | |
img = Image.open(BytesIO(response.content)).convert('RGB') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
nltk.download('punkt') | |
text = ' '.join(list(feature_list)) | |
st = StanfordNERTagger(f'{BASE_FOLDER}/MovieLens-1M/english.all.3class.distsim.crf.ser.gz', | |
f'{BASE_FOLDER}/MovieLens-1M/stanford-ner.jar') | |
people = [] | |
for sent in nltk.sent_tokenize(text): | |
tokens = nltk.tokenize.word_tokenize(sent) | |
tags = st.tag(tokens) | |
for tag in tags: | |
if tag[1] == "PERSON": |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
constant_filter = VarianceThreshold(threshold = 0.0002) | |
constant_filter.fit(tfidf_df) | |
feature_list = tfidf_df.columns[constant_filter.get_support(indices=True)] | |
print('Number of selected features: ' ,len(list(feature_list)),'\n') | |
print('List of selected features: \n' ,list(feature_list)) | |
item_matrix_filtered_words_trainset_loocv = get_item_matrix_with_inner_ids(tfidf_df[feature_list].values, movies_df, train_loocv) | |
cosine_sim_filtered_words_trainset_loocv = cosine_similarity(item_matrix_filtered_words_trainset_loocv, | |
item_matrix_filtered_words_trainset_loocv) | |
item_matrix_filtered_words_trainset = get_item_matrix_with_inner_ids(tfidf_df[feature_list].values, movies_df, trainset) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
item_matrix_all_words_trainset_loocv = get_item_matrix_with_inner_ids(tfidf_matrix.todense(), movies_df, train_loocv) | |
cosine_sim_all_words_trainset_loocv = cosine_similarity(item_matrix_all_words_trainset_loocv, | |
item_matrix_all_words_trainset_loocv) | |
item_matrix_all_words_trainset = get_item_matrix_with_inner_ids(tfidf_matrix.todense(), movies_df, trainset) | |
cosine_sim_all_words_trainset = cosine_similarity(item_matrix_all_words_trainset, item_matrix_all_words_trainset) | |
get_algorithm_report(CustomSimKNNAlgorithm, trainset, testset, train_loocv, test_loocv, movies_df, | |
target_movie_id='movie_1', target_user_id='user_1', top_k=10, | |
algo_kwargs_trainset=dict(similarities=cosine_sim_all_words_trainset, sim_options={'user_based': False}), | |
algo_kwargs_trainset_loocv=dict(similarities=cosine_sim_all_words_trainset_loocv, sim_options={'user_based': False})) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
stemmer = SnowballStemmer('english') | |
movies_df['movie_plot'] = movies_df['movie_plot'].apply(lambda x:' '.join([stemmer.stem(y) for y in x.split()])) | |
tfidf = TfidfVectorizer(stop_words='english') | |
tfidf_matrix = tfidf.fit_transform(movies_df['movie_plot']) | |
tfidf_df = pd.DataFrame( | |
tfidf_matrix.todense(), | |
columns=tfidf.get_feature_names() | |
) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from surprise import AlgoBase, KNNBasic | |
from surprise.prediction_algorithms.knns import SymmetricAlgo | |
class CustomSimKNNAlgorithm(KNNBasic): | |
def __init__(self, sim_options, k=40, min_k=1): | |
SymmetricAlgo.__init__(self) | |
self.sim_options = sim_options | |
self.k = k | |
self.min_k = min_k |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
algo_kwargs = dict(k=50, sim_options={'name': 'pearson', 'user_based': False, 'verbose' : True}) | |
get_algorithm_report(KNNBasicWithTqdm, trainset, testset, train_loocv, test_loocv, movies_df, | |
target_movie_id='movie_1', target_user_id='user_1', top_k=10, | |
algo_kwargs_trainset=algo_kwargs, algo_kwargs_trainset_loocv=algo_kwargs, calc_most_similar=True) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
algo_kwargs = dict(k=50, sim_options={'name': 'pearson', 'user_based': True, 'verbose' : True}) | |
get_algorithm_report(KNNBasicWithTqdm, trainset, testset, train_loocv, test_loocv, movies_df, | |
target_movie_id='movie_1', target_user_id='user_1', top_k=10, | |
algo_kwargs_trainset=algo_kwargs, algo_kwargs_trainset_loocv=algo_kwargs, calc_most_similar=False) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
get_algorithm_report(SVDWithTqdm, trainset, testset, train_loocv, test_loocv, movies_df, | |
target_movie_id='movie_1', target_user_id='user_1', top_k=10) |