Last active
May 14, 2021 07:49
-
-
Save morrisalp/b6cb81bf73411c40f861f865a2b12fa9 to your computer and use it in GitHub Desktop.
loading MovieLens 100K dataset with Surprise & Pandas
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import surprise | |
import pandas as pd | |
data = surprise.Dataset.load_builtin('ml-100k') | |
ddir = surprise.get_dataset_dir() | |
item_data = pd.read_csv(f'{ddir}/ml-100k/ml-100k/u.item', | |
sep='|', | |
header=None, | |
encoding='ISO-8859-1', | |
usecols=[0, 1], | |
names=['iid', 'item_name']) | |
def dataset2df(ds, train=True): | |
df = pd.DataFrame(ds.all_ratings() if train else ds, | |
columns=['uid', 'iid', 'rating']) | |
if train: | |
df.uid = df.uid.apply(trainset.to_raw_uid) | |
df.iid = df.iid.apply(trainset.to_raw_iid) | |
df.uid = df.uid.astype(int) | |
df.iid = df.iid.astype(int) | |
return pd.merge(df, item_data, how='left') | |
def datasets2dfs(trainset, testset): | |
df_train = dataset2df(trainset) | |
df_test = dataset2df(testset, train=False) | |
return df_train, df_test | |
# # To get test set of all unrated items: | |
# trainset = data.build_full_trainset() | |
# testset = trainset.build_anti_testset() | |
# df_train, df_test = datasets2dfs(trainset, testset) | |
# # To get test set of some rated items: | |
# trainset, testset = surprise.model_selection.train_test_split(data, test_size=.25) | |
# df_train, df_test = datasets2dfs(trainset, testset) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment