Created
November 7, 2022 01:24
-
-
Save reachsumit/c6a8037f4596a8181376313fdba33ffd to your computer and use it in GitHub Desktop.
Field-Aware Factorization Machines
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.7.12","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"import torch\n\nimport matplotlib.pyplot as plt\nimport numpy as np\nimport pandas as pd\nimport torch.nn as nn\n\nfrom scipy.stats import rankdata\nfrom sklearn.preprocessing import LabelEncoder","metadata":{"execution":{"iopub.status.busy":"2022-10-31T06:43:23.762107Z","iopub.execute_input":"2022-10-31T06:43:23.762926Z","iopub.status.idle":"2022-10-31T06:43:26.757470Z","shell.execute_reply.started":"2022-10-31T06:43:23.762803Z","shell.execute_reply":"2022-10-31T06:43:26.756389Z"},"trusted":true},"execution_count":1,"outputs":[]},{"cell_type":"code","source":"device = 'cuda' if torch.cuda.is_available() else 'cpu'\nPAD_IDX = 0","metadata":{"execution":{"iopub.status.busy":"2022-10-31T06:43:26.762024Z","iopub.execute_input":"2022-10-31T06:43:26.763192Z","iopub.status.idle":"2022-10-31T06:43:26.838995Z","shell.execute_reply.started":"2022-10-31T06:43:26.763147Z","shell.execute_reply":"2022-10-31T06:43:26.837958Z"},"trusted":true},"execution_count":2,"outputs":[]},{"cell_type":"code","source":"# purpose: convert target with index of movie to series of all zeros and one in place of index\n# We will use this to compute the expected output of the model to be compared with actual output\ndef idx_to_sparse(idx, sparse_dim):\n sparse = np.zeros(sparse_dim) # vector of 1683 zeroes\n sparse[int(idx)] = 1 # set a given index to 1\n return pd.Series(sparse, dtype=int) # make a pandas series of 0s and 1s\n\n\n# Calculate accuracy (a classification metric)\ndef accuracy_fn(y_true, y_pred):\n correct = torch.eq(y_true, y_pred).sum().item() # torch.eq() calculates where two tensors are equal\n acc = (correct / len(y_pred)) * 100 \n return acc","metadata":{"execution":{"iopub.status.busy":"2022-10-31T06:43:26.843519Z","iopub.execute_input":"2022-10-31T06:43:26.844496Z","iopub.status.idle":"2022-10-31T06:43:26.855128Z","shell.execute_reply.started":"2022-10-31T06:43:26.844457Z","shell.execute_reply":"2022-10-31T06:43:26.853894Z"},"trusted":true},"execution_count":3,"outputs":[]},{"cell_type":"code","source":"def load_and_process_data_ffm():\n #Load the Ratings data\n data = pd.read_csv('../input/movielens-100k-dataset/ml-100k/u.data', sep=\"\\t\", header=None)\n data.columns = ['user id', 'movie id', 'rating', 'timestamp']\n #Load the User data\n users = pd.read_csv('../input/movielens-100k-dataset/ml-100k/u.user', sep=\"|\", encoding='latin-1', header=None)\n users.columns = ['user id', 'age', 'gender', 'occupation', 'zip code']\n #Load movie data\n items = pd.read_csv('../input/movielens-100k-dataset/ml-100k/u.item', \n sep=\"|\", encoding='latin-1', header=None)\n items.columns = ['movie id', 'movie title' ,'release date','video release date', 'IMDb URL', \n 'unknown', 'Action', 'Adventure', 'Animation', 'Children\\'s', 'Comedy', \n 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', \n 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']\n GENRES = pd.read_csv('../input/movielens-100k-dataset/ml-100k/u.genre', \n sep=\"|\", header=None, usecols=[0])[0].tolist()\n \n # Sort the dataset by user-id and time\n dataset = data.sort_values(['user id', 'timestamp']).reset_index(drop=True)\n dataset['one'] = 1 # add a column containing all 1s\n dataset['sample_num'] = dataset.groupby('user id')['one'].cumsum() # use the 1s column to create a sample number for each user\n # Create a target column by shifting movie-id for each user-id one step back, effectively this means that we have a column that has id for the next movie the user is going to watch \n # (it is NaN for the row representing the last movie the user watches). We will predict this column.\n dataset['target'] = dataset.groupby('user id')['movie id'].shift(-1)\n # create a column that represents average movie rating given by user till that time (represented by row)\n dataset['mean_rate'] = dataset.groupby('user id')['rating'].cumsum() / dataset['sample_num']\n \n # do a left join with movies dataframe and bring all the genre representations (0/1 binary values for each movie representing its category) here.\n dataset = dataset.merge(items[['movie id'] + GENRES], on='movie id', how='left')\n \n # For each genre column (19) creates another column (total 19 more). This column represents a given user's mean score (float value) for a given genre till that time (represented by row).\n # Note that we also update the genre columns such that each column now has cumulative sum, i.e. the corresponding number of movies that the user has watched in that genre so far.\n for genre in GENRES:\n dataset[f'{genre}_rate'] = dataset[genre]*dataset['rating']\n dataset[genre] = dataset.groupby('user id')[genre].cumsum()\n dataset[f'{genre}_rate'] = dataset.groupby('user id')[f'{genre}_rate'].cumsum() / dataset[genre]\n \n # Next we normalize the scores for movies in each genre such that we divide it by the number of movies that the user has watched so far.\n dataset[GENRES] = dataset[GENRES].apply(lambda x: x / dataset['sample_num'])\n # do a left-join on users data and get more information on users\n dataset = dataset.merge(users, on='user id', how='left')\n \n gender_encoder = LabelEncoder()\n occupations_encoder = LabelEncoder()\n dataset['gender'] = gender_encoder.fit_transform(dataset['gender'])\n dataset['occupation'] = occupations_encoder.fit_transform(dataset['occupation'])\n \n dataset.drop('zip code', axis=1, inplace=True)\n \n COLD_START_TRESH = 5 # take the rows AFTER each user has watched at least 4 movies\n # filter using threshold and remove null target rows\n filtred_data = dataset[(dataset['sample_num'] >= COLD_START_TRESH) &\n ~(dataset['target'].isna())].sort_values('timestamp')\n \n continuous_cols = ['age', 'mean_rate'] + GENRES + [gen+\"_rate\" for gen in GENRES] # 41\n categoricals = ['gender', 'occupation'] # label encoded\n field_dims = []\n field_dims.append(len(gender_encoder.classes_))\n field_dims.append(len(occupations_encoder.classes_))\n df_continuous = filtred_data[continuous_cols]\n df_categorical = filtred_data[categoricals]\n \n TEST_SIZE = 0.2 # size of test set\n X_train_continuous, X_test_continuous = df_continuous[:int(len(df_continuous)*(1-TEST_SIZE))], df_continuous[int(len(df_continuous)*(1-TEST_SIZE)):]\n X_train_categorical, X_test_categorical = df_categorical[:int(len(df_categorical)*(1-TEST_SIZE))], df_categorical[int(len(df_categorical)*(1-TEST_SIZE)):]\n\n filtered_train_data, filtered_test_data = filtred_data[:int(len(filtred_data)*(1-TEST_SIZE))], filtred_data[int(len(filtred_data)*(1-TEST_SIZE)):]\n y_train, y_test = filtered_train_data['target'], filtered_test_data['target']\n \n # target\n target_train = torch.Tensor(y_train.values).long().to(device)\n target_test = torch.Tensor(y_test.values).long().to(device)\n target_test_sparse = y_test.apply(lambda x: idx_to_sparse(x, items['movie id'].nunique() + 1)) # to calculate mean rank over test set during training\n \n # tensor with continuous features\n X_train_continuous_tensor = torch.Tensor(X_train_continuous.fillna(0).values).to(device)\n X_test_continuous_tensor = torch.Tensor(X_test_continuous.fillna(0).values).to(device)\n X_train_categorical_tensor = torch.Tensor(X_train_categorical.fillna(0).values).to(device)\n X_test_categorical_tensor = torch.Tensor(X_test_categorical.fillna(0).values).to(device)\n \n return X_train_continuous_tensor, X_test_continuous_tensor, X_train_categorical_tensor, X_test_categorical_tensor, target_train, target_test, target_test_sparse, field_dims, items['movie id'].nunique() + 1\n\nclass FFM(nn.Module):\n def __init__(self, continuous_dim, field_dims, n_class, embed_dim=16, pad_idx=0):\n super().__init__()\n self.bias = nn.Parameter(torch.zeros((n_class,)))\n self.embeddings = nn.Embedding(sum(field_dims), n_class, padding_idx=pad_idx, device=device)\n \n self.num_fields = len(field_dims)\n self.embeddings_interaction = nn.ModuleList([\n nn.Embedding(sum(field_dims), embed_dim, padding_idx=pad_idx, device=device) for _ in range(self.num_fields)\n ])\n \n self.linear_layer = nn.Linear(continuous_dim, n_class, device=device)\n\n def forward(self, continuous_X, categorical_X):\n embeds_out = torch.sum(self.embeddings(categorical_X), dim=1) + self.bias.to(device)\n \n field_wise_emb_list = [self.embeddings_interaction[i](categorical_X) for i in range(self.num_fields)]\n ix = list()\n for i in range(self.num_fields - 1):\n for j in range(i + 1, self.num_fields):\n ix.append(field_wise_emb_list[j][:, i] * field_wise_emb_list[i][:, j])\n ix = torch.stack(ix, dim=1)\n ffm_interaction_term = torch.sum(torch.sum(ix, dim=1), dim=1, keepdim=True)\n output = self.linear_layer(continuous_X) + embeds_out + ffm_interaction_term\n return output\n\ndef run_gradient_descent_ffm(model,\n learning_rate=1e-3,\n weight_decay=0.01,\n num_epochs=10):\n loss_fn = nn.CrossEntropyLoss(ignore_index=PAD_IDX) # the model doesn't need to predict padding index\n optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)\n \n iters, train_losses, test_losses, mean_test_ranks = [], [], [], []\n \n # training\n n = 0 # the number of iterations\n for epoch in range(num_epochs):\n model.train()\n y_logits = model(X_train_continuous_tensor, X_train_categorical_tensor.long().to(device))\n loss_train = loss_fn(y_logits, target_train)\n\n # Backpropagation\n optimizer.zero_grad() # a clean up step for PyTorch\n loss_train.backward() # compute updates for each parameter\n optimizer.step() # make the updates for each parameter\n\n # save the current training information\n if n%100 == 0:\n pred_train = torch.softmax(y_logits, dim=1).argmax(dim=1)\n acc = accuracy_fn(y_true=target_train, y_pred=pred_train)\n \n model.eval()\n with torch.inference_mode():\n test_logits = model(X_test_continuous_tensor, X_test_categorical_tensor.long().to(device))\n test_pred = torch.softmax(test_logits, dim=1).argmax(dim=1)\n loss_test = loss_fn(test_logits, target_test)\n test_acc = accuracy_fn(y_true=target_test,y_pred=test_pred)\n \n # calculate mean rank on test set\n softmax = nn.Softmax(dim=0)\n preds_wnd = softmax(test_logits.float()).cpu().detach().numpy()\n ranks_wnd = pd.DataFrame(preds_wnd).apply(lambda x: pd.Series(rankdata(-x)), axis=1)\n ranks_target_wnd = (ranks_wnd.values * target_test_sparse).sum(axis=1)\n mean_rank_wnd = ranks_target_wnd.mean()\n \n print(f\"Epoch: {epoch} | Loss: {loss_train:.5f}, Acc: {acc:.2f}% | Test Loss: {loss_test:.5f}, Test Acc: {test_acc:.2f}% Test mean rank: {mean_rank_wnd:.0f}\")\n \n iters.append(n)\n train_losses.append(float(loss_train))\n test_losses.append(float(loss_test))\n mean_test_ranks.append(mean_rank_wnd)\n \n # increment the iteration number\n n += 1\n \n # plotting\n plt.figure(figsize=(12, 8), dpi=100)\n plt.title(f\"Training Curve (lr={learning_rate})\")\n plt.plot(iters, train_losses, label=\"Train Loss\")\n plt.plot(iters, test_losses, label=\"Test Loss\")\n plt.xlabel(\"Iterations\")\n plt.ylabel(\"Loss\")\n plt.legend(loc='best')\n plt.show()\n \n plt.figure(figsize=(12, 8), dpi=100)\n plt.plot(iters, mean_test_ranks, label=\"Test Rank\")\n plt.xlabel(\"Iterations\")\n plt.ylabel(\"Mean Rank on testset\")\n plt.legend(loc='best')\n plt.show()\n \n return model, iters, train_losses, test_losses","metadata":{"execution":{"iopub.status.busy":"2022-10-31T06:43:26.859532Z","iopub.execute_input":"2022-10-31T06:43:26.860645Z","iopub.status.idle":"2022-10-31T06:43:26.921784Z","shell.execute_reply.started":"2022-10-31T06:43:26.860608Z","shell.execute_reply":"2022-10-31T06:43:26.920153Z"},"trusted":true},"execution_count":4,"outputs":[]},{"cell_type":"code","source":"X_train_continuous_tensor, X_test_continuous_tensor, X_train_categorical_tensor, X_test_categorical_tensor, target_train, target_test, target_test_sparse, field_dims, n_classes = load_and_process_data_ffm()","metadata":{"execution":{"iopub.status.busy":"2022-10-31T06:43:26.923234Z","iopub.execute_input":"2022-10-31T06:43:26.924085Z","iopub.status.idle":"2022-10-31T06:43:37.877386Z","shell.execute_reply.started":"2022-10-31T06:43:26.924040Z","shell.execute_reply":"2022-10-31T06:43:37.876393Z"},"trusted":true},"execution_count":5,"outputs":[]},{"cell_type":"code","source":"ffm_model = FFM(continuous_dim=X_train_continuous_tensor.shape[1], field_dims=field_dims, n_class=n_classes)","metadata":{"execution":{"iopub.status.busy":"2022-10-31T06:43:37.878649Z","iopub.execute_input":"2022-10-31T06:43:37.879639Z","iopub.status.idle":"2022-10-31T06:43:37.894713Z","shell.execute_reply.started":"2022-10-31T06:43:37.879600Z","shell.execute_reply":"2022-10-31T06:43:37.893780Z"},"trusted":true},"execution_count":6,"outputs":[]},{"cell_type":"code","source":"ffm_model_trained, iters, train_losses, test_losses = run_gradient_descent_ffm(ffm_model, num_epochs=1000, weight_decay=0, learning_rate=0.03)","metadata":{"execution":{"iopub.status.busy":"2022-10-31T06:43:37.896216Z","iopub.execute_input":"2022-10-31T06:43:37.896860Z","iopub.status.idle":"2022-10-31T06:46:05.361778Z","shell.execute_reply.started":"2022-10-31T06:43:37.896825Z","shell.execute_reply":"2022-10-31T06:46:05.360863Z"},"trusted":true},"execution_count":7,"outputs":[{"name":"stdout","text":"Epoch: 0 | Loss: 11.90849, Acc: 0.04% | Test Loss: 10.56461, Test Acc: 0.20% Test mean rank: 871\nEpoch: 100 | Loss: 6.09373, Acc: 1.36% | Test Loss: 6.99710, Test Acc: 0.76% Test mean rank: 494\nEpoch: 200 | Loss: 5.96240, Acc: 1.73% | Test Loss: 7.07962, Test Acc: 0.67% Test mean rank: 680\nEpoch: 300 | Loss: 5.90011, Acc: 1.94% | Test Loss: 7.16347, Test Acc: 0.60% Test mean rank: 803\nEpoch: 400 | Loss: 5.86051, Acc: 2.08% | Test Loss: 7.24217, Test Acc: 0.59% Test mean rank: 802\nEpoch: 500 | Loss: 5.83249, Acc: 2.13% | Test Loss: 7.31701, Test Acc: 0.62% Test mean rank: 801\nEpoch: 600 | Loss: 5.81145, Acc: 2.19% | Test Loss: 7.38897, Test Acc: 0.57% Test mean rank: 800\nEpoch: 700 | Loss: 5.79512, Acc: 2.27% | Test Loss: 7.45846, Test Acc: 0.61% Test mean rank: 800\nEpoch: 800 | Loss: 5.78216, Acc: 2.32% | Test Loss: 7.52555, Test Acc: 0.55% Test mean rank: 800\nEpoch: 900 | Loss: 5.77166, Acc: 2.38% | Test Loss: 7.59043, Test Acc: 0.56% Test mean rank: 799\n","output_type":"stream"},{"output_type":"display_data","data":{"text/plain":"<Figure size 1200x800 with 1 Axes>","image/png":"\n"},"metadata":{"needs_background":"light"}},{"output_type":"display_data","data":{"text/plain":"<Figure size 1200x800 with 1 Axes>","image/png":"\n"},"metadata":{"needs_background":"light"}}]}]} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment