Skip to content

Instantly share code, notes, and snippets.

@reachsumit
Created November 7, 2022 02:12
Show Gist options
  • Save reachsumit/a6ab97ed6bc053aaf3d73320b4b31b97 to your computer and use it in GitHub Desktop.
Save reachsumit/a6ab97ed6bc053aaf3d73320b4b31b97 to your computer and use it in GitHub Desktop.
Wide & Deep Learning
Display the source blob
Display the rendered blob
Raw
{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.7.12","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"import torch\n\nimport matplotlib.pyplot as plt\nimport numpy as np\nimport pandas as pd\nimport torch.nn as nn\n\nfrom scipy.sparse import coo_matrix\nfrom scipy.stats import rankdata\nfrom sklearn.preprocessing import StandardScaler","metadata":{"execution":{"iopub.status.busy":"2022-11-07T01:41:06.702429Z","iopub.execute_input":"2022-11-07T01:41:06.702914Z","iopub.status.idle":"2022-11-07T01:41:09.006235Z","shell.execute_reply.started":"2022-11-07T01:41:06.702822Z","shell.execute_reply":"2022-11-07T01:41:09.005224Z"},"trusted":true},"execution_count":1,"outputs":[]},{"cell_type":"code","source":"device = 'cuda' if torch.cuda.is_available() else 'cpu'\nPAD_IDX = 0","metadata":{"execution":{"iopub.status.busy":"2022-11-07T01:41:09.011809Z","iopub.execute_input":"2022-11-07T01:41:09.014401Z","iopub.status.idle":"2022-11-07T01:41:09.090847Z","shell.execute_reply.started":"2022-11-07T01:41:09.014358Z","shell.execute_reply":"2022-11-07T01:41:09.089107Z"},"trusted":true},"execution_count":2,"outputs":[]},{"cell_type":"code","source":"# purpose: convert target with index of movie to series of all zeros and one in place of index\n# We will use this to compute the expected output of the model to be compared with actual output\ndef idx_to_sparse(idx, sparse_dim):\n sparse = np.zeros(sparse_dim) # vector of 1683 zeroes\n sparse[int(idx)] = 1 # set a given index to 1\n return pd.Series(sparse, dtype=int) # make a pandas series of 0s and 1s\n\n\n# Calculate accuracy (a classification metric)\ndef accuracy_fn(y_true, y_pred):\n correct = torch.eq(y_true, y_pred).sum().item() # torch.eq() calculates where two tensors are equal\n acc = (correct / len(y_pred)) * 100 \n return acc","metadata":{"execution":{"iopub.status.busy":"2022-11-07T01:41:09.096682Z","iopub.execute_input":"2022-11-07T01:41:09.099400Z","iopub.status.idle":"2022-11-07T01:41:09.108898Z","shell.execute_reply.started":"2022-11-07T01:41:09.099206Z","shell.execute_reply":"2022-11-07T01:41:09.107778Z"},"trusted":true},"execution_count":3,"outputs":[]},{"cell_type":"code","source":"# r,c = get_coo_indexes(dataset['prev movies'].tolist())\n# print(len(r), len(c))\n# 10150406 10150406\n# print(r[:11], c[:11])\n# [0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4] ['168', '168', '172', '168', '172', '165', '168', '172', '165', '156', '168']\n# basically the information that row0 has 168, row1 has 168 and 172, row2 has 168, 172, 165 and so on..\n# note that the length of first list represents number of \"1s\", while zip(first,second) gives row, col indices that should be one\ndef get_coo_indexes(lil):\n rows = []\n cols = []\n for i, el in enumerate(lil):\n if type(el)!=list:\n el = [el]\n for j in el:\n rows.append(i)\n cols.append(j)\n return rows, cols\n\n\n# This function creates a sparse matrix given the \"prev movies\" column\ndef get_sparse_features(series, shape):\n # get row, column pairs such that column value represents the watched movie\n coo_indexes = get_coo_indexes(series.tolist())\n # Create a matrix of 0s and 1s of size orignal dataset rows and number of movies as columns; then convert it into coord based sparse matrix\n # sparse matrix would be of the size tuple (original rows count x number of movies); matrix starts with 1; we keep one extra column because movie id starts with 1 in the dataset\n # In the tuple, first argument specifies the number of 1s to be put in the sparse matrix, the second item (another tuple) specified row and column indexes for the positions where corresponding each value ie. 1 should be placed in the sparse matrix\n sparse_df = coo_matrix((np.ones(len(coo_indexes[0])), (coo_indexes[0], coo_indexes[1])), shape=shape)\n return sparse_df\n\n\n# purpose: convert indexes of previous watched movies to series of films indexes\n# given a sparse matrix input, this function returns a corresponding padded 2D matrix\n# We use this to make binary features for the model training and testing\ndef sparse_to_idx(data, pad_idx=-1):\n # Returns a tuple of arrays (row,col) containing the indices of the non-zero elements of the matrix.\n indexes = data.nonzero()\n # for prev_movies_train, this dataset will be 7957390 rows × 2 columns because of repeating values of rows\n indexes_df = pd.DataFrame()\n indexes_df['rows'] = indexes[0]\n indexes_df['cols'] = indexes[1]\n \n # group by the rows, and make a list of all the corresponding columns\n # rows\n # 0 [255, 286, 298, 185, 173]\n # 1 [255, 286, 298, 185, 173, 772, 108]\n # 2 [255, 286, 298, 185, 173, 772]\n # 3 [255, 286, 298, 185, 173, 772, 108, 288]\n mdf = indexes_df.groupby('rows').apply(lambda x: x['cols'].tolist())\n max_len = mdf.apply(lambda x: len(x)).max() # longest list is 736 sized\n return mdf.apply(lambda x: pd.Series(x + [pad_idx] * (max_len - len(x)))).values # pad zeroes in the list upto 736 values; this result is (76228, 736) shaped\n\n\ndef load_and_process_data_wnd():\n #Load the Ratings data\n data = pd.read_csv('../input/movielens-100k-dataset/ml-100k/u.data', sep=\"\\t\", header=None)\n data.columns = ['user id', 'movie id', 'rating', 'timestamp']\n #Load the User data\n users = pd.read_csv('../input/movielens-100k-dataset/ml-100k/u.user', sep=\"|\", encoding='latin-1', header=None)\n users.columns = ['user id', 'age', 'gender', 'occupation', 'zip code']\n #Load movie data\n items = pd.read_csv('../input/movielens-100k-dataset/ml-100k/u.item', \n sep=\"|\", encoding='latin-1', header=None)\n items.columns = ['movie id', 'movie title' ,'release date','video release date', 'IMDb URL', \n 'unknown', 'Action', 'Adventure', 'Animation', 'Children\\'s', 'Comedy', \n 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', \n 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']\n GENRES = pd.read_csv('../input/movielens-100k-dataset/ml-100k/u.genre', \n sep=\"|\", header=None, usecols=[0])[0].tolist()\n \n # Sort the dataset by user-id and time\n dataset = data.sort_values(['user id', 'timestamp']).reset_index(drop=True)\n dataset['one'] = 1 # add a column containing all 1s\n dataset['sample_num'] = dataset.groupby('user id')['one'].cumsum() # use the 1s column to create a sample number for each user\n # Create a target column by shifting movie-id for each user-id one step back, effectively this means that we have a column that has id for the next movie the user is going to watch \n # (it is NaN for the row representing the last movie the user watches). We will predict this column.\n dataset['target'] = dataset.groupby('user id')['movie id'].shift(-1)\n # create a column that represents average movie rating given by user till that time (represented by row)\n dataset['mean_rate'] = dataset.groupby('user id')['rating'].cumsum() / dataset['sample_num']\n \n # Create a column that has a list of movies that the user has watched so far. We will create sparse vector and embedding vectors from this later on.\n dataset['prev movies'] = dataset['movie id'].apply(lambda x: str(x))\n dataset['prev movies'] = dataset.groupby('user id')['prev movies'].apply(lambda x: (x + ' ').cumsum().str.strip())\n dataset['prev movies'] = dataset['prev movies'].apply(lambda x: x.split())\n \n # do a left join with movies dataframe and bring all the genre representations (0/1 binary values for each movie representing its category) here.\n dataset = dataset.merge(items[['movie id'] + GENRES], on='movie id', how='left')\n \n # For each genre column (19 total) creates another column (total 19 more). This column represents a given user's mean score (float value) for a given genre till that time (represented by row).\n # Note that we also update the genre columns such that each column now has cumulative sum, i.e. the corresponding number of movies that the user has watched in that genre so far.\n for genre in GENRES:\n dataset[f'{genre}_rate'] = dataset[genre]*dataset['rating']\n dataset[genre] = dataset.groupby('user id')[genre].cumsum()\n dataset[f'{genre}_rate'] = dataset.groupby('user id')[f'{genre}_rate'].cumsum() / dataset[genre]\n \n # Next we normalize the scores for movies in each genre such that we divide it by the number of movies that the user has watched so far.\n dataset[GENRES] = dataset[GENRES].apply(lambda x: x / dataset['sample_num'])\n # do a left-join on users data and get more information on users\n dataset = dataset.merge(users, on='user id', how='left')\n \n crossed_col_names = (['gender', 'occupation'], ['age', 'gender'])\n # Build the crossed columns\n crossed_columns = []\n for cols in crossed_col_names:\n colname = '_'.join(cols)\n dataset[colname] = dataset[cols].apply(lambda x: '-'.join(x.astype(str)), axis=1)\n crossed_columns.append(colname)\n \n occupations_categoricals = dataset['occupation'].unique().tolist()\n dummy_crossed_columns = []\n for col in crossed_columns:\n dummy_crossed_columns.extend(dataset[col].unique().tolist())\n\n dataset['gender'] = (dataset['gender'] == 'M').astype(int) # change gender to 0/1 integer\n dataset = pd.concat([dataset.drop(['occupation']+crossed_columns, axis=1), pd.get_dummies(dataset[['occupation']+crossed_columns], prefix=\"\", prefix_sep=\"\")], axis=1) # get occupation dummy variables and drop occupation column\n dataset.drop('zip code', axis=1, inplace=True)\n \n COLD_START_TRESH = 5 # take the rows AFTER each user has watched at least 4 movies\n # filter using threshold and remove null target rows\n filtred_data = dataset[(dataset['sample_num'] >= COLD_START_TRESH) &\n ~(dataset['target'].isna())].sort_values('timestamp')\n \n continuous_cols = ['age', 'gender', 'mean_rate'] + GENRES + [gen+\"_rate\" for gen in GENRES] # 41\n categoricals = occupations_categoricals# already dummy encoded\n wide_data_column_names = continuous_cols + categoricals + dummy_crossed_columns\n df_wide = filtred_data[wide_data_column_names]\n df_wide_without_cross = filtred_data[continuous_cols + categoricals]\n \n scaler = StandardScaler()\n pd.options.mode.chained_assignment = None\n \n TEST_SIZE = 0.2 # size of test set\n X_train_wide, X_test_wide = df_wide[:int(len(df_wide)*(1-TEST_SIZE))], df_wide[int(len(df_wide)*(1-TEST_SIZE)):]\n X_train_wide_wo_cross, X_test_wide_wo_cross = df_wide_without_cross[:int(len(df_wide_without_cross)*(1-TEST_SIZE))], df_wide_without_cross[int(len(df_wide_without_cross)*(1-TEST_SIZE)):]\n\n filtered_train_data, filtered_test_data = filtred_data[:int(len(filtred_data)*(1-TEST_SIZE))], filtred_data[int(len(filtred_data)*(1-TEST_SIZE)):]\n y_train, y_test = filtered_train_data['target'], filtered_test_data['target']\n \n # create sparse matrix out of prev_movies column for both train and test sets\n prev_movies_train = get_sparse_features(filtered_train_data['prev movies'], (len(filtered_train_data), filtred_data['movie id'].max()+1))\n prev_movies_test = get_sparse_features(filtered_test_data['prev movies'], (len(filtered_test_data), filtred_data['movie id'].max()+1))\n\n # tensor with sequence of indexes\n movies_train_tensor = torch.sparse_coo_tensor(\n indices=prev_movies_train.nonzero(), # The indices are the coordinates of the non-zero values in the matrix (7957390,7957390)\n values=[1]*len(prev_movies_train.nonzero()[0]), # Initial values for the tensor, 7957390 1s\n size=prev_movies_train.shape # Size of the sparse tensor (76228, 1683)\n ).to_dense().to(device)\n \n movies_test_tensor = torch.sparse_coo_tensor(\n indices=prev_movies_test.nonzero(), \n values=[1]*len(prev_movies_test.nonzero()[0]),\n size=prev_movies_test.shape\n ).to_dense().to(device)\n \n # Train part\n # tensor with binary features\n # to get embeddings for sequence of indexes\n movies_train_idx = torch.Tensor(\n sparse_to_idx(prev_movies_train, pad_idx=PAD_IDX),\n ).long().to(device)\n \n movies_test_idx = torch.Tensor(\n sparse_to_idx(prev_movies_test, pad_idx=PAD_IDX),\n ).long().to(device)\n \n # target\n target_train = torch.Tensor(y_train.values).long().to(device)\n target_test = torch.Tensor(y_test.values).long().to(device)\n target_test_sparse = y_test.apply(lambda x: idx_to_sparse(x, items['movie id'].nunique() + 1)) # to calculate mean rank over test set during training\n \n # tensor with continuous features\n X_train_wide_tensor = torch.Tensor(X_train_wide.fillna(0).values).to(device)\n X_train_wide_wo_cross_tensor = torch.Tensor(X_train_wide_wo_cross.fillna(0).values).to(device)\n X_test_wide_tensor = torch.Tensor(X_test_wide.fillna(0).values).to(device)\n X_test_wide_wo_cross_tensor = torch.Tensor(X_test_wide_wo_cross.fillna(0).values).to(device)\n \n return X_train_wide_tensor, X_train_wide_wo_cross_tensor, X_test_wide_tensor, X_test_wide_wo_cross_tensor, movies_train_tensor, movies_test_tensor, movies_train_idx, movies_test_idx, target_train, target_test, target_test_sparse, items['movie id'].nunique() + 1","metadata":{"execution":{"iopub.status.busy":"2022-11-07T01:41:09.115599Z","iopub.execute_input":"2022-11-07T01:41:09.117760Z","iopub.status.idle":"2022-11-07T01:41:09.177199Z","shell.execute_reply.started":"2022-11-07T01:41:09.117727Z","shell.execute_reply":"2022-11-07T01:41:09.176250Z"},"trusted":true},"execution_count":4,"outputs":[]},{"cell_type":"code","source":"class WideDeep(nn.Module):\n def __init__(self, wide_dim, n_class, embed_dim, embed_size, pad_idx=0):\n super(WideDeep, self).__init__()\n self.embed = nn.Embedding(embed_dim, embed_size, padding_idx=pad_idx, device=device)\n self.linear_relu_stack = nn.Sequential(\n nn.Linear(embed_size, 1024, device=device),\n nn.ReLU(),\n nn.Linear(1024, 512, device=device),\n nn.ReLU(),\n nn.Linear(512, 256, device=device),\n nn.ReLU()\n )\n self.output = nn.Linear(256+wide_dim, n_class, device=device)\n \n def forward(self, X_w, X_sparse_idx):\n embed = self.embed(X_sparse_idx) # movies_train_idx\n embed = torch.mean(embed, dim=1)\n deep_logits = self.linear_relu_stack(embed)\n total_logits = self.output(torch.cat((deep_logits, X_w), dim=1))\n return total_logits\n\ndef run_gradient_descent_wnd(model,\n learning_rate=1e-3,\n weight_decay=0.01,\n num_epochs=10):\n loss_fn = nn.CrossEntropyLoss(ignore_index=PAD_IDX) # the model doesn't need to predict padding index\n optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)\n \n iters, train_losses, test_losses, mean_test_ranks = [], [], [], []\n \n # training\n n = 0 # the number of iterations\n for epoch in range(num_epochs):\n model.train()\n y_logits = model(torch.cat([X_train_wide_tensor, movies_train_tensor], dim=1), movies_train_idx)\n loss_train = loss_fn(y_logits, target_train)\n\n # Backpropagation\n optimizer.zero_grad() # a clean up step for PyTorch\n loss_train.backward() # compute updates for each parameter\n optimizer.step() # make the updates for each parameter\n\n # save the current training information\n if n%100 == 0:\n pred_train = torch.softmax(y_logits, dim=1).argmax(dim=1)\n acc = accuracy_fn(y_true=target_train, y_pred=pred_train)\n \n model.eval()\n with torch.inference_mode():\n test_logits = model(torch.cat([X_test_wide_tensor, movies_test_tensor], dim=1), movies_test_idx)\n test_pred = torch.softmax(test_logits, dim=1).argmax(dim=1)\n loss_test = loss_fn(test_logits, target_test)\n test_acc = accuracy_fn(y_true=target_test,y_pred=test_pred)\n \n # calculate mean rank on test set\n softmax = nn.Softmax(dim=0)\n preds_wnd = softmax(test_logits.float()).cpu().detach().numpy()\n ranks_wnd = pd.DataFrame(preds_wnd).apply(lambda x: pd.Series(rankdata(-x)), axis=1)\n ranks_target_wnd = (ranks_wnd.values * target_test_sparse).sum(axis=1)\n mean_rank_wnd = ranks_target_wnd.mean()\n \n print(f\"Epoch: {epoch} | Loss: {loss_train:.5f}, Acc: {acc:.2f}% | Test Loss: {loss_test:.5f}, Test Acc: {test_acc:.2f}% Test mean rank: {mean_rank_wnd:.0f}\")\n \n iters.append(n)\n train_losses.append(float(loss_train))\n test_losses.append(float(loss_test))\n mean_test_ranks.append(mean_rank_wnd)\n \n # increment the iteration number\n n += 1\n \n # plotting\n plt.figure(figsize=(12, 8), dpi=100)\n plt.title(f\"Training Curve (lr={learning_rate})\")\n plt.plot(iters, train_losses, label=\"Train Loss\")\n plt.plot(iters, test_losses, label=\"Test Loss\")\n plt.xlabel(\"Iterations\")\n plt.ylabel(\"Loss\")\n plt.legend(loc='best')\n plt.show()\n \n plt.figure(figsize=(12, 8), dpi=100)\n plt.plot(iters, mean_test_ranks, label=\"Test Rank\")\n plt.xlabel(\"Iterations\")\n plt.ylabel(\"Mean Rank on testset\")\n plt.legend(loc='best')\n plt.show()\n \n return model, iters, train_losses, test_losses","metadata":{"execution":{"iopub.status.busy":"2022-11-07T01:41:09.181664Z","iopub.execute_input":"2022-11-07T01:41:09.183830Z","iopub.status.idle":"2022-11-07T01:41:09.223478Z","shell.execute_reply.started":"2022-11-07T01:41:09.183796Z","shell.execute_reply":"2022-11-07T01:41:09.222451Z"},"trusted":true},"execution_count":5,"outputs":[]},{"cell_type":"code","source":"X_train_wide_tensor, X_train_wide_wo_cross_tensor, X_test_wide_tensor, X_test_wide_wo_cross_tensor, movies_train_tensor, movies_test_tensor, movies_train_idx, movies_test_idx, target_train, target_test, target_test_sparse, n_classes = load_and_process_data_wnd()","metadata":{"execution":{"iopub.status.busy":"2022-11-07T01:41:09.238530Z","iopub.execute_input":"2022-11-07T01:41:09.240874Z","iopub.status.idle":"2022-11-07T01:42:36.851253Z","shell.execute_reply.started":"2022-11-07T01:41:09.240834Z","shell.execute_reply":"2022-11-07T01:42:36.850173Z"},"trusted":true},"execution_count":6,"outputs":[{"name":"stderr","text":"/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py:150: UserWarning: Creating a tensor from a list of numpy.ndarrays is extremely slow. Please consider converting the list to a single numpy.ndarray with numpy.array() before converting to a tensor. (Triggered internally at /usr/local/src/pytorch/torch/csrc/utils/tensor_new.cpp:207.)\n","output_type":"stream"}]},{"cell_type":"code","source":"wide_and_deep_model = WideDeep(wide_dim=torch.cat([X_train_wide_tensor, movies_train_tensor], dim=1).shape[1],\n n_class=n_classes,\n embed_dim=n_classes,\n embed_size=16,) # randomly chosen","metadata":{"execution":{"iopub.status.busy":"2022-11-07T01:42:36.852863Z","iopub.execute_input":"2022-11-07T01:42:36.853262Z","iopub.status.idle":"2022-11-07T01:42:36.874171Z","shell.execute_reply.started":"2022-11-07T01:42:36.853225Z","shell.execute_reply":"2022-11-07T01:42:36.873359Z"},"trusted":true},"execution_count":7,"outputs":[]},{"cell_type":"code","source":"wide_and_deep_model_trained, iters, train_losses, test_losses = run_gradient_descent_wnd(wide_and_deep_model, num_epochs=1000, weight_decay=0, learning_rate=0.03)","metadata":{"execution":{"iopub.status.busy":"2022-11-07T01:42:36.875385Z","iopub.execute_input":"2022-11-07T01:42:36.876090Z","iopub.status.idle":"2022-11-07T01:55:52.856349Z","shell.execute_reply.started":"2022-11-07T01:42:36.876052Z","shell.execute_reply":"2022-11-07T01:55:52.855294Z"},"trusted":true},"execution_count":8,"outputs":[{"name":"stdout","text":"Epoch: 0 | Loss: 7.53350, Acc: 0.04% | Test Loss: 16.95257, Test Acc: 0.20% Test mean rank: 960\nEpoch: 100 | Loss: 2.46722, Acc: 45.85% | Test Loss: 11.89806, Test Acc: 1.30% Test mean rank: 908\nEpoch: 200 | Loss: 1.60657, Acc: 67.28% | Test Loss: 15.36075, Test Acc: 1.17% Test mean rank: 857\nEpoch: 300 | Loss: 1.18522, Acc: 79.23% | Test Loss: 18.48282, Test Acc: 1.09% Test mean rank: 822\nEpoch: 400 | Loss: 0.96231, Acc: 84.40% | Test Loss: 21.25227, Test Acc: 0.99% Test mean rank: 797\nEpoch: 500 | Loss: 0.76772, Acc: 89.98% | Test Loss: 23.71025, Test Acc: 0.95% Test mean rank: 782\nEpoch: 600 | Loss: 0.64600, Acc: 92.51% | Test Loss: 25.95594, Test Acc: 0.96% Test mean rank: 775\nEpoch: 700 | Loss: 0.55119, Acc: 95.08% | Test Loss: 27.99042, Test Acc: 0.93% Test mean rank: 770\nEpoch: 800 | Loss: 0.48001, Acc: 96.26% | Test Loss: 29.95028, Test Acc: 0.92% Test mean rank: 766\nEpoch: 900 | Loss: 0.42119, Acc: 97.14% | Test Loss: 31.84937, Test Acc: 0.92% Test mean rank: 768\n","output_type":"stream"},{"output_type":"display_data","data":{"text/plain":"<Figure size 1200x800 with 1 Axes>","image/png":"\n"},"metadata":{"needs_background":"light"}},{"output_type":"display_data","data":{"text/plain":"<Figure size 1200x800 with 1 Axes>","image/png":"\n"},"metadata":{"needs_background":"light"}}]},{"cell_type":"code","source":"","metadata":{},"execution_count":null,"outputs":[]}]}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment