Last active
February 21, 2018 08:46
-
-
Save hiepph/b1505fd8206d942331f12937e8955ba5 to your computer and use it in GitHub Desktop.
Cinnamon assignment 1
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Load data from csv" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>URI</th>\n", | |
" <th>name</th>\n", | |
" <th>text</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td><http://dbpedia.org/resource/Digby_Morrell></td>\n", | |
" <td>Digby Morrell</td>\n", | |
" <td>digby morrell born 10 october 1979 is a former...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td><http://dbpedia.org/resource/Alfred_J._Lewy></td>\n", | |
" <td>Alfred J. Lewy</td>\n", | |
" <td>alfred j lewy aka sandy lewy graduated from un...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td><http://dbpedia.org/resource/Harpdog_Brown></td>\n", | |
" <td>Harpdog Brown</td>\n", | |
" <td>harpdog brown is a singer and harmonica player...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td><http://dbpedia.org/resource/Franz_Rottensteiner></td>\n", | |
" <td>Franz Rottensteiner</td>\n", | |
" <td>franz rottensteiner born in waidmannsfeld lowe...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td><http://dbpedia.org/resource/G-Enka></td>\n", | |
" <td>G-Enka</td>\n", | |
" <td>henry krvits born 30 december 1974 in tallinn ...</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" URI name \\\n", | |
"0 <http://dbpedia.org/resource/Digby_Morrell> Digby Morrell \n", | |
"1 <http://dbpedia.org/resource/Alfred_J._Lewy> Alfred J. Lewy \n", | |
"2 <http://dbpedia.org/resource/Harpdog_Brown> Harpdog Brown \n", | |
"3 <http://dbpedia.org/resource/Franz_Rottensteiner> Franz Rottensteiner \n", | |
"4 <http://dbpedia.org/resource/G-Enka> G-Enka \n", | |
"\n", | |
" text \n", | |
"0 digby morrell born 10 october 1979 is a former... \n", | |
"1 alfred j lewy aka sandy lewy graduated from un... \n", | |
"2 harpdog brown is a singer and harmonica player... \n", | |
"3 franz rottensteiner born in waidmannsfeld lowe... \n", | |
"4 henry krvits born 30 december 1974 in tallinn ... " | |
] | |
}, | |
"execution_count": 1, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"import pandas as pd\n", | |
"\n", | |
"df = pd.read_csv('data/people_wiki.csv')\n", | |
"df.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>URI</th>\n", | |
" <th>name</th>\n", | |
" <th>text</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>count</th>\n", | |
" <td>59071</td>\n", | |
" <td>59071</td>\n", | |
" <td>59071</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>unique</th>\n", | |
" <td>59071</td>\n", | |
" <td>59070</td>\n", | |
" <td>59071</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>top</th>\n", | |
" <td><http://dbpedia.org/resource/Josaia_Waqabaca></td>\n", | |
" <td>author)</td>\n", | |
" <td>jos francisco cardenal born 1940 was a nicarag...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>freq</th>\n", | |
" <td>1</td>\n", | |
" <td>2</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" URI name \\\n", | |
"count 59071 59071 \n", | |
"unique 59071 59070 \n", | |
"top <http://dbpedia.org/resource/Josaia_Waqabaca> author) \n", | |
"freq 1 2 \n", | |
"\n", | |
" text \n", | |
"count 59071 \n", | |
"unique 59071 \n", | |
"top jos francisco cardenal born 1940 was a nicarag... \n", | |
"freq 1 " | |
] | |
}, | |
"execution_count": 2, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df.describe()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Build TFIDF matrix from name/text" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 2min 14s, sys: 1.9 s, total: 2min 16s\n", | |
"Wall time: 2min 16s\n", | |
"TFIDF shape (n_samples, n_features): (59071, 15780383)\n" | |
] | |
} | |
], | |
"source": [ | |
"from sklearn.feature_extraction.text import TfidfVectorizer\n", | |
" \n", | |
"tf = TfidfVectorizer(analyzer='word', ngram_range=(1,3), min_df=0, stop_words='english')\n", | |
"%time tfidf_matrix = tf.fit_transform(df['text'])\n", | |
"\n", | |
"print('TFIDF shape (n_samples, n_features):', tfidf_matrix.shape)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Problem 1" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Calculate distance using cosine distance" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 83, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from sklearn.metrics.pairwise import cosine_distances\n", | |
"\n", | |
"def find_id(name):\n", | |
" return df.loc[df['name'] == name].index[0]\n", | |
"\n", | |
"\n", | |
"def distance(name1, name2):\n", | |
" id1 = find_id(name1)\n", | |
" id2 = find_id(name2)\n", | |
" return cosine_distances(tfidf_matrix[id1], tfidf_matrix[id2])[0][0]" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Test with some test cases" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 84, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"3.552713678800501e-15" | |
] | |
}, | |
"execution_count": 84, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"distance('Barack Obama', 'Barack Obama')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 85, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"0.9579495140804771" | |
] | |
}, | |
"execution_count": 85, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"distance('Barack Obama', 'George W. Bush')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 86, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"0.9689909088513265" | |
] | |
}, | |
"execution_count": 86, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"distance('George W. Bush', 'Joe Biden')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 88, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"0.8914688165783309" | |
] | |
}, | |
"execution_count": 88, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"distance('Barack Obama', 'Joe Biden')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Problem 2" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Recommend using cosine similarity, in constrast with cosine distance: the higher the similarity, more likely it will related to reading person" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 97, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from sklearn.metrics.pairwise import linear_kernel\n", | |
"import numpy as np\n", | |
"\n", | |
"def recommend(name, k):\n", | |
" \"\"\"\n", | |
" Args:\n", | |
" name: Name of reading person\n", | |
" k: Number of people to recommend\n", | |
" \n", | |
" Returns:\n", | |
" DataFrame contains k list of people to recommend for reading person\n", | |
" \"\"\"\n", | |
" # Get id of current reading person\n", | |
" target_id = find_id(name)\n", | |
" \n", | |
" # calculate cosine similarity with other people\n", | |
" cosine_similarities = linear_kernel(tfidf_matrix[target_id], tfidf_matrix).flatten()\n", | |
"\n", | |
" # get list of ids, sorted by similarity (desc)\n", | |
" # trick: cut off head 'coz head is always reading person\n", | |
" ids = np.flip(\n", | |
" np.argsort(cosine_similarities),\n", | |
" axis=0\n", | |
" )[1:]\n", | |
" \n", | |
" # return dataframe corresponding to list of ids\n", | |
" # limited by k\n", | |
" return df.iloc[ids].head(k)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Test for recommending 10 people while reading `Barack Obama`" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 98, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>URI</th>\n", | |
" <th>name</th>\n", | |
" <th>text</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>24478</th>\n", | |
" <td><http://dbpedia.org/resource/Joe_Biden></td>\n", | |
" <td>Joe Biden</td>\n", | |
" <td>joseph robinette joe biden jr dosf rbnt badn b...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>46811</th>\n", | |
" <td><http://dbpedia.org/resource/Jeff_Sessions></td>\n", | |
" <td>Jeff Sessions</td>\n", | |
" <td>jefferson beauregard jeff sessions iii born de...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4408</th>\n", | |
" <td><http://dbpedia.org/resource/Joe_Lieberman></td>\n", | |
" <td>Joe Lieberman</td>\n", | |
" <td>joseph isadore joe lieberman born february 24 ...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>48693</th>\n", | |
" <td><http://dbpedia.org/resource/Artur_Davis></td>\n", | |
" <td>Artur Davis</td>\n", | |
" <td>artur genestre davis born october 9 1967 is an...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>18827</th>\n", | |
" <td><http://dbpedia.org/resource/Henry_Waxman></td>\n", | |
" <td>Henry Waxman</td>\n", | |
" <td>henry arnold waxman born september 12 1939 is ...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>57108</th>\n", | |
" <td><http://dbpedia.org/resource/Hillary_Rodham_Cl...</td>\n", | |
" <td>Hillary Rodham Clinton</td>\n", | |
" <td>hillary diane rodham clinton hlri dan rdm klnt...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>30804</th>\n", | |
" <td><http://dbpedia.org/resource/Richard_Pildes></td>\n", | |
" <td>Richard Pildes</td>\n", | |
" <td>richard h pildes is a law professor at the new...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>38376</th>\n", | |
" <td><http://dbpedia.org/resource/Samantha_Power></td>\n", | |
" <td>Samantha Power</td>\n", | |
" <td>samantha power born september 21 1970 is an ir...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>46140</th>\n", | |
" <td><http://dbpedia.org/resource/Robert_Gibbs></td>\n", | |
" <td>Robert Gibbs</td>\n", | |
" <td>robert lane gibbs born march 29 1971 is an ame...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>38714</th>\n", | |
" <td><http://dbpedia.org/resource/Eric_Stern_(polit...</td>\n", | |
" <td>Eric Stern (politician)</td>\n", | |
" <td>eric stern is the director of operations for t...</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" URI \\\n", | |
"24478 <http://dbpedia.org/resource/Joe_Biden> \n", | |
"46811 <http://dbpedia.org/resource/Jeff_Sessions> \n", | |
"4408 <http://dbpedia.org/resource/Joe_Lieberman> \n", | |
"48693 <http://dbpedia.org/resource/Artur_Davis> \n", | |
"18827 <http://dbpedia.org/resource/Henry_Waxman> \n", | |
"57108 <http://dbpedia.org/resource/Hillary_Rodham_Cl... \n", | |
"30804 <http://dbpedia.org/resource/Richard_Pildes> \n", | |
"38376 <http://dbpedia.org/resource/Samantha_Power> \n", | |
"46140 <http://dbpedia.org/resource/Robert_Gibbs> \n", | |
"38714 <http://dbpedia.org/resource/Eric_Stern_(polit... \n", | |
"\n", | |
" name \\\n", | |
"24478 Joe Biden \n", | |
"46811 Jeff Sessions \n", | |
"4408 Joe Lieberman \n", | |
"48693 Artur Davis \n", | |
"18827 Henry Waxman \n", | |
"57108 Hillary Rodham Clinton \n", | |
"30804 Richard Pildes \n", | |
"38376 Samantha Power \n", | |
"46140 Robert Gibbs \n", | |
"38714 Eric Stern (politician) \n", | |
"\n", | |
" text \n", | |
"24478 joseph robinette joe biden jr dosf rbnt badn b... \n", | |
"46811 jefferson beauregard jeff sessions iii born de... \n", | |
"4408 joseph isadore joe lieberman born february 24 ... \n", | |
"48693 artur genestre davis born october 9 1967 is an... \n", | |
"18827 henry arnold waxman born september 12 1939 is ... \n", | |
"57108 hillary diane rodham clinton hlri dan rdm klnt... \n", | |
"30804 richard h pildes is a law professor at the new... \n", | |
"38376 samantha power born september 21 1970 is an ir... \n", | |
"46140 robert lane gibbs born march 29 1971 is an ame... \n", | |
"38714 eric stern is the director of operations for t... " | |
] | |
}, | |
"execution_count": 98, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"recommend('Barack Obama', 10)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Calculate executing time" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 105, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import random\n", | |
"from tqdm import tqdm\n", | |
"\n", | |
"def execute_recommend(n, k):\n", | |
" \"\"\"\n", | |
" Args:\n", | |
" n: Number of people\n", | |
" k: Number of people to recommend to corresponding person\n", | |
" \"\"\"\n", | |
" # random n people\n", | |
" people = random.sample(list(df['name']), n)\n", | |
" \n", | |
" for name in tqdm(people):\n", | |
" recommend(name, k)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"1 person, k = 200" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 106, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"100%|██████████| 1/1 [00:01<00:00, 1.82s/it]" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 1.81 s, sys: 23.3 ms, total: 1.83 s\n", | |
"Wall time: 1.83 s\n" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"%time execute_recommend(1, 200)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"50 people, k = 200" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 107, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"100%|██████████| 50/50 [01:26<00:00, 1.72s/it]" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 1min 24s, sys: 1.56 s, total: 1min 26s\n", | |
"Wall time: 1min 26s\n" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"%time execute_recommend(50, 200)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"500 people, k = 200" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 108, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"100%|██████████| 500/500 [14:13<00:00, 1.71s/it]" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 13min 58s, sys: 15.2 s, total: 14min 14s\n", | |
"Wall time: 14min 13s\n" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"%time execute_recommend(500, 200)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"As estimation, recommend 5000 people, k = 200 would take near 2.5 hours." | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Problem 3" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"First cluster TFIDF matrix with K-Means into 100 clusters." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Init 1/1 with method: k-means++\n", | |
"Inertia for init 1/1: 2861.899365\n", | |
"Minibatch iteration 1/6000: mean batch inertia: 1.001967, ewa inertia: 1.001967 \n", | |
"Minibatch iteration 2/6000: mean batch inertia: 0.994862, ewa inertia: 1.001726 \n", | |
"Minibatch iteration 3/6000: mean batch inertia: 0.992571, ewa inertia: 1.001416 \n", | |
"Minibatch iteration 4/6000: mean batch inertia: 0.992247, ewa inertia: 1.001106 \n", | |
"Minibatch iteration 5/6000: mean batch inertia: 0.991494, ewa inertia: 1.000780 \n", | |
"Minibatch iteration 6/6000: mean batch inertia: 0.990495, ewa inertia: 1.000432 \n", | |
"Minibatch iteration 7/6000: mean batch inertia: 0.988632, ewa inertia: 1.000033 \n", | |
"Minibatch iteration 8/6000: mean batch inertia: 0.991173, ewa inertia: 0.999733 \n", | |
"Minibatch iteration 9/6000: mean batch inertia: 0.990889, ewa inertia: 0.999433 \n", | |
"Minibatch iteration 10/6000: mean batch inertia: 0.990143, ewa inertia: 0.999119 \n", | |
"Minibatch iteration 11/6000: mean batch inertia: 0.990426, ewa inertia: 0.998825 \n", | |
"[MiniBatchKMeans] Reassigning 72 cluster centers.\n", | |
"Minibatch iteration 12/6000: mean batch inertia: 0.989320, ewa inertia: 0.998503 \n", | |
"Minibatch iteration 13/6000: mean batch inertia: 0.991861, ewa inertia: 0.998278 \n", | |
"Minibatch iteration 14/6000: mean batch inertia: 0.990187, ewa inertia: 0.998004 \n", | |
"Minibatch iteration 15/6000: mean batch inertia: 0.990044, ewa inertia: 0.997734 \n", | |
"Minibatch iteration 16/6000: mean batch inertia: 0.989320, ewa inertia: 0.997449 \n", | |
"Minibatch iteration 17/6000: mean batch inertia: 0.988875, ewa inertia: 0.997159 \n", | |
"Minibatch iteration 18/6000: mean batch inertia: 0.989355, ewa inertia: 0.996895 \n", | |
"Minibatch iteration 19/6000: mean batch inertia: 0.990982, ewa inertia: 0.996695 \n", | |
"Minibatch iteration 20/6000: mean batch inertia: 0.990432, ewa inertia: 0.996483 \n", | |
"Minibatch iteration 21/6000: mean batch inertia: 0.991172, ewa inertia: 0.996303 \n", | |
"Minibatch iteration 22/6000: mean batch inertia: 0.990418, ewa inertia: 0.996104 \n", | |
"Minibatch iteration 23/6000: mean batch inertia: 0.987428, ewa inertia: 0.995810 \n", | |
"Minibatch iteration 24/6000: mean batch inertia: 0.989901, ewa inertia: 0.995610 \n", | |
"Minibatch iteration 25/6000: mean batch inertia: 0.989562, ewa inertia: 0.995405 \n", | |
"Minibatch iteration 26/6000: mean batch inertia: 0.990112, ewa inertia: 0.995226 \n", | |
"Minibatch iteration 27/6000: mean batch inertia: 0.990702, ewa inertia: 0.995073 \n", | |
"Minibatch iteration 28/6000: mean batch inertia: 0.988111, ewa inertia: 0.994837 \n", | |
"Minibatch iteration 29/6000: mean batch inertia: 0.990247, ewa inertia: 0.994682 \n", | |
"Minibatch iteration 30/6000: mean batch inertia: 0.989248, ewa inertia: 0.994498 \n", | |
"Minibatch iteration 31/6000: mean batch inertia: 0.989790, ewa inertia: 0.994338 \n", | |
"Minibatch iteration 32/6000: mean batch inertia: 0.990520, ewa inertia: 0.994209 \n", | |
"Minibatch iteration 33/6000: mean batch inertia: 0.990912, ewa inertia: 0.994097 \n", | |
"Minibatch iteration 34/6000: mean batch inertia: 0.988415, ewa inertia: 0.993905 \n", | |
"Minibatch iteration 35/6000: mean batch inertia: 0.988008, ewa inertia: 0.993705 \n", | |
"Minibatch iteration 36/6000: mean batch inertia: 0.988288, ewa inertia: 0.993522 \n", | |
"Minibatch iteration 37/6000: mean batch inertia: 0.988747, ewa inertia: 0.993360 \n", | |
"Minibatch iteration 38/6000: mean batch inertia: 0.988941, ewa inertia: 0.993211 \n", | |
"Minibatch iteration 39/6000: mean batch inertia: 0.989247, ewa inertia: 0.993076 \n", | |
"Minibatch iteration 40/6000: mean batch inertia: 0.990554, ewa inertia: 0.992991 \n", | |
"Minibatch iteration 41/6000: mean batch inertia: 0.988648, ewa inertia: 0.992844 \n", | |
"Minibatch iteration 42/6000: mean batch inertia: 0.989584, ewa inertia: 0.992734 \n", | |
"Minibatch iteration 43/6000: mean batch inertia: 0.989854, ewa inertia: 0.992636 \n", | |
"Minibatch iteration 44/6000: mean batch inertia: 0.990015, ewa inertia: 0.992547 \n", | |
"Minibatch iteration 45/6000: mean batch inertia: 0.989750, ewa inertia: 0.992453 \n", | |
"Minibatch iteration 46/6000: mean batch inertia: 0.989349, ewa inertia: 0.992348 \n", | |
"Minibatch iteration 47/6000: mean batch inertia: 0.988047, ewa inertia: 0.992202 \n", | |
"Minibatch iteration 48/6000: mean batch inertia: 0.985259, ewa inertia: 0.991967 \n", | |
"Minibatch iteration 49/6000: mean batch inertia: 0.987224, ewa inertia: 0.991806 \n", | |
"Minibatch iteration 50/6000: mean batch inertia: 0.988472, ewa inertia: 0.991693 \n", | |
"Minibatch iteration 51/6000: mean batch inertia: 0.988400, ewa inertia: 0.991582 \n", | |
"Minibatch iteration 52/6000: mean batch inertia: 0.990279, ewa inertia: 0.991538 \n", | |
"Minibatch iteration 53/6000: mean batch inertia: 0.988436, ewa inertia: 0.991433 \n", | |
"Minibatch iteration 54/6000: mean batch inertia: 0.988311, ewa inertia: 0.991327 \n", | |
"Minibatch iteration 55/6000: mean batch inertia: 0.990053, ewa inertia: 0.991284 \n", | |
"Minibatch iteration 56/6000: mean batch inertia: 0.989915, ewa inertia: 0.991238 \n", | |
"Minibatch iteration 57/6000: mean batch inertia: 0.989532, ewa inertia: 0.991180 \n", | |
"Minibatch iteration 58/6000: mean batch inertia: 0.988709, ewa inertia: 0.991096 \n", | |
"Minibatch iteration 59/6000: mean batch inertia: 0.987885, ewa inertia: 0.990988 \n", | |
"Minibatch iteration 60/6000: mean batch inertia: 0.987077, ewa inertia: 0.990855 \n", | |
"Minibatch iteration 61/6000: mean batch inertia: 0.987957, ewa inertia: 0.990757 \n", | |
"Minibatch iteration 62/6000: mean batch inertia: 0.987912, ewa inertia: 0.990661 \n", | |
"Minibatch iteration 63/6000: mean batch inertia: 0.985600, ewa inertia: 0.990489 \n", | |
"Minibatch iteration 64/6000: mean batch inertia: 0.988168, ewa inertia: 0.990411 \n", | |
"Minibatch iteration 65/6000: mean batch inertia: 0.989560, ewa inertia: 0.990382 \n", | |
"Minibatch iteration 66/6000: mean batch inertia: 0.988193, ewa inertia: 0.990308 \n", | |
"Minibatch iteration 67/6000: mean batch inertia: 0.990052, ewa inertia: 0.990299 \n", | |
"Minibatch iteration 68/6000: mean batch inertia: 0.989917, ewa inertia: 0.990286 \n", | |
"Minibatch iteration 69/6000: mean batch inertia: 0.988681, ewa inertia: 0.990232 \n", | |
"Minibatch iteration 70/6000: mean batch inertia: 0.990569, ewa inertia: 0.990243 \n", | |
"Minibatch iteration 71/6000: mean batch inertia: 0.987367, ewa inertia: 0.990146 \n", | |
"[MiniBatchKMeans] Reassigning 79 cluster centers.\n", | |
"Minibatch iteration 72/6000: mean batch inertia: 0.989707, ewa inertia: 0.990131 \n", | |
"Minibatch iteration 73/6000: mean batch inertia: 0.990318, ewa inertia: 0.990137 \n", | |
"Minibatch iteration 74/6000: mean batch inertia: 0.987705, ewa inertia: 0.990055 \n", | |
"Minibatch iteration 75/6000: mean batch inertia: 0.989487, ewa inertia: 0.990036 \n", | |
"Minibatch iteration 76/6000: mean batch inertia: 0.987645, ewa inertia: 0.989955 \n", | |
"Minibatch iteration 77/6000: mean batch inertia: 0.990472, ewa inertia: 0.989972 \n", | |
"Minibatch iteration 78/6000: mean batch inertia: 0.990040, ewa inertia: 0.989975 \n", | |
"Minibatch iteration 79/6000: mean batch inertia: 0.989171, ewa inertia: 0.989947 \n", | |
"Minibatch iteration 80/6000: mean batch inertia: 0.989946, ewa inertia: 0.989947 \n", | |
"Minibatch iteration 81/6000: mean batch inertia: 0.989624, ewa inertia: 0.989936 \n", | |
"Minibatch iteration 82/6000: mean batch inertia: 0.987063, ewa inertia: 0.989839 \n", | |
"Minibatch iteration 83/6000: mean batch inertia: 0.989501, ewa inertia: 0.989828 \n", | |
"Minibatch iteration 84/6000: mean batch inertia: 0.989636, ewa inertia: 0.989821 \n", | |
"Minibatch iteration 85/6000: mean batch inertia: 0.989926, ewa inertia: 0.989825 \n", | |
"Minibatch iteration 86/6000: mean batch inertia: 0.989663, ewa inertia: 0.989819 \n", | |
"Minibatch iteration 87/6000: mean batch inertia: 0.989854, ewa inertia: 0.989820 \n", | |
"Minibatch iteration 88/6000: mean batch inertia: 0.990595, ewa inertia: 0.989847 \n", | |
"Minibatch iteration 89/6000: mean batch inertia: 0.986981, ewa inertia: 0.989750 \n", | |
"Minibatch iteration 90/6000: mean batch inertia: 0.988309, ewa inertia: 0.989701 \n", | |
"Minibatch iteration 91/6000: mean batch inertia: 0.990275, ewa inertia: 0.989720 \n", | |
"Minibatch iteration 92/6000: mean batch inertia: 0.987583, ewa inertia: 0.989648 \n", | |
"Minibatch iteration 93/6000: mean batch inertia: 0.988446, ewa inertia: 0.989607 \n", | |
"Minibatch iteration 94/6000: mean batch inertia: 0.990568, ewa inertia: 0.989640 \n", | |
"Minibatch iteration 95/6000: mean batch inertia: 0.989911, ewa inertia: 0.989649 \n", | |
"Minibatch iteration 96/6000: mean batch inertia: 0.988477, ewa inertia: 0.989609 \n", | |
"Minibatch iteration 97/6000: mean batch inertia: 0.990238, ewa inertia: 0.989631 \n", | |
"Minibatch iteration 98/6000: mean batch inertia: 0.987676, ewa inertia: 0.989564 \n", | |
"Minibatch iteration 99/6000: mean batch inertia: 0.989683, ewa inertia: 0.989568 \n", | |
"Minibatch iteration 100/6000: mean batch inertia: 0.990201, ewa inertia: 0.989590 \n", | |
"Minibatch iteration 101/6000: mean batch inertia: 0.986591, ewa inertia: 0.989488 \n", | |
"Minibatch iteration 102/6000: mean batch inertia: 0.988818, ewa inertia: 0.989466 \n", | |
"Minibatch iteration 103/6000: mean batch inertia: 0.988694, ewa inertia: 0.989439 \n", | |
"Minibatch iteration 104/6000: mean batch inertia: 0.986483, ewa inertia: 0.989339 \n", | |
"Minibatch iteration 105/6000: mean batch inertia: 0.989257, ewa inertia: 0.989337 \n", | |
"Minibatch iteration 106/6000: mean batch inertia: 0.988572, ewa inertia: 0.989311 \n", | |
"Minibatch iteration 107/6000: mean batch inertia: 0.988316, ewa inertia: 0.989277 \n", | |
"Minibatch iteration 108/6000: mean batch inertia: 0.988854, ewa inertia: 0.989263 \n", | |
"Minibatch iteration 109/6000: mean batch inertia: 0.989032, ewa inertia: 0.989255 \n", | |
"Minibatch iteration 110/6000: mean batch inertia: 0.987968, ewa inertia: 0.989211 \n", | |
"Minibatch iteration 111/6000: mean batch inertia: 0.990702, ewa inertia: 0.989262 \n", | |
"Minibatch iteration 112/6000: mean batch inertia: 0.990762, ewa inertia: 0.989313 \n", | |
"Minibatch iteration 113/6000: mean batch inertia: 0.985904, ewa inertia: 0.989197 \n", | |
"Minibatch iteration 114/6000: mean batch inertia: 0.988785, ewa inertia: 0.989183 \n", | |
"Minibatch iteration 115/6000: mean batch inertia: 0.988447, ewa inertia: 0.989158 \n", | |
"Minibatch iteration 116/6000: mean batch inertia: 0.989680, ewa inertia: 0.989176 \n", | |
"Minibatch iteration 117/6000: mean batch inertia: 0.989286, ewa inertia: 0.989180 \n", | |
"Minibatch iteration 118/6000: mean batch inertia: 0.990281, ewa inertia: 0.989217 \n", | |
"Minibatch iteration 119/6000: mean batch inertia: 0.989576, ewa inertia: 0.989229 \n", | |
"Minibatch iteration 120/6000: mean batch inertia: 0.987615, ewa inertia: 0.989174 \n", | |
"Minibatch iteration 121/6000: mean batch inertia: 0.990094, ewa inertia: 0.989206 \n", | |
"Minibatch iteration 122/6000: mean batch inertia: 0.987440, ewa inertia: 0.989146 \n", | |
"Minibatch iteration 123/6000: mean batch inertia: 0.988054, ewa inertia: 0.989109 \n", | |
"Minibatch iteration 124/6000: mean batch inertia: 0.987681, ewa inertia: 0.989060 \n", | |
"Minibatch iteration 125/6000: mean batch inertia: 0.990558, ewa inertia: 0.989111 \n", | |
"Minibatch iteration 126/6000: mean batch inertia: 0.990396, ewa inertia: 0.989155 \n", | |
"Minibatch iteration 127/6000: mean batch inertia: 0.985824, ewa inertia: 0.989042 \n", | |
"Minibatch iteration 128/6000: mean batch inertia: 0.989975, ewa inertia: 0.989073 \n", | |
"Minibatch iteration 129/6000: mean batch inertia: 0.987519, ewa inertia: 0.989021 \n", | |
"Minibatch iteration 130/6000: mean batch inertia: 0.987937, ewa inertia: 0.988984 \n", | |
"Minibatch iteration 131/6000: mean batch inertia: 0.989074, ewa inertia: 0.988987 \n", | |
"Minibatch iteration 132/6000: mean batch inertia: 0.987295, ewa inertia: 0.988930 \n", | |
"Minibatch iteration 133/6000: mean batch inertia: 0.989153, ewa inertia: 0.988937 \n", | |
"Minibatch iteration 134/6000: mean batch inertia: 0.990368, ewa inertia: 0.988986 \n", | |
"Minibatch iteration 135/6000: mean batch inertia: 0.986665, ewa inertia: 0.988907 \n", | |
"Minibatch iteration 136/6000: mean batch inertia: 0.988844, ewa inertia: 0.988905 \n", | |
"Minibatch iteration 137/6000: mean batch inertia: 0.988336, ewa inertia: 0.988886 \n", | |
"Minibatch iteration 138/6000: mean batch inertia: 0.990123, ewa inertia: 0.988928 \n", | |
"Minibatch iteration 139/6000: mean batch inertia: 0.987487, ewa inertia: 0.988879 \n", | |
"Minibatch iteration 140/6000: mean batch inertia: 0.987411, ewa inertia: 0.988829 \n", | |
"Minibatch iteration 141/6000: mean batch inertia: 0.986050, ewa inertia: 0.988735 \n", | |
"Minibatch iteration 142/6000: mean batch inertia: 0.989150, ewa inertia: 0.988749 \n", | |
"Minibatch iteration 143/6000: mean batch inertia: 0.989601, ewa inertia: 0.988778 \n", | |
"Minibatch iteration 144/6000: mean batch inertia: 0.990991, ewa inertia: 0.988853 \n", | |
"Minibatch iteration 145/6000: mean batch inertia: 0.985968, ewa inertia: 0.988755 \n", | |
"Minibatch iteration 146/6000: mean batch inertia: 0.990663, ewa inertia: 0.988820 \n", | |
"Minibatch iteration 147/6000: mean batch inertia: 0.987778, ewa inertia: 0.988785 \n", | |
"Minibatch iteration 148/6000: mean batch inertia: 0.990022, ewa inertia: 0.988827 \n", | |
"Minibatch iteration 149/6000: mean batch inertia: 0.989714, ewa inertia: 0.988857 \n", | |
"Minibatch iteration 150/6000: mean batch inertia: 0.987734, ewa inertia: 0.988819 \n", | |
"Minibatch iteration 151/6000: mean batch inertia: 0.988657, ewa inertia: 0.988813 \n", | |
"Converged (lack of improvement in inertia) at iteration 151/6000\n", | |
"Computing label assignment and total inertia\n", | |
"CPU times: user 16min 18s, sys: 29.2 s, total: 16min 47s\n", | |
"Wall time: 8min\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"MiniBatchKMeans(batch_size=1000, compute_labels=True, init='k-means++',\n", | |
" init_size=None, max_iter=100, max_no_improvement=10,\n", | |
" n_clusters=100, n_init=1, random_state=None,\n", | |
" reassignment_ratio=0.01, tol=0.0, verbose=1)" | |
] | |
}, | |
"execution_count": 22, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"from sklearn.cluster import MiniBatchKMeans\n", | |
"\n", | |
"km = MiniBatchKMeans(n_clusters=100, init='k-means++', \n", | |
" n_init=1, \n", | |
" # init_size=1000, \n", | |
" batch_size=1000,\n", | |
" verbose=1)\n", | |
"%time km.fit(tfidf_matrix)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Assign clustered labels to corresponding records." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 24, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>URI</th>\n", | |
" <th>name</th>\n", | |
" <th>text</th>\n", | |
" <th>label</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td><http://dbpedia.org/resource/Digby_Morrell></td>\n", | |
" <td>Digby Morrell</td>\n", | |
" <td>digby morrell born 10 october 1979 is a former...</td>\n", | |
" <td>91</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td><http://dbpedia.org/resource/Alfred_J._Lewy></td>\n", | |
" <td>Alfred J. Lewy</td>\n", | |
" <td>alfred j lewy aka sandy lewy graduated from un...</td>\n", | |
" <td>19</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td><http://dbpedia.org/resource/Harpdog_Brown></td>\n", | |
" <td>Harpdog Brown</td>\n", | |
" <td>harpdog brown is a singer and harmonica player...</td>\n", | |
" <td>42</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td><http://dbpedia.org/resource/Franz_Rottensteiner></td>\n", | |
" <td>Franz Rottensteiner</td>\n", | |
" <td>franz rottensteiner born in waidmannsfeld lowe...</td>\n", | |
" <td>13</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td><http://dbpedia.org/resource/G-Enka></td>\n", | |
" <td>G-Enka</td>\n", | |
" <td>henry krvits born 30 december 1974 in tallinn ...</td>\n", | |
" <td>61</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" URI name \\\n", | |
"0 <http://dbpedia.org/resource/Digby_Morrell> Digby Morrell \n", | |
"1 <http://dbpedia.org/resource/Alfred_J._Lewy> Alfred J. Lewy \n", | |
"2 <http://dbpedia.org/resource/Harpdog_Brown> Harpdog Brown \n", | |
"3 <http://dbpedia.org/resource/Franz_Rottensteiner> Franz Rottensteiner \n", | |
"4 <http://dbpedia.org/resource/G-Enka> G-Enka \n", | |
"\n", | |
" text label \n", | |
"0 digby morrell born 10 october 1979 is a former... 91 \n", | |
"1 alfred j lewy aka sandy lewy graduated from un... 19 \n", | |
"2 harpdog brown is a singer and harmonica player... 42 \n", | |
"3 franz rottensteiner born in waidmannsfeld lowe... 13 \n", | |
"4 henry krvits born 30 december 1974 in tallinn ... 61 " | |
] | |
}, | |
"execution_count": 24, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df['label'] = km.labels_\n", | |
"df.head()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Plot portion of records corresponding to each label" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 74, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Original fig_size [6.0, 4.0]\n", | |
"New fig_size [10.0, 10.0]\n" | |
] | |
} | |
], | |
"source": [ | |
"import matplotlib.pyplot as plt\n", | |
"%matplotlib inline\n", | |
"\n", | |
"# Get current size\n", | |
"fig_size = plt.rcParams[\"figure.figsize\"]\n", | |
"print(\"Original fig_size\", fig_size)\n", | |
"\n", | |
"# Re-set figure width & height\n", | |
"new_fig = [10.0, 10.0]\n", | |
"plt.rcParams[\"figure.figsize\"] = new_fig\n", | |
"print(\"New fig_size\", new_fig)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 77, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"image/png": "\n", | |
"text/plain": [ | |
"<matplotlib.figure.Figure at 0x7f617a8a2b70>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"label_df = df.groupby('label').agg(['count'])\n", | |
"labels = label_df['name'].index.values\n", | |
"counts = label_df['name']['count'].values\n", | |
"\n", | |
"plt.pie(counts, labels=labels)\n", | |
"plt.show()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Improved recommend function with only searching in same label cluster." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 29, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from sklearn.metrics.pairwise import linear_kernel\n", | |
"import numpy as np\n", | |
"\n", | |
"def recommend(name, k):\n", | |
" \"\"\"\n", | |
" Args:\n", | |
" name: Name of reading person\n", | |
" k: Number of people to recommend\n", | |
" \n", | |
" Returns:\n", | |
" DataFrame contains k list of people to recommend for reading person\n", | |
" \"\"\"\n", | |
" # Find DataFrame corresponding to name\n", | |
" target = df[df['name'] == name]\n", | |
" \n", | |
" # Get target TFIDF value\n", | |
" target_id = target.index[0]\n", | |
" target_tfidf = tfidf_matrix[target_id]\n", | |
" \n", | |
" # Get TFIDF matrix with same label as target\n", | |
" label = target['label'].values[0]\n", | |
" same_label_df = df[df['label'] == label]\n", | |
" same_label_ids = same_label_df.index.values\n", | |
" same_label_tfidf_matrix = tfidf_matrix[same_label_ids]\n", | |
" \n", | |
" # calculate cosine similarity with other people\n", | |
" cosine_similarities = linear_kernel(target_tfidf, same_label_tfidf_matrix).flatten()\n", | |
"\n", | |
" # get list of ids, sorted by similarity (desc)\n", | |
" # trick: cut off head 'coz head is always reading person\n", | |
" ids = np.flip(\n", | |
" np.argsort(cosine_similarities),\n", | |
" axis=0\n", | |
" )[1:]\n", | |
" \n", | |
" # return dataframe corresponding to list of ids\n", | |
" # limited by k\n", | |
" return same_label_df.reset_index(drop=True).iloc[ids].head(k)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Test recommend 10 people while reading Barack Obama:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 30, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>URI</th>\n", | |
" <th>name</th>\n", | |
" <th>text</th>\n", | |
" <th>label</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>2834</th>\n", | |
" <td><http://dbpedia.org/resource/Joe_Biden></td>\n", | |
" <td>Joe Biden</td>\n", | |
" <td>joseph robinette joe biden jr dosf rbnt badn b...</td>\n", | |
" <td>44</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5464</th>\n", | |
" <td><http://dbpedia.org/resource/Jeff_Sessions></td>\n", | |
" <td>Jeff Sessions</td>\n", | |
" <td>jefferson beauregard jeff sessions iii born de...</td>\n", | |
" <td>44</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>493</th>\n", | |
" <td><http://dbpedia.org/resource/Joe_Lieberman></td>\n", | |
" <td>Joe Lieberman</td>\n", | |
" <td>joseph isadore joe lieberman born february 24 ...</td>\n", | |
" <td>44</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5675</th>\n", | |
" <td><http://dbpedia.org/resource/Artur_Davis></td>\n", | |
" <td>Artur Davis</td>\n", | |
" <td>artur genestre davis born october 9 1967 is an...</td>\n", | |
" <td>44</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2192</th>\n", | |
" <td><http://dbpedia.org/resource/Henry_Waxman></td>\n", | |
" <td>Henry Waxman</td>\n", | |
" <td>henry arnold waxman born september 12 1939 is ...</td>\n", | |
" <td>44</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6655</th>\n", | |
" <td><http://dbpedia.org/resource/Hillary_Rodham_Cl...</td>\n", | |
" <td>Hillary Rodham Clinton</td>\n", | |
" <td>hillary diane rodham clinton hlri dan rdm klnt...</td>\n", | |
" <td>44</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3592</th>\n", | |
" <td><http://dbpedia.org/resource/Richard_Pildes></td>\n", | |
" <td>Richard Pildes</td>\n", | |
" <td>richard h pildes is a law professor at the new...</td>\n", | |
" <td>44</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4498</th>\n", | |
" <td><http://dbpedia.org/resource/Samantha_Power></td>\n", | |
" <td>Samantha Power</td>\n", | |
" <td>samantha power born september 21 1970 is an ir...</td>\n", | |
" <td>44</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5386</th>\n", | |
" <td><http://dbpedia.org/resource/Robert_Gibbs></td>\n", | |
" <td>Robert Gibbs</td>\n", | |
" <td>robert lane gibbs born march 29 1971 is an ame...</td>\n", | |
" <td>44</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4537</th>\n", | |
" <td><http://dbpedia.org/resource/Eric_Stern_(polit...</td>\n", | |
" <td>Eric Stern (politician)</td>\n", | |
" <td>eric stern is the director of operations for t...</td>\n", | |
" <td>44</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" URI \\\n", | |
"2834 <http://dbpedia.org/resource/Joe_Biden> \n", | |
"5464 <http://dbpedia.org/resource/Jeff_Sessions> \n", | |
"493 <http://dbpedia.org/resource/Joe_Lieberman> \n", | |
"5675 <http://dbpedia.org/resource/Artur_Davis> \n", | |
"2192 <http://dbpedia.org/resource/Henry_Waxman> \n", | |
"6655 <http://dbpedia.org/resource/Hillary_Rodham_Cl... \n", | |
"3592 <http://dbpedia.org/resource/Richard_Pildes> \n", | |
"4498 <http://dbpedia.org/resource/Samantha_Power> \n", | |
"5386 <http://dbpedia.org/resource/Robert_Gibbs> \n", | |
"4537 <http://dbpedia.org/resource/Eric_Stern_(polit... \n", | |
"\n", | |
" name \\\n", | |
"2834 Joe Biden \n", | |
"5464 Jeff Sessions \n", | |
"493 Joe Lieberman \n", | |
"5675 Artur Davis \n", | |
"2192 Henry Waxman \n", | |
"6655 Hillary Rodham Clinton \n", | |
"3592 Richard Pildes \n", | |
"4498 Samantha Power \n", | |
"5386 Robert Gibbs \n", | |
"4537 Eric Stern (politician) \n", | |
"\n", | |
" text label \n", | |
"2834 joseph robinette joe biden jr dosf rbnt badn b... 44 \n", | |
"5464 jefferson beauregard jeff sessions iii born de... 44 \n", | |
"493 joseph isadore joe lieberman born february 24 ... 44 \n", | |
"5675 artur genestre davis born october 9 1967 is an... 44 \n", | |
"2192 henry arnold waxman born september 12 1939 is ... 44 \n", | |
"6655 hillary diane rodham clinton hlri dan rdm klnt... 44 \n", | |
"3592 richard h pildes is a law professor at the new... 44 \n", | |
"4498 samantha power born september 21 1970 is an ir... 44 \n", | |
"5386 robert lane gibbs born march 29 1971 is an ame... 44 \n", | |
"4537 eric stern is the director of operations for t... 44 " | |
] | |
}, | |
"execution_count": 30, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"recommend('Barack Obama', 10)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Calculate executing time, again:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 31, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import random\n", | |
"from tqdm import tqdm\n", | |
"\n", | |
"def execute_recommend(n, k):\n", | |
" \"\"\"\n", | |
" Args:\n", | |
" n: Number of people\n", | |
" k: Number of people to recommend to corresponding person\n", | |
" \"\"\"\n", | |
" # random n people\n", | |
" people = random.sample(list(df['name']), n)\n", | |
" \n", | |
" for name in tqdm(people):\n", | |
" recommend(name, k)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"1 person, k = 200: **268ms** vs 1.83s (old `recommend` function)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 32, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"100%|██████████| 1/1 [00:00<00:00, 4.16it/s]" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 117 ms, sys: 124 ms, total: 240 ms\n", | |
"Wall time: 268 ms\n" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"%time execute_recommend(1, 200)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"50 people, k = 200: **44.6s** vs 1m26s" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 33, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"100%|██████████| 50/50 [00:44<00:00, 1.12it/s]" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 36 s, sys: 8.4 s, total: 44.5 s\n", | |
"Wall time: 44.6 s\n" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"%time execute_recommend(50, 200)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"500 people, k = 200: **5m58s** vs 14m14s" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 35, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"100%|██████████| 500/500 [05:58<00:00, 1.40it/s]" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 4min 35s, sys: 1min 22s, total: 5min 58s\n", | |
"Wall time: 5min 58s\n" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"%time execute_recommend(500, 200)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"As estimation, 5000 people k = 200 would take around **1 hour** vs 2.5 hours of old `recommend` function." | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"As results, cluster with 100 clusters before recommending is a boost in speed." | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.4" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment