Created
February 2, 2014 08:55
-
-
Save mickaellegal/8764952 to your computer and use it in GitHub Desktop.
iPython Notebook: Blog book recommendation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "heading", | |
"level": 2, | |
"metadata": {}, | |
"source": [ | |
"Loading the data and exploration" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# Importing the libraries\n", | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"import json" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 1 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"data = pd.read_csv(\"../../Downloads/data_books.csv\", sep = \",\", header=None,\n", | |
" names=['Reviewer', 'Book', 'Rating'])\n", | |
"\n", | |
"print \"There are %d rows in this dataframe\" %len(data)\n", | |
"\n", | |
"# Let's see what the dataframe looks like\n", | |
"data.head()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"There are 383852 rows in this dataframe\n" | |
] | |
}, | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>Reviewer</th>\n", | |
" <th>Book</th>\n", | |
" <th>Rating</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td> 276726</td>\n", | |
" <td> Rites of Passage</td>\n", | |
" <td> 5</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td> 276729</td>\n", | |
" <td> Help!: Level 1</td>\n", | |
" <td> 3</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td> 276729</td>\n", | |
" <td> The Amsterdam Connection : Level 4 (Cambridge ...</td>\n", | |
" <td> 6</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td> 276744</td>\n", | |
" <td> A Painted House</td>\n", | |
" <td> 7</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td> 276747</td>\n", | |
" <td> Little Altars Everywhere</td>\n", | |
" <td> 9</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 3, | |
"text": [ | |
" Reviewer Book Rating\n", | |
"0 276726 Rites of Passage 5\n", | |
"1 276729 Help!: Level 1 3\n", | |
"2 276729 The Amsterdam Connection : Level 4 (Cambridge ... 6\n", | |
"3 276744 A Painted House 7\n", | |
"4 276747 Little Altars Everywhere 9" | |
] | |
} | |
], | |
"prompt_number": 3 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# Top 20 most reviewed books \n", | |
"top_books = pd.value_counts(data.Book)\n", | |
"top_books.head(20)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 4, | |
"text": [ | |
"The Lovely Bones: A Novel 707\n", | |
"Wild Animus 581\n", | |
"The Da Vinci Code 494\n", | |
"The Secret Life of Bees 406\n", | |
"The Nanny Diaries: A Novel 393\n", | |
"The Red Tent (Bestselling Backlist) 383\n", | |
"Bridget Jones's Diary 377\n", | |
"A Painted House 366\n", | |
"Life of Pi 336\n", | |
"Harry Potter and the Chamber of Secrets (Book 2) 326\n", | |
"Divine Secrets of the Ya-Ya Sisterhood: A Novel 323\n", | |
"Angels & 317\n", | |
"Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback)) 315\n", | |
"The Summons 309\n", | |
"Where the Heart Is (Oprah's Book Club (Paperback)) 295\n", | |
"The Notebook 293\n", | |
"Girl with a Pearl Earring 278\n", | |
"Harry Potter and the Prisoner of Azkaban (Book 3) 277\n", | |
"Snow Falling on Cedars 275\n", | |
"The Pilot's Wife : A Novel 272\n", | |
"dtype: int64" | |
] | |
} | |
], | |
"prompt_number": 4 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# Top Reviewers\n", | |
"top_reviewers = pd.value_counts(data.Reviewer)\n", | |
"top_reviewers.head(20)\n", | |
"# Wow some people gave almost 7,000 reviews that's a lot of book to read. " | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 5, | |
"text": [ | |
"11676 6943\n", | |
"98391 5691\n", | |
"189835 1899\n", | |
"153662 1845\n", | |
"23902 1180\n", | |
"235105 1020\n", | |
"76499 1012\n", | |
"171118 962\n", | |
"16795 959\n", | |
"248718 941\n", | |
"56399 838\n", | |
"197659 781\n", | |
"35859 777\n", | |
"185233 698\n", | |
"95359 606\n", | |
"114368 603\n", | |
"158295 567\n", | |
"101851 563\n", | |
"177458 524\n", | |
"204864 504\n", | |
"dtype: int64" | |
] | |
} | |
], | |
"prompt_number": 5 | |
}, | |
{ | |
"cell_type": "raw", | |
"metadata": {}, | |
"source": [ | |
"Wow! Some people have reviewed almost 7,000 books, that's a lot of books for one person... I'm a bit suspicious about the data now. " | |
] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 2, | |
"metadata": {}, | |
"source": [ | |
"Let's start with only 2 books" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# Picking 2 books \n", | |
"book_1, book_2 = \"Harry Potter and the Chamber of Secrets (Book 2)\", \"Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback))\"\n", | |
"\n", | |
"# Getting all the reviewers for these books\n", | |
"book_1_reviewers = data[data.Book == book_1].Reviewer\n", | |
"book_2_reviewers = data[data.Book == book_2].Reviewer\n" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 6 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# Check if there are any common reviewers\n", | |
"common_reviewers = set(book_1_reviewers).intersection(book_2_reviewers)\n", | |
"\n", | |
"print \"%d people have reviewed these 2 books\" % len(common_reviewers)\n", | |
"\n", | |
"# Checking the table with only the common reviewers\n", | |
"list_common_reviewers = []\n", | |
"\n", | |
"for i in common_reviewers:\n", | |
" list_common_reviewers.append(i)\n", | |
"\n", | |
"common_reviewers_only = data[data.Reviewer.isin(list_common_reviewers)]\n", | |
"\n", | |
"common_reviewers_only.head(10)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"92 people have reviewed these 2 books\n" | |
] | |
}, | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>Reviewer</th>\n", | |
" <th>Book</th>\n", | |
" <th>Rating</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>14299</th>\n", | |
" <td> 10560</td>\n", | |
" <td> The Death of Vishnu: A Novel</td>\n", | |
" <td> 8</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>14300</th>\n", | |
" <td> 10560</td>\n", | |
" <td> The Last Report on the Miracles at Little No H...</td>\n", | |
" <td> 8</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>14301</th>\n", | |
" <td> 10560</td>\n", | |
" <td> Mirror Mirror: A Novel</td>\n", | |
" <td> 9</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>14302</th>\n", | |
" <td> 10560</td>\n", | |
" <td> The Hidden Life of Otto Frank</td>\n", | |
" <td> 10</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>14303</th>\n", | |
" <td> 10560</td>\n", | |
" <td> The Lady, the Chef, and the Courtesan</td>\n", | |
" <td> 5</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>14304</th>\n", | |
" <td> 10560</td>\n", | |
" <td> Tracks RI</td>\n", | |
" <td> 6</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>14305</th>\n", | |
" <td> 10560</td>\n", | |
" <td> My Antonia (Twentieth-Century Classics)</td>\n", | |
" <td> 6</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>14306</th>\n", | |
" <td> 10560</td>\n", | |
" <td> A Mind of Its Own: A Cultural History of the P...</td>\n", | |
" <td> 7</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>14307</th>\n", | |
" <td> 10560</td>\n", | |
" <td> Mary, Called Magdalene</td>\n", | |
" <td> 7</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>14308</th>\n", | |
" <td> 10560</td>\n", | |
" <td> The Red Tent (Bestselling Backlist)</td>\n", | |
" <td> 9</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 7, | |
"text": [ | |
" Reviewer Book Rating\n", | |
"14299 10560 The Death of Vishnu: A Novel 8\n", | |
"14300 10560 The Last Report on the Miracles at Little No H... 8\n", | |
"14301 10560 Mirror Mirror: A Novel 9\n", | |
"14302 10560 The Hidden Life of Otto Frank 10\n", | |
"14303 10560 The Lady, the Chef, and the Courtesan 5\n", | |
"14304 10560 Tracks RI 6\n", | |
"14305 10560 My Antonia (Twentieth-Century Classics) 6\n", | |
"14306 10560 A Mind of Its Own: A Cultural History of the P... 7\n", | |
"14307 10560 Mary, Called Magdalene 7\n", | |
"14308 10560 The Red Tent (Bestselling Backlist) 9" | |
] | |
} | |
], | |
"prompt_number": 7 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# Let's create a function that collect the reviews of our common reviewers\n", | |
"def get_book_reviews(title, common_reviewers):\n", | |
" mask = (data.Reviewer.isin(common_reviewers)) & (data.Book==title)\n", | |
" reviews = data[mask].sort('Reviewer')\n", | |
" reviews = reviews[reviews.Reviewer.duplicated()==False]\n", | |
" return reviews" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 9 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 2, | |
"metadata": {}, | |
"source": [ | |
"Calculating the correlation" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# Importing the scipy library to measure the pearson correlation coefficient\n", | |
"from scipy.stats.stats import pearsonr\n", | |
"\n", | |
"def calculate_correlation(book1, book2):\n", | |
" # We start by finding the common reviewers\n", | |
" book_1_reviewers = data[data.Book == book1].Reviewer\n", | |
" book_2_reviewers = data[data.Book == book2].Reviewer\n", | |
" common_reviewers = set(book_1_reviewers).intersection(book_2_reviewers)\n", | |
"\n", | |
" # Then we look for the reviews given by common reviewers\n", | |
" book_1_reviews = get_book_reviews(book1, common_reviewers)\n", | |
" book_2_reviews = get_book_reviews(book2, common_reviewers)\n", | |
" \n", | |
" # Calculate the Pearson Correlation Score\n", | |
" return pearsonr(book_1_reviews.Rating, book_2_reviews.Rating)[0]\n", | |
"\n", | |
"# Print the correlation score\n", | |
"calculate_correlation(book_1,book_2)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 10, | |
"text": [ | |
"0.86964736645336571" | |
] | |
} | |
], | |
"prompt_number": 10 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 2, | |
"metadata": {}, | |
"source": [ | |
"Let's expand the idea to a larger set of books" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# As I want to avoid dealing with a huge sparse matrix, I will only select the top most reviewed books for our example \n", | |
"\n", | |
"most_reviewed_books = pd.DataFrame({'count' : data.groupby([\"Book\"]).size()})\\\n", | |
" .reset_index().sort(['count'],ascending = False)\n", | |
"\n", | |
"most_reviewed_books.head(20) " | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>Book</th>\n", | |
" <th>count</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>110079</th>\n", | |
" <td> The Lovely Bones: A Novel</td>\n", | |
" <td> 707</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>132075</th>\n", | |
" <td> Wild Animus</td>\n", | |
" <td> 581</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>102555</th>\n", | |
" <td> The Da Vinci Code</td>\n", | |
" <td> 494</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>116044</th>\n", | |
" <td> The Secret Life of Bees</td>\n", | |
" <td> 406</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>111800</th>\n", | |
" <td> The Nanny Diaries: A Novel</td>\n", | |
" <td> 393</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>114808</th>\n", | |
" <td> The Red Tent (Bestselling Backlist)</td>\n", | |
" <td> 383</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>15736 </th>\n", | |
" <td> Bridget Jones's Diary</td>\n", | |
" <td> 377</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3064 </th>\n", | |
" <td> A Painted House</td>\n", | |
" <td> 366</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>60614 </th>\n", | |
" <td> Life of Pi</td>\n", | |
" <td> 336</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>45326 </th>\n", | |
" <td> Harry Potter and the Chamber of Secrets (Book 2)</td>\n", | |
" <td> 326</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>29909 </th>\n", | |
" <td> Divine Secrets of the Ya-Ya Sisterhood: A Novel</td>\n", | |
" <td> 323</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>8132 </th>\n", | |
" <td> Angels &amp</td>\n", | |
" <td> 317</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>45355 </th>\n", | |
" <td> Harry Potter and the Sorcerer's Stone (Harry P...</td>\n", | |
" <td> 315</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>117679</th>\n", | |
" <td> The Summons</td>\n", | |
" <td> 309</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>131173</th>\n", | |
" <td> Where the Heart Is (Oprah's Book Club (Paperba...</td>\n", | |
" <td> 295</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>112407</th>\n", | |
" <td> The Notebook</td>\n", | |
" <td> 293</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>42252 </th>\n", | |
" <td> Girl with a Pearl Earring</td>\n", | |
" <td> 278</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>45345 </th>\n", | |
" <td> Harry Potter and the Prisoner of Azkaban (Book 3)</td>\n", | |
" <td> 277</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>90767 </th>\n", | |
" <td> Snow Falling on Cedars</td>\n", | |
" <td> 275</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>113578</th>\n", | |
" <td> The Pilot's Wife : A Novel</td>\n", | |
" <td> 272</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 11, | |
"text": [ | |
" Book count\n", | |
"110079 The Lovely Bones: A Novel 707\n", | |
"132075 Wild Animus 581\n", | |
"102555 The Da Vinci Code 494\n", | |
"116044 The Secret Life of Bees 406\n", | |
"111800 The Nanny Diaries: A Novel 393\n", | |
"114808 The Red Tent (Bestselling Backlist) 383\n", | |
"15736 Bridget Jones's Diary 377\n", | |
"3064 A Painted House 366\n", | |
"60614 Life of Pi 336\n", | |
"45326 Harry Potter and the Chamber of Secrets (Book 2) 326\n", | |
"29909 Divine Secrets of the Ya-Ya Sisterhood: A Novel 323\n", | |
"8132 Angels & 317\n", | |
"45355 Harry Potter and the Sorcerer's Stone (Harry P... 315\n", | |
"117679 The Summons 309\n", | |
"131173 Where the Heart Is (Oprah's Book Club (Paperba... 295\n", | |
"112407 The Notebook 293\n", | |
"42252 Girl with a Pearl Earring 278\n", | |
"45345 Harry Potter and the Prisoner of Azkaban (Book 3) 277\n", | |
"90767 Snow Falling on Cedars 275\n", | |
"113578 The Pilot's Wife : A Novel 272" | |
] | |
} | |
], | |
"prompt_number": 11 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# Getting the list of the most reviewed book\n", | |
"\n", | |
"top_books = []\n", | |
"\n", | |
"for i in most_reviewed_books.Book[0:13]:\n", | |
" top_books.append(i)\n", | |
"\n", | |
" \n", | |
"# calculate the correlation for our top books\n", | |
"correlation_coefficient = []\n", | |
"\n", | |
"for book1 in top_books:\n", | |
" print \"Calculating the correlations for:\", book1\n", | |
" for book2 in top_books:\n", | |
" if book1 != book2:\n", | |
" row = [book1, book2] + [calculate_correlation(book1, book2)]\n", | |
" correlation_coefficient.append(row)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"Calculating the correlations for: The Lovely Bones: A Novel\n", | |
"Calculating the correlations for:" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
" Wild Animus\n", | |
"Calculating the correlations for:" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
" The Da Vinci Code\n", | |
"Calculating the correlations for:" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
" The Secret Life of Bees\n", | |
"Calculating the correlations for:" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
" The Nanny Diaries: A Novel\n", | |
"Calculating the correlations for:" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
" The Red Tent (Bestselling Backlist)\n", | |
"Calculating the correlations for:" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
" Bridget Jones's Diary\n", | |
"Calculating the correlations for:" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
" A Painted House\n", | |
"Calculating the correlations for:" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
" Life of Pi\n", | |
"Calculating the correlations for:" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
" Harry Potter and the Chamber of Secrets (Book 2)\n", | |
"Calculating the correlations for:" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
" Divine Secrets of the Ya-Ya Sisterhood: A Novel\n", | |
"Calculating the correlations for:" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
" Angels &\n", | |
"Calculating the correlations for:" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
" Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback))\n" | |
] | |
} | |
], | |
"prompt_number": 12 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# Let's look at what the table of correlation looks like\n", | |
"cols = [\"Book_1\", \"Book_2\", \"Correlation\"]\n", | |
"correlation_coefficient = pd.DataFrame(correlation_coefficient, columns=cols).sort('Correlation')\n", | |
"correlation_coefficient.head(10)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>Book_1</th>\n", | |
" <th>Book_2</th>\n", | |
" <th>Correlation</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>25 </th>\n", | |
" <td> The Da Vinci Code</td>\n", | |
" <td> Wild Animus</td>\n", | |
" <td>-0.580228</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>13 </th>\n", | |
" <td> Wild Animus</td>\n", | |
" <td> The Da Vinci Code</td>\n", | |
" <td>-0.580228</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>61 </th>\n", | |
" <td> The Red Tent (Bestselling Backlist)</td>\n", | |
" <td> Wild Animus</td>\n", | |
" <td>-0.487857</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>16 </th>\n", | |
" <td> Wild Animus</td>\n", | |
" <td> The Red Tent (Bestselling Backlist)</td>\n", | |
" <td>-0.487857</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>49 </th>\n", | |
" <td> The Nanny Diaries: A Novel</td>\n", | |
" <td> Wild Animus</td>\n", | |
" <td>-0.424865</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>15 </th>\n", | |
" <td> Wild Animus</td>\n", | |
" <td> The Nanny Diaries: A Novel</td>\n", | |
" <td>-0.424865</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>97 </th>\n", | |
" <td> Life of Pi</td>\n", | |
" <td> Wild Animus</td>\n", | |
" <td>-0.409176</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>19 </th>\n", | |
" <td> Wild Animus</td>\n", | |
" <td> Life of Pi</td>\n", | |
" <td>-0.409176</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>21 </th>\n", | |
" <td> Wild Animus</td>\n", | |
" <td> Divine Secrets of the Ya-Ya Sisterhood: A Novel</td>\n", | |
" <td>-0.322301</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>121</th>\n", | |
" <td> Divine Secrets of the Ya-Ya Sisterhood: A Novel</td>\n", | |
" <td> Wild Animus</td>\n", | |
" <td>-0.322301</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 13, | |
"text": [ | |
" Book_1 Book_2 Correlation\n", | |
"25 The Da Vinci Code Wild Animus -0.580228\n", | |
"13 Wild Animus The Da Vinci Code -0.580228\n", | |
"61 The Red Tent (Bestselling Backlist) Wild Animus -0.487857\n", | |
"16 Wild Animus The Red Tent (Bestselling Backlist) -0.487857\n", | |
"49 The Nanny Diaries: A Novel Wild Animus -0.424865\n", | |
"15 Wild Animus The Nanny Diaries: A Novel -0.424865\n", | |
"97 Life of Pi Wild Animus -0.409176\n", | |
"19 Wild Animus Life of Pi -0.409176\n", | |
"21 Wild Animus Divine Secrets of the Ya-Ya Sisterhood: A Novel -0.322301\n", | |
"121 Divine Secrets of the Ya-Ya Sisterhood: A Novel Wild Animus -0.322301" | |
] | |
} | |
], | |
"prompt_number": 13 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# Let's clean up some of the titles\n", | |
"correlation_coefficient= correlation_coefficient.replace([\"The Red Tent (Bestselling Backlist)\", \"Harry Potter and the Chamber of Secrets (Book 2)\",\n", | |
" \"Angels &\", \"Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback))\"],\n", | |
" [\"The Red Tent\", \"Harry Potter and the Chamber of Secrets\", \"Angels & Demons\",\n", | |
" \"Harry Potter and the Sorcerer's Stone\"])" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 14 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"def calc_correlation(corr, book1, book2):\n", | |
" mask = (corr.Book_1==book1) & (corr.Book_2==book2)\n", | |
" row = corr[mask]\n", | |
" corr = row\n", | |
" return corr.sum(axis=1).tolist()[0]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 15 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"calc_correlation(correlation_coefficient,\"Harry Potter and the Sorcerer's Stone\", \"The Red Tent\")" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 16, | |
"text": [ | |
"0.3532613528144701" | |
] | |
} | |
], | |
"prompt_number": 16 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 2, | |
"metadata": {}, | |
"source": [ | |
"Finding similar books to Harry Potter and the Sorcerer's Stone" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# Making sure that the top books is also updated with the new titles\n", | |
"top_books[5] = \"The Red Tent\"\n", | |
"top_books[9] = \"Harry Potter and the Chamber of Secrets\"\n", | |
"top_books[11] = \"Angels & Demons\"\n", | |
"top_books[12] = \"Harry Potter and the Sorcerer's Stone\"\n", | |
"\n", | |
"print top_books" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"['The Lovely Bones: A Novel', 'Wild Animus', 'The Da Vinci Code', 'The Secret Life of Bees', 'The Nanny Diaries: A Novel', 'The Red Tent', \"Bridget Jones's Diary\", 'A Painted House', 'Life of Pi', 'Harry Potter and the Chamber of Secrets', 'Divine Secrets of the Ya-Ya Sisterhood: A Novel', 'Angels & Demons', \"Harry Potter and the Sorcerer's Stone\"]\n" | |
] | |
} | |
], | |
"prompt_number": 17 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# Finding the top recommendation for Harry Potter\n", | |
"my_book = \"Harry Potter and the Sorcerer's Stone\"\n", | |
"results = []\n", | |
"for b in top_books:\n", | |
" if my_book!=b:\n", | |
" results.append((my_book, b, calc_correlation(correlation_coefficient, my_book, b)))\n", | |
"sorted(results, key=lambda x: x[2], reverse=True)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 18, | |
"text": [ | |
"[(\"Harry Potter and the Sorcerer's Stone\",\n", | |
" 'Harry Potter and the Chamber of Secrets',\n", | |
" 0.8696473664533657),\n", | |
" (\"Harry Potter and the Sorcerer's Stone\",\n", | |
" 'The Nanny Diaries: A Novel',\n", | |
" 0.5375134737269157),\n", | |
" (\"Harry Potter and the Sorcerer's Stone\", 'Wild Animus', 0.48412291827592707),\n", | |
" (\"Harry Potter and the Sorcerer's Stone\",\n", | |
" 'Angels & Demons',\n", | |
" 0.45363235436327737),\n", | |
" (\"Harry Potter and the Sorcerer's Stone\", 'The Red Tent', 0.3532613528144701),\n", | |
" (\"Harry Potter and the Sorcerer's Stone\",\n", | |
" 'Divine Secrets of the Ya-Ya Sisterhood: A Novel',\n", | |
" 0.2896094261253908),\n", | |
" (\"Harry Potter and the Sorcerer's Stone\",\n", | |
" 'A Painted House',\n", | |
" 0.2636352520041483),\n", | |
" (\"Harry Potter and the Sorcerer's Stone\",\n", | |
" 'The Lovely Bones: A Novel',\n", | |
" 0.18557232038403582),\n", | |
" (\"Harry Potter and the Sorcerer's Stone\",\n", | |
" \"Bridget Jones's Diary\",\n", | |
" 0.02651948272397066),\n", | |
" (\"Harry Potter and the Sorcerer's Stone\", 'The Secret Life of Bees', 0.0),\n", | |
" (\"Harry Potter and the Sorcerer's Stone\",\n", | |
" 'The Da Vinci Code',\n", | |
" -0.10387310073624213),\n", | |
" (\"Harry Potter and the Sorcerer's Stone\", 'Life of Pi', -0.2909286827258562)]" | |
] | |
} | |
], | |
"prompt_number": 18 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 2, | |
"metadata": {}, | |
"source": [ | |
"Deploying the model on yhat" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"from yhat import Yhat, BaseModel\n", | |
"\n", | |
"class Book_Recommender(BaseModel):\n", | |
" \n", | |
" def predict(self, my_book):\n", | |
" results = []\n", | |
" for other_book in self.top_books:\n", | |
" if my_book != other_book:\n", | |
" correlation = calc_correlation(self.correlation_coefficient, my_book, other_book)\n", | |
" results.append((my_book, other_book, correlation)) \n", | |
" return sorted(results, key=lambda x: x[2], reverse=True)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 38 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"yh = Yhat(\"YHAT_USERNAME\", \"YHAT_APIKEY\")\n", | |
"my_model = Book_Recommender(correlation_coefficient=correlation_coefficient, top_books=top_books,\n", | |
" udfs=[calc_correlation])\n", | |
"yh.deploy(\"BookReco\", my_model)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"uploading... " | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"done!\n" | |
] | |
}, | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 39, | |
"text": [ | |
"{u'modelname': u'BookReco', u'status': u'success', u'version': 3}" | |
] | |
} | |
], | |
"prompt_number": 39 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 3, | |
"metadata": {}, | |
"source": [ | |
"Testing it out" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"yh.predict(\"BookReco\", 3, \"Harry Potter and the Sorcerer's Stone\")" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 40, | |
"text": [ | |
"[[u\"Harry Potter and the Sorcerer's Stone\",\n", | |
" u'Harry Potter and the Chamber of Secrets',\n", | |
" 0.86965],\n", | |
" [u\"Harry Potter and the Sorcerer's Stone\",\n", | |
" u'The Nanny Diaries: A Novel',\n", | |
" 0.53751],\n", | |
" [u\"Harry Potter and the Sorcerer's Stone\", u'Wild Animus', 0.48412],\n", | |
" [u\"Harry Potter and the Sorcerer's Stone\", u'Angels & Demons', 0.45363],\n", | |
" [u\"Harry Potter and the Sorcerer's Stone\", u'The Red Tent', 0.35326],\n", | |
" [u\"Harry Potter and the Sorcerer's Stone\",\n", | |
" u'Divine Secrets of the Ya-Ya Sisterhood: A Novel',\n", | |
" 0.28961],\n", | |
" [u\"Harry Potter and the Sorcerer's Stone\", u'A Painted House', 0.26364],\n", | |
" [u\"Harry Potter and the Sorcerer's Stone\",\n", | |
" u'The Lovely Bones: A Novel',\n", | |
" 0.18557],\n", | |
" [u\"Harry Potter and the Sorcerer's Stone\", u\"Bridget Jones's Diary\", 0.02652],\n", | |
" [u\"Harry Potter and the Sorcerer's Stone\", u'The Secret Life of Bees', 0.0],\n", | |
" [u\"Harry Potter and the Sorcerer's Stone\", u'The Da Vinci Code', -0.10387],\n", | |
" [u\"Harry Potter and the Sorcerer's Stone\", u'Life of Pi', -0.29093]]" | |
] | |
} | |
], | |
"prompt_number": 40 | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment