Last active
August 30, 2019 04:17
-
-
Save mdzhang/53cf81c8e4800bff5afe2698f25f134f to your computer and use it in GitHub Desktop.
Script to filter down goodreads listopia items and order them based on what I'm mostly likely to want to read
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Filter down a Goodreads Listopia list to books I'm most likely to like. | |
Fetches the raw HTML with Selenium (requires chromedriver), extracts book data | |
to a pandas data frame using BeautifulSoup, does some massaging, then prints | |
it out nicely. | |
TODO: | |
- more data points (year, contents of top upvoted reviews, shelves, | |
page count, read status, etc.) | |
- support multi-page Listopia lists | |
""" | |
import argparse | |
import logging | |
import os | |
import re | |
import sys | |
import numpy as np | |
import pandas as pd | |
from bs4 import BeautifulSoup | |
from selenium import webdriver | |
from tabulate import tabulate | |
logging.basicConfig() | |
logger = logging.getLogger(__name__) | |
logger.setLevel(logging.DEBUG) | |
def make_book(book) -> dict: | |
"""Extract key fields from HTML for a book in a Goodreads webpage table | |
to a dict. | |
Works for Listopia lists at e.g. https://www.goodreads.com/list/show/* | |
""" | |
# title & author | |
title = book.find("a", attrs={"class": "bookTitle"}).find("span").getText() | |
author = book.find("a", attrs={"class": "authorName"}).find("span").getText() | |
# id (from which url can easily be built) | |
href = book.find("a", attrs={"class": "bookTitle"})["href"] | |
id_ = os.path.basename(href).split(".", 1)[0] | |
id_ = id_.split("-", 1)[0] | |
# avg & total rating/s | |
rating_detail = book.find("span", attrs={"class", "minirating"}).getText() | |
RATING_REGEX = r"^.+\s?(\d\.\d\d) avg rating — ([\d,]+) ratings$" | |
m = re.match(RATING_REGEX, rating_detail) | |
if m is None: | |
logger.warn(f"Failed to pull ratings from detail: {rating_detail}") | |
avg_rating = np.nan | |
total_ratings = np.nan | |
else: | |
avg_rating = float(m.group(1)) | |
total_ratings = int(m.group(2).replace(",", "")) | |
# listopia score | |
score_detail = book.find("a", onclick=re.compile("score_explanation")).getText() | |
SCORE_REGEX = r"^score: ([\d,]+)$" | |
m = re.match(SCORE_REGEX, score_detail) | |
if m is None: | |
logger.warn(f"Failed to pull scoring from detail: {score_detail}") | |
score = np.nan | |
else: | |
score = int(m.group(1).replace(",", "")) | |
# final dict | |
return { | |
"title": title, | |
"author": author, | |
"id": id_, | |
"avg_rating": avg_rating, | |
"total_ratings": total_ratings, | |
"score": score, | |
"url": f"https://www.goodreads.com/book/show/{id_}", | |
} | |
def load_to_df(url): | |
driver = webdriver.Chrome("/usr/bin/chromedriver") | |
driver.get(url) | |
html_source = driver.page_source | |
driver.close() | |
soup = BeautifulSoup(html_source, "html.parser") | |
books = soup.find_all("tr", attrs={"itemtype": "http://schema.org/Book"}) | |
book_dicts = list(map(make_book, books)) | |
df = pd.DataFrame.from_dict(book_dicts) | |
return df.sort_values(by=["total_ratings", "avg_rating"], ascending=False) | |
def find_my_favorites(df): | |
df2 = df.sort_values(by=["total_ratings", "avg_rating", "score"], ascending=False) | |
df3 = df2[["title", "id", "total_ratings", "avg_rating", "score"]] | |
# ignore anything below a 3.7, since historically I haven't liked those much anyways | |
df3 = df3[df3["avg_rating"] > 3.7] | |
# ignore anything read less than some number of times | |
# since it could be an indicator that the book was so bad, people just won't finish it | |
df3 = df3[df3["total_ratings"] > 500] | |
# TODO: filter out books I've already read | |
# create an aggregate score from total/avg rating | |
# | |
# weigh total ratings down b/c sometimes things get a lot of ratings due to | |
# hype/cult status etc which doesn't correspond to my enjoyment. | |
# typically I don't enjoy books with that many more ratings over ~2k all that much more | |
# | |
# weigh average rating way up | |
# | |
# the results for a couple sample lists have some of my favorites from the list within the | |
# top 15 results, which seems legit | |
# | |
# TODO: incorporate year since I often don't like very old books | |
# TODO: incorporate page count, since I'm more likely to pick up a short | |
# book to get closer to my reading challenge for the year | |
# TODO: incorporate tags and shelves, b/c some genres I really don't care for | |
df3["agg_score"] = 1000 * (np.log2(df3["total_ratings"]) + (np.e ** df3["avg_rating"])) + df3['score'] | |
df3 = df3.sort_values(by=["agg_score"], ascending=False) | |
return df3.reset_index(drop=True) | |
def get_parser(): | |
parser = argparse.ArgumentParser( | |
description="Search a Goodreads Listopia list for books I may want to read" | |
) | |
parser.add_argument( | |
"--refresh", action="store_true", help="Whether to reuse cached files on disk" | |
) | |
parser.add_argument( | |
"-n", type=int, help="How many of the top recommendations to print", default=30 | |
) | |
parser.add_argument( | |
"--url", | |
type=str, | |
help="URL of Listopia list on Goodreads to filter down", | |
required=True, | |
) | |
return parser | |
def main(limit, url, refresh): | |
title = os.path.basename(url).split(".", 1)[1] | |
cache_file = f"{title}.csv" | |
if not os.path.exists(cache_file) or refresh: | |
logger.info(f"Fetching contents for first time '{cache_file}'") | |
df = load_to_df(url) | |
df.to_csv(cache_file, index=False) | |
else: | |
logger.info(f"Reusing local '{cache_file}'") | |
df = pd.read_csv(cache_file) | |
df2 = find_my_favorites(df) | |
print(tabulate(df2.head(limit), headers="keys", tablefmt="psql")) | |
if __name__ == "__main__": | |
parser = get_parser() | |
args = parser.parse_args() | |
main(limit=args.n, refresh=args.refresh, url=args.url) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment