Skip to content

Instantly share code, notes, and snippets.

@aok1425
Last active December 26, 2016 05:09
Show Gist options
  • Save aok1425/5910df38e4caff20f5bab6f634a2c4cb to your computer and use it in GitHub Desktop.
Save aok1425/5910df38e4caff20f5bab6f634a2c4cb to your computer and use it in GitHub Desktop.
# adapted from https://github.com/alaw1290/CS591B1/blob/master/RTscrapers/moviescraper.py
from bs4 import BeautifulSoup
from selenium import webdriver
import locale
locale.setlocale( locale.LC_ALL, 'en_US.UTF-8' )
import datetime
driver = webdriver.Chrome("/usr/lib/chromium-browser/chromedriver")
# driver = webdriver.PhantomJS() # oesn't work, for whatever reason
# driver.set_window_size(1120, 550)
def get_rotten_tomatoes_result(url):
"""Input URL of Rotten Tomatoes movie, and get dictionary of its ratings"""
driver.get(url)
html = driver.page_source
soup = BeautifulSoup(html, "lxml")
results = {}
# all critics
all_critics = soup.find("div", { "id" : "all-critics-numbers" })
all_critics_2 = all_critics.find("div", { "id" : "scoreStats" })
all_critics_info = all_critics_2.findAll("span", { "class" : '' })
results["all critics"] = {
'tomatometer': int(all_critics.find("a", { "id" : 'tomato_meter_link' }).text.strip().replace("%", "")),
'average_rating': float(all_critics_2.find("div", { "class" : 'superPageFontColor' }).text.split("\n")[2].strip().replace("/10", "")),
'reviews_counted': int(all_critics_info[0].text),
'fresh': int(all_critics_info[1].text),
'rotten': int(all_critics_info[2].text)
}
# top critics
find_top = soup.find("div", { "id" : "top-critics-numbers" }).text
if "Not Available" in find_top:
pass
else:
top_critics = soup.find("div", { "id" : "top-critics-numbers" })
top_critics_2 = top_critics.find("div", { "id" : "scoreStats" })
top_critics_info = top_critics_2.findAll("span", { "class" : '' })
results["top critics"] = {
'tomatometer': int(top_critics.find("a", { "id" : 'tomato_meter_link' }).text.strip().replace("%", "")),
'average_rating': float(top_critics_2.find("div", { "class" : 'superPageFontColor' }).text.split("\n")[2].strip().replace("/10", "")),
'reviews_counted': int(top_critics_info[0].text),
'fresh': int(top_critics_info[1].text),
'rotten': int(top_critics_info[2].text)
}
# audience
"""Consider using a strip() first before split"""
audience_info = soup.find("div", { "class" : 'audience-info hidden-xs superPageFontColor' }).text.split("\n")
results["audience"] = {
'score': int(soup.find("div", { "class" : "meter-value" }).text.split("\n")[1].replace("%","")),
'average_rating': float(audience_info[3].strip().replace("/5","")),
'number_of_ratings': int(audience_info[7].strip().replace(",", ""))
}
return results
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment