Last active
July 5, 2020 19:31
-
-
Save linstantnoodles/be1814e7e7c945441d66bdba45cff1b5 to your computer and use it in GitHub Desktop.
Get amazon prime information on Roger Ebert Great Movies
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import csv | |
import os | |
import json | |
import re | |
from bs4 import BeautifulSoup | |
import mechanize | |
from random import choice | |
user_agents = ['Mozilla/5.0 (X11; U; Linux; i686; en-US; rv:1.6) Gecko Debian/1.6-7','Konqueror/3.0-rc4; (Konqueror/3.0-rc4; i686 Linux;;datecode)','Opera/9.52 (X11; Linux i686; U; en)'] | |
random_user_agent = choice(user_agents) | |
import urllib | |
def all_great_movies(): | |
ebert_url = "https://www.rogerebert.com/great-movies?utf8=%E2%9C%93&filters%5Btitle%5D=&sort%5Border%5D=newest&filters%5Byears%5D%5B%5D=1914&filters%5Byears%5D%5B%5D=2020&filters%5Bstar_rating%5D%5B%5D=0.0&filters%5Bstar_rating%5D%5B%5D=4.0&filters%5Bno_stars%5D=1&page={}" | |
curr_page = 1 | |
headers = {'accept': 'application/json'} | |
while True: | |
print("Parsing page {}".format(curr_page)) | |
data = [] | |
response = requests.get(ebert_url.format(curr_page), headers=headers) | |
data_soup = BeautifulSoup(response.json()['html'], features="html.parser") | |
reviews = data_soup.find_all("div", class_="review-stack") | |
for review in reviews: | |
title = review.find("h5", class_="review-stack--title") | |
title_anchor = title.find("a") | |
review_link = title_anchor['href'] | |
review_text = title_anchor.text | |
data.append({ | |
"review_url": review_link, | |
"title": review_text | |
}) | |
if data: | |
print("{} movies saved".format(len(data))) | |
with open("data/{}.json".format(curr_page), 'w') as f: | |
f.write(json.dumps(data)) | |
curr_page += 1 | |
else: | |
break | |
def amazon_search(movie_title): | |
print("Searching movie: {}".format(movie_title)) | |
url = "https://www.amazon.com/s?k={}&i=instant-video" | |
search_key = urllib.parse.quote_plus(movie_title.lower()) | |
end = url.format(search_key) | |
br = mechanize.Browser() | |
br.addheaders = [('User-Agent', random_user_agent)] | |
response = br.open(end) | |
data_soup = BeautifulSoup(response.get_data(), features="html.parser") | |
txt = str(data_soup).replace("\n", "") | |
res = data_soup.find_all() | |
import re | |
rgex = re.compile(r"""<h2 class="a-size-mini a-spacing-none a-color-base s-line-clamp-2">.*?<\/h2>""") | |
matches = rgex.findall(txt) | |
available_on_amazon = False | |
included_with_prime = False | |
amazon_url = None | |
for match in matches: | |
soup = BeautifulSoup(match, features="html.parser") | |
anchor = soup.find("a") | |
search_title = anchor.text.strip().lower() | |
if search_title in movie_title.lower(): | |
print("Available on amazon!") | |
curl = "https://www.amazon.com{}".format(anchor['href']) | |
available_on_amazon = True | |
amazon_url = curl | |
try: | |
res = br.open(curl) | |
if "Watch for $0.00 with Prime" in str(res.get_data()): | |
included_with_prime = True | |
break | |
except: | |
print("Unable to get prime information for {}".format(movie_title)) | |
break | |
res = { | |
"available_on_amazon": available_on_amazon, | |
"included_with_prime": included_with_prime, | |
"url": amazon_url | |
} | |
return res | |
def add_amazon_data_to_ebert_movies(name): | |
updated_data = [] | |
ebert_data = [] | |
with open(name, 'r') as f: | |
contents = f.read() | |
if contents: | |
ebert_data = json.loads(contents) | |
for movie_info in ebert_data: | |
amazon_info = amazon_search(movie_info["title"]) | |
updated_data.append({**movie_info, **amazon_info}) | |
if ebert_data: | |
with open(name, 'w') as f: | |
f.write(json.dumps(updated_data)) | |
def run(): | |
all_great_movies() | |
for myfile in os.listdir('data'): | |
print("------Page {}------".format(myfile)) | |
add_amazon_data_to_ebert_movies("data/{}".format(myfile)) | |
combined_movie_data = [] | |
for myfile in os.listdir('data'): | |
with open("data/{}".format(myfile), 'r') as f: | |
if "json" in myfile: | |
combined_movie_data += json.loads(f.read()) | |
with open('data/results.csv', 'w') as csvfile: | |
filewriter = csv.writer(csvfile, delimiter='~') | |
filewriter.writerow(['Title', 'Review URL', 'Available on Amazon', 'Included With Prime', 'Prime URL']) | |
for row in combined_movie_data: | |
filewriter.writerow([ | |
row['title'], | |
"www.rogerebert.com{}".format(row['review_url']), | |
row['available_on_amazon'], | |
row['included_with_prime'], | |
row['url'] | |
]) | |
if __name__ == '__main__': | |
run() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment