linstantnoodles · July 5, 2020 19:31
diff --git a/ebert_amazon.py b/ebert_amazon.py
 import requests
 import csv
 import os
 import json
 import re
 from bs4 import BeautifulSoup 
 import mechanize
 from random import choice
 user_agents = ['Mozilla/5.0 (X11; U; Linux; i686; en-US; rv:1.6) Gecko Debian/1.6-7','Konqueror/3.0-rc4; (Konqueror/3.0-rc4; i686 Linux;;datecode)','Opera/9.52 (X11; Linux i686; U; en)']
 random_user_agent = choice(user_agents)
 import urllib

 def all_great_movies():
    ebert_url = "https://www.rogerebert.com/great-movies?utf8=%E2%9C%93&filters%5Btitle%5D=&sort%5Border%5D=newest&filters%5Byears%5D%5B%5D=1914&filters%5Byears%5D%5B%5D=2020&filters%5Bstar_rating%5D%5B%5D=0.0&filters%5Bstar_rating%5D%5B%5D=4.0&filters%5Bno_stars%5D=1&page={}"
    curr_page = 1
    headers = {'accept': 'application/json'}
    while True:
        print("Parsing page {}".format(curr_page))
        data = []
        response = requests.get(ebert_url.format(curr_page), headers=headers) 
        data_soup = BeautifulSoup(response.json()['html'], features="html.parser")
        reviews = data_soup.find_all("div", class_="review-stack")
        for review in reviews:
            title = review.find("h5", class_="review-stack--title")
            title_anchor = title.find("a")
            review_link = title_anchor['href']
            review_text = title_anchor.text
            data.append({
                "review_url": review_link,
                "title": review_text
            })
        if data:
            print("{} movies saved".format(len(data)))
            with open("data/{}.json".format(curr_page), 'w') as f:
                f.write(json.dumps(data))
            curr_page += 1
        else:
            break

 def amazon_search(movie_title):
    print("Searching movie: {}".format(movie_title))
    url = "https://www.amazon.com/s?k={}&i=instant-video"
    search_key = urllib.parse.quote_plus(movie_title.lower())
    end = url.format(search_key)
    br = mechanize.Browser()
    br.addheaders = [('User-Agent', random_user_agent)]
    response = br.open(end)
    data_soup = BeautifulSoup(response.get_data(), features="html.parser")
    txt = str(data_soup).replace("\n", "")
    res = data_soup.find_all()
    import re
    rgex = re.compile(r"""<h2 class="a-size-mini a-spacing-none a-color-base s-line-clamp-2">.*?<\/h2>""")
    matches = rgex.findall(txt)
    available_on_amazon = False
    included_with_prime = False
    amazon_url = None
    for match in matches:
        soup = BeautifulSoup(match, features="html.parser")
        anchor = soup.find("a")
        search_title = anchor.text.strip().lower()
        if search_title in movie_title.lower():
            print("Available on amazon!")
            curl = "https://www.amazon.com{}".format(anchor['href'])
            available_on_amazon = True
            amazon_url = curl
            try:
                res = br.open(curl)
                if "Watch for $0.00 with Prime" in str(res.get_data()):
                    included_with_prime = True
                break
            except:
                print("Unable to get prime information for {}".format(movie_title))
                break
    res = {
        "available_on_amazon": available_on_amazon,
        "included_with_prime": included_with_prime,
        "url": amazon_url
    }
    return res

 def add_amazon_data_to_ebert_movies(name):
    updated_data = []
    ebert_data = []
    with open(name, 'r') as f:
        contents = f.read()
        if contents:
            ebert_data = json.loads(contents)
        for movie_info in ebert_data:
            amazon_info = amazon_search(movie_info["title"])
            updated_data.append({**movie_info, **amazon_info})
    if ebert_data:
        with open(name, 'w') as f:
            f.write(json.dumps(updated_data))

 def run():
    all_great_movies()
    for myfile in os.listdir('data'):
        print("------Page {}------".format(myfile))
        add_amazon_data_to_ebert_movies("data/{}".format(myfile))
    combined_movie_data = []
    for myfile in os.listdir('data'):
        with open("data/{}".format(myfile), 'r') as f:
            if "json" in myfile:
                combined_movie_data += json.loads(f.read())
    with open('data/results.csv', 'w') as csvfile:
        filewriter = csv.writer(csvfile, delimiter='~')
        filewriter.writerow(['Title', 'Review URL', 'Available on Amazon', 'Included With Prime', 'Prime URL'])
        for row in combined_movie_data:
            filewriter.writerow([
                row['title'], 
                "www.rogerebert.com{}".format(row['review_url']),
                row['available_on_amazon'],
                row['included_with_prime'],
                row['url']
            ])

 if __name__ == '__main__': 
    run()
	import requests
	import csv
	import os
	import json
	import re
	from bs4 import BeautifulSoup
	import mechanize
	from random import choice
	user_agents = ['Mozilla/5.0 (X11; U; Linux; i686; en-US; rv:1.6) Gecko Debian/1.6-7','Konqueror/3.0-rc4; (Konqueror/3.0-rc4; i686 Linux;;datecode)','Opera/9.52 (X11; Linux i686; U; en)']
	random_user_agent = choice(user_agents)
	import urllib

	def all_great_movies():
	ebert_url = "https://www.rogerebert.com/great-movies?utf8=%E2%9C%93&filters%5Btitle%5D=&sort%5Border%5D=newest&filters%5Byears%5D%5B%5D=1914&filters%5Byears%5D%5B%5D=2020&filters%5Bstar_rating%5D%5B%5D=0.0&filters%5Bstar_rating%5D%5B%5D=4.0&filters%5Bno_stars%5D=1&page={}"
	curr_page = 1
	headers = {'accept': 'application/json'}
	while True:
	print("Parsing page {}".format(curr_page))
	data = []
	response = requests.get(ebert_url.format(curr_page), headers=headers)
	data_soup = BeautifulSoup(response.json()['html'], features="html.parser")
	reviews = data_soup.find_all("div", class_="review-stack")
	for review in reviews:
	title = review.find("h5", class_="review-stack--title")
	title_anchor = title.find("a")
	review_link = title_anchor['href']
	review_text = title_anchor.text
	data.append({
	"review_url": review_link,
	"title": review_text
	})
	if data:
	print("{} movies saved".format(len(data)))
	with open("data/{}.json".format(curr_page), 'w') as f:
	f.write(json.dumps(data))
	curr_page += 1
	else:
	break

	def amazon_search(movie_title):
	print("Searching movie: {}".format(movie_title))
	url = "https://www.amazon.com/s?k={}&i=instant-video"
	search_key = urllib.parse.quote_plus(movie_title.lower())
	end = url.format(search_key)
	br = mechanize.Browser()
	br.addheaders = [('User-Agent', random_user_agent)]
	response = br.open(end)
	data_soup = BeautifulSoup(response.get_data(), features="html.parser")
	txt = str(data_soup).replace("\n", "")
	res = data_soup.find_all()
	import re
	rgex = re.compile(r"""<h2 class="a-size-mini a-spacing-none a-color-base s-line-clamp-2">.*?<\/h2>""")
	matches = rgex.findall(txt)
	available_on_amazon = False
	included_with_prime = False
	amazon_url = None
	for match in matches:
	soup = BeautifulSoup(match, features="html.parser")
	anchor = soup.find("a")
	search_title = anchor.text.strip().lower()
	if search_title in movie_title.lower():
	print("Available on amazon!")
	curl = "https://www.amazon.com{}".format(anchor['href'])
	available_on_amazon = True
	amazon_url = curl
	try:
	res = br.open(curl)
	if "Watch for $0.00 with Prime" in str(res.get_data()):
	included_with_prime = True
	break
	except:
	print("Unable to get prime information for {}".format(movie_title))
	break
	res = {
	"available_on_amazon": available_on_amazon,
	"included_with_prime": included_with_prime,
	"url": amazon_url
	}
	return res

	def add_amazon_data_to_ebert_movies(name):
	updated_data = []
	ebert_data = []
	with open(name, 'r') as f:
	contents = f.read()
	if contents:
	ebert_data = json.loads(contents)
	for movie_info in ebert_data:
	amazon_info = amazon_search(movie_info["title"])
	updated_data.append({movie_info, amazon_info})
	if ebert_data:
	with open(name, 'w') as f:
	f.write(json.dumps(updated_data))

	def run():
	all_great_movies()
	for myfile in os.listdir('data'):
	print("------Page {}------".format(myfile))
	add_amazon_data_to_ebert_movies("data/{}".format(myfile))
	combined_movie_data = []
	for myfile in os.listdir('data'):
	with open("data/{}".format(myfile), 'r') as f:
	if "json" in myfile:
	combined_movie_data += json.loads(f.read())
	with open('data/results.csv', 'w') as csvfile:
	filewriter = csv.writer(csvfile, delimiter='~')
	filewriter.writerow(['Title', 'Review URL', 'Available on Amazon', 'Included With Prime', 'Prime URL'])
	for row in combined_movie_data:
	filewriter.writerow([
	row['title'],
	"www.rogerebert.com{}".format(row['review_url']),
	row['available_on_amazon'],
	row['included_with_prime'],
	row['url']
	])

	if __name__ == '__main__':
	run()