Created
June 13, 2017 08:14
-
-
Save aliostad/3f91e55376c53489608f4dc8050ee71e to your computer and use it in GitHub Desktop.
A simple script to find Amazon prime (UK) films and their rotten tomato score
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import codecs | |
import requests | |
def get_film_names(doc): | |
res = [] | |
bs = BeautifulSoup(doc, 'html.parser') | |
for a in bs.select('ul li .s-item-container h2'): | |
name = a.text.replace('\t', ' ') | |
yearTags = a.parent.parent.select('> span.a-color-secondary') | |
if len(yearTags) > 0: | |
name = name + '\t' + yearTags[0].text | |
res.append(name) | |
return res | |
def get_rotten_tomatoes_scorex(filmName): | |
r = requests.get('https://www.rottentomatoes.com/m/' + filmName, headers={'User-Agent': 'Mozilla/5.0 (Mobile; rv:26.0) Gecko/26.0 Firefox/26.'}) | |
html = r.text | |
bs = BeautifulSoup(html, 'html.parser') | |
spans = bs.select('#all-critics-numbers span.meter-value span') | |
if len(spans) > 0: | |
return spans[0].text | |
else: | |
return None | |
def get_rotten_tomatoes_score(filmName, filmYear): | |
filmName = filmName.replace(' ', '_').replace(':', '').replace("'", '') | |
score = get_rotten_tomatoes_scorex(filmName) | |
if score is None: | |
score = get_rotten_tomatoes_scorex(filmName + '_' + filmYear) | |
return score | |
def iterate(frompg=1, pgcount=1000): | |
f = codecs.open('amaz-films.txt', mode='w', encoding='utf-8') | |
films = {} | |
templ = 'https://www.amazon.co.uk/s/ref=sr_pg_{}?fst=as%3Aoff&rh=n%3A3010085031%2Cn%3A%213010086031%2Cn%3A3046737031%2Cp_85%3A3282143031&bbn=3046737031&ie=UTF8&qid=1497108228&page={}' | |
for pg in range(frompg, pgcount): | |
#print pg | |
url = templ.format(pg, pg) | |
print url | |
r = requests.get(url, headers={'User-Agent': 'Mozilla/5.0 (Mobile; rv:26.0) Gecko/26.0 Firefox/26.'}) | |
html = r.text | |
for film in get_film_names(html): | |
films[film] = 100 #get_rotten_tomatoes_score(film) | |
f.write(film + '\n') | |
f.close() | |
exit(0) | |
sortedFilms = sorted(films, key=lambda f: films[f], reverse=True) | |
for film in sortedFilms: | |
txt = '{} - {}'.format(films[film], film) | |
print txt | |
f.write(txt + '\n') | |
f.close() | |
def find_score(fileName='amaz-films.txt'): | |
f = codecs.open(filename=fileName, mode='r', encoding='utf-8') | |
fr = codecs.open(filename=fileName + '.score', mode='w', encoding='utf-8') | |
films = {} | |
for line in f: | |
try: | |
line = line.replace('\n', '') | |
if len(line.replace('\t', '')) < len(line): | |
filmName, filmYear = line.split('\t') | |
else: | |
fileName = line | |
filmYear = '2010' | |
score = get_rotten_tomatoes_score(filmName, filmYear) | |
if score is not None: | |
films[fileName] = score | |
print '{} => {}'.format(filmName, score) | |
except Exception as e: | |
print e | |
sortedFilms = sorted(films, key=lambda f: films[f], reverse=True) | |
for film in sortedFilms: | |
txt = '{} - {}'.format(films[film], film) | |
print txt | |
fr.write(txt + '\n') | |
# finds amazon films and store in a file | |
iterate(1, 400) | |
# finds score for those films loaded from the file | |
find_score() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment