|
from collections import OrderedDict |
|
from functools import partial |
|
import logging |
|
import sys |
|
|
|
from bs4 import BeautifulSoup |
|
import dataset |
|
import requests |
|
from thready import threaded |
|
import sqlalchemy |
|
|
|
logger = logging.getLogger() |
|
|
|
|
|
def netflix_url_generator(): |
|
for i in xrange(0, 90000): |
|
yield 'http://movies.netflix.com/WiAltGenre?agid=%d' % i |
|
|
|
|
|
def scrape_title(r, db, url): |
|
resp = r.get(url) |
|
resp.raise_for_status() |
|
soup = BeautifulSoup(resp.content) |
|
try: |
|
micro_genre = soup.select('h1 .crumb a')[0].text |
|
except IndexError: |
|
# This can happen when they force you throught the profile screen |
|
return |
|
|
|
data = { |
|
'source_url': url, |
|
'micro_genre': micro_genre, |
|
} |
|
|
|
db['micro_genres'].upsert(data, ['source_url']) |
|
|
|
logger.info('Found: %s', micro_genre) |
|
|
|
|
|
def get_database(): |
|
return dataset.connect('sqlite:///missed_connections.db') |
|
|
|
|
|
def setup_env(email, password): |
|
r = requests.Session() |
|
|
|
r.headers.update({ |
|
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36', |
|
}) |
|
|
|
resp = r.get('https://signup.netflix.com/Login') |
|
soup = BeautifulSoup(resp.content) |
|
|
|
auth_data = OrderedDict([ |
|
('authURL', soup.select('input[name=authURL]')[0]['value']), |
|
('email', email), |
|
('password', password), |
|
]) |
|
|
|
resp = r.post('https://signup.netflix.com/Login', data=auth_data, headers={ |
|
'Accept-Language': 'en-US,en;q=0.8', |
|
'Origin': 'https://signup.netflix.com', |
|
'Pragma': 'no-cache', |
|
'Referer': 'https://signup.netflix.com/Login?nextpage=http%3A%2F%2Fmovies.netflix.com%2FWiHome%3Flocale%3Den-US%26ref%3Dec' |
|
}) |
|
|
|
db = get_database() |
|
|
|
# Make sure db is created |
|
table_name = 'micro_genres' |
|
try: |
|
table = db.load_table(table_name) |
|
except sqlalchemy.exc.NoSuchTableError: |
|
table = db.get_table(table_name) |
|
table.create_column('source_url', sqlalchemy.String) |
|
table.create_column('micro_genre', sqlalchemy.String) |
|
db.commit() |
|
|
|
return r, db |
|
|
|
|
|
def main(email, password): |
|
ch = logging.StreamHandler(sys.stdout) |
|
ch.setLevel(logging.INFO) |
|
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') |
|
ch.setFormatter(formatter) |
|
logger.addHandler(ch) |
|
|
|
r, db = setup_env(email, password) |
|
|
|
handler = partial(scrape_title, r, db) |
|
|
|
logger.info('About to start scraping micro genres') |
|
threaded(netflix_url_generator(), handler, num_threads=200) |
|
|
|
|
|
def print_data(): |
|
db = get_database() |
|
for x in db['micro_genres']: |
|
print x |
|
|
|
print len(db['micro_genres']) |
|
|
|
|
|
if __name__ == '__main__': |
|
username = sys.argv[1] |
|
password = sys.argv[2] |
|
main(username, password) |
|
print_data() |