Last active
December 22, 2015 10:09
-
-
Save erogol/6457197 to your computer and use it in GitHub Desktop.
scrap imdb to get non-dublicate movies of given categories
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib2 | |
from BeautifulSoup import BeautifulSoup | |
import pdb | |
import os | |
import htmlentitydefs | |
from BeautifulSoup import BeautifulStoneSoup | |
import HTMLParser | |
import cgi | |
def search_in_dict(DICT,query): | |
keys = DICT.keys() | |
num_key = len(keys)-1 | |
for i in range(num_key): | |
if query in DICT[keys[i]]: | |
DICT[keys[i]].pop(DICT[keys[i]].index(query)) | |
return True | |
return False | |
def HTMLEntitiesToUnicode(text): | |
"""Converts HTML entities to unicode. For example '&' becomes '&'.""" | |
text = unicode(BeautifulStoneSoup(text, convertEntities=BeautifulStoneSoup.ALL_ENTITIES)) | |
return text | |
def write_dict_to_files(DICT): | |
OUTPUT_FOLDER = 'outputs' | |
if not os.path.exists(OUTPUT_FOLDER): | |
os.makedirs(OUTPUT_FOLDER) | |
keys = DICT.keys() | |
num_keys = len(keys) | |
for i in range(num_keys): | |
#pdb.set_trace() | |
f = open(OUTPUT_FOLDER+'/'+keys[i]+'.txt','w') | |
num_ins = len(DICT[keys[i]]) | |
for j in range(num_ins): | |
text = DICT[keys[i]][j] | |
f.write(HTMLEntitiesToUnicode(text)+'\n') | |
f.close() | |
num_page = 15 | |
address = "" | |
h = HTMLParser.HTMLParser() | |
GENRE_LIST = ['action','comedy','animation','drama','horror',] | |
#GENRE_LIST = ['animation', 'romance', 'war'] | |
GENRE_DICT = {} | |
for GENRE in GENRE_LIST: | |
GENRE_DICT[GENRE] = []; | |
address = '/search/title?count=100&genres='+GENRE+'&sproduction_status=released&title_type=feature' | |
for i in range(num_page): | |
print address | |
soup = BeautifulSoup(urllib2.urlopen('http://www.imdb.com'+address).read()) | |
for row in soup('td', {'class':'title'}): | |
film_name = h.unescape(row('a')[0].string) | |
if not search_in_dict(GENRE_DICT,film_name): | |
#print film_name | |
GENRE_DICT[GENRE].append(film_name) | |
#pdb.set_trace() | |
#f.write(film_name+'\n') | |
next_link = soup.find('span',{'class':'pagination'}) | |
link = next_link.findChildren('a') | |
address = link[-1]['href'] | |
print('\n\n') | |
write_dict_to_files(GENRE_DICT) | |
print 'Finish' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment