-
-
Save jayrambhia/1678382 to your computer and use it in GitHub Desktop.
''' | |
Author : Jay Rambhia | |
Git : https://github.com/jayrambhia | |
gist : https://gist.github.com/jayrambhia | |
''' | |
import urllib2 | |
from BeautifulSoup import BeautifulSoup | |
from mechanize import Browser | |
import re | |
def getunicode(soup): | |
body='' | |
if isinstance(soup, unicode): | |
soup = soup.replace(''',"'") | |
soup = soup.replace('"','"') | |
soup = soup.replace(' ',' ') | |
body = body + soup | |
else: | |
if not soup.contents: | |
return '' | |
con_list = soup.contents | |
for con in con_list: | |
body = body + getunicode(con) | |
return body | |
def main(): | |
movie = str(raw_input('Movie Name: ')) | |
movie_search = '+'.join(movie.split()) | |
base_url = 'http://www.imdb.com/find?q=' | |
url = base_url+movie_search+'&s=all' | |
title_search = re.compile('/title/tt\d+') | |
br = Browser() | |
br.set_proxies({'http':'http://username:password@proxy:port', | |
'https':'https://username:password@proxy:port'}) | |
br.open(url) | |
link = br.find_link(url_regex = re.compile(r'/title/tt.*')) | |
res = br.follow_link(link) | |
soup = BeautifulSoup(res.read()) | |
movie_title = getunicode(soup.find('title')) | |
rate = soup.find('span',itemprop='ratingValue') | |
rating = getunicode(rate) | |
actors=[] | |
actors_soup = soup.findAll('a',itemprop='actors') | |
for i in range(len(actors_soup)): | |
actors.append(getunicode(actors_soup[i])) | |
des = soup.find('meta',{'name':'description'})['content'] | |
genre=[] | |
infobar = soup.find('div',{'class':'infobar'}) | |
r = infobar.find('',{'title':True})['title'] | |
genrelist = infobar.findAll('a',{'href':True}) | |
for i in range(len(genrelist)-1): | |
genre.append(getunicode(genrelist[i])) | |
release_date = getunicode(genrelist[-1]) | |
print movie_title,rating+'/10.0' | |
print 'Relase Date:',release_date | |
print 'Rated',r | |
print '' | |
print 'Genre:', | |
print ', '.join(genre) | |
print '\nActors:', | |
print ', '.join(actors) | |
print '\nDescription:' | |
print des | |
if __name__ == '__main__': | |
main() | |
''' | |
Author : Jay Rambhia | |
Git : https://github.com/jayrambhia | |
gist : https://gist.github.com/jayrambhia | |
''' | |
import urllib2 | |
from BeautifulSoup import BeautifulSoup | |
from mechanize import Browser | |
import re | |
def getunicode(soup): | |
body='' | |
if isinstance(soup, unicode): | |
soup = soup.replace(''',"'") | |
soup = soup.replace('"','"') | |
soup = soup.replace(' ',' ') | |
body = body + soup | |
else: | |
if not soup.contents: | |
return '' | |
con_list = soup.contents | |
for con in con_list: | |
body = body + getunicode(con) | |
return body | |
def main(): | |
movie = str(raw_input('Movie Name: ')) | |
movie_search = '+'.join(movie.split()) | |
base_url = 'http://www.imdb.com/find?q=' | |
url = base_url+movie_search+'&s=all' | |
title_search = re.compile('/title/tt\d+') | |
br = Browser() | |
br.set_proxies({'http':'http://username:password@proxy:port', | |
'https':'https://username:password@proxy:port'}) | |
br.open(url) | |
link = br.find_link(url_regex = re.compile(r'/title/tt.*')) | |
res = br.follow_link(link) | |
soup = BeautifulSoup(res.read()) | |
movie_title = getunicode(soup.find('title')) | |
rate = soup.find('span',itemprop='ratingValue') | |
rating = getunicode(rate) | |
actors=[] | |
actors_soup = soup.findAll('a',itemprop='actors') | |
for i in range(len(actors_soup)): | |
actors.append(getunicode(actors_soup[i])) | |
des = soup.find('meta',{'name':'description'})['content'] | |
genre=[] | |
infobar = soup.find('div',{'class':'infobar'}) | |
r = infobar.find('',{'title':True})['title'] | |
genrelist = infobar.findAll('a',{'href':True}) | |
for i in range(len(genrelist)-1): | |
genre.append(getunicode(genrelist[i])) | |
release_date = getunicode(genrelist[-1]) | |
print movie_title,rating+'/10.0' | |
print 'Relase Date:',release_date | |
print 'Rated',r | |
print '' | |
print 'Genre:', | |
print ', '.join(genre) | |
print '\nActors:', | |
print ', '.join(actors) | |
print '\nDescription:' | |
print des | |
if __name__ == '__main__': | |
main() | |
Its giving the following error -
`Movie Name: avengers
Traceback (most recent call last):
File "/home/shubham/PycharmProjects/TV_Adro/Excel/search.py", line 17, in
br.open(url)
File "/home/shubham/Downloads/sample/venv/local/lib/python2.7/site-packages/mechanize/_mechanize.py", line 254, in open
return self._mech_open(url_or_request, data, timeout=timeout)
File "/home/shubham/Downloads/sample/venv/local/lib/python2.7/site-packages/mechanize/_mechanize.py", line 284, in _mech_open
response = UserAgentBase.open(self, request, data)
File "/home/shubham/Downloads/sample/venv/local/lib/python2.7/site-packages/mechanize/_opener.py", line 190, in open
req = meth(req)
File "/home/shubham/Downloads/sample/venv/local/lib/python2.7/site-packages/mechanize/_http.py", line 253, in http_request
self.rfp.read()
File "/home/shubham/Downloads/sample/venv/local/lib/python2.7/site-packages/mechanize/_http.py", line 174, in read
f = self._opener.open(req)
File "/home/shubham/Downloads/sample/venv/local/lib/python2.7/site-packages/mechanize/_mechanize.py", line 254, in open
return self._mech_open(url_or_request, data, timeout=timeout)
File "/home/shubham/Downloads/sample/venv/local/lib/python2.7/site-packages/mechanize/_mechanize.py", line 284, in _mech_open
response = UserAgentBase.open(self, request, data)
File "/home/shubham/Downloads/sample/venv/local/lib/python2.7/site-packages/mechanize/_opener.py", line 195, in open
response = urlopen(self, req, data)
File "/home/shubham/Downloads/sample/venv/local/lib/python2.7/site-packages/mechanize/_urllib2_fork.py", line 352, in _open
'_open', req)
File "/home/shubham/Downloads/sample/venv/local/lib/python2.7/site-packages/mechanize/_urllib2_fork.py", line 340, in _call_chain
result = func(*args)
File "/home/shubham/Downloads/sample/venv/local/lib/python2.7/site-packages/mechanize/_urllib2_fork.py", line 1185, in http_open
return self.do_open(httplib.HTTPConnection, req)
File "/home/shubham/Downloads/sample/venv/local/lib/python2.7/site-packages/mechanize/_urllib2_fork.py", line 1127, in do_open
h = http_class(host_port, timeout=req.timeout)
File "/usr/lib/python2.7/httplib.py", line 751, in init
(self.host, self.port) = self._get_hostport(host, port)
File "/usr/lib/python2.7/httplib.py", line 792, in _get_hostport
raise InvalidURL("nonnumeric port: '%s'" % host[i+1:])
httplib.InvalidURL: nonnumeric port: 'port'
Process finished with exit code 1`
I had the same problem as shubhamjanhere
+1 ... same here!
@shubhamjanhere @ricardorqr this does the same thing but works for me:
https://gist.github.com/rpryzant/cb4fe2c4d676262d667a68fcbf4e4c91
@shubhamjanhere @ricardorqr you have to give your proxy address for br.set_proxies function.
Remade after IMDb changed the way their page works
https://gist.github.com/Dob-The-Duilder/98f0765ce6dd9c1e11c3a649619654ac
it works for me