-
-
Save jayrambhia/1678382 to your computer and use it in GitHub Desktop.
| ''' | |
| Author : Jay Rambhia | |
| Git : https://github.com/jayrambhia | |
| gist : https://gist.github.com/jayrambhia | |
| ''' | |
| import urllib2 | |
| from BeautifulSoup import BeautifulSoup | |
| from mechanize import Browser | |
| import re | |
| def getunicode(soup): | |
| body='' | |
| if isinstance(soup, unicode): | |
| soup = soup.replace(''',"'") | |
| soup = soup.replace('"','"') | |
| soup = soup.replace(' ',' ') | |
| body = body + soup | |
| else: | |
| if not soup.contents: | |
| return '' | |
| con_list = soup.contents | |
| for con in con_list: | |
| body = body + getunicode(con) | |
| return body | |
| def main(): | |
| movie = str(raw_input('Movie Name: ')) | |
| movie_search = '+'.join(movie.split()) | |
| base_url = 'http://www.imdb.com/find?q=' | |
| url = base_url+movie_search+'&s=all' | |
| title_search = re.compile('/title/tt\d+') | |
| br = Browser() | |
| br.set_proxies({'http':'http://username:password@proxy:port', | |
| 'https':'https://username:password@proxy:port'}) | |
| br.open(url) | |
| link = br.find_link(url_regex = re.compile(r'/title/tt.*')) | |
| res = br.follow_link(link) | |
| soup = BeautifulSoup(res.read()) | |
| movie_title = getunicode(soup.find('title')) | |
| rate = soup.find('span',itemprop='ratingValue') | |
| rating = getunicode(rate) | |
| actors=[] | |
| actors_soup = soup.findAll('a',itemprop='actors') | |
| for i in range(len(actors_soup)): | |
| actors.append(getunicode(actors_soup[i])) | |
| des = soup.find('meta',{'name':'description'})['content'] | |
| genre=[] | |
| infobar = soup.find('div',{'class':'infobar'}) | |
| r = infobar.find('',{'title':True})['title'] | |
| genrelist = infobar.findAll('a',{'href':True}) | |
| for i in range(len(genrelist)-1): | |
| genre.append(getunicode(genrelist[i])) | |
| release_date = getunicode(genrelist[-1]) | |
| print movie_title,rating+'/10.0' | |
| print 'Relase Date:',release_date | |
| print 'Rated',r | |
| print '' | |
| print 'Genre:', | |
| print ', '.join(genre) | |
| print '\nActors:', | |
| print ', '.join(actors) | |
| print '\nDescription:' | |
| print des | |
| if __name__ == '__main__': | |
| main() | |
| ''' | |
| Author : Jay Rambhia | |
| Git : https://github.com/jayrambhia | |
| gist : https://gist.github.com/jayrambhia | |
| ''' | |
| import urllib2 | |
| from BeautifulSoup import BeautifulSoup | |
| from mechanize import Browser | |
| import re | |
| def getunicode(soup): | |
| body='' | |
| if isinstance(soup, unicode): | |
| soup = soup.replace(''',"'") | |
| soup = soup.replace('"','"') | |
| soup = soup.replace(' ',' ') | |
| body = body + soup | |
| else: | |
| if not soup.contents: | |
| return '' | |
| con_list = soup.contents | |
| for con in con_list: | |
| body = body + getunicode(con) | |
| return body | |
| def main(): | |
| movie = str(raw_input('Movie Name: ')) | |
| movie_search = '+'.join(movie.split()) | |
| base_url = 'http://www.imdb.com/find?q=' | |
| url = base_url+movie_search+'&s=all' | |
| title_search = re.compile('/title/tt\d+') | |
| br = Browser() | |
| br.set_proxies({'http':'http://username:password@proxy:port', | |
| 'https':'https://username:password@proxy:port'}) | |
| br.open(url) | |
| link = br.find_link(url_regex = re.compile(r'/title/tt.*')) | |
| res = br.follow_link(link) | |
| soup = BeautifulSoup(res.read()) | |
| movie_title = getunicode(soup.find('title')) | |
| rate = soup.find('span',itemprop='ratingValue') | |
| rating = getunicode(rate) | |
| actors=[] | |
| actors_soup = soup.findAll('a',itemprop='actors') | |
| for i in range(len(actors_soup)): | |
| actors.append(getunicode(actors_soup[i])) | |
| des = soup.find('meta',{'name':'description'})['content'] | |
| genre=[] | |
| infobar = soup.find('div',{'class':'infobar'}) | |
| r = infobar.find('',{'title':True})['title'] | |
| genrelist = infobar.findAll('a',{'href':True}) | |
| for i in range(len(genrelist)-1): | |
| genre.append(getunicode(genrelist[i])) | |
| release_date = getunicode(genrelist[-1]) | |
| print movie_title,rating+'/10.0' | |
| print 'Relase Date:',release_date | |
| print 'Rated',r | |
| print '' | |
| print 'Genre:', | |
| print ', '.join(genre) | |
| print '\nActors:', | |
| print ', '.join(actors) | |
| print '\nDescription:' | |
| print des | |
| if __name__ == '__main__': | |
| main() | |
Its giving the following error -
`Movie Name: avengers
Traceback (most recent call last):
File "/home/shubham/PycharmProjects/TV_Adro/Excel/search.py", line 17, in
br.open(url)
File "/home/shubham/Downloads/sample/venv/local/lib/python2.7/site-packages/mechanize/_mechanize.py", line 254, in open
return self._mech_open(url_or_request, data, timeout=timeout)
File "/home/shubham/Downloads/sample/venv/local/lib/python2.7/site-packages/mechanize/_mechanize.py", line 284, in _mech_open
response = UserAgentBase.open(self, request, data)
File "/home/shubham/Downloads/sample/venv/local/lib/python2.7/site-packages/mechanize/_opener.py", line 190, in open
req = meth(req)
File "/home/shubham/Downloads/sample/venv/local/lib/python2.7/site-packages/mechanize/_http.py", line 253, in http_request
self.rfp.read()
File "/home/shubham/Downloads/sample/venv/local/lib/python2.7/site-packages/mechanize/_http.py", line 174, in read
f = self._opener.open(req)
File "/home/shubham/Downloads/sample/venv/local/lib/python2.7/site-packages/mechanize/_mechanize.py", line 254, in open
return self._mech_open(url_or_request, data, timeout=timeout)
File "/home/shubham/Downloads/sample/venv/local/lib/python2.7/site-packages/mechanize/_mechanize.py", line 284, in _mech_open
response = UserAgentBase.open(self, request, data)
File "/home/shubham/Downloads/sample/venv/local/lib/python2.7/site-packages/mechanize/_opener.py", line 195, in open
response = urlopen(self, req, data)
File "/home/shubham/Downloads/sample/venv/local/lib/python2.7/site-packages/mechanize/_urllib2_fork.py", line 352, in _open
'_open', req)
File "/home/shubham/Downloads/sample/venv/local/lib/python2.7/site-packages/mechanize/_urllib2_fork.py", line 340, in _call_chain
result = func(*args)
File "/home/shubham/Downloads/sample/venv/local/lib/python2.7/site-packages/mechanize/_urllib2_fork.py", line 1185, in http_open
return self.do_open(httplib.HTTPConnection, req)
File "/home/shubham/Downloads/sample/venv/local/lib/python2.7/site-packages/mechanize/_urllib2_fork.py", line 1127, in do_open
h = http_class(host_port, timeout=req.timeout)
File "/usr/lib/python2.7/httplib.py", line 751, in init
(self.host, self.port) = self._get_hostport(host, port)
File "/usr/lib/python2.7/httplib.py", line 792, in _get_hostport
raise InvalidURL("nonnumeric port: '%s'" % host[i+1:])
httplib.InvalidURL: nonnumeric port: 'port'
Process finished with exit code 1`
I had the same problem as shubhamjanhere
+1 ... same here!
@shubhamjanhere @ricardorqr this does the same thing but works for me:
https://gist.github.com/rpryzant/cb4fe2c4d676262d667a68fcbf4e4c91
@shubhamjanhere @ricardorqr you have to give your proxy address for br.set_proxies function.
Remade after IMDb changed the way their page works
https://gist.github.com/Dob-The-Duilder/98f0765ce6dd9c1e11c3a649619654ac
it works for me