Skip to content

Instantly share code, notes, and snippets.

@timtan
Created May 6, 2013 23:46
Show Gist options
  • Select an option

  • Save timtan/5529222 to your computer and use it in GitHub Desktop.

Select an option

Save timtan/5529222 to your computer and use it in GitHub Desktop.
def urlinformation():
titlelist = []
genrelist = []
valuelist = []
lists = (
titlelist,
genrelist,
valuelist,
)
idlist = ['0091209', '0111161']
for i in idlist:
headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
req = urllib2.Request("http://www.imdb.com/title/tt"+i+"/", headers=headers)
html = urllib2.urlopen("http://www.imdb.com/title/tt"+i+"/", timeout = 30).read()
soup = BeautifulSoup(html)
nameTag = soup.findAll("title")
genreTag = soup.findAll("span",{"itemprop":"genre"})
ratingTag = soup.findAll("span",{"itemprop":"ratingValue"})
tags = (
nameTag,
genreTag,
ratingTag,
)
def TEXT(tag):
return tag.find(text=True)
for tag_list, result in zip( tags, lists ):
tag_str = ",".join(map( TEXT, tag_list))
result.append(tag_str)
return zip( * lists )
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment