Last active
August 29, 2015 14:02
-
-
Save kzinglzy/b342f1b75c0031ef1423 to your computer and use it in GitHub Desktop.
Simple Huxiu Spider
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
import requests | |
from bs4 import BeautifulSoup | |
try: | |
from urllib.parse import urljoin # python 3.x | |
except: | |
from urlparse import urljoin # python 2.x | |
class Spider: | |
def __init__(self, URL): | |
self.URL = URL # www.huxiu.com | |
def start_crawl(self): | |
""" mission start :) | |
""" | |
for t_name, t_url in self.get_tag_list(): | |
if not t_name: | |
continue | |
print '\n', t_name | |
self.get_artice_from_tag(t_url) | |
def get_tag_list(self): | |
""" yield all of the tag and it's corresponding url | |
""" | |
target = "tagslist/all.html" # this url contaings all of tags :) | |
url = urljoin(self.URL, target) | |
r = requests.get(url) | |
soup = BeautifulSoup(r.text) | |
for each in soup.find_all('dl'): | |
for t in each('li'): | |
yield t('a')[0].string, t('a')[0]['href'] # tag_name, tag_url | |
def get_artice_from_tag(self, tag_url): | |
""" output the whole articles with the given tag | |
""" | |
url = urljoin(self.URL, tag_url) | |
index = 1 | |
while True: | |
try: | |
r = requests.get(url) | |
except: | |
print 'Faild to URL: {}'.format(url) | |
return '' | |
# output the article title | |
soup = BeautifulSoup(r.text) | |
for each in soup.find_all('dl'): | |
print index, '. ', each('h3')[0].string # article title | |
index += 1 | |
# get next url | |
current = soup.find('div', 'pull-right pgs') | |
if current: # some page is special, so we have to ensure current is no None | |
next_url = current.find_next_siblings('b', 'a') | |
if next_url: | |
url = next_url['href'] | |
continue | |
break # when all of the article is indexed | |
if __name__ == '__main__': | |
sp = Spider('http://www.huxiu.com/') | |
sp.start_crawl() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This code is use to find out all of the tags and its corresponding articles from www.huxiu.com
Just for fun :)