huzhifeng · November 2, 2014 11:52
diff --git a/csdn_blog.py b/csdn_blog.py
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-

 '''
 Usage:
    pip install requests
    pip install beautifulsoup4
    python csdn_blog.py
 '''

 import os
 import re
 import time
 import json
 import requests
 from bs4 import BeautifulSoup
 import huzhifeng

 def download_pdf(author, url, title, post_date):
    filename = huzhifeng.format_filename(author + '-' + post_date + '-' + title + '.pdf')
    dir_name = 'pdf'
    sub_dir = dir_name + os.sep + author
    file_path = sub_dir + os.sep + filename
    try:
        print '%s: %s' % (filename, url)
    except:
        print 'Exception: %s: %s' % (filename.encode('gbk', 'ignore'), url)
    if not os.path.exists(dir_name):
        os.mkdir(dir_name)
    if not os.path.exists(sub_dir):
        os.mkdir(sub_dir)
    if os.path.exists(file_path):
        if os.path.getsize(file_path) > 0x10000:
            print 'File exist, do nothing'
            return
        else:
            print 'File too small, delete it'

    huzhifeng.pdfmyurl(url, file_path)

 def main():
    print 'Started at %s' % (time.strftime('%Y/%m/%d %H:%M:%S', time.localtime()))

    headers = {
        'Host': 'blog.csdn.net',
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36'
    }
    homepage = 'http://blog.csdn.net/'
    authors = ['feixiaoxing', 'v_JULY_v']

    for author in authors:
        url = homepage + author
        while url:
            print 'Current page %s' % (url)
            try:
                r = requests.get(url, headers=headers)
            except:
                print 'Exception: requests.get(%s)' % (url)
                break
            if not r or r.status_code != 200:
                print 'requests.get(%s) failed' % (url)
                break

            soup = BeautifulSoup(r.content)

            papelist = soup.find(id='papelist').find_all('a')
            if not papelist:
                print 'Can not find papelist'
                break
            for a in papelist:
                if a.get_text() == u'下一页':
                    href = a.get('href')
                    url = homepage + href if href[0:1] == '/' else href
                    break
            else:
                print 'This is the last page'
                url = ''

            article_list = (soup.find(id="article_list").find_all('div', class_='list_item list_view')
                            or soup.find(id="article_list").find_all('div', class_='list_item article_item'))
            if not article_list:
                print 'Can not find article_list'
                continue
            for article in article_list:
                link_title = article.find('div', class_='article_title').find('h1').find('span', class_='link_title').find('a')
                link_postdate = article.find('div', class_='article_manage').find('span', class_='link_postdate')
                if not link_title or not link_postdate:
                    print 'Can not find link_title or link_postdate'
                    continue
                font = link_title.find('font')
                if font:
                    link_title.font.decompose()
                article_title = link_title.get_text().strip()
                href = link_title.get('href')
                article_url = homepage + href if href[0:1] == '/' else href
                article_post_time = link_postdate.get_text()
                if not article_title or not article_url or not article_post_time:
                    print 'Invalid title, url or time, continue next page'
                    continue
                match = re.search(r'(\d{4})-(\d{1,2})-(\d{1,2})', article_post_time.encode('utf-8', 'ignore'))
                if not match:
                    print 'Extract date failed: %s' (article_post_time)
                    continue
                post_date = '%04d-%02d-%02d' % (int(match.group(1)), int(match.group(2)), int(match.group(3)))
                download_pdf(author, article_url, article_title, post_date)

            time.sleep(3)

    print 'Stopped at %s' % (time.strftime('%Y/%m/%d %H:%M:%S', time.localtime()))

 if __name__ == '__main__':
    main()
	#!/usr/bin/env python
	# -- coding: utf-8 --

	'''
	Usage:
	pip install requests
	pip install beautifulsoup4
	python csdn_blog.py
	'''

	import os
	import re
	import time
	import json
	import requests
	from bs4 import BeautifulSoup
	import huzhifeng

	def download_pdf(author, url, title, post_date):
	filename = huzhifeng.format_filename(author + '-' + post_date + '-' + title + '.pdf')
	dir_name = 'pdf'
	sub_dir = dir_name + os.sep + author
	file_path = sub_dir + os.sep + filename
	try:
	print '%s: %s' % (filename, url)
	except:
	print 'Exception: %s: %s' % (filename.encode('gbk', 'ignore'), url)
	if not os.path.exists(dir_name):
	os.mkdir(dir_name)
	if not os.path.exists(sub_dir):
	os.mkdir(sub_dir)
	if os.path.exists(file_path):
	if os.path.getsize(file_path) > 0x10000:
	print 'File exist, do nothing'
	return
	else:
	print 'File too small, delete it'

	huzhifeng.pdfmyurl(url, file_path)

	def main():
	print 'Started at %s' % (time.strftime('%Y/%m/%d %H:%M:%S', time.localtime()))

	headers = {
	'Host': 'blog.csdn.net',
	'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36'
	}
	homepage = 'http://blog.csdn.net/'
	authors = ['feixiaoxing', 'v_JULY_v']

	for author in authors:
	url = homepage + author
	while url:
	print 'Current page %s' % (url)
	try:
	r = requests.get(url, headers=headers)
	except:
	print 'Exception: requests.get(%s)' % (url)
	break
	if not r or r.status_code != 200:
	print 'requests.get(%s) failed' % (url)
	break

	soup = BeautifulSoup(r.content)

	papelist = soup.find(id='papelist').find_all('a')
	if not papelist:
	print 'Can not find papelist'
	break
	for a in papelist:
	if a.get_text() == u'下一页':
	href = a.get('href')
	url = homepage + href if href[0:1] == '/' else href
	break
	else:
	print 'This is the last page'
	url = ''

	article_list = (soup.find(id="article_list").find_all('div', class_='list_item list_view')
	or soup.find(id="article_list").find_all('div', class_='list_item article_item'))
	if not article_list:
	print 'Can not find article_list'
	continue
	for article in article_list:
	link_title = article.find('div', class_='article_title').find('h1').find('span', class_='link_title').find('a')
	link_postdate = article.find('div', class_='article_manage').find('span', class_='link_postdate')
	if not link_title or not link_postdate:
	print 'Can not find link_title or link_postdate'
	continue
	font = link_title.find('font')
	if font:
	link_title.font.decompose()
	article_title = link_title.get_text().strip()
	href = link_title.get('href')
	article_url = homepage + href if href[0:1] == '/' else href
	article_post_time = link_postdate.get_text()
	if not article_title or not article_url or not article_post_time:
	print 'Invalid title, url or time, continue next page'
	continue
	match = re.search(r'(\d{4})-(\d{1,2})-(\d{1,2})', article_post_time.encode('utf-8', 'ignore'))
	if not match:
	print 'Extract date failed: %s' (article_post_time)
	continue
	post_date = '%04d-%02d-%02d' % (int(match.group(1)), int(match.group(2)), int(match.group(3)))
	download_pdf(author, article_url, article_title, post_date)

	time.sleep(3)

	print 'Stopped at %s' % (time.strftime('%Y/%m/%d %H:%M:%S', time.localtime()))

	if __name__ == '__main__':
	main()