Created
November 2, 2014 11:52
-
-
Save huzhifeng/323dd17e9de3c8a01f4d to your computer and use it in GitHub Desktop.
A python script used to download CSDN blog as PDF files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
''' | |
Usage: | |
pip install requests | |
pip install beautifulsoup4 | |
python csdn_blog.py | |
''' | |
import os | |
import re | |
import time | |
import json | |
import requests | |
from bs4 import BeautifulSoup | |
import huzhifeng | |
def download_pdf(author, url, title, post_date): | |
filename = huzhifeng.format_filename(author + '-' + post_date + '-' + title + '.pdf') | |
dir_name = 'pdf' | |
sub_dir = dir_name + os.sep + author | |
file_path = sub_dir + os.sep + filename | |
try: | |
print '%s: %s' % (filename, url) | |
except: | |
print 'Exception: %s: %s' % (filename.encode('gbk', 'ignore'), url) | |
if not os.path.exists(dir_name): | |
os.mkdir(dir_name) | |
if not os.path.exists(sub_dir): | |
os.mkdir(sub_dir) | |
if os.path.exists(file_path): | |
if os.path.getsize(file_path) > 0x10000: | |
print 'File exist, do nothing' | |
return | |
else: | |
print 'File too small, delete it' | |
huzhifeng.pdfmyurl(url, file_path) | |
def main(): | |
print 'Started at %s' % (time.strftime('%Y/%m/%d %H:%M:%S', time.localtime())) | |
headers = { | |
'Host': 'blog.csdn.net', | |
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36' | |
} | |
homepage = 'http://blog.csdn.net/' | |
authors = ['feixiaoxing', 'v_JULY_v'] | |
for author in authors: | |
url = homepage + author | |
while url: | |
print 'Current page %s' % (url) | |
try: | |
r = requests.get(url, headers=headers) | |
except: | |
print 'Exception: requests.get(%s)' % (url) | |
break | |
if not r or r.status_code != 200: | |
print 'requests.get(%s) failed' % (url) | |
break | |
soup = BeautifulSoup(r.content) | |
papelist = soup.find(id='papelist').find_all('a') | |
if not papelist: | |
print 'Can not find papelist' | |
break | |
for a in papelist: | |
if a.get_text() == u'下一页': | |
href = a.get('href') | |
url = homepage + href if href[0:1] == '/' else href | |
break | |
else: | |
print 'This is the last page' | |
url = '' | |
article_list = (soup.find(id="article_list").find_all('div', class_='list_item list_view') | |
or soup.find(id="article_list").find_all('div', class_='list_item article_item')) | |
if not article_list: | |
print 'Can not find article_list' | |
continue | |
for article in article_list: | |
link_title = article.find('div', class_='article_title').find('h1').find('span', class_='link_title').find('a') | |
link_postdate = article.find('div', class_='article_manage').find('span', class_='link_postdate') | |
if not link_title or not link_postdate: | |
print 'Can not find link_title or link_postdate' | |
continue | |
font = link_title.find('font') | |
if font: | |
link_title.font.decompose() | |
article_title = link_title.get_text().strip() | |
href = link_title.get('href') | |
article_url = homepage + href if href[0:1] == '/' else href | |
article_post_time = link_postdate.get_text() | |
if not article_title or not article_url or not article_post_time: | |
print 'Invalid title, url or time, continue next page' | |
continue | |
match = re.search(r'(\d{4})-(\d{1,2})-(\d{1,2})', article_post_time.encode('utf-8', 'ignore')) | |
if not match: | |
print 'Extract date failed: %s' (article_post_time) | |
continue | |
post_date = '%04d-%02d-%02d' % (int(match.group(1)), int(match.group(2)), int(match.group(3))) | |
download_pdf(author, article_url, article_title, post_date) | |
time.sleep(3) | |
print 'Stopped at %s' % (time.strftime('%Y/%m/%d %H:%M:%S', time.localtime())) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment