Skip to content

Instantly share code, notes, and snippets.

@huzhifeng
Created November 2, 2014 11:52
Show Gist options
  • Save huzhifeng/323dd17e9de3c8a01f4d to your computer and use it in GitHub Desktop.
Save huzhifeng/323dd17e9de3c8a01f4d to your computer and use it in GitHub Desktop.
A python script used to download CSDN blog as PDF files
#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''
Usage:
pip install requests
pip install beautifulsoup4
python csdn_blog.py
'''
import os
import re
import time
import json
import requests
from bs4 import BeautifulSoup
import huzhifeng
def download_pdf(author, url, title, post_date):
filename = huzhifeng.format_filename(author + '-' + post_date + '-' + title + '.pdf')
dir_name = 'pdf'
sub_dir = dir_name + os.sep + author
file_path = sub_dir + os.sep + filename
try:
print '%s: %s' % (filename, url)
except:
print 'Exception: %s: %s' % (filename.encode('gbk', 'ignore'), url)
if not os.path.exists(dir_name):
os.mkdir(dir_name)
if not os.path.exists(sub_dir):
os.mkdir(sub_dir)
if os.path.exists(file_path):
if os.path.getsize(file_path) > 0x10000:
print 'File exist, do nothing'
return
else:
print 'File too small, delete it'
huzhifeng.pdfmyurl(url, file_path)
def main():
print 'Started at %s' % (time.strftime('%Y/%m/%d %H:%M:%S', time.localtime()))
headers = {
'Host': 'blog.csdn.net',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36'
}
homepage = 'http://blog.csdn.net/'
authors = ['feixiaoxing', 'v_JULY_v']
for author in authors:
url = homepage + author
while url:
print 'Current page %s' % (url)
try:
r = requests.get(url, headers=headers)
except:
print 'Exception: requests.get(%s)' % (url)
break
if not r or r.status_code != 200:
print 'requests.get(%s) failed' % (url)
break
soup = BeautifulSoup(r.content)
papelist = soup.find(id='papelist').find_all('a')
if not papelist:
print 'Can not find papelist'
break
for a in papelist:
if a.get_text() == u'下一页':
href = a.get('href')
url = homepage + href if href[0:1] == '/' else href
break
else:
print 'This is the last page'
url = ''
article_list = (soup.find(id="article_list").find_all('div', class_='list_item list_view')
or soup.find(id="article_list").find_all('div', class_='list_item article_item'))
if not article_list:
print 'Can not find article_list'
continue
for article in article_list:
link_title = article.find('div', class_='article_title').find('h1').find('span', class_='link_title').find('a')
link_postdate = article.find('div', class_='article_manage').find('span', class_='link_postdate')
if not link_title or not link_postdate:
print 'Can not find link_title or link_postdate'
continue
font = link_title.find('font')
if font:
link_title.font.decompose()
article_title = link_title.get_text().strip()
href = link_title.get('href')
article_url = homepage + href if href[0:1] == '/' else href
article_post_time = link_postdate.get_text()
if not article_title or not article_url or not article_post_time:
print 'Invalid title, url or time, continue next page'
continue
match = re.search(r'(\d{4})-(\d{1,2})-(\d{1,2})', article_post_time.encode('utf-8', 'ignore'))
if not match:
print 'Extract date failed: %s' (article_post_time)
continue
post_date = '%04d-%02d-%02d' % (int(match.group(1)), int(match.group(2)), int(match.group(3)))
download_pdf(author, article_url, article_title, post_date)
time.sleep(3)
print 'Stopped at %s' % (time.strftime('%Y/%m/%d %H:%M:%S', time.localtime()))
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment