Skip to content

Instantly share code, notes, and snippets.

@huzhifeng
Last active August 29, 2015 14:08
Show Gist options
  • Save huzhifeng/aefa5a6bb4768f61d434 to your computer and use it in GitHub Desktop.
Save huzhifeng/aefa5a6bb4768f61d434 to your computer and use it in GitHub Desktop.
A python script used to download all HTML articles on http://www.ruanyifeng.com/blog/ and save as PDF files
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# huzhifeng's Python Lib
import os
import re
import time
import json
import requests
import wget
def format_filename(fname):
invalid_chars = '/\:*?"<>|'
filename = ''
for c in fname:
filename = filename + ('-' if c in invalid_chars else c)
return filename
def pdfcrowd(url, file_path):
post_url = 'http://pdfcrowd.com/form/json/convert/uri/'
headers = {
'Host': 'pdfcrowd.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36'
}
payload = {
'noCache': '1414481239318',
'src': url,
'conversion_source': 'uri'
}
try:
r = requests.post(post_url, data=payload, headers=headers)
except:
print 'Exception: requests.post(%s), url=%s' % (post_url, url)
return False
if not r or r.status_code != 200:
print 'requests.post(%s) failed, url=%s' % (post_url, url)
return False
data = r.text
obj = json.loads(data)
if not all(key in obj for key in ('status', 'uri')):
print 'Invalid response obj=%s' % (obj)
return False
status = obj['status']
if status != 'ok':
print 'status=%s' % (status)
download_url = 'http://pdfcrowd.com%s' % (obj['uri'])
if wget.download(download_url, out=file_path):
print ''
return True
else:
return False
def pdfmyurl(url, file_path):
post_url = 'http://pdfmyurl.com/index.php'
headers = {
'Host': 'pdfmyurl.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36',
'Cookie': 'NB_SRVID=srv29695; _ga=GA1.2.826873587.1414482384; _gat=1',
'Cache-Control': 'max-age=0',
'Origin': 'http://pdfmyurl.com',
'Referer': 'http://pdfmyurl.com/',
'Accept-Language': 'zh-CN,zh;q=0.8'
}
payload = {
'url': url
}
try:
r = requests.post(post_url, data=payload, headers=headers)
except:
print 'Exception: requests.post(%s), url=%s' % (post_url, url)
return False
if not r or r.status_code != 200:
print 'requests.post(%s) failed, url=%s' % (post_url, url)
return False
with open(file_path, 'wb') as f:
f.write(r.content)
return True
#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''
Usage:
pip install requests
pip install beautifulsoup4
pip install wget
python ruanyifeng.py
'''
import os
import re
import time
import json
import requests
from bs4 import BeautifulSoup
import huzhifeng
def download_pdf(url, title, post_date):
try:
print '%s-%s: %s' % (post_date, title, url)
except:
print 'Exception: %s-%s: %s' % (post_date, title.encode('gbk', 'ignore'), url)
dir_name = 'pdf'
filename = huzhifeng.format_filename('ryf-' + post_date + '-' + title + '.pdf')
file_path = dir_name + os.sep + filename
if not os.path.isdir(dir_name):
os.mkdir(dir_name)
if os.path.exists(file_path):
if os.path.getsize(file_path) > 0x10000:
print 'File exist, do nothing'
return
else:
print 'File too small, delete it'
huzhifeng.pdfmyurl(url, file_path)
def main():
print 'Started at %s' % (time.strftime('%Y/%m/%d %H:%M:%S', time.localtime()))
sort = 'asc'
#sort = 'desc'
headers = {
'Host': 'www.ruanyifeng.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36'
}
if sort == 'desc':
url = 'http://www.ruanyifeng.com/blog/%s' % (time.strftime('%Y/%m/', time.localtime()))
else:
url = 'http://www.ruanyifeng.com/blog/2003/12/'
while url:
print 'Current page %s' % (url)
try:
r = requests.get(url, headers=headers)
except:
print 'Exception: requests.get(%s)' % (url)
break
if not r or r.status_code != 200:
print 'requests.get(%s) failed' % (url)
break
soup = BeautifulSoup(r.content)
nav_tag = soup.find('div', class_='content-nav').find('p')
if not nav_tag:
print 'Can not find tag p for previous month'
break
a_tag = nav_tag.find_all('a')
if not a_tag or len(a_tag) < 1:
print 'Can not find tag a for previous month'
break
if len(a_tag) == 1:
text = a_tag[0].get_text()
if text == u'« 上月' and sort == 'desc':
url = a_tag[0].get('href')
elif text == u'下月 »' and sort == 'asc':
url = a_tag[0].get('href')
else:
print 'This is the last page'
url = ''
else:
url = a_tag[0].get('href') if sort == 'desc' else a_tag[1].get('href')
if(url == 'http://www.ruanyifeng.com/blog/2008/03/'):
print 'Skip page http://www.ruanyifeng.com/blog/2008/03, it does no response'
url = 'http://www.ruanyifeng.com/blog/2008/02/' if sort == 'desc' else 'http://www.ruanyifeng.com/blog/2008/04/'
article_div_tag = soup.find('div', class_='entry-asset asset hentry')
a_tag = article_div_tag.find('div', class_='asset-header').find('h2', class_='asset-name entry-title').find('a')
article_time_tag = article_div_tag.find('div', class_='asset-footer').find('div', class_='asset-meta').find('p').find('span', class_='byline').find('abbr', class_='published')
if not a_tag:
print 'Can not find tag a for latest post of this month'
continue
if not article_time_tag:
print 'Can not find tag abbr'
continue
article_title = a_tag.get_text()
article_url = a_tag.get('href')
article_post_time = article_time_tag.get_text()
if not article_title or not article_url or not article_post_time:
print 'Invalid title, url or time, continue next page'
continue
match = re.search(r'(\d{4})年 {0,1}(\d{1,2})月 {0,1}(\d{1,2})日', article_post_time.encode('utf-8', 'ignore'))
if not match:
print 'Extract date failed: %s' (article_post_time)
continue
post_date = '%04d-%02d-%02d' % (int(match.group(1)), int(match.group(2)), int(match.group(3)))
download_pdf(article_url, article_title, post_date)
article_list_tag = soup.find('div', class_='module-categories module').find('div', class_='module-content').find('ul', class_='module-list').find_all('li', class_='module-list-item')
if not article_list_tag:
print 'Can not find tag lis'
continue
for item in article_list_tag:
a_tag = item.find('a')
if not a_tag:
print 'Can not find tag a for post list of this month'
continue
article_title = a_tag.get_text()
article_url = a_tag.get('href')
item.span.decompose()
item.a.decompose()
article_post_time = item.get_text()
if not article_title or not article_url or not article_post_time:
print 'Invalid title, url or time, continue next li'
continue
match = re.search(r'(\d{4})\.(\d{1,2})\.(\d{1,2})', article_post_time.encode('utf-8', 'ignore'))
if not match:
print 'Extract date failed: %s' (article_post_time)
continue
post_date = '%04d-%02d-%02d' % (int(match.group(1)), int(match.group(2)), int(match.group(3)))
download_pdf(article_url, article_title, post_date)
time.sleep(3)
print 'Stopped at %s' % (time.strftime('%Y/%m/%d %H:%M:%S', time.localtime()))
if __name__ == '__main__':
main()
else:
print 'Run as a module'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment