Last active
August 29, 2015 14:08
-
-
Save huzhifeng/aefa5a6bb4768f61d434 to your computer and use it in GitHub Desktop.
A python script used to download all HTML articles on http://www.ruanyifeng.com/blog/ and save as PDF files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
# huzhifeng's Python Lib | |
import os | |
import re | |
import time | |
import json | |
import requests | |
import wget | |
def format_filename(fname): | |
invalid_chars = '/\:*?"<>|' | |
filename = '' | |
for c in fname: | |
filename = filename + ('-' if c in invalid_chars else c) | |
return filename | |
def pdfcrowd(url, file_path): | |
post_url = 'http://pdfcrowd.com/form/json/convert/uri/' | |
headers = { | |
'Host': 'pdfcrowd.com', | |
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36' | |
} | |
payload = { | |
'noCache': '1414481239318', | |
'src': url, | |
'conversion_source': 'uri' | |
} | |
try: | |
r = requests.post(post_url, data=payload, headers=headers) | |
except: | |
print 'Exception: requests.post(%s), url=%s' % (post_url, url) | |
return False | |
if not r or r.status_code != 200: | |
print 'requests.post(%s) failed, url=%s' % (post_url, url) | |
return False | |
data = r.text | |
obj = json.loads(data) | |
if not all(key in obj for key in ('status', 'uri')): | |
print 'Invalid response obj=%s' % (obj) | |
return False | |
status = obj['status'] | |
if status != 'ok': | |
print 'status=%s' % (status) | |
download_url = 'http://pdfcrowd.com%s' % (obj['uri']) | |
if wget.download(download_url, out=file_path): | |
print '' | |
return True | |
else: | |
return False | |
def pdfmyurl(url, file_path): | |
post_url = 'http://pdfmyurl.com/index.php' | |
headers = { | |
'Host': 'pdfmyurl.com', | |
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36', | |
'Cookie': 'NB_SRVID=srv29695; _ga=GA1.2.826873587.1414482384; _gat=1', | |
'Cache-Control': 'max-age=0', | |
'Origin': 'http://pdfmyurl.com', | |
'Referer': 'http://pdfmyurl.com/', | |
'Accept-Language': 'zh-CN,zh;q=0.8' | |
} | |
payload = { | |
'url': url | |
} | |
try: | |
r = requests.post(post_url, data=payload, headers=headers) | |
except: | |
print 'Exception: requests.post(%s), url=%s' % (post_url, url) | |
return False | |
if not r or r.status_code != 200: | |
print 'requests.post(%s) failed, url=%s' % (post_url, url) | |
return False | |
with open(file_path, 'wb') as f: | |
f.write(r.content) | |
return True |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
''' | |
Usage: | |
pip install requests | |
pip install beautifulsoup4 | |
pip install wget | |
python ruanyifeng.py | |
''' | |
import os | |
import re | |
import time | |
import json | |
import requests | |
from bs4 import BeautifulSoup | |
import huzhifeng | |
def download_pdf(url, title, post_date): | |
try: | |
print '%s-%s: %s' % (post_date, title, url) | |
except: | |
print 'Exception: %s-%s: %s' % (post_date, title.encode('gbk', 'ignore'), url) | |
dir_name = 'pdf' | |
filename = huzhifeng.format_filename('ryf-' + post_date + '-' + title + '.pdf') | |
file_path = dir_name + os.sep + filename | |
if not os.path.isdir(dir_name): | |
os.mkdir(dir_name) | |
if os.path.exists(file_path): | |
if os.path.getsize(file_path) > 0x10000: | |
print 'File exist, do nothing' | |
return | |
else: | |
print 'File too small, delete it' | |
huzhifeng.pdfmyurl(url, file_path) | |
def main(): | |
print 'Started at %s' % (time.strftime('%Y/%m/%d %H:%M:%S', time.localtime())) | |
sort = 'asc' | |
#sort = 'desc' | |
headers = { | |
'Host': 'www.ruanyifeng.com', | |
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36' | |
} | |
if sort == 'desc': | |
url = 'http://www.ruanyifeng.com/blog/%s' % (time.strftime('%Y/%m/', time.localtime())) | |
else: | |
url = 'http://www.ruanyifeng.com/blog/2003/12/' | |
while url: | |
print 'Current page %s' % (url) | |
try: | |
r = requests.get(url, headers=headers) | |
except: | |
print 'Exception: requests.get(%s)' % (url) | |
break | |
if not r or r.status_code != 200: | |
print 'requests.get(%s) failed' % (url) | |
break | |
soup = BeautifulSoup(r.content) | |
nav_tag = soup.find('div', class_='content-nav').find('p') | |
if not nav_tag: | |
print 'Can not find tag p for previous month' | |
break | |
a_tag = nav_tag.find_all('a') | |
if not a_tag or len(a_tag) < 1: | |
print 'Can not find tag a for previous month' | |
break | |
if len(a_tag) == 1: | |
text = a_tag[0].get_text() | |
if text == u'« 上月' and sort == 'desc': | |
url = a_tag[0].get('href') | |
elif text == u'下月 »' and sort == 'asc': | |
url = a_tag[0].get('href') | |
else: | |
print 'This is the last page' | |
url = '' | |
else: | |
url = a_tag[0].get('href') if sort == 'desc' else a_tag[1].get('href') | |
if(url == 'http://www.ruanyifeng.com/blog/2008/03/'): | |
print 'Skip page http://www.ruanyifeng.com/blog/2008/03, it does no response' | |
url = 'http://www.ruanyifeng.com/blog/2008/02/' if sort == 'desc' else 'http://www.ruanyifeng.com/blog/2008/04/' | |
article_div_tag = soup.find('div', class_='entry-asset asset hentry') | |
a_tag = article_div_tag.find('div', class_='asset-header').find('h2', class_='asset-name entry-title').find('a') | |
article_time_tag = article_div_tag.find('div', class_='asset-footer').find('div', class_='asset-meta').find('p').find('span', class_='byline').find('abbr', class_='published') | |
if not a_tag: | |
print 'Can not find tag a for latest post of this month' | |
continue | |
if not article_time_tag: | |
print 'Can not find tag abbr' | |
continue | |
article_title = a_tag.get_text() | |
article_url = a_tag.get('href') | |
article_post_time = article_time_tag.get_text() | |
if not article_title or not article_url or not article_post_time: | |
print 'Invalid title, url or time, continue next page' | |
continue | |
match = re.search(r'(\d{4})年 {0,1}(\d{1,2})月 {0,1}(\d{1,2})日', article_post_time.encode('utf-8', 'ignore')) | |
if not match: | |
print 'Extract date failed: %s' (article_post_time) | |
continue | |
post_date = '%04d-%02d-%02d' % (int(match.group(1)), int(match.group(2)), int(match.group(3))) | |
download_pdf(article_url, article_title, post_date) | |
article_list_tag = soup.find('div', class_='module-categories module').find('div', class_='module-content').find('ul', class_='module-list').find_all('li', class_='module-list-item') | |
if not article_list_tag: | |
print 'Can not find tag lis' | |
continue | |
for item in article_list_tag: | |
a_tag = item.find('a') | |
if not a_tag: | |
print 'Can not find tag a for post list of this month' | |
continue | |
article_title = a_tag.get_text() | |
article_url = a_tag.get('href') | |
item.span.decompose() | |
item.a.decompose() | |
article_post_time = item.get_text() | |
if not article_title or not article_url or not article_post_time: | |
print 'Invalid title, url or time, continue next li' | |
continue | |
match = re.search(r'(\d{4})\.(\d{1,2})\.(\d{1,2})', article_post_time.encode('utf-8', 'ignore')) | |
if not match: | |
print 'Extract date failed: %s' (article_post_time) | |
continue | |
post_date = '%04d-%02d-%02d' % (int(match.group(1)), int(match.group(2)), int(match.group(3))) | |
download_pdf(article_url, article_title, post_date) | |
time.sleep(3) | |
print 'Stopped at %s' % (time.strftime('%Y/%m/%d %H:%M:%S', time.localtime())) | |
if __name__ == '__main__': | |
main() | |
else: | |
print 'Run as a module' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment