Skip to content

Instantly share code, notes, and snippets.

@huzhifeng
Last active August 29, 2015 14:08
Show Gist options
  • Save huzhifeng/c7b6d382b430b8a492c6 to your computer and use it in GitHub Desktop.
Save huzhifeng/c7b6d382b430b8a492c6 to your computer and use it in GitHub Desktop.
A python script used to download all HTML articles on http://coolshell.cn and save as PDF files
#!/usr/bin/python
# -*- coding: utf-8 -*-
'''
Usage:
pip install requests
pip install beautifulsoup4
python coolshell.py
'''
import os
import re
import time
import requests
from bs4 import BeautifulSoup
headers = {
'Host': 'coolshell.cn',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36'
}
def format_filename(title):
invalid_chars = '/\:*?"<>|'
filename = ''
for c in title:
filename = filename + ('-' if c in invalid_chars else c)
return filename
def download_pdf(url, title, post_date):
try:
print '%s-%s: %s' % (post_date, title, url)
except:
print 'Exception: %s-%s: %s' % (post_date, title.encode('gbk', 'ignore'), url)
dir_name = 'pdf'
filename = format_filename('CoolShell-' + post_date + '-' + title + '.pdf')
file_path = dir_name + os.sep + filename
if not os.path.isdir(dir_name):
os.mkdir(dir_name)
if os.path.exists(file_path):
if os.path.getsize(file_path) > 0x10000:
print 'File exist, do nothing'
return
else:
print 'File too small, delete it'
pdfmyurl = 'http://pdfmyurl.com/index.php'
headers = {
'Host': 'pdfmyurl.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36',
'Cookie': 'NB_SRVID=srv29695; _ga=GA1.2.826873587.1414482384; _gat=1',
'Cache-Control': 'max-age=0',
'Origin': 'http://pdfmyurl.com',
'Referer': 'http://pdfmyurl.com/',
'Accept-Language': 'zh-CN,zh;q=0.8'
}
payload = {
'url': url
}
r = requests.post(pdfmyurl, data=payload, headers=headers)
if not r or r.status_code != 200:
print 'ruests.post(url) failed' % (url)
return None
with open(file_path, 'wb') as f:
f.write(r.content)
def main():
print 'Started at %s' % (time.strftime('%Y/%m/%d %H:%M:%S', time.localtime()))
current_page = 1
last_page = 999
while current_page <= last_page:
url = 'http://coolshell.cn' if current_page == 1 else 'http://coolshell.cn/page/%d' % (current_page)
print 'Current page %d' % (current_page)
try:
r = requests.get(url, headers=headers)
except requests.exceptions.ConnectionError as e:
print 'ConnectionError: e=%s' % e
except requests.exceptions.Timeout as e:
print 'Timeout: e=%s' % e
except requests.exceptions.TooManyRedirects as e:
print 'TooManyRedirects: e=%s' % e
except requests.exceptions.HTTPError as e:
print 'HTTPError: e=%s' % e
except requests.exceptions.RequestException as e:
print 'RequestException: e=%s' % e
except:
print 'Unkonwn exception: url=%s' % url
if not r or r.status_code != 200:
print 'ruests.get(%s) failed' % (url)
return
href = ''
soup = BeautifulSoup(r.content)
title = soup.title.string
posts = soup.find_all('div', class_="post")
for post in posts:
a = post.find('a', class_='title')
span_date = post.find('span', class_='date')
if not a or not span_date:
print 'Invalid post'
continue
href = a.get('href')
title = a.get_text()
post_date = span_date.get_text()
match = re.search(r'(\d{4})年(\d{1,2})月(\d{1,2})日', post_date.encode('utf-8', 'ignore'))
if not match:
print 'Invalid date: %s' (post_date)
continue
post_date = '%04d-%02d-%02d' % (int(match.group(1)), int(match.group(2)), int(match.group(3)))
download_pdf(href, title, post_date)
if last_page == 999:
pages = soup.find('span', class_='pages')
text = pages.get_text() # u'第 1 / 67 页'
match = re.search(r'(\d{1,3})[^\d]+(\d{1,3})', text.encode('utf-8', 'ignore'))
if match:
last_page = int(match.group(2))
print 'Last page is %d' % (last_page)
current_page = current_page + 1
print 'Stopped at %s' % (time.strftime('%Y/%m/%d %H:%M:%S', time.localtime()))
if __name__ == "__main__":
main()
else:
print 'Run as a module'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment