Skip to content

Instantly share code, notes, and snippets.

@huzhifeng
Last active August 29, 2015 14:08
Show Gist options
  • Save huzhifeng/823c14170a85cc7b1464 to your computer and use it in GitHub Desktop.
Save huzhifeng/823c14170a85cc7b1464 to your computer and use it in GitHub Desktop.
A python script used to download PDFs from http://www.crifan.com/files/doc/docbook/
#!/usr/bin/python
# -*- coding: utf-8 -*-
'''
Usage:
pip install requests
pip install pyquery
pip install beautifulsoup4
pip install wget
python crifan.py
'''
import os
import requests
from pyquery import PyQuery as pq
from bs4 import BeautifulSoup
import wget
headers = {
'Host': 'www.crifan.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36'
}
def format_filename(title):
invalid_chars = '/\:*?"<>|'
filename = ''
for c in title:
filename = filename + ('-' if c in invalid_chars else c)
return filename
def download_pdf(url, title):
print '%s' % (url)
dir_name = 'pdf'
filename = format_filename('Crifan-' + title + '.pdf')
file_path = dir_name + os.sep + filename
if not os.path.isdir(dir_name):
os.mkdir(dir_name)
if os.path.exists(file_path):
print 'File %s already exist' % (filename)
return
# 1. Download with wget module (Default)
ret = wget.download(url, out=file_path)
print '\nDownload ret = %s' % (ret)
# 2. Download with wget command
'''
download_cmd = 'wget %s -O %s' % (url, file_path)
ret = os.system(download_cmd)
if ret == 0:
print 'Download %s successful' % (url)
else:
print 'Download %s failed, error code=%d' % (url, ret)
'''
# 3. Download with requests
'''
r = requests.get(url)
with open(file_path, 'wb') as f:
f.write(r.content)
'''
def get_doc(url):
try:
r = requests.get(url, headers=headers)
except requests.exceptions.ConnectionError as e:
print 'ConnectionError: e=%s' % e
except requests.exceptions.Timeout as e:
print 'Timeout: e=%s' % e
except requests.exceptions.TooManyRedirects as e:
print 'TooManyRedirects: e=%s' % e
except requests.exceptions.HTTPError as e:
print 'HTTPError: e=%s' % e
except requests.exceptions.RequestException as e:
print 'RequestException: e=%s' % e
except:
print 'Unkonwn exception: url=%s' % url
if r.status_code != 200:
print 'ruests.post failed, status_code=%d' % (r.status_code)
return ''
href = ''
d = pq(r.content)
title = d('title').text()
informaltable = d('div.informaltable')
for a in informaltable.items('a'):
href = a.attr('href')
if href[-4:] == '.pdf':
download_pdf(href, title)
break
'''
soup = BeautifulSoup(r.content)
title = soup.title.string
informaltable = soup.find('div', class_="informaltable")
for a in informaltable.find_all('a'):
href = a.get('href')
if href[-4:] == '.pdf':
download_pdf(href, title)
break
'''
def main():
url = 'http://www.crifan.com/files/doc/docbook/'
try:
r = requests.get(url, headers=headers)
except requests.exceptions.ConnectionError as e:
print 'ConnectionError: e=%s' % e
except requests.exceptions.Timeout as e:
print 'Timeout: e=%s' % e
except requests.exceptions.TooManyRedirects as e:
print 'TooManyRedirects: e=%s' % e
except requests.exceptions.HTTPError as e:
print 'HTTPError: e=%s' % e
except requests.exceptions.RequestException as e:
print 'RequestException: e=%s' % e
except:
print 'Unkonwn exception: url=%s' % url
if r.status_code != 200:
print 'ruests.post failed, status_code=%d' % (r.status_code)
return ''
href = ''
d = pq(r.content)
for a in d.items('a'):
href = a.attr('href')
if href[0:19] == '/files/doc/docbook/':
get_doc('http://www.crifan.com' + href + 'release/html/' + href[19:-1] + '.html')
'''
soup = BeautifulSoup(r.content)
for a in soup.find_all('a'):
href = a.get('href')
if href[0:19] == '/files/doc/docbook/':
get_doc('http://www.crifan.com' + href + 'release/html/' + href[19:-1] + '.html')
'''
if __name__ == "__main__":
main()
else:
print 'Run as a module'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment