huzhifeng · August 29, 2015 14:08
diff --git a/crifan.py b/crifan.py
 #!/usr/bin/python
 # -*- coding: utf-8 -*-

 '''
 Usage:
    pip install requests
    pip install pyquery
    pip install beautifulsoup4
    pip install wget
    python crifan.py
 '''

 import os
 import requests
 from pyquery import PyQuery as pq
 from bs4 import BeautifulSoup
 import wget

 headers = {
    'Host': 'www.crifan.com',
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36'
 }

 def format_filename(title):
    invalid_chars = '/\:*?"<>|'
    filename = ''
    for c in title:
        filename = filename + ('-' if c in invalid_chars else c)
        
    return filename

 def download_pdf(url, title):
    print '%s' % (url)
    dir_name = 'pdf'
    filename = format_filename('Crifan-' + title + '.pdf')
    file_path = dir_name + os.sep + filename
    if not os.path.isdir(dir_name):
        os.mkdir(dir_name)
    if os.path.exists(file_path):
        print 'File %s already exist' % (filename)
        return

    # 1. Download with wget module (Default)
    ret = wget.download(url, out=file_path)
    print '\nDownload ret = %s' % (ret)
    
    # 2. Download with wget command
    '''
    download_cmd = 'wget %s -O %s' % (url, file_path)
    ret = os.system(download_cmd)
    if ret == 0:
        print 'Download %s successful' % (url)
    else:
        print 'Download %s failed, error code=%d' % (url, ret)
    '''

    # 3. Download with requests
    '''
    r = requests.get(url)
    with open(file_path, 'wb') as f:
        f.write(r.content)
    '''

 def get_doc(url):
    try:
        r = requests.get(url, headers=headers)
    except requests.exceptions.ConnectionError as e:
        print 'ConnectionError: e=%s' % e 
    except requests.exceptions.Timeout as e:
        print 'Timeout: e=%s' % e 
    except requests.exceptions.TooManyRedirects as e:
        print 'TooManyRedirects: e=%s' % e 
    except requests.exceptions.HTTPError as e:
        print 'HTTPError: e=%s' % e 
    except requests.exceptions.RequestException as e:
        print 'RequestException: e=%s' % e 
    except:
        print 'Unkonwn exception: url=%s' % url

    if r.status_code != 200:
        print 'ruests.post failed, status_code=%d' % (r.status_code)
        return ''

    href = ''
    d = pq(r.content)
    title = d('title').text()
    informaltable = d('div.informaltable')
    for a in informaltable.items('a'):
        href = a.attr('href')
        if href[-4:] == '.pdf':
            download_pdf(href, title)
            break
 '''
    soup = BeautifulSoup(r.content)
    title = soup.title.string
    informaltable = soup.find('div', class_="informaltable")
    for a in informaltable.find_all('a'):
        href = a.get('href')
        if href[-4:] == '.pdf':
            download_pdf(href, title)
            break
 '''

 def main():
    url = 'http://www.crifan.com/files/doc/docbook/'
    try:
        r = requests.get(url, headers=headers)
    except requests.exceptions.ConnectionError as e:
        print 'ConnectionError: e=%s' % e 
    except requests.exceptions.Timeout as e:
        print 'Timeout: e=%s' % e 
    except requests.exceptions.TooManyRedirects as e:
        print 'TooManyRedirects: e=%s' % e 
    except requests.exceptions.HTTPError as e:
        print 'HTTPError: e=%s' % e 
    except requests.exceptions.RequestException as e:
        print 'RequestException: e=%s' % e 
    except:
        print 'Unkonwn exception: url=%s' % url

    if r.status_code != 200:
        print 'ruests.post failed, status_code=%d' % (r.status_code)
        return ''

    href = ''
    d = pq(r.content)
    for a in d.items('a'):
        href = a.attr('href')
        if href[0:19] == '/files/doc/docbook/':
            get_doc('http://www.crifan.com' + href + 'release/html/' + href[19:-1] + '.html')
 '''
    soup = BeautifulSoup(r.content)
    for a in soup.find_all('a'):
        href = a.get('href')
        if href[0:19] == '/files/doc/docbook/':
            get_doc('http://www.crifan.com' + href + 'release/html/' + href[19:-1] + '.html')
 '''

 if __name__ == "__main__":
    main()
 else:
    print 'Run as a module'
	#!/usr/bin/python
	# -- coding: utf-8 --

	'''
	Usage:
	pip install requests
	pip install pyquery
	pip install beautifulsoup4
	pip install wget
	python crifan.py
	'''

	import os
	import requests
	from pyquery import PyQuery as pq
	from bs4 import BeautifulSoup
	import wget

	headers = {
	'Host': 'www.crifan.com',
	'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36'
	}

	def format_filename(title):
	invalid_chars = '/\:*?"<>\|'
	filename = ''
	for c in title:
	filename = filename + ('-' if c in invalid_chars else c)

	return filename

	def download_pdf(url, title):
	print '%s' % (url)
	dir_name = 'pdf'
	filename = format_filename('Crifan-' + title + '.pdf')
	file_path = dir_name + os.sep + filename
	if not os.path.isdir(dir_name):
	os.mkdir(dir_name)
	if os.path.exists(file_path):
	print 'File %s already exist' % (filename)
	return

	# 1. Download with wget module (Default)
	ret = wget.download(url, out=file_path)
	print '\nDownload ret = %s' % (ret)

	# 2. Download with wget command
	'''
	download_cmd = 'wget %s -O %s' % (url, file_path)
	ret = os.system(download_cmd)
	if ret == 0:
	print 'Download %s successful' % (url)
	else:
	print 'Download %s failed, error code=%d' % (url, ret)
	'''

	# 3. Download with requests
	'''
	r = requests.get(url)
	with open(file_path, 'wb') as f:
	f.write(r.content)
	'''

	def get_doc(url):
	try:
	r = requests.get(url, headers=headers)
	except requests.exceptions.ConnectionError as e:
	print 'ConnectionError: e=%s' % e
	except requests.exceptions.Timeout as e:
	print 'Timeout: e=%s' % e
	except requests.exceptions.TooManyRedirects as e:
	print 'TooManyRedirects: e=%s' % e
	except requests.exceptions.HTTPError as e:
	print 'HTTPError: e=%s' % e
	except requests.exceptions.RequestException as e:
	print 'RequestException: e=%s' % e
	except:
	print 'Unkonwn exception: url=%s' % url

	if r.status_code != 200:
	print 'ruests.post failed, status_code=%d' % (r.status_code)
	return ''

	href = ''
	d = pq(r.content)
	title = d('title').text()
	informaltable = d('div.informaltable')
	for a in informaltable.items('a'):
	href = a.attr('href')
	if href[-4:] == '.pdf':
	download_pdf(href, title)
	break
	'''
	soup = BeautifulSoup(r.content)
	title = soup.title.string
	informaltable = soup.find('div', class_="informaltable")
	for a in informaltable.find_all('a'):
	href = a.get('href')
	if href[-4:] == '.pdf':
	download_pdf(href, title)
	break
	'''

	def main():
	url = 'http://www.crifan.com/files/doc/docbook/'
	try:
	r = requests.get(url, headers=headers)
	except requests.exceptions.ConnectionError as e:
	print 'ConnectionError: e=%s' % e
	except requests.exceptions.Timeout as e:
	print 'Timeout: e=%s' % e
	except requests.exceptions.TooManyRedirects as e:
	print 'TooManyRedirects: e=%s' % e
	except requests.exceptions.HTTPError as e:
	print 'HTTPError: e=%s' % e
	except requests.exceptions.RequestException as e:
	print 'RequestException: e=%s' % e
	except:
	print 'Unkonwn exception: url=%s' % url

	if r.status_code != 200:
	print 'ruests.post failed, status_code=%d' % (r.status_code)
	return ''

	href = ''
	d = pq(r.content)
	for a in d.items('a'):
	href = a.attr('href')
	if href[0:19] == '/files/doc/docbook/':
	get_doc('http://www.crifan.com' + href + 'release/html/' + href[19:-1] + '.html')
	'''
	soup = BeautifulSoup(r.content)
	for a in soup.find_all('a'):
	href = a.get('href')
	if href[0:19] == '/files/doc/docbook/':
	get_doc('http://www.crifan.com' + href + 'release/html/' + href[19:-1] + '.html')
	'''

	if __name__ == "__main__":
	main()
	else:
	print 'Run as a module'