huzhifeng · August 29, 2015 14:08
diff --git a/coolshell.py b/coolshell.py
 #!/usr/bin/python
 # -*- coding: utf-8 -*-

 '''
 Usage:
    pip install requests
    pip install beautifulsoup4
    python coolshell.py
 '''

 import os
 import re
 import time
 import requests
 from bs4 import BeautifulSoup

 headers = {
    'Host': 'coolshell.cn',
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36'
 }

 def format_filename(title):
    invalid_chars = '/\:*?"<>|'
    filename = ''
    for c in title:
        filename = filename + ('-' if c in invalid_chars else c)

    return filename

 def download_pdf(url, title, post_date):
    try:
        print '%s-%s: %s' % (post_date, title, url)
    except:
        print 'Exception: %s-%s: %s' % (post_date, title.encode('gbk', 'ignore'), url)
    dir_name = 'pdf'
    filename = format_filename('CoolShell-' + post_date + '-' + title + '.pdf')
    file_path = dir_name + os.sep + filename
    if not os.path.isdir(dir_name):
        os.mkdir(dir_name)
    if os.path.exists(file_path):
        if os.path.getsize(file_path) > 0x10000:
            print 'File exist, do nothing'
            return
        else:
            print 'File too small, delete it'            

    pdfmyurl = 'http://pdfmyurl.com/index.php'
    headers = {
        'Host': 'pdfmyurl.com',
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36',
        'Cookie': 'NB_SRVID=srv29695; _ga=GA1.2.826873587.1414482384; _gat=1',
        'Cache-Control': 'max-age=0',
        'Origin': 'http://pdfmyurl.com',
        'Referer': 'http://pdfmyurl.com/',
        'Accept-Language': 'zh-CN,zh;q=0.8'
    }
    payload = {
        'url': url
    }
    r = requests.post(pdfmyurl, data=payload, headers=headers)
    if not r or r.status_code != 200:
        print 'ruests.post(url) failed' % (url)
        return None
    with open(file_path, 'wb') as f:
        f.write(r.content)

 def main():
    print 'Started at %s' % (time.strftime('%Y/%m/%d %H:%M:%S', time.localtime()))

    current_page = 1
    last_page = 999
    while current_page <= last_page:
        url = 'http://coolshell.cn' if current_page == 1 else 'http://coolshell.cn/page/%d' % (current_page)
        print 'Current page %d' % (current_page)
        try:
            r = requests.get(url, headers=headers)
        except requests.exceptions.ConnectionError as e:
            print 'ConnectionError: e=%s' % e
        except requests.exceptions.Timeout as e:
            print 'Timeout: e=%s' % e
        except requests.exceptions.TooManyRedirects as e:
            print 'TooManyRedirects: e=%s' % e
        except requests.exceptions.HTTPError as e:
            print 'HTTPError: e=%s' % e
        except requests.exceptions.RequestException as e:
            print 'RequestException: e=%s' % e
        except:
            print 'Unkonwn exception: url=%s' % url

        if not r or r.status_code != 200:
            print 'ruests.get(%s) failed' % (url)
            return

        href = ''
        soup = BeautifulSoup(r.content)
        title = soup.title.string
        posts = soup.find_all('div', class_="post")
        for post in posts:
            a = post.find('a', class_='title')
            span_date = post.find('span', class_='date')
            if not a or not span_date:
                print 'Invalid post'
                continue
            href = a.get('href')
            title = a.get_text()
            post_date = span_date.get_text()
            match = re.search(r'(\d{4})年(\d{1,2})月(\d{1,2})日', post_date.encode('utf-8', 'ignore'))
            if not match:
                print 'Invalid date: %s' (post_date)
                continue
            post_date = '%04d-%02d-%02d' % (int(match.group(1)), int(match.group(2)), int(match.group(3)))
            download_pdf(href, title, post_date)
            
        if last_page == 999:
            pages = soup.find('span', class_='pages')
            text = pages.get_text() # u'第 1 / 67 页'
            match = re.search(r'(\d{1,3})[^\d]+(\d{1,3})', text.encode('utf-8', 'ignore'))
            if match:
                last_page = int(match.group(2))
                print 'Last page is %d' % (last_page)

        current_page = current_page + 1

    print 'Stopped at %s' % (time.strftime('%Y/%m/%d %H:%M:%S', time.localtime()))

 if __name__ == "__main__":
    main()
 else:
    print 'Run as a module'
	#!/usr/bin/python
	# -- coding: utf-8 --

	'''
	Usage:
	pip install requests
	pip install beautifulsoup4
	python coolshell.py
	'''

	import os
	import re
	import time
	import requests
	from bs4 import BeautifulSoup

	headers = {
	'Host': 'coolshell.cn',
	'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36'
	}

	def format_filename(title):
	invalid_chars = '/\:*?"<>\|'
	filename = ''
	for c in title:
	filename = filename + ('-' if c in invalid_chars else c)

	return filename

	def download_pdf(url, title, post_date):
	try:
	print '%s-%s: %s' % (post_date, title, url)
	except:
	print 'Exception: %s-%s: %s' % (post_date, title.encode('gbk', 'ignore'), url)
	dir_name = 'pdf'
	filename = format_filename('CoolShell-' + post_date + '-' + title + '.pdf')
	file_path = dir_name + os.sep + filename
	if not os.path.isdir(dir_name):
	os.mkdir(dir_name)
	if os.path.exists(file_path):
	if os.path.getsize(file_path) > 0x10000:
	print 'File exist, do nothing'
	return
	else:
	print 'File too small, delete it'

	pdfmyurl = 'http://pdfmyurl.com/index.php'
	headers = {
	'Host': 'pdfmyurl.com',
	'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36',
	'Cookie': 'NB_SRVID=srv29695; _ga=GA1.2.826873587.1414482384; _gat=1',
	'Cache-Control': 'max-age=0',
	'Origin': 'http://pdfmyurl.com',
	'Referer': 'http://pdfmyurl.com/',
	'Accept-Language': 'zh-CN,zh;q=0.8'
	}
	payload = {
	'url': url
	}
	r = requests.post(pdfmyurl, data=payload, headers=headers)
	if not r or r.status_code != 200:
	print 'ruests.post(url) failed' % (url)
	return None
	with open(file_path, 'wb') as f:
	f.write(r.content)

	def main():
	print 'Started at %s' % (time.strftime('%Y/%m/%d %H:%M:%S', time.localtime()))

	current_page = 1
	last_page = 999
	while current_page <= last_page:
	url = 'http://coolshell.cn' if current_page == 1 else 'http://coolshell.cn/page/%d' % (current_page)
	print 'Current page %d' % (current_page)
	try:
	r = requests.get(url, headers=headers)
	except requests.exceptions.ConnectionError as e:
	print 'ConnectionError: e=%s' % e
	except requests.exceptions.Timeout as e:
	print 'Timeout: e=%s' % e
	except requests.exceptions.TooManyRedirects as e:
	print 'TooManyRedirects: e=%s' % e
	except requests.exceptions.HTTPError as e:
	print 'HTTPError: e=%s' % e
	except requests.exceptions.RequestException as e:
	print 'RequestException: e=%s' % e
	except:
	print 'Unkonwn exception: url=%s' % url

	if not r or r.status_code != 200:
	print 'ruests.get(%s) failed' % (url)
	return

	href = ''
	soup = BeautifulSoup(r.content)
	title = soup.title.string
	posts = soup.find_all('div', class_="post")
	for post in posts:
	a = post.find('a', class_='title')
	span_date = post.find('span', class_='date')
	if not a or not span_date:
	print 'Invalid post'
	continue
	href = a.get('href')
	title = a.get_text()
	post_date = span_date.get_text()
	match = re.search(r'(\d{4})年(\d{1,2})月(\d{1,2})日', post_date.encode('utf-8', 'ignore'))
	if not match:
	print 'Invalid date: %s' (post_date)
	continue
	post_date = '%04d-%02d-%02d' % (int(match.group(1)), int(match.group(2)), int(match.group(3)))
	download_pdf(href, title, post_date)

	if last_page == 999:
	pages = soup.find('span', class_='pages')
	text = pages.get_text() # u'第 1 / 67 页'
	match = re.search(r'(\d{1,3})[^\d]+(\d{1,3})', text.encode('utf-8', 'ignore'))
	if match:
	last_page = int(match.group(2))
	print 'Last page is %d' % (last_page)

	current_page = current_page + 1

	print 'Stopped at %s' % (time.strftime('%Y/%m/%d %H:%M:%S', time.localtime()))

	if __name__ == "__main__":
	main()
	else:
	print 'Run as a module'