huzhifeng · August 29, 2015 14:08
diff --git a/huzhifeng.py b/huzhifeng.py
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-

 # huzhifeng's Python Lib

 import os
 import re
 import time
 import json
 import requests
 import wget

 def format_filename(fname):
    invalid_chars = '/\:*?"<>|'
    filename = ''
    for c in fname:
        filename = filename + ('-' if c in invalid_chars else c)

    return filename

 def pdfcrowd(url, file_path):
    post_url = 'http://pdfcrowd.com/form/json/convert/uri/'
    headers = {
        'Host': 'pdfcrowd.com',
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36'
    }
    payload = {
        'noCache': '1414481239318',
        'src': url,
        'conversion_source': 'uri'
    }
    try:
        r = requests.post(post_url, data=payload, headers=headers)
    except:
        print 'Exception: requests.post(%s), url=%s' % (post_url, url)
        return False
    if not r or r.status_code != 200:
        print 'requests.post(%s) failed, url=%s' % (post_url, url)
        return False

    data = r.text
    obj = json.loads(data)
    if not all(key in obj for key in ('status', 'uri')):
        print 'Invalid response obj=%s' % (obj)
        return False
    status = obj['status']
    if status != 'ok':
        print 'status=%s' % (status)
    download_url = 'http://pdfcrowd.com%s' % (obj['uri'])

    if wget.download(download_url, out=file_path):
        print ''
        return True
    else:
        return False

 def pdfmyurl(url, file_path):
    post_url = 'http://pdfmyurl.com/index.php'
    headers = {
        'Host': 'pdfmyurl.com',
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36',
        'Cookie': 'NB_SRVID=srv29695; _ga=GA1.2.826873587.1414482384; _gat=1',
        'Cache-Control': 'max-age=0',
        'Origin': 'http://pdfmyurl.com',
        'Referer': 'http://pdfmyurl.com/',
        'Accept-Language': 'zh-CN,zh;q=0.8'
    }
    payload = {
        'url': url
    }
    try:
        r = requests.post(post_url, data=payload, headers=headers)
    except:
        print 'Exception: requests.post(%s), url=%s' % (post_url, url)
        return False
    if not r or r.status_code != 200:
        print 'requests.post(%s) failed, url=%s' % (post_url, url)
        return False

    with open(file_path, 'wb') as f:
        f.write(r.content)
        
    return True
diff --git a/ruanyifeng.py b/ruanyifeng.py
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-

 '''
 Usage:
    pip install requests
    pip install beautifulsoup4
    pip install wget
    python ruanyifeng.py
 '''

 import os
 import re
 import time
 import json
 import requests
 from bs4 import BeautifulSoup
 import huzhifeng

 def download_pdf(url, title, post_date):
    try:
        print '%s-%s: %s' % (post_date, title, url)
    except:
        print 'Exception: %s-%s: %s' % (post_date, title.encode('gbk', 'ignore'), url)
    dir_name = 'pdf'
    filename = huzhifeng.format_filename('ryf-' + post_date + '-' + title + '.pdf')
    file_path = dir_name + os.sep + filename
    if not os.path.isdir(dir_name):
        os.mkdir(dir_name)
    if os.path.exists(file_path):
        if os.path.getsize(file_path) > 0x10000:
            print 'File exist, do nothing'
            return
        else:
            print 'File too small, delete it'

    huzhifeng.pdfmyurl(url, file_path)

 def main():
    print 'Started at %s' % (time.strftime('%Y/%m/%d %H:%M:%S', time.localtime()))

    sort = 'asc'
    #sort = 'desc'
    headers = {
        'Host': 'www.ruanyifeng.com',
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36'
    }

    if sort == 'desc':
        url = 'http://www.ruanyifeng.com/blog/%s' % (time.strftime('%Y/%m/', time.localtime()))
    else:
        url = 'http://www.ruanyifeng.com/blog/2003/12/'
    while url:
        print 'Current page %s' % (url)
        try:
            r = requests.get(url, headers=headers)
        except:
            print 'Exception: requests.get(%s)' % (url)
            break
        if not r or r.status_code != 200:
            print 'requests.get(%s) failed' % (url)
            break

        soup = BeautifulSoup(r.content)

        nav_tag = soup.find('div', class_='content-nav').find('p')
        if not nav_tag:
            print 'Can not find tag p for previous month'
            break
        a_tag = nav_tag.find_all('a')
        if not a_tag or len(a_tag) < 1:
            print 'Can not find tag a for previous month'
            break          
        if len(a_tag) == 1:
            text = a_tag[0].get_text()
            if text == u'« 上月' and sort == 'desc':
                url = a_tag[0].get('href')
            elif text == u'下月 »' and sort == 'asc':
                url = a_tag[0].get('href')
            else:
                print 'This is the last page'
                url = ''
        else:
            url = a_tag[0].get('href') if sort == 'desc' else a_tag[1].get('href')
        if(url == 'http://www.ruanyifeng.com/blog/2008/03/'):
            print 'Skip page http://www.ruanyifeng.com/blog/2008/03, it does no response'
            url = 'http://www.ruanyifeng.com/blog/2008/02/' if sort == 'desc' else 'http://www.ruanyifeng.com/blog/2008/04/'

        article_div_tag = soup.find('div', class_='entry-asset asset hentry')
        a_tag = article_div_tag.find('div', class_='asset-header').find('h2', class_='asset-name entry-title').find('a')
        article_time_tag = article_div_tag.find('div', class_='asset-footer').find('div', class_='asset-meta').find('p').find('span', class_='byline').find('abbr', class_='published')
        if not a_tag:
            print 'Can not find tag a for latest post of this month'
            continue
        if not article_time_tag:
            print 'Can not find tag abbr'
            continue
        article_title = a_tag.get_text()
        article_url = a_tag.get('href')
        article_post_time = article_time_tag.get_text()
        if not article_title or not article_url or not article_post_time:
            print 'Invalid title, url or time, continue next page'
            continue
        match = re.search(r'(\d{4})年 {0,1}(\d{1,2})月 {0,1}(\d{1,2})日', article_post_time.encode('utf-8', 'ignore'))
        if not match:
            print 'Extract date failed: %s' (article_post_time)
            continue
        post_date = '%04d-%02d-%02d' % (int(match.group(1)), int(match.group(2)), int(match.group(3)))
        download_pdf(article_url, article_title, post_date)

        article_list_tag = soup.find('div', class_='module-categories module').find('div', class_='module-content').find('ul', class_='module-list').find_all('li', class_='module-list-item')
        if not article_list_tag:
            print 'Can not find tag lis'
            continue
        for item in article_list_tag:
            a_tag = item.find('a')
            if not a_tag:
                print 'Can not find tag a for post list of this month'
                continue
            article_title = a_tag.get_text()
            article_url = a_tag.get('href')
            item.span.decompose()
            item.a.decompose()
            article_post_time = item.get_text()
            if not article_title or not article_url or not article_post_time:
                print 'Invalid title, url or time, continue next li'
                continue
            match = re.search(r'(\d{4})\.(\d{1,2})\.(\d{1,2})', article_post_time.encode('utf-8', 'ignore'))
            if not match:
                print 'Extract date failed: %s' (article_post_time)
                continue
            post_date = '%04d-%02d-%02d' % (int(match.group(1)), int(match.group(2)), int(match.group(3)))
            download_pdf(article_url, article_title, post_date)

        time.sleep(3)

    print 'Stopped at %s' % (time.strftime('%Y/%m/%d %H:%M:%S', time.localtime()))

 if __name__ == '__main__':
    main()
 else:
    print 'Run as a module'
	#!/usr/bin/env python
	# -- coding: utf-8 --

	# huzhifeng's Python Lib

	import os
	import re
	import time
	import json
	import requests
	import wget

	def format_filename(fname):
	invalid_chars = '/\:*?"<>\|'
	filename = ''
	for c in fname:
	filename = filename + ('-' if c in invalid_chars else c)

	return filename

	def pdfcrowd(url, file_path):
	post_url = 'http://pdfcrowd.com/form/json/convert/uri/'
	headers = {
	'Host': 'pdfcrowd.com',
	'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36'
	}
	payload = {
	'noCache': '1414481239318',
	'src': url,
	'conversion_source': 'uri'
	}
	try:
	r = requests.post(post_url, data=payload, headers=headers)
	except:
	print 'Exception: requests.post(%s), url=%s' % (post_url, url)
	return False
	if not r or r.status_code != 200:
	print 'requests.post(%s) failed, url=%s' % (post_url, url)
	return False

	data = r.text
	obj = json.loads(data)
	if not all(key in obj for key in ('status', 'uri')):
	print 'Invalid response obj=%s' % (obj)
	return False
	status = obj['status']
	if status != 'ok':
	print 'status=%s' % (status)
	download_url = 'http://pdfcrowd.com%s' % (obj['uri'])

	if wget.download(download_url, out=file_path):
	print ''
	return True
	else:
	return False

	def pdfmyurl(url, file_path):
	post_url = 'http://pdfmyurl.com/index.php'
	headers = {
	'Host': 'pdfmyurl.com',
	'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36',
	'Cookie': 'NB_SRVID=srv29695; _ga=GA1.2.826873587.1414482384; _gat=1',
	'Cache-Control': 'max-age=0',
	'Origin': 'http://pdfmyurl.com',
	'Referer': 'http://pdfmyurl.com/',
	'Accept-Language': 'zh-CN,zh;q=0.8'
	}
	payload = {
	'url': url
	}
	try:
	r = requests.post(post_url, data=payload, headers=headers)
	except:
	print 'Exception: requests.post(%s), url=%s' % (post_url, url)
	return False
	if not r or r.status_code != 200:
	print 'requests.post(%s) failed, url=%s' % (post_url, url)
	return False

	with open(file_path, 'wb') as f:
	f.write(r.content)

	return True
	#!/usr/bin/env python
	# -- coding: utf-8 --

	'''
	Usage:
	pip install requests
	pip install beautifulsoup4
	pip install wget
	python ruanyifeng.py
	'''

	import os
	import re
	import time
	import json
	import requests
	from bs4 import BeautifulSoup
	import huzhifeng

	def download_pdf(url, title, post_date):
	try:
	print '%s-%s: %s' % (post_date, title, url)
	except:
	print 'Exception: %s-%s: %s' % (post_date, title.encode('gbk', 'ignore'), url)
	dir_name = 'pdf'
	filename = huzhifeng.format_filename('ryf-' + post_date + '-' + title + '.pdf')
	file_path = dir_name + os.sep + filename
	if not os.path.isdir(dir_name):
	os.mkdir(dir_name)
	if os.path.exists(file_path):
	if os.path.getsize(file_path) > 0x10000:
	print 'File exist, do nothing'
	return
	else:
	print 'File too small, delete it'

	huzhifeng.pdfmyurl(url, file_path)

	def main():
	print 'Started at %s' % (time.strftime('%Y/%m/%d %H:%M:%S', time.localtime()))

	sort = 'asc'
	#sort = 'desc'
	headers = {
	'Host': 'www.ruanyifeng.com',
	'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36'
	}

	if sort == 'desc':
	url = 'http://www.ruanyifeng.com/blog/%s' % (time.strftime('%Y/%m/', time.localtime()))
	else:
	url = 'http://www.ruanyifeng.com/blog/2003/12/'
	while url:
	print 'Current page %s' % (url)
	try:
	r = requests.get(url, headers=headers)
	except:
	print 'Exception: requests.get(%s)' % (url)
	break
	if not r or r.status_code != 200:
	print 'requests.get(%s) failed' % (url)
	break

	soup = BeautifulSoup(r.content)

	nav_tag = soup.find('div', class_='content-nav').find('p')
	if not nav_tag:
	print 'Can not find tag p for previous month'
	break
	a_tag = nav_tag.find_all('a')
	if not a_tag or len(a_tag) < 1:
	print 'Can not find tag a for previous month'
	break
	if len(a_tag) == 1:
	text = a_tag[0].get_text()
	if text == u'« 上月' and sort == 'desc':
	url = a_tag[0].get('href')
	elif text == u'下月 »' and sort == 'asc':
	url = a_tag[0].get('href')
	else:
	print 'This is the last page'
	url = ''
	else:
	url = a_tag[0].get('href') if sort == 'desc' else a_tag[1].get('href')
	if(url == 'http://www.ruanyifeng.com/blog/2008/03/'):
	print 'Skip page http://www.ruanyifeng.com/blog/2008/03, it does no response'
	url = 'http://www.ruanyifeng.com/blog/2008/02/' if sort == 'desc' else 'http://www.ruanyifeng.com/blog/2008/04/'

	article_div_tag = soup.find('div', class_='entry-asset asset hentry')
	a_tag = article_div_tag.find('div', class_='asset-header').find('h2', class_='asset-name entry-title').find('a')
	article_time_tag = article_div_tag.find('div', class_='asset-footer').find('div', class_='asset-meta').find('p').find('span', class_='byline').find('abbr', class_='published')
	if not a_tag:
	print 'Can not find tag a for latest post of this month'
	continue
	if not article_time_tag:
	print 'Can not find tag abbr'
	continue
	article_title = a_tag.get_text()
	article_url = a_tag.get('href')
	article_post_time = article_time_tag.get_text()
	if not article_title or not article_url or not article_post_time:
	print 'Invalid title, url or time, continue next page'
	continue
	match = re.search(r'(\d{4})年 {0,1}(\d{1,2})月 {0,1}(\d{1,2})日', article_post_time.encode('utf-8', 'ignore'))
	if not match:
	print 'Extract date failed: %s' (article_post_time)
	continue
	post_date = '%04d-%02d-%02d' % (int(match.group(1)), int(match.group(2)), int(match.group(3)))
	download_pdf(article_url, article_title, post_date)

	article_list_tag = soup.find('div', class_='module-categories module').find('div', class_='module-content').find('ul', class_='module-list').find_all('li', class_='module-list-item')
	if not article_list_tag:
	print 'Can not find tag lis'
	continue
	for item in article_list_tag:
	a_tag = item.find('a')
	if not a_tag:
	print 'Can not find tag a for post list of this month'
	continue
	article_title = a_tag.get_text()
	article_url = a_tag.get('href')
	item.span.decompose()
	item.a.decompose()
	article_post_time = item.get_text()
	if not article_title or not article_url or not article_post_time:
	print 'Invalid title, url or time, continue next li'
	continue
	match = re.search(r'(\d{4})\.(\d{1,2})\.(\d{1,2})', article_post_time.encode('utf-8', 'ignore'))
	if not match:
	print 'Extract date failed: %s' (article_post_time)
	continue
	post_date = '%04d-%02d-%02d' % (int(match.group(1)), int(match.group(2)), int(match.group(3)))
	download_pdf(article_url, article_title, post_date)

	time.sleep(3)

	print 'Stopped at %s' % (time.strftime('%Y/%m/%d %H:%M:%S', time.localtime()))

	if __name__ == '__main__':
	main()
	else:
	print 'Run as a module'