YieldNull · October 26, 2015 12:26
diff --git a/readme.md b/readme.md
diff --git a/neteasy.py b/neteasy.py
 #!/usr/bin/env python
 # coding:utf-8

 """
 Backup your neteasy blog entries to a sqlite3 database

 Usage neteasy.py <username>
 The username is your blog name which appears in your blog domain.
 Just like "junjie.blog.163.com", in which "junjie" is the <username>

 Created on 2015.10.12
 Updated on 2015.10.26
 By hejunjie.net

 """

 import urllib
 import urllib2
 import gzip
 import cStringIO
 import time
 import re
 import sqlite3
 import sys

 USER = ''
 SITE_URL = 'http://%s.blog.163.com/'
 API_URL = 'http://api.blog.163.com/%s/dwr/call/plaincall/BlogBeanNew.getBlogs.dwr'

 # base common headers, entry list and entry detai use defferent headers
 base_headers = [
    ('Accept', '*/*'),
    ('Accept-Encoding', 'gzip, deflate'),
    ('Accept-Language', 'en-US,en;q=0.8,zh-CN;q=0.6,zh;q=0.4'),
    ('Connection', 'keep-alive'),
    ('Content-Type', 'text/plain'),
    ('User-Agent', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/45.0.2454.101 Chrome/45.0.2454.101 Safari/537.36'),
 ]

 # entry list query params
 QUERY_STEP = 200
 params = {
    'callCount': '1',
    'scriptSessionId': '${scriptSessionId}187',
    'c0-scriptName': 'BlogBeanNew',
    'c0-methodName': 'getBlogs',
    'c0-id': '0',
    'c0-param0': 'number:',  # uid
    'c0-param1': 'number:0',  # already got
    'c0-param2': 'number:%d' % QUERY_STEP,  # query step
    'batchId': '421865'
 }

 # entries that encounter Error
 errors = []


 def do_http(url, headers=None, params=None):
    opener = urllib2.build_opener()
    opener.addheaders = base_headers + headers if headers is not None else base_headers
    if params is None:
        res = opener.open(url)
    else:
        res = opener.open(url, urllib.urlencode(params))
    unziped = gzip.GzipFile(fileobj=cStringIO.StringIO(res.read()))
    try:
        content = unziped.read().decode('gbk').encode('utf-8')
    except UnicodeDecodeError:
        print 'Error in retrieving entry: %s' % url
        errors.append(url)
        return None
    finally:
        opener.close()
    return content


 def get_info():
    """
    get uid as number and sum of entrise
    """
    content = do_http(SITE_URL + 'blog')
    match = re.search(
        r"location.vcd = 'http://api.blog.163.com/cap/captcha.jpgx\?parentId=(\d+)&r='", content)
    uid = match.group(1)

    cate = re.search(r'c:(\[.*\])', content).group(1)
    counts = re.findall(r'count:(\d+)', cate)
    amount = sum([int(x) for x in counts[2:]])  # exclude "草稿箱"、”回收站”
    return {'uid': uid, 'sum': amount}


 def get_list(amount):
    """
    get entry list, extract detail url
    """

    print 'Getting entry lists......'
    headers = [
        ('Host', 'api.blog.163.com'),
        ('Origin', 'http://api.blog.163.com'),
        ('Referer', 'http://api.blog.163.com/crossdomain.html?t=20100205')
    ]

    urls = []
    i = 0
    while i * QUERY_STEP < int(amount):
        params['c0-param1'] = 'number:%d' % (i * QUERY_STEP)
        res = do_http(API_URL, headers=headers, params=params)
        match = re.findall(r'permalink="(.*?)"', res, re.S)
        urls += match
        i += 1
        print ' Got %d' % (i * QUERY_STEP)
    print 'Getting entry lists......Done!'
    return urls


 def get_entry(url):
    """
    get entry detail
    """
    headers = [
        ('Host', '%s.blog.163.com' % USER),
        ('Referer', 'http://%s.blog.163.com/blog/' % USER),
        # ('Upgrade-Insecure-Requests', '1')
    ]
    res = do_http(url, headers=headers)
    if res:
        res = res.decode('utf-8')
    else:
        return None

    title_pat = r'<span class="tcnt">(.*?)</span>'
    content_pat = r'<div class="nbw-blog-start"></div>(.*?)<div class="nbw-blog-end"></div>'
    time_pat = r'<span class="blogsep">(.*?)</span>'
    title = re.search(title_pat, res, re.S).group(1)
    content = re.search(content_pat, res, re.S).group(1)
    timestamp = re.search(time_pat, res, re.S).group(1)

    timeArray = time.strptime(timestamp, "%Y-%m-%d %H:%M:%S")
    timestamp = int(time.mktime(timeArray))
    return {'timestamp': timestamp, 'title': title, 'content': content.strip()}


 def store(entries):
    """
    store entries in db
    """
    print 'Storing entry details...'

    # create database
    conn = sqlite3.connect('%s.db' % USER)
    cursor = conn.cursor()
    cursor.execute('DROP TABLE IF EXISTS entry')
    cursor.execute('CREATE TABLE entry('
                   'id INTEGER PRIMARY KEY AUTOINCREMENT,'
                   'timestamp INTEGER,title TEXT,content TEXT)')

    i = 0
    for url in entries:
        i += 1
        entry = get_entry(SITE_URL + url)
        if not entry:
            continue
        cursor.execute('INSERT INTO entry(timestamp,title,content) values (?,?,?)',
                       [entry['timestamp'], entry['title'], entry['content']])
        print ' %d of %d. Storing entry: %s' % (i, info['sum'], entry['title'])
    conn.commit()
    conn.close()

 if __name__ == '__main__':
    if len(sys.argv) != 2:
        print 'Usage neteasy.py <username>'
        sys.exit(0)

    # config request headers
    USER = sys.argv[1]
    SITE_URL = SITE_URL % USER
    API_URL = API_URL % USER

    info = get_info()
    params['c0-param0'] = 'number:%s' % info['uid']

    print 'User:%s Uid:%s' % (USER, info['uid'])
    # store entry detail
    entries = get_list(info['sum'])
    store(entries)
    print 'Finish Backup. %d entries in all.' % info['sum']
    for url in errors:
        print 'Error in retrieving entry: %s' % url
	#!/usr/bin/env python
	# coding:utf-8

	"""
	Backup your neteasy blog entries to a sqlite3 database

	Usage neteasy.py <username>
	The username is your blog name which appears in your blog domain.
	Just like "junjie.blog.163.com", in which "junjie" is the <username>

	Created on 2015.10.12
	Updated on 2015.10.26
	By hejunjie.net

	"""

	import urllib
	import urllib2
	import gzip
	import cStringIO
	import time
	import re
	import sqlite3
	import sys

	USER = ''
	SITE_URL = 'http://%s.blog.163.com/'
	API_URL = 'http://api.blog.163.com/%s/dwr/call/plaincall/BlogBeanNew.getBlogs.dwr'

	# base common headers, entry list and entry detai use defferent headers
	base_headers = [
	('Accept', '/'),
	('Accept-Encoding', 'gzip, deflate'),
	('Accept-Language', 'en-US,en;q=0.8,zh-CN;q=0.6,zh;q=0.4'),
	('Connection', 'keep-alive'),
	('Content-Type', 'text/plain'),
	('User-Agent', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/45.0.2454.101 Chrome/45.0.2454.101 Safari/537.36'),
	]

	# entry list query params
	QUERY_STEP = 200
	params = {
	'callCount': '1',
	'scriptSessionId': '${scriptSessionId}187',
	'c0-scriptName': 'BlogBeanNew',
	'c0-methodName': 'getBlogs',
	'c0-id': '0',
	'c0-param0': 'number:', # uid
	'c0-param1': 'number:0', # already got
	'c0-param2': 'number:%d' % QUERY_STEP, # query step
	'batchId': '421865'
	}

	# entries that encounter Error
	errors = []


	def do_http(url, headers=None, params=None):
	opener = urllib2.build_opener()
	opener.addheaders = base_headers + headers if headers is not None else base_headers
	if params is None:
	res = opener.open(url)
	else:
	res = opener.open(url, urllib.urlencode(params))
	unziped = gzip.GzipFile(fileobj=cStringIO.StringIO(res.read()))
	try:
	content = unziped.read().decode('gbk').encode('utf-8')
	except UnicodeDecodeError:
	print 'Error in retrieving entry: %s' % url
	errors.append(url)
	return None
	finally:
	opener.close()
	return content


	def get_info():
	"""
	get uid as number and sum of entrise
	"""
	content = do_http(SITE_URL + 'blog')
	match = re.search(
	r"location.vcd = 'http://api.blog.163.com/cap/captcha.jpgx\?parentId=(\d+)&r='", content)
	uid = match.group(1)

	cate = re.search(r'c:(\[.*\])', content).group(1)
	counts = re.findall(r'count:(\d+)', cate)
	amount = sum([int(x) for x in counts[2:]]) # exclude "草稿箱"、”回收站”
	return {'uid': uid, 'sum': amount}


	def get_list(amount):
	"""
	get entry list, extract detail url
	"""

	print 'Getting entry lists......'
	headers = [
	('Host', 'api.blog.163.com'),
	('Origin', 'http://api.blog.163.com'),
	('Referer', 'http://api.blog.163.com/crossdomain.html?t=20100205')
	]

	urls = []
	i = 0
	while i * QUERY_STEP < int(amount):
	params['c0-param1'] = 'number:%d' % (i * QUERY_STEP)
	res = do_http(API_URL, headers=headers, params=params)
	match = re.findall(r'permalink="(.*?)"', res, re.S)
	urls += match
	i += 1
	print ' Got %d' % (i * QUERY_STEP)
	print 'Getting entry lists......Done!'
	return urls


	def get_entry(url):
	"""
	get entry detail
	"""
	headers = [
	('Host', '%s.blog.163.com' % USER),
	('Referer', 'http://%s.blog.163.com/blog/' % USER),
	# ('Upgrade-Insecure-Requests', '1')
	]
	res = do_http(url, headers=headers)
	if res:
	res = res.decode('utf-8')
	else:
	return None

	title_pat = r'<span class="tcnt">(.*?)</span>'
	content_pat = r'<div class="nbw-blog-start"></div>(.*?)<div class="nbw-blog-end"></div>'
	time_pat = r'<span class="blogsep">(.*?)</span>'
	title = re.search(title_pat, res, re.S).group(1)
	content = re.search(content_pat, res, re.S).group(1)
	timestamp = re.search(time_pat, res, re.S).group(1)

	timeArray = time.strptime(timestamp, "%Y-%m-%d %H:%M:%S")
	timestamp = int(time.mktime(timeArray))
	return {'timestamp': timestamp, 'title': title, 'content': content.strip()}


	def store(entries):
	"""
	store entries in db
	"""
	print 'Storing entry details...'

	# create database
	conn = sqlite3.connect('%s.db' % USER)
	cursor = conn.cursor()
	cursor.execute('DROP TABLE IF EXISTS entry')
	cursor.execute('CREATE TABLE entry('
	'id INTEGER PRIMARY KEY AUTOINCREMENT,'
	'timestamp INTEGER,title TEXT,content TEXT)')

	i = 0
	for url in entries:
	i += 1
	entry = get_entry(SITE_URL + url)
	if not entry:
	continue
	cursor.execute('INSERT INTO entry(timestamp,title,content) values (?,?,?)',
	[entry['timestamp'], entry['title'], entry['content']])
	print ' %d of %d. Storing entry: %s' % (i, info['sum'], entry['title'])
	conn.commit()
	conn.close()

	if __name__ == '__main__':
	if len(sys.argv) != 2:
	print 'Usage neteasy.py <username>'
	sys.exit(0)

	# config request headers
	USER = sys.argv[1]
	SITE_URL = SITE_URL % USER
	API_URL = API_URL % USER

	info = get_info()
	params['c0-param0'] = 'number:%s' % info['uid']

	print 'User:%s Uid:%s' % (USER, info['uid'])
	# store entry detail
	entries = get_list(info['sum'])
	store(entries)
	print 'Finish Backup. %d entries in all.' % info['sum']
	for url in errors:
	print 'Error in retrieving entry: %s' % url