|
#!/usr/bin/env python |
|
# coding:utf-8 |
|
|
|
""" |
|
Backup your neteasy blog entries to a sqlite3 database |
|
|
|
Usage neteasy.py <username> |
|
The username is your blog name which appears in your blog domain. |
|
Just like "junjie.blog.163.com", in which "junjie" is the <username> |
|
|
|
Created on 2015.10.12 |
|
Updated on 2015.10.26 |
|
By hejunjie.net |
|
|
|
""" |
|
|
|
import urllib |
|
import urllib2 |
|
import gzip |
|
import cStringIO |
|
import time |
|
import re |
|
import sqlite3 |
|
import sys |
|
|
|
USER = '' |
|
SITE_URL = 'http://%s.blog.163.com/' |
|
API_URL = 'http://api.blog.163.com/%s/dwr/call/plaincall/BlogBeanNew.getBlogs.dwr' |
|
|
|
# base common headers, entry list and entry detai use defferent headers |
|
base_headers = [ |
|
('Accept', '*/*'), |
|
('Accept-Encoding', 'gzip, deflate'), |
|
('Accept-Language', 'en-US,en;q=0.8,zh-CN;q=0.6,zh;q=0.4'), |
|
('Connection', 'keep-alive'), |
|
('Content-Type', 'text/plain'), |
|
('User-Agent', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/45.0.2454.101 Chrome/45.0.2454.101 Safari/537.36'), |
|
] |
|
|
|
# entry list query params |
|
QUERY_STEP = 200 |
|
params = { |
|
'callCount': '1', |
|
'scriptSessionId': '${scriptSessionId}187', |
|
'c0-scriptName': 'BlogBeanNew', |
|
'c0-methodName': 'getBlogs', |
|
'c0-id': '0', |
|
'c0-param0': 'number:', # uid |
|
'c0-param1': 'number:0', # already got |
|
'c0-param2': 'number:%d' % QUERY_STEP, # query step |
|
'batchId': '421865' |
|
} |
|
|
|
# entries that encounter Error |
|
errors = [] |
|
|
|
|
|
def do_http(url, headers=None, params=None): |
|
opener = urllib2.build_opener() |
|
opener.addheaders = base_headers + headers if headers is not None else base_headers |
|
if params is None: |
|
res = opener.open(url) |
|
else: |
|
res = opener.open(url, urllib.urlencode(params)) |
|
unziped = gzip.GzipFile(fileobj=cStringIO.StringIO(res.read())) |
|
try: |
|
content = unziped.read().decode('gbk').encode('utf-8') |
|
except UnicodeDecodeError: |
|
print 'Error in retrieving entry: %s' % url |
|
errors.append(url) |
|
return None |
|
finally: |
|
opener.close() |
|
return content |
|
|
|
|
|
def get_info(): |
|
""" |
|
get uid as number and sum of entrise |
|
""" |
|
content = do_http(SITE_URL + 'blog') |
|
match = re.search( |
|
r"location.vcd = 'http://api.blog.163.com/cap/captcha.jpgx\?parentId=(\d+)&r='", content) |
|
uid = match.group(1) |
|
|
|
cate = re.search(r'c:(\[.*\])', content).group(1) |
|
counts = re.findall(r'count:(\d+)', cate) |
|
amount = sum([int(x) for x in counts[2:]]) # exclude "草稿箱"、”回收站” |
|
return {'uid': uid, 'sum': amount} |
|
|
|
|
|
def get_list(amount): |
|
""" |
|
get entry list, extract detail url |
|
""" |
|
|
|
print 'Getting entry lists......' |
|
headers = [ |
|
('Host', 'api.blog.163.com'), |
|
('Origin', 'http://api.blog.163.com'), |
|
('Referer', 'http://api.blog.163.com/crossdomain.html?t=20100205') |
|
] |
|
|
|
urls = [] |
|
i = 0 |
|
while i * QUERY_STEP < int(amount): |
|
params['c0-param1'] = 'number:%d' % (i * QUERY_STEP) |
|
res = do_http(API_URL, headers=headers, params=params) |
|
match = re.findall(r'permalink="(.*?)"', res, re.S) |
|
urls += match |
|
i += 1 |
|
print ' Got %d' % (i * QUERY_STEP) |
|
print 'Getting entry lists......Done!' |
|
return urls |
|
|
|
|
|
def get_entry(url): |
|
""" |
|
get entry detail |
|
""" |
|
headers = [ |
|
('Host', '%s.blog.163.com' % USER), |
|
('Referer', 'http://%s.blog.163.com/blog/' % USER), |
|
# ('Upgrade-Insecure-Requests', '1') |
|
] |
|
res = do_http(url, headers=headers) |
|
if res: |
|
res = res.decode('utf-8') |
|
else: |
|
return None |
|
|
|
title_pat = r'<span class="tcnt">(.*?)</span>' |
|
content_pat = r'<div class="nbw-blog-start"></div>(.*?)<div class="nbw-blog-end"></div>' |
|
time_pat = r'<span class="blogsep">(.*?)</span>' |
|
title = re.search(title_pat, res, re.S).group(1) |
|
content = re.search(content_pat, res, re.S).group(1) |
|
timestamp = re.search(time_pat, res, re.S).group(1) |
|
|
|
timeArray = time.strptime(timestamp, "%Y-%m-%d %H:%M:%S") |
|
timestamp = int(time.mktime(timeArray)) |
|
return {'timestamp': timestamp, 'title': title, 'content': content.strip()} |
|
|
|
|
|
def store(entries): |
|
""" |
|
store entries in db |
|
""" |
|
print 'Storing entry details...' |
|
|
|
# create database |
|
conn = sqlite3.connect('%s.db' % USER) |
|
cursor = conn.cursor() |
|
cursor.execute('DROP TABLE IF EXISTS entry') |
|
cursor.execute('CREATE TABLE entry(' |
|
'id INTEGER PRIMARY KEY AUTOINCREMENT,' |
|
'timestamp INTEGER,title TEXT,content TEXT)') |
|
|
|
i = 0 |
|
for url in entries: |
|
i += 1 |
|
entry = get_entry(SITE_URL + url) |
|
if not entry: |
|
continue |
|
cursor.execute('INSERT INTO entry(timestamp,title,content) values (?,?,?)', |
|
[entry['timestamp'], entry['title'], entry['content']]) |
|
print ' %d of %d. Storing entry: %s' % (i, info['sum'], entry['title']) |
|
conn.commit() |
|
conn.close() |
|
|
|
if __name__ == '__main__': |
|
if len(sys.argv) != 2: |
|
print 'Usage neteasy.py <username>' |
|
sys.exit(0) |
|
|
|
# config request headers |
|
USER = sys.argv[1] |
|
SITE_URL = SITE_URL % USER |
|
API_URL = API_URL % USER |
|
|
|
info = get_info() |
|
params['c0-param0'] = 'number:%s' % info['uid'] |
|
|
|
print 'User:%s Uid:%s' % (USER, info['uid']) |
|
# store entry detail |
|
entries = get_list(info['sum']) |
|
store(entries) |
|
print 'Finish Backup. %d entries in all.' % info['sum'] |
|
for url in errors: |
|
print 'Error in retrieving entry: %s' % url |