Skip to content

Instantly share code, notes, and snippets.

@YieldNull
Last active October 26, 2015 12:26
Show Gist options
  • Save YieldNull/f5ce2800e109641abb78 to your computer and use it in GitHub Desktop.
Save YieldNull/f5ce2800e109641abb78 to your computer and use it in GitHub Desktop.
Backup your neteasy blog entries to a sqlite3 database (备份网易博客)

What

Backup your neteasy blog entries to a sqlite3 database.

Only Created time,title and content are stored.

Database Scheme

CREATE TABLE entry(
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    timestamp INTEGER,
    title TEXT,
    content TEXT
)

Usage:

neteasy.py <username>

The <username> is your blog name which appears in your blog domain. Just like "junjie.blog.163.com", in which "junjie" is the <username>

Demo

If everything is normal, the output will be like this:

normal.png

Otherwise, the omitted entries will be prompted at the bottom:

abnormal.png

Customize

I got uid and category information from the HTML source of <username>.blog.163.com/blog. Category info structure is below.

[{id:'-2',name:'草稿箱',count:0},
{id:'-3',name:'回收站',count:1},
{id:"fks_084067086080088071082083081095086094088075081085080069",name:"文风。",count:0},
{id:"fks_087070085080083067093084074065092095089070087080080",name:"默认分类",count:101}]

You can store more information of each blog entry.Below is the structure of entry info got in get_list. I just used the permalink to get entry detail URL and got timestamp,title and content in the detail HTML page.

content="<abstract>";
title="<title>";
permalink="<URL>";
publishTime=<timestamp>;

abstractSysGen=1;
accessCount=11;
allowComment=-100;
allowView=-100;
blogAbstract="";
blogAttachments=null;
blogCount=s56;
blogExt=null;
circleCount=0;
circleIdList=s57;
circleIds=null;
classId="fks_087070085080083067093084074065092095089070087080080";
className="";
commentCount=0;
comments=null;
contentPlainText=null;
id="fks_095069092084086070080082086095086094088075081085080069";
ip="221.233.47.78";
isBlogAbstractComplete=false;
isPublished=1;
keyName="ID";
keyWordCheckedState=0;
lastAccessCountUpdateTime=1444659802159;
matchedKeyWord=false;
modifyTime=1311086816156;
moveFrom="NONE";
permaSerial="289853662011619104628824";
photoIds=null;
photoStoreTypes=null;
publishTimeStr="22:46:28";
publisherId=0;
publisherNickname=null;
publisherUsername=null;
rank=0;
recomBlogHome=false;
ref=false;
shortPublishDateStr="2011-7-19";
synchLofter=-1;
synchMiniBlog=-1;
tag="";
trackbackCount=0;
trackbackUrl="blog/289853662011619104628824.track";
userId=28985366;
userName="kuoaixq";
userNickname="Lolita\u3002";
valid=0;
zipContent=null;
#!/usr/bin/env python
# coding:utf-8
"""
Backup your neteasy blog entries to a sqlite3 database
Usage neteasy.py <username>
The username is your blog name which appears in your blog domain.
Just like "junjie.blog.163.com", in which "junjie" is the <username>
Created on 2015.10.12
Updated on 2015.10.26
By hejunjie.net
"""
import urllib
import urllib2
import gzip
import cStringIO
import time
import re
import sqlite3
import sys
USER = ''
SITE_URL = 'http://%s.blog.163.com/'
API_URL = 'http://api.blog.163.com/%s/dwr/call/plaincall/BlogBeanNew.getBlogs.dwr'
# base common headers, entry list and entry detai use defferent headers
base_headers = [
('Accept', '*/*'),
('Accept-Encoding', 'gzip, deflate'),
('Accept-Language', 'en-US,en;q=0.8,zh-CN;q=0.6,zh;q=0.4'),
('Connection', 'keep-alive'),
('Content-Type', 'text/plain'),
('User-Agent', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/45.0.2454.101 Chrome/45.0.2454.101 Safari/537.36'),
]
# entry list query params
QUERY_STEP = 200
params = {
'callCount': '1',
'scriptSessionId': '${scriptSessionId}187',
'c0-scriptName': 'BlogBeanNew',
'c0-methodName': 'getBlogs',
'c0-id': '0',
'c0-param0': 'number:', # uid
'c0-param1': 'number:0', # already got
'c0-param2': 'number:%d' % QUERY_STEP, # query step
'batchId': '421865'
}
# entries that encounter Error
errors = []
def do_http(url, headers=None, params=None):
opener = urllib2.build_opener()
opener.addheaders = base_headers + headers if headers is not None else base_headers
if params is None:
res = opener.open(url)
else:
res = opener.open(url, urllib.urlencode(params))
unziped = gzip.GzipFile(fileobj=cStringIO.StringIO(res.read()))
try:
content = unziped.read().decode('gbk').encode('utf-8')
except UnicodeDecodeError:
print 'Error in retrieving entry: %s' % url
errors.append(url)
return None
finally:
opener.close()
return content
def get_info():
"""
get uid as number and sum of entrise
"""
content = do_http(SITE_URL + 'blog')
match = re.search(
r"location.vcd = 'http://api.blog.163.com/cap/captcha.jpgx\?parentId=(\d+)&r='", content)
uid = match.group(1)
cate = re.search(r'c:(\[.*\])', content).group(1)
counts = re.findall(r'count:(\d+)', cate)
amount = sum([int(x) for x in counts[2:]]) # exclude "草稿箱"、”回收站”
return {'uid': uid, 'sum': amount}
def get_list(amount):
"""
get entry list, extract detail url
"""
print 'Getting entry lists......'
headers = [
('Host', 'api.blog.163.com'),
('Origin', 'http://api.blog.163.com'),
('Referer', 'http://api.blog.163.com/crossdomain.html?t=20100205')
]
urls = []
i = 0
while i * QUERY_STEP < int(amount):
params['c0-param1'] = 'number:%d' % (i * QUERY_STEP)
res = do_http(API_URL, headers=headers, params=params)
match = re.findall(r'permalink="(.*?)"', res, re.S)
urls += match
i += 1
print ' Got %d' % (i * QUERY_STEP)
print 'Getting entry lists......Done!'
return urls
def get_entry(url):
"""
get entry detail
"""
headers = [
('Host', '%s.blog.163.com' % USER),
('Referer', 'http://%s.blog.163.com/blog/' % USER),
# ('Upgrade-Insecure-Requests', '1')
]
res = do_http(url, headers=headers)
if res:
res = res.decode('utf-8')
else:
return None
title_pat = r'<span class="tcnt">(.*?)</span>'
content_pat = r'<div class="nbw-blog-start"></div>(.*?)<div class="nbw-blog-end"></div>'
time_pat = r'<span class="blogsep">(.*?)</span>'
title = re.search(title_pat, res, re.S).group(1)
content = re.search(content_pat, res, re.S).group(1)
timestamp = re.search(time_pat, res, re.S).group(1)
timeArray = time.strptime(timestamp, "%Y-%m-%d %H:%M:%S")
timestamp = int(time.mktime(timeArray))
return {'timestamp': timestamp, 'title': title, 'content': content.strip()}
def store(entries):
"""
store entries in db
"""
print 'Storing entry details...'
# create database
conn = sqlite3.connect('%s.db' % USER)
cursor = conn.cursor()
cursor.execute('DROP TABLE IF EXISTS entry')
cursor.execute('CREATE TABLE entry('
'id INTEGER PRIMARY KEY AUTOINCREMENT,'
'timestamp INTEGER,title TEXT,content TEXT)')
i = 0
for url in entries:
i += 1
entry = get_entry(SITE_URL + url)
if not entry:
continue
cursor.execute('INSERT INTO entry(timestamp,title,content) values (?,?,?)',
[entry['timestamp'], entry['title'], entry['content']])
print ' %d of %d. Storing entry: %s' % (i, info['sum'], entry['title'])
conn.commit()
conn.close()
if __name__ == '__main__':
if len(sys.argv) != 2:
print 'Usage neteasy.py <username>'
sys.exit(0)
# config request headers
USER = sys.argv[1]
SITE_URL = SITE_URL % USER
API_URL = API_URL % USER
info = get_info()
params['c0-param0'] = 'number:%s' % info['uid']
print 'User:%s Uid:%s' % (USER, info['uid'])
# store entry detail
entries = get_list(info['sum'])
store(entries)
print 'Finish Backup. %d entries in all.' % info['sum']
for url in errors:
print 'Error in retrieving entry: %s' % url
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment