Created
August 26, 2012 14:07
-
-
Save moluapple/3479853 to your computer and use it in GitHub Desktop.
[python2]cchere user posts downloader
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- coding:utf-8 -*- | |
""" download all cchere posts of the author from pre-collected postlist | |
currently the list was collected manually | |
TODO: get the postlist from cchere user home automatically | |
NOTE: Fanqiang is needed as the exists of GFW | |
code based on: https://code.google.com/p/cchere-thread-saver/ | |
""" | |
import urllib2 | |
from lxml import html | |
import cookielib | |
import re | |
def getAuthor(ele): | |
""" get author from element | |
""" | |
r = ele.xpath('div/div/a') | |
return r[0].text | |
def zJ_PE(alltext): | |
""" from the entire source, get the meaningful center part | |
and decoded it as in the javascript function zJ_PE | |
return a unicode version of the center part | |
""" | |
# get the center part of the page, encrypted | |
texts = alltext.partition('ls=\"') | |
alltext = texts[2][3:] | |
texts2 = alltext.partition('\";') | |
alltext = texts2[0] | |
# decode, re-implement of js zJ_PE function | |
list1 = ['~', '#', '<', '@', '>', '!', '&', '*', '(', ')', ':', ';', '=', ',', '|', '+'] | |
list2 = ['%1', '%2', '%3', '%4', '%5', '%6', '%7', '%8', '%9', '%A', '%B', '%C', '%D', '%E', 'e', '%20'] | |
for i in range(0, len(list1)): | |
alltext = alltext.replace(list1[i], list2[i]) | |
decodedtext = urllib2.unquote(alltext) # decoded text is in utf8, but still a str | |
decodedtext = unicode(decodedtext, 'utf8') # change it to unicode | |
return decodedtext | |
def parseHTML(url): | |
""" the cookie was tracked by firebug | |
return the entire source htmlstring | |
""" | |
cookie = 'cchome=***** use your cookie here *****; expires=Sunday, 02-Sep-2012 09:28:55; path=/; domain=.cchere.com' | |
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookielib.CookieJar())) | |
urllib2.install_opener(opener) | |
req = urllib2.Request(url) | |
req.add_header('Cookie', cookie) | |
alltext = urllib2.urlopen(req).read() | |
return alltext | |
def getPageContents(htmlstring, baseurl="http://www.cchere.com"): | |
""" get PageContents element list from htmlstring | |
""" | |
doc = html.fromstring(htmlstring) | |
doc.make_links_absolute(baseurl) | |
alllist = doc.find_class("pContent") # list of posts | |
# next remove all those recycled (not shown) posts | |
alllist = [ele for ele in alllist if len(ele.xpath('div/div/a')) > 0] | |
return alllist | |
def generateHTML(allposts, outfilename): | |
""" put all posts in the given list in to a simple | |
html file. | |
""" | |
begstr = "<html><body>" | |
allstr = "" | |
for p in allposts: | |
allstr = allstr + html.tostring(p) | |
endstr = "</body></html>" | |
with open(outfilename, "w") as f: | |
f.write(begstr + allstr + endstr) | |
def downloadPostList(postlist, authorToPick): | |
""" postlist:list in the form of [thread,page_index] | |
authorToPick: extract only this author's posts | |
""" | |
for post in postlist: | |
for i in range(1, post[1] + 1): | |
url = "https://www.ccthere.com/thread/%s/%s" % (post[0], i) | |
outfilename = str(post[0]) + '_' + str(i) + '.html' | |
htmlstring = parseHTML(url) | |
# if special javascript found, change url to cchere | |
if 'var ls="' in htmlstring: | |
url = "https://www.cchere.com/thread/%s/%s" % (post[0], i) | |
htmlstring = parseHTML(url) | |
htmlstring = zJ_PE(htmlstring) | |
allposts = getPageContents(htmlstring) | |
sameauthorlist = [ele for ele in allposts if authorToPick == getAuthor(ele)] | |
# if this page have author's posts, save to file | |
if len(sameauthorlist): generateHTML(sameauthorlist, outfilename) | |
if __name__ == '__main__': | |
postlist = [[307479, 1], [307786, 4], [307863, 2], [308916, 2], [310336, 2], [312480, 3], [329532, 4]] | |
authorToPick = u"HeiDaoRen" # 此处贴代码时 ID 用拼音代替了 | |
downloadPostList(postlist, authorToPick) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment