Created
August 4, 2011 15:50
-
-
Save farseerfc/1125490 to your computer and use it in GitHub Desktop.
treeyssy.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
from urllib.request import urlopen | |
#!/usr/bin/env python2 | |
# from urllib2 import urlopen | |
import re,sys,os,html | |
URLBASE="http://bbs.sjtu.edu.cn/" | |
URLTHREAD=URLBASE+"bbstfind0?" | |
URLARTICLE=URLBASE+"bbscon?" | |
class LineMsg: | |
def __init__(self,msg=""): | |
self.msg=msg | |
print(self.msg,end="") | |
sys.stdout.flush() | |
def replace(self,msg): | |
self.msg=msg | |
print("\r"+self.msg,end="") | |
sys.stdout.flush() | |
def close(self): | |
print("\r") | |
sys.stdout.flush() | |
class Article: | |
def __init__(self,url): | |
self.url=url | |
self.html=urlopen(url).read().decode("gbk","ignore") | |
self.parse() | |
def parse(self): | |
self.content=re.findall("<pre>(.*)<\/pre>",self.html,re.M|re.S)[0] | |
self.lines=self.content.split("\n") | |
self.head="\n".join(self.lines[0:3]) | |
self.title=self.lines[1][6:] | |
self.author=re.findall("<a href=\"bbsqry\\?userid=(\w+)\">\\1<\/a>",self.lines[0])[0] | |
self.board=re.findall("\), 信区: (\w+)$",self.lines[0])[0] | |
self.date=self.lines[2][5:] | |
self.article=self.lines[3:] | |
self.mainlines=set() | |
self.reflines=set() | |
for line in self.article: | |
refline=re.findall("<font color=\"808080\">: (.*)$",line) | |
if len(refline)>0: | |
self.reflines.add(refline[0]) | |
self.mainlines.add(": "+refline[0]) | |
else: | |
self.mainlines.add(line) | |
def __str__(self): | |
return "%s\t%s\t%s\n"%(self.author,self.date,self.title) | |
def __repr__(self): | |
return str(self) | |
def getThread(self): | |
msg=LineMsg("Building Threads") | |
self.threadPageUrl=re.findall("\[<a href='bbstfind0\?(.+?)'>同主题列表<\/a>\]", | |
self.html,re.M|re.S)[0] | |
self.threadPage=urlopen(URLTHREAD+self.threadPageUrl).read().decode("gbk","ignore") | |
threadListHtml=re.findall("<table.*?>(.*?)<\/table>",self.threadPage,re.M|re.S)[0] | |
threadList=threadListHtml.split("<tr>")[1:] | |
threadUrlList=[URLARTICLE+re.findall("<a href=bbscon\?(.+?)>",sub)[0] | |
for sub in threadList] | |
result=[]; count=1 | |
for url in threadUrlList: | |
msg.replace("Building Threads:"+("."*count)+("%d%%"%(100*count/len(threadUrlList)))) | |
result.append(Article(url)) | |
count+=1 | |
msg.close() | |
return result | |
def calcRef(threads): | |
for art in threads: | |
max_score=0 | |
max_refer=0 | |
for other in threads: | |
score=len(art.reflines.intersection(other.mainlines)) | |
if score>max_score: | |
max_score=score | |
max_refer=other | |
art.refer=max_refer | |
def genRefTreeRoot(level,art,threads,out): | |
print("<tr><td>%s</td><td>%s</td><td>%s<a href=\"%s\" title=\"%s\">%s</a></td></tr>"%( | |
art.date,art.author," "*level+"└─",art.url, | |
html.escape(re.sub("<.*?>","","\n".join(art.article))), | |
art.title),end="",file=out) | |
for child in threads: | |
if child.refer==art: | |
genRefTreeRoot(level+1,child,threads,out) | |
def genRefTree(threads,out): | |
print("<html><body><table>",file=out) | |
for art in threads: | |
if art.refer==0: | |
genRefTreeRoot(1,art,threads,out) | |
print("</tr></table></body></html>",file=out) | |
if __name__=="__main__": | |
if len(sys.argv)<2: | |
print("treeyssy url output.html") | |
sys.exit(1) | |
url=sys.argv[1] | |
out=open(sys.argv[2],"w") | |
threads=Article(url).getThread() | |
calcRef(threads) | |
genRefTree(threads,out) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment