Skip to content

Instantly share code, notes, and snippets.

@farseerfc
Created August 4, 2011 15:50
Show Gist options
  • Save farseerfc/1125490 to your computer and use it in GitHub Desktop.
Save farseerfc/1125490 to your computer and use it in GitHub Desktop.
treeyssy.py
#!/usr/bin/env python3
from urllib.request import urlopen
#!/usr/bin/env python2
# from urllib2 import urlopen
import re,sys,os,html
URLBASE="http://bbs.sjtu.edu.cn/"
URLTHREAD=URLBASE+"bbstfind0?"
URLARTICLE=URLBASE+"bbscon?"
class LineMsg:
def __init__(self,msg=""):
self.msg=msg
print(self.msg,end="")
sys.stdout.flush()
def replace(self,msg):
self.msg=msg
print("\r"+self.msg,end="")
sys.stdout.flush()
def close(self):
print("\r")
sys.stdout.flush()
class Article:
def __init__(self,url):
self.url=url
self.html=urlopen(url).read().decode("gbk","ignore")
self.parse()
def parse(self):
self.content=re.findall("<pre>(.*)<\/pre>",self.html,re.M|re.S)[0]
self.lines=self.content.split("\n")
self.head="\n".join(self.lines[0:3])
self.title=self.lines[1][6:]
self.author=re.findall("<a href=\"bbsqry\\?userid=(\w+)\">\\1<\/a>",self.lines[0])[0]
self.board=re.findall("\), 信区: (\w+)$",self.lines[0])[0]
self.date=self.lines[2][5:]
self.article=self.lines[3:]
self.mainlines=set()
self.reflines=set()
for line in self.article:
refline=re.findall("<font color=\"808080\">: (.*)$",line)
if len(refline)>0:
self.reflines.add(refline[0])
self.mainlines.add(": "+refline[0])
else:
self.mainlines.add(line)
def __str__(self):
return "%s\t%s\t%s\n"%(self.author,self.date,self.title)
def __repr__(self):
return str(self)
def getThread(self):
msg=LineMsg("Building Threads")
self.threadPageUrl=re.findall("\[<a href='bbstfind0\?(.+?)'>同主题列表<\/a>\]",
self.html,re.M|re.S)[0]
self.threadPage=urlopen(URLTHREAD+self.threadPageUrl).read().decode("gbk","ignore")
threadListHtml=re.findall("<table.*?>(.*?)<\/table>",self.threadPage,re.M|re.S)[0]
threadList=threadListHtml.split("<tr>")[1:]
threadUrlList=[URLARTICLE+re.findall("<a href=bbscon\?(.+?)>",sub)[0]
for sub in threadList]
result=[]; count=1
for url in threadUrlList:
msg.replace("Building Threads:"+("."*count)+("%d%%"%(100*count/len(threadUrlList))))
result.append(Article(url))
count+=1
msg.close()
return result
def calcRef(threads):
for art in threads:
max_score=0
max_refer=0
for other in threads:
score=len(art.reflines.intersection(other.mainlines))
if score>max_score:
max_score=score
max_refer=other
art.refer=max_refer
def genRefTreeRoot(level,art,threads,out):
print("<tr><td>%s</td><td>%s</td><td>%s<a href=\"%s\" title=\"%s\">%s</a></td></tr>"%(
art.date,art.author,"  "*level+"└─",art.url,
html.escape(re.sub("<.*?>","","\n".join(art.article))),
art.title),end="",file=out)
for child in threads:
if child.refer==art:
genRefTreeRoot(level+1,child,threads,out)
def genRefTree(threads,out):
print("<html><body><table>",file=out)
for art in threads:
if art.refer==0:
genRefTreeRoot(1,art,threads,out)
print("</tr></table></body></html>",file=out)
if __name__=="__main__":
if len(sys.argv)<2:
print("treeyssy url output.html")
sys.exit(1)
url=sys.argv[1]
out=open(sys.argv[2],"w")
threads=Article(url).getThread()
calcRef(threads)
genRefTree(threads,out)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment