farseerfc · August 4, 2011 15:50
diff --git a/gistfile1.py b/gistfile1.py
 #!/usr/bin/env python3
 from urllib.request import urlopen
 #!/usr/bin/env python2
 # from urllib2 import urlopen

 import re,sys,os,html

 URLBASE="http://bbs.sjtu.edu.cn/"
 URLTHREAD=URLBASE+"bbstfind0?"
 URLARTICLE=URLBASE+"bbscon?"

 class LineMsg:
    def __init__(self,msg=""):
        self.msg=msg
        print(self.msg,end="")
        sys.stdout.flush()
        
    def replace(self,msg):
        self.msg=msg
        print("\r"+self.msg,end="")
        sys.stdout.flush()
        
    def close(self):
        print("\r")
        sys.stdout.flush()

 class Article:
    def __init__(self,url):
        self.url=url
        self.html=urlopen(url).read().decode("gbk","ignore")
        self.parse()
        
    def parse(self):
        self.content=re.findall("<pre>(.*)<\/pre>",self.html,re.M|re.S)[0]
        self.lines=self.content.split("\n")
        self.head="\n".join(self.lines[0:3])
        self.title=self.lines[1][6:]
        self.author=re.findall("<a href=\"bbsqry\\?userid=(\w+)\">\\1<\/a>",self.lines[0])[0]
        self.board=re.findall("\), 信区: (\w+)$",self.lines[0])[0]
        self.date=self.lines[2][5:]
        
        self.article=self.lines[3:]
        self.mainlines=set()
        self.reflines=set()
        for line in self.article:
            refline=re.findall("<font color=\"808080\">: (.*)$",line)
            if len(refline)>0:
                self.reflines.add(refline[0])
                self.mainlines.add(": "+refline[0])
            else:
                self.mainlines.add(line)
              
    def __str__(self):
        return "%s\t%s\t%s\n"%(self.author,self.date,self.title)
        
    def __repr__(self):
        return str(self)
        
    def getThread(self):
        msg=LineMsg("Building Threads")
        self.threadPageUrl=re.findall("\[<a href='bbstfind0\?(.+?)'>同主题列表<\/a>\]",
            self.html,re.M|re.S)[0]
        self.threadPage=urlopen(URLTHREAD+self.threadPageUrl).read().decode("gbk","ignore")
        threadListHtml=re.findall("<table.*?>(.*?)<\/table>",self.threadPage,re.M|re.S)[0]
        threadList=threadListHtml.split("<tr>")[1:]
        threadUrlList=[URLARTICLE+re.findall("<a href=bbscon\?(.+?)>",sub)[0] 
            for sub in threadList]
        result=[]; count=1
        for url in threadUrlList:
            msg.replace("Building Threads:"+("."*count)+("%d%%"%(100*count/len(threadUrlList))))
            result.append(Article(url))
            count+=1
        msg.close()
        return result

 def calcRef(threads):
    for art in threads:
        max_score=0
        max_refer=0
        for other in threads:
            score=len(art.reflines.intersection(other.mainlines))
            if score>max_score:
                max_score=score
                max_refer=other
        art.refer=max_refer

 def genRefTreeRoot(level,art,threads,out):
    print("<tr><td>%s</td><td>%s</td><td>%s<a href=\"%s\" title=\"%s\">%s</a></td></tr>"%(
        art.date,art.author,"　　"*level+"└─",art.url,
        html.escape(re.sub("<.*?>","","\n".join(art.article))),
        art.title),end="",file=out)
    for child in threads:
        if child.refer==art:
            genRefTreeRoot(level+1,child,threads,out)
    
 def genRefTree(threads,out):
    print("<html><body><table>",file=out)
    for art in threads:
        if art.refer==0:
            genRefTreeRoot(1,art,threads,out)
    print("</tr></table></body></html>",file=out)
        
 if __name__=="__main__":
    if len(sys.argv)<2:
        print("treeyssy url output.html")
        sys.exit(1)
    url=sys.argv[1]
    out=open(sys.argv[2],"w")
    threads=Article(url).getThread()
    calcRef(threads)
    genRefTree(threads,out)
	#!/usr/bin/env python3
	from urllib.request import urlopen
	#!/usr/bin/env python2
	# from urllib2 import urlopen

	import re,sys,os,html

	URLBASE="http://bbs.sjtu.edu.cn/"
	URLTHREAD=URLBASE+"bbstfind0?"
	URLARTICLE=URLBASE+"bbscon?"

	class LineMsg:
	def __init__(self,msg=""):
	self.msg=msg
	print(self.msg,end="")
	sys.stdout.flush()

	def replace(self,msg):
	self.msg=msg
	print("\r"+self.msg,end="")
	sys.stdout.flush()

	def close(self):
	print("\r")
	sys.stdout.flush()

	class Article:
	def __init__(self,url):
	self.url=url
	self.html=urlopen(url).read().decode("gbk","ignore")
	self.parse()

	def parse(self):
	self.content=re.findall("<pre>(.*)<\/pre>",self.html,re.M\|re.S)[0]
	self.lines=self.content.split("\n")
	self.head="\n".join(self.lines[0:3])
	self.title=self.lines[1][6:]
	self.author=re.findall("<a href=\"bbsqry\\?userid=(\w+)\">\\1<\/a>",self.lines[0])[0]
	self.board=re.findall("\), 信区: (\w+)$",self.lines[0])[0]
	self.date=self.lines[2][5:]

	self.article=self.lines[3:]
	self.mainlines=set()
	self.reflines=set()
	for line in self.article:
	refline=re.findall("<font color=\"808080\">: (.*)$",line)
	if len(refline)>0:
	self.reflines.add(refline[0])
	self.mainlines.add(": "+refline[0])
	else:
	self.mainlines.add(line)

	def __str__(self):
	return "%s\t%s\t%s\n"%(self.author,self.date,self.title)

	def __repr__(self):
	return str(self)

	def getThread(self):
	msg=LineMsg("Building Threads")
	self.threadPageUrl=re.findall("\[<a href='bbstfind0\?(.+?)'>同主题列表<\/a>\]",
	self.html,re.M\|re.S)[0]
	self.threadPage=urlopen(URLTHREAD+self.threadPageUrl).read().decode("gbk","ignore")
	threadListHtml=re.findall("<table.?>(.?)<\/table>",self.threadPage,re.M\|re.S)[0]
	threadList=threadListHtml.split("<tr>")[1:]
	threadUrlList=[URLARTICLE+re.findall("<a href=bbscon\?(.+?)>",sub)[0]
	for sub in threadList]
	result=[]; count=1
	for url in threadUrlList:
	msg.replace("Building Threads:"+("."count)+("%d%%"%(100count/len(threadUrlList))))
	result.append(Article(url))
	count+=1
	msg.close()
	return result

	def calcRef(threads):
	for art in threads:
	max_score=0
	max_refer=0
	for other in threads:
	score=len(art.reflines.intersection(other.mainlines))
	if score>max_score:
	max_score=score
	max_refer=other
	art.refer=max_refer

	def genRefTreeRoot(level,art,threads,out):
	print("<tr><td>%s</td><td>%s</td><td>%s<a href=\"%s\" title=\"%s\">%s</a></td></tr>"%(
	art.date,art.author,"　　"*level+"└─",art.url,
	html.escape(re.sub("<.*?>","","\n".join(art.article))),
	art.title),end="",file=out)
	for child in threads:
	if child.refer==art:
	genRefTreeRoot(level+1,child,threads,out)

	def genRefTree(threads,out):
	print("<html><body><table>",file=out)
	for art in threads:
	if art.refer==0:
	genRefTreeRoot(1,art,threads,out)
	print("</tr></table></body></html>",file=out)

	if __name__=="__main__":
	if len(sys.argv)<2:
	print("treeyssy url output.html")
	sys.exit(1)
	url=sys.argv[1]
	out=open(sys.argv[2],"w")
	threads=Article(url).getThread()
	calcRef(threads)
	genRefTree(threads,out)