shihongzhi · May 20, 2012 12:00
diff --git a/grep98topdf.py b/grep98topdf.py
 #-*- coding:utf-8 -*-
 import urllib2
 import time
 import os
 from BeautifulSoup import BeautifulSoup
 from xhtml2pdf.default import DEFAULT_FONT
 from xhtml2pdf.document import pisaDocument
 from reportlab.pdfbase import pdfmetrics
 from reportlab.pdfbase.ttfonts import TTFont

 #have problem
 def deleteBracket(data):
    result = ''
    while '[' in data:
        if data.find('[') == 0:
            if not ']' in data:
                break
            data = data[data.find(']')+1:]
        else:
            result += data[:data.find('[')]
            data = data[data.find('['):]
    result += data
    return result

 def wrap_long_line(text, max_len=55):
    if len(text) <= max_len:
        return text
    parts = text.split("\n")
    parts_out = []
    for x in parts:
        parts_out.append(_wrap_long_line(x, max_len))
    return "\n".join(parts_out)

 def _wrap_long_line(text, max_len):
    out_text = ""
    times = len(text)*1.0 / max_len
    if times > int(times):
        times = int(times) + 1
    else:
        times = int(times)
    i = 0
    index = 0
    while i < times:
        s = text[index:index+max_len]
        out_text += s
        if not ('<' in s or '>' in s):
            out_text += '\n'
        index += max_len
        i += 1

    return out_text

 def extractImagePath(lineData):
    result = ""
    while lineData.find('[upload') != -1:
        result += wrap_long_line(deleteBracket(lineData[:lineData.find('[upload')]))
        lineData = lineData[lineData.find('[upload'):]
        lineData = lineData[lineData.find(']')+1:]
        imagepath = lineData[:lineData.find('[')]
        lineData = lineData[lineData.find(']')+1:]
        result += '<br/><img src="' + imagepath + '"/><br/>'
    result += wrap_long_line(deleteBracket(lineData))
    return result


 def grep88(soup, louzu):
    usersTag = soup('td', style='filter:glow(color=#9898BA,strength=2)') 
    print "len(usersTag):%d"%len(usersTag)
    tdTag = soup('td', width="100%",style="font-size:9pt;line-height:12pt")
    timesTag = soup('td', valign='middle', align='center', width='175')
    print "len(tdTag):%d"%len(tdTag)
    _htmlCell = ""
    for i in range(len(tdTag)):
        str = tdTag[i]
        username = usersTag[i].contents[1].contents[0].contents[0].string
        if username == louzu:   #只写楼主的信息
            posttime = timesTag[i].contents[2]
            data = str.contents[5]
            _htmlCell += '''<div class="time">''' + posttime[4:-3] + u'''发表：</div>'''
            if str.contents[3].string:
                _htmlCell += '<div class="bar">' + str.contents[3].string + '</div><br/>'
            for j in range(len(data)):
                if data.contents[j].string:
                    lineData = data.contents[j].string.replace('&nbsp;',' ').strip() #string
                    #print "lineData:"+lineData
                    if lineData and lineData!='\n':
                        if lineData[:8] == '[quotex]':  #this is a quotex
                            lineData = lineData[11:-4].replace('[i]', ' ').replace('[/i]', ' ')
                            #lineData = deleteBracket(lineData)
                            _htmlCell += '<div class="quote"><span class="inq"><div class="tip"><span class="fade">' + extractImagePath(lineData) + '<br/>'
                        elif lineData[:9] == '[/quotex]': #end of quotex
                            _htmlCell = _htmlCell[:-5] + '</span></div></span></div>' #delete the last <br/>
                        else:
                            lineData = extractImagePath(lineData)
                            if lineData:
                                _htmlCell += lineData + '<br/>'
    return _htmlCell

 def helper(pageOneUrl, soup):
    spanTag = soup('span', id="topicPagesNavigation")
    usersTag = soup('td', style='filter:glow(color=#9898BA,strength=2)') 
    titleTag = soup('title')
    louzu = usersTag[0].contents[1].contents[0].contents[0].string
    title = titleTag[0].string[:-15]
    postsNumber = int(spanTag[0].contents[1].string)
    if postsNumber%10 == 0:
        pageNumber = postsNumber/10
    else:
        pageNumber = postsNumber/10 + 1
    print 'pageNumber:%d'%pageNumber
    _htmlPosts = ""
    _htmlPosts += '''<div class="highlighted"><a href="'''+pageOneUrl + '''">Title:&nbsp;'''+ title + '&nbsp;&nbsp;by:&nbsp;' + louzu + '''</a></div>'''
    if pageNumber > 1:
        baseUrl = pageOneUrl[:pageOneUrl.find('?')] + soup('a', title=u'可以使用Alt+PageUp或Alt+PageDown快捷翻页')[0]['href'][:-1]
        print baseUrl
        for i in range(pageNumber):
            url = "%s%d"%(baseUrl, i+1)
            print url
            page = urllib2.urlopen(url)
            soup = BeautifulSoup(page)
            _htmlPosts += grep88(soup, louzu)
            #time.sleep(0.1)
    else:
       _htmlPosts += grep88(soup, louzu) 
       time.sleep(0.1)
    return _htmlPosts

 def render(urlfile):
    urlFileHandle = open(urlfile, 'r')
    css = open('static/css/pdf.css').read()
    for line in urlFileHandle.readlines():
        filepath = '/'.join(line.split()[:-1])
        if filepath:
            filepath += '/'
        print filepath
        url = line.split()[-1]
        print url
        #url = line[:-1] + "%d"%1
        page = urllib2.urlopen(url)
        soup = BeautifulSoup(page)
        titleTag = soup('title')
        title = titleTag[0].string[:-15].replace('/', '')
        #print ('pdffile/%s/%s.pdf'%(filepath,title)).encode('utf-8')
        #print '%s'%filepath.decode('utf-8')
        try:
            result = open('pdffile/%s%s.pdf'%(filepath.decode('utf-8'),title), 'wb')
        except IOError, e:
            print e
            os.makedirs('pdffile/%s'%filepath)
            result = open('pdffile/%s/%s.pdf'%(filepath.decode('utf-8'),title), 'wb')
        #htmlfile = open('%s.html'%title, 'wb')
    	_html = u"""<html><body>
            <div id="Top">
                行者无疆 &nbsp; &nbsp; 思者无域 &nbsp; &nbsp; &nbsp; &nbsp; 
                <br/>
            </div>
            <div class="box">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; &nbsp; &nbsp; &nbsp;&nbsp; &nbsp; &nbsp; &nbsp;&nbsp; &nbsp; &nbsp; &nbsp;&nbsp; &nbsp;
            &nbsp; &nbsp;&nbsp; &nbsp; &nbsp; &nbsp;&nbsp; &nbsp; &nbsp; &nbsp;&nbsp; &nbsp; &nbsp; &nbsp;&nbsp; &nbsp; &nbsp; &nbsp;&nbsp; &nbsp; 
            &nbsp;&nbsp; &nbsp;&nbsp; &nbsp; &nbsp; &nbsp;&nbsp; &nbsp; &nbsp; &nbsp;&nbsp; &nbsp; &nbsp; &nbsp;&nbsp; &nbsp; &nbsp; &nbsp;&nbsp; &nbsp; &nbsp; &nbsp;
            <div class="bar">声明：本文档由<a href="http://weibo.com/shiboss">猴哥</a>制作，版权归
                <a href="http://www.cc98.org/list.asp?boardid=147">cc98行者无疆</a>及楼主所有</div></div>
            <br/><br/>
            <div class="box">
            """
        _htmlPosts = helper(url, soup)
        _html += '''<div class="cell">''' + _htmlPosts + '''</div>'''
        _html += '''<br/><br/><br/><br/><br/>'''
        _html += """</div></body></html>"""
        print "render this post succuss!"
        print "save pdf file...."
        _pdf = pisaDocument(_html, result, default_css=css, capacity=50*1024)
        #htmlfile.write(_html.encode('utf-8'))
        result.close()
        #htmlfile.close()

        if not _pdf.err:
            print "Corret!!!"
        else:
            print _pdf.err

 if __name__ == '__main__':
    pdfmetrics.registerFont(TTFont('zhfont', 'static/font/code2000.ttf'))
    DEFAULT_FONT["helvetica"] = "zhfont"
    urlfile = 'tempurl2.txt'
    render(urlfile)
	#-- coding:utf-8 --
	import urllib2
	import time
	import os
	from BeautifulSoup import BeautifulSoup
	from xhtml2pdf.default import DEFAULT_FONT
	from xhtml2pdf.document import pisaDocument
	from reportlab.pdfbase import pdfmetrics
	from reportlab.pdfbase.ttfonts import TTFont

	#have problem
	def deleteBracket(data):
	result = ''
	while '[' in data:
	if data.find('[') == 0:
	if not ']' in data:
	break
	data = data[data.find(']')+1:]
	else:
	result += data[:data.find('[')]
	data = data[data.find('['):]
	result += data
	return result

	def wrap_long_line(text, max_len=55):
	if len(text) <= max_len:
	return text
	parts = text.split("\n")
	parts_out = []
	for x in parts:
	parts_out.append(_wrap_long_line(x, max_len))
	return "\n".join(parts_out)

	def _wrap_long_line(text, max_len):
	out_text = ""
	times = len(text)*1.0 / max_len
	if times > int(times):
	times = int(times) + 1
	else:
	times = int(times)
	i = 0
	index = 0
	while i < times:
	s = text[index:index+max_len]
	out_text += s
	if not ('<' in s or '>' in s):
	out_text += '\n'
	index += max_len
	i += 1

	return out_text

	def extractImagePath(lineData):
	result = ""
	while lineData.find('[upload') != -1:
	result += wrap_long_line(deleteBracket(lineData[:lineData.find('[upload')]))
	lineData = lineData[lineData.find('[upload'):]
	lineData = lineData[lineData.find(']')+1:]
	imagepath = lineData[:lineData.find('[')]
	lineData = lineData[lineData.find(']')+1:]
	result += '<br/><img src="' + imagepath + '"/><br/>'
	result += wrap_long_line(deleteBracket(lineData))
	return result


	def grep88(soup, louzu):
	usersTag = soup('td', style='filter:glow(color=#9898BA,strength=2)')
	print "len(usersTag):%d"%len(usersTag)
	tdTag = soup('td', width="100%",style="font-size:9pt;line-height:12pt")
	timesTag = soup('td', valign='middle', align='center', width='175')
	print "len(tdTag):%d"%len(tdTag)
	_htmlCell = ""
	for i in range(len(tdTag)):
	str = tdTag[i]
	username = usersTag[i].contents[1].contents[0].contents[0].string
	if username == louzu: #只写楼主的信息
	posttime = timesTag[i].contents[2]
	data = str.contents[5]
	_htmlCell += '''<div class="time">''' + posttime[4:-3] + u'''发表：</div>'''
	if str.contents[3].string:
	_htmlCell += '<div class="bar">' + str.contents[3].string + '</div><br/>'
	for j in range(len(data)):
	if data.contents[j].string:
	lineData = data.contents[j].string.replace(' ',' ').strip() #string
	#print "lineData:"+lineData
	if lineData and lineData!='\n':
	if lineData[:8] == '[quotex]': #this is a quotex
	lineData = lineData[11:-4].replace('[i]', ' ').replace('[/i]', ' ')
	#lineData = deleteBracket(lineData)
	_htmlCell += '<div class="quote"><span class="inq"><div class="tip"><span class="fade">' + extractImagePath(lineData) + '<br/>'
	elif lineData[:9] == '[/quotex]': #end of quotex
	_htmlCell = _htmlCell[:-5] + '</span></div></span></div>' #delete the last <br/>
	else:
	lineData = extractImagePath(lineData)
	if lineData:
	_htmlCell += lineData + '<br/>'
	return _htmlCell

	def helper(pageOneUrl, soup):
	spanTag = soup('span', id="topicPagesNavigation")
	usersTag = soup('td', style='filter:glow(color=#9898BA,strength=2)')
	titleTag = soup('title')
	louzu = usersTag[0].contents[1].contents[0].contents[0].string
	title = titleTag[0].string[:-15]
	postsNumber = int(spanTag[0].contents[1].string)
	if postsNumber%10 == 0:
	pageNumber = postsNumber/10
	else:
	pageNumber = postsNumber/10 + 1
	print 'pageNumber:%d'%pageNumber
	_htmlPosts = ""
	_htmlPosts += '''<div class="highlighted"><a href="'''+pageOneUrl + '''">Title: '''+ title + '  by: ' + louzu + '''</a></div>'''
	if pageNumber > 1:
	baseUrl = pageOneUrl[:pageOneUrl.find('?')] + soup('a', title=u'可以使用Alt+PageUp或Alt+PageDown快捷翻页')[0]['href'][:-1]
	print baseUrl
	for i in range(pageNumber):
	url = "%s%d"%(baseUrl, i+1)
	print url
	page = urllib2.urlopen(url)
	soup = BeautifulSoup(page)
	_htmlPosts += grep88(soup, louzu)
	#time.sleep(0.1)
	else:
	_htmlPosts += grep88(soup, louzu)
	time.sleep(0.1)
	return _htmlPosts

	def render(urlfile):
	urlFileHandle = open(urlfile, 'r')
	css = open('static/css/pdf.css').read()
	for line in urlFileHandle.readlines():
	filepath = '/'.join(line.split()[:-1])
	if filepath:
	filepath += '/'
	print filepath
	url = line.split()[-1]
	print url
	#url = line[:-1] + "%d"%1
	page = urllib2.urlopen(url)
	soup = BeautifulSoup(page)
	titleTag = soup('title')
	title = titleTag[0].string[:-15].replace('/', '')
	#print ('pdffile/%s/%s.pdf'%(filepath,title)).encode('utf-8')
	#print '%s'%filepath.decode('utf-8')
	try:
	result = open('pdffile/%s%s.pdf'%(filepath.decode('utf-8'),title), 'wb')
	except IOError, e:
	print e
	os.makedirs('pdffile/%s'%filepath)
	result = open('pdffile/%s/%s.pdf'%(filepath.decode('utf-8'),title), 'wb')
	#htmlfile = open('%s.html'%title, 'wb')
	_html = u"""<html><body>
	<div id="Top">
	行者无疆     思者无域
	<br/>
	</div>
	<div class="box">


	<div class="bar">声明：本文档由<a href="http://weibo.com/shiboss">猴哥</a>制作，版权归
	<a href="http://www.cc98.org/list.asp?boardid=147">cc98行者无疆</a>及楼主所有</div></div>
	<br/><br/>
	<div class="box">
	"""
	_htmlPosts = helper(url, soup)
	_html += '''<div class="cell">''' + _htmlPosts + '''</div>'''
	_html += '''<br/><br/><br/><br/><br/>'''
	_html += """</div></body></html>"""
	print "render this post succuss!"
	print "save pdf file...."
	_pdf = pisaDocument(_html, result, default_css=css, capacity=50*1024)
	#htmlfile.write(_html.encode('utf-8'))
	result.close()
	#htmlfile.close()

	if not _pdf.err:
	print "Corret!!!"
	else:
	print _pdf.err

	if __name__ == '__main__':
	pdfmetrics.registerFont(TTFont('zhfont', 'static/font/code2000.ttf'))
	DEFAULT_FONT["helvetica"] = "zhfont"
	urlfile = 'tempurl2.txt'
	render(urlfile)