Skip to content

Instantly share code, notes, and snippets.

@shihongzhi
Created May 20, 2012 12:00
Show Gist options
  • Save shihongzhi/2757811 to your computer and use it in GitHub Desktop.
Save shihongzhi/2757811 to your computer and use it in GitHub Desktop.
抓取cc98上面的帖子内容,然后保存为pdf。帖子列表由tempurl2.txt导入。使用到的库为urllib2和BeautifulSoup
#-*- coding:utf-8 -*-
import urllib2
import time
import os
from BeautifulSoup import BeautifulSoup
from xhtml2pdf.default import DEFAULT_FONT
from xhtml2pdf.document import pisaDocument
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont
#have problem
def deleteBracket(data):
result = ''
while '[' in data:
if data.find('[') == 0:
if not ']' in data:
break
data = data[data.find(']')+1:]
else:
result += data[:data.find('[')]
data = data[data.find('['):]
result += data
return result
def wrap_long_line(text, max_len=55):
if len(text) <= max_len:
return text
parts = text.split("\n")
parts_out = []
for x in parts:
parts_out.append(_wrap_long_line(x, max_len))
return "\n".join(parts_out)
def _wrap_long_line(text, max_len):
out_text = ""
times = len(text)*1.0 / max_len
if times > int(times):
times = int(times) + 1
else:
times = int(times)
i = 0
index = 0
while i < times:
s = text[index:index+max_len]
out_text += s
if not ('<' in s or '>' in s):
out_text += '\n'
index += max_len
i += 1
return out_text
def extractImagePath(lineData):
result = ""
while lineData.find('[upload') != -1:
result += wrap_long_line(deleteBracket(lineData[:lineData.find('[upload')]))
lineData = lineData[lineData.find('[upload'):]
lineData = lineData[lineData.find(']')+1:]
imagepath = lineData[:lineData.find('[')]
lineData = lineData[lineData.find(']')+1:]
result += '<br/><img src="' + imagepath + '"/><br/>'
result += wrap_long_line(deleteBracket(lineData))
return result
def grep88(soup, louzu):
usersTag = soup('td', style='filter:glow(color=#9898BA,strength=2)')
print "len(usersTag):%d"%len(usersTag)
tdTag = soup('td', width="100%",style="font-size:9pt;line-height:12pt")
timesTag = soup('td', valign='middle', align='center', width='175')
print "len(tdTag):%d"%len(tdTag)
_htmlCell = ""
for i in range(len(tdTag)):
str = tdTag[i]
username = usersTag[i].contents[1].contents[0].contents[0].string
if username == louzu: #只写楼主的信息
posttime = timesTag[i].contents[2]
data = str.contents[5]
_htmlCell += '''<div class="time">''' + posttime[4:-3] + u'''发表:</div>'''
if str.contents[3].string:
_htmlCell += '<div class="bar">' + str.contents[3].string + '</div><br/>'
for j in range(len(data)):
if data.contents[j].string:
lineData = data.contents[j].string.replace('&nbsp;',' ').strip() #string
#print "lineData:"+lineData
if lineData and lineData!='\n':
if lineData[:8] == '[quotex]': #this is a quotex
lineData = lineData[11:-4].replace('[i]', ' ').replace('[/i]', ' ')
#lineData = deleteBracket(lineData)
_htmlCell += '<div class="quote"><span class="inq"><div class="tip"><span class="fade">' + extractImagePath(lineData) + '<br/>'
elif lineData[:9] == '[/quotex]': #end of quotex
_htmlCell = _htmlCell[:-5] + '</span></div></span></div>' #delete the last <br/>
else:
lineData = extractImagePath(lineData)
if lineData:
_htmlCell += lineData + '<br/>'
return _htmlCell
def helper(pageOneUrl, soup):
spanTag = soup('span', id="topicPagesNavigation")
usersTag = soup('td', style='filter:glow(color=#9898BA,strength=2)')
titleTag = soup('title')
louzu = usersTag[0].contents[1].contents[0].contents[0].string
title = titleTag[0].string[:-15]
postsNumber = int(spanTag[0].contents[1].string)
if postsNumber%10 == 0:
pageNumber = postsNumber/10
else:
pageNumber = postsNumber/10 + 1
print 'pageNumber:%d'%pageNumber
_htmlPosts = ""
_htmlPosts += '''<div class="highlighted"><a href="'''+pageOneUrl + '''">Title:&nbsp;'''+ title + '&nbsp;&nbsp;by:&nbsp;' + louzu + '''</a></div>'''
if pageNumber > 1:
baseUrl = pageOneUrl[:pageOneUrl.find('?')] + soup('a', title=u'可以使用Alt+PageUp或Alt+PageDown快捷翻页')[0]['href'][:-1]
print baseUrl
for i in range(pageNumber):
url = "%s%d"%(baseUrl, i+1)
print url
page = urllib2.urlopen(url)
soup = BeautifulSoup(page)
_htmlPosts += grep88(soup, louzu)
#time.sleep(0.1)
else:
_htmlPosts += grep88(soup, louzu)
time.sleep(0.1)
return _htmlPosts
def render(urlfile):
urlFileHandle = open(urlfile, 'r')
css = open('static/css/pdf.css').read()
for line in urlFileHandle.readlines():
filepath = '/'.join(line.split()[:-1])
if filepath:
filepath += '/'
print filepath
url = line.split()[-1]
print url
#url = line[:-1] + "%d"%1
page = urllib2.urlopen(url)
soup = BeautifulSoup(page)
titleTag = soup('title')
title = titleTag[0].string[:-15].replace('/', '')
#print ('pdffile/%s/%s.pdf'%(filepath,title)).encode('utf-8')
#print '%s'%filepath.decode('utf-8')
try:
result = open('pdffile/%s%s.pdf'%(filepath.decode('utf-8'),title), 'wb')
except IOError, e:
print e
os.makedirs('pdffile/%s'%filepath)
result = open('pdffile/%s/%s.pdf'%(filepath.decode('utf-8'),title), 'wb')
#htmlfile = open('%s.html'%title, 'wb')
_html = u"""<html><body>
<div id="Top">
行者无疆 &nbsp; &nbsp; 思者无域 &nbsp; &nbsp; &nbsp; &nbsp;
<br/>
</div>
<div class="box">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; &nbsp; &nbsp; &nbsp;&nbsp; &nbsp; &nbsp; &nbsp;&nbsp; &nbsp; &nbsp; &nbsp;&nbsp; &nbsp;
&nbsp; &nbsp;&nbsp; &nbsp; &nbsp; &nbsp;&nbsp; &nbsp; &nbsp; &nbsp;&nbsp; &nbsp; &nbsp; &nbsp;&nbsp; &nbsp; &nbsp; &nbsp;&nbsp; &nbsp;
&nbsp;&nbsp; &nbsp;&nbsp; &nbsp; &nbsp; &nbsp;&nbsp; &nbsp; &nbsp; &nbsp;&nbsp; &nbsp; &nbsp; &nbsp;&nbsp; &nbsp; &nbsp; &nbsp;&nbsp; &nbsp; &nbsp; &nbsp;
<div class="bar">声明:本文档由<a href="http://weibo.com/shiboss">猴哥</a>制作,版权归
<a href="http://www.cc98.org/list.asp?boardid=147">cc98行者无疆</a>及楼主所有</div></div>
<br/><br/>
<div class="box">
"""
_htmlPosts = helper(url, soup)
_html += '''<div class="cell">''' + _htmlPosts + '''</div>'''
_html += '''<br/><br/><br/><br/><br/>'''
_html += """</div></body></html>"""
print "render this post succuss!"
print "save pdf file...."
_pdf = pisaDocument(_html, result, default_css=css, capacity=50*1024)
#htmlfile.write(_html.encode('utf-8'))
result.close()
#htmlfile.close()
if not _pdf.err:
print "Corret!!!"
else:
print _pdf.err
if __name__ == '__main__':
pdfmetrics.registerFont(TTFont('zhfont', 'static/font/code2000.ttf'))
DEFAULT_FONT["helvetica"] = "zhfont"
urlfile = 'tempurl2.txt'
render(urlfile)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment