Created
May 20, 2012 12:00
-
-
Save shihongzhi/2757811 to your computer and use it in GitHub Desktop.
抓取cc98上面的帖子内容,然后保存为pdf。帖子列表由tempurl2.txt导入。使用到的库为urllib2和BeautifulSoup
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#-*- coding:utf-8 -*- | |
import urllib2 | |
import time | |
import os | |
from BeautifulSoup import BeautifulSoup | |
from xhtml2pdf.default import DEFAULT_FONT | |
from xhtml2pdf.document import pisaDocument | |
from reportlab.pdfbase import pdfmetrics | |
from reportlab.pdfbase.ttfonts import TTFont | |
#have problem | |
def deleteBracket(data): | |
result = '' | |
while '[' in data: | |
if data.find('[') == 0: | |
if not ']' in data: | |
break | |
data = data[data.find(']')+1:] | |
else: | |
result += data[:data.find('[')] | |
data = data[data.find('['):] | |
result += data | |
return result | |
def wrap_long_line(text, max_len=55): | |
if len(text) <= max_len: | |
return text | |
parts = text.split("\n") | |
parts_out = [] | |
for x in parts: | |
parts_out.append(_wrap_long_line(x, max_len)) | |
return "\n".join(parts_out) | |
def _wrap_long_line(text, max_len): | |
out_text = "" | |
times = len(text)*1.0 / max_len | |
if times > int(times): | |
times = int(times) + 1 | |
else: | |
times = int(times) | |
i = 0 | |
index = 0 | |
while i < times: | |
s = text[index:index+max_len] | |
out_text += s | |
if not ('<' in s or '>' in s): | |
out_text += '\n' | |
index += max_len | |
i += 1 | |
return out_text | |
def extractImagePath(lineData): | |
result = "" | |
while lineData.find('[upload') != -1: | |
result += wrap_long_line(deleteBracket(lineData[:lineData.find('[upload')])) | |
lineData = lineData[lineData.find('[upload'):] | |
lineData = lineData[lineData.find(']')+1:] | |
imagepath = lineData[:lineData.find('[')] | |
lineData = lineData[lineData.find(']')+1:] | |
result += '<br/><img src="' + imagepath + '"/><br/>' | |
result += wrap_long_line(deleteBracket(lineData)) | |
return result | |
def grep88(soup, louzu): | |
usersTag = soup('td', style='filter:glow(color=#9898BA,strength=2)') | |
print "len(usersTag):%d"%len(usersTag) | |
tdTag = soup('td', width="100%",style="font-size:9pt;line-height:12pt") | |
timesTag = soup('td', valign='middle', align='center', width='175') | |
print "len(tdTag):%d"%len(tdTag) | |
_htmlCell = "" | |
for i in range(len(tdTag)): | |
str = tdTag[i] | |
username = usersTag[i].contents[1].contents[0].contents[0].string | |
if username == louzu: #只写楼主的信息 | |
posttime = timesTag[i].contents[2] | |
data = str.contents[5] | |
_htmlCell += '''<div class="time">''' + posttime[4:-3] + u'''发表:</div>''' | |
if str.contents[3].string: | |
_htmlCell += '<div class="bar">' + str.contents[3].string + '</div><br/>' | |
for j in range(len(data)): | |
if data.contents[j].string: | |
lineData = data.contents[j].string.replace(' ',' ').strip() #string | |
#print "lineData:"+lineData | |
if lineData and lineData!='\n': | |
if lineData[:8] == '[quotex]': #this is a quotex | |
lineData = lineData[11:-4].replace('[i]', ' ').replace('[/i]', ' ') | |
#lineData = deleteBracket(lineData) | |
_htmlCell += '<div class="quote"><span class="inq"><div class="tip"><span class="fade">' + extractImagePath(lineData) + '<br/>' | |
elif lineData[:9] == '[/quotex]': #end of quotex | |
_htmlCell = _htmlCell[:-5] + '</span></div></span></div>' #delete the last <br/> | |
else: | |
lineData = extractImagePath(lineData) | |
if lineData: | |
_htmlCell += lineData + '<br/>' | |
return _htmlCell | |
def helper(pageOneUrl, soup): | |
spanTag = soup('span', id="topicPagesNavigation") | |
usersTag = soup('td', style='filter:glow(color=#9898BA,strength=2)') | |
titleTag = soup('title') | |
louzu = usersTag[0].contents[1].contents[0].contents[0].string | |
title = titleTag[0].string[:-15] | |
postsNumber = int(spanTag[0].contents[1].string) | |
if postsNumber%10 == 0: | |
pageNumber = postsNumber/10 | |
else: | |
pageNumber = postsNumber/10 + 1 | |
print 'pageNumber:%d'%pageNumber | |
_htmlPosts = "" | |
_htmlPosts += '''<div class="highlighted"><a href="'''+pageOneUrl + '''">Title: '''+ title + ' by: ' + louzu + '''</a></div>''' | |
if pageNumber > 1: | |
baseUrl = pageOneUrl[:pageOneUrl.find('?')] + soup('a', title=u'可以使用Alt+PageUp或Alt+PageDown快捷翻页')[0]['href'][:-1] | |
print baseUrl | |
for i in range(pageNumber): | |
url = "%s%d"%(baseUrl, i+1) | |
print url | |
page = urllib2.urlopen(url) | |
soup = BeautifulSoup(page) | |
_htmlPosts += grep88(soup, louzu) | |
#time.sleep(0.1) | |
else: | |
_htmlPosts += grep88(soup, louzu) | |
time.sleep(0.1) | |
return _htmlPosts | |
def render(urlfile): | |
urlFileHandle = open(urlfile, 'r') | |
css = open('static/css/pdf.css').read() | |
for line in urlFileHandle.readlines(): | |
filepath = '/'.join(line.split()[:-1]) | |
if filepath: | |
filepath += '/' | |
print filepath | |
url = line.split()[-1] | |
print url | |
#url = line[:-1] + "%d"%1 | |
page = urllib2.urlopen(url) | |
soup = BeautifulSoup(page) | |
titleTag = soup('title') | |
title = titleTag[0].string[:-15].replace('/', '') | |
#print ('pdffile/%s/%s.pdf'%(filepath,title)).encode('utf-8') | |
#print '%s'%filepath.decode('utf-8') | |
try: | |
result = open('pdffile/%s%s.pdf'%(filepath.decode('utf-8'),title), 'wb') | |
except IOError, e: | |
print e | |
os.makedirs('pdffile/%s'%filepath) | |
result = open('pdffile/%s/%s.pdf'%(filepath.decode('utf-8'),title), 'wb') | |
#htmlfile = open('%s.html'%title, 'wb') | |
_html = u"""<html><body> | |
<div id="Top"> | |
行者无疆 思者无域 | |
<br/> | |
</div> | |
<div class="box"> | |
| |
| |
<div class="bar">声明:本文档由<a href="http://weibo.com/shiboss">猴哥</a>制作,版权归 | |
<a href="http://www.cc98.org/list.asp?boardid=147">cc98行者无疆</a>及楼主所有</div></div> | |
<br/><br/> | |
<div class="box"> | |
""" | |
_htmlPosts = helper(url, soup) | |
_html += '''<div class="cell">''' + _htmlPosts + '''</div>''' | |
_html += '''<br/><br/><br/><br/><br/>''' | |
_html += """</div></body></html>""" | |
print "render this post succuss!" | |
print "save pdf file...." | |
_pdf = pisaDocument(_html, result, default_css=css, capacity=50*1024) | |
#htmlfile.write(_html.encode('utf-8')) | |
result.close() | |
#htmlfile.close() | |
if not _pdf.err: | |
print "Corret!!!" | |
else: | |
print _pdf.err | |
if __name__ == '__main__': | |
pdfmetrics.registerFont(TTFont('zhfont', 'static/font/code2000.ttf')) | |
DEFAULT_FONT["helvetica"] = "zhfont" | |
urlfile = 'tempurl2.txt' | |
render(urlfile) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment