Created
February 4, 2015 17:45
-
-
Save liyonghelpme/48bc1332ace8b5002bcd to your computer and use it in GitHub Desktop.
将oschina导出的html转化为 rss文件
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#coding:utf8 | |
#将oschina导出的html 转化为rss格式 | |
import re, sys, codecs, types | |
import htmlentitydefs | |
import urlparse | |
import HTMLParser | |
import urllib | |
rssheader = '''<?xml version="1.0" encoding="utf-8" ?> | |
<?xml-stylesheet type="text/xsl" title="XSL Formatting" href="/rss.xsl" media="all" ?> | |
<rss version="2.0"> | |
%s | |
</rss> | |
''' | |
channel = ''' | |
<channel> | |
<title>liyonghelpme</title> | |
<description>liyonghelpme</description> | |
<language>zh-cn</language> | |
<ttl>5</ttl> | |
<copyright><![CDATA[Copyright © liyonghelpme]]></copyright> | |
<pubDate>2015/2/4 23:13:49</pubDate> | |
%s | |
</channel> | |
''' | |
item = ''' | |
<item> | |
<title><![CDATA[%s]]></title> | |
<author>liyonghelpme</author> | |
<pubDate>%s</pubDate> | |
<description> | |
<![CDATA[ | |
%s | |
]]> | |
</description> | |
</item> | |
''' | |
saveRss = open('rss.xml', 'w') | |
pat = re.compile("<div class='content'>(.*?)</div>", re.MULTILINE|re.DOTALL) | |
class HTML2RSS(HTMLParser.HTMLParser): | |
def __init__(self, out=None, baseurl=''): | |
HTMLParser.HTMLParser.__init__(self) | |
self.outrss = [] | |
self.htmlData = '' | |
self.state = 0 #not read blog | |
self.title = '' | |
self.time = 0 | |
self.content = '' | |
self.divNest = 0 | |
self.conStartPos = -1 | |
self.conEndPos = -1 | |
def feed(self, data): | |
#data = data.replace("</' + 'script>", "</ignore>") | |
self.htmlData = data | |
HTMLParser.HTMLParser.feed(self, data) | |
def handle_starttag(self, tag, attrs, startPos = -1): | |
print 'start tag', tag, attrs | |
if tag == 'div' and len(attrs) == 1: | |
if len(attrs[0]) == 2: | |
firstAttr = attrs[0] | |
if attrs[0][0] == 'class' and attrs[0][1] == 'blog' and self.state == 0: | |
self.state = 1#begin blog | |
elif firstAttr[0] == 'class' and firstAttr[1] == 'date' and self.state == 3: | |
self.state = 4#begin read date | |
elif firstAttr[0] == 'class' and firstAttr[1] == 'content' and self.state == 5: | |
self.state = 6 | |
#self.content = '' | |
self.content = self.get_starttag_text() | |
self.conStartPos = startPos | |
self.divNest = 0 | |
if tag == 'div' and self.state == 6: | |
self.divNest += 1 | |
if tag == 'a' and len(attrs) == 2 and self.state == 1: | |
if attrs[0][0] == 'name' and attrs[0][1].find('blog') != -1: | |
self.state = 2 #read title | |
def handle_endtag(self, tag, endPos=-1): | |
print 'end tag', tag | |
if self.state == 2: | |
self.state = 3#read time | |
if self.state == 4: | |
self.state = 5 #read content | |
if self.state == 6 and tag == 'div': | |
self.divNest -= 1 | |
if self.divNest == 0: | |
self.conEndPos = endPos | |
#self.content = self.get_starttag_text() | |
print self.conStartPos | |
print self.conEndPos | |
self.content = self.htmlData[self.conStartPos:self.conEndPos] | |
res = pat.findall(self.content) | |
print 'read blog ' | |
self.state = 0 | |
#saveRss.write('read blog '+'\n\n') | |
#saveRss.write(self.title+'\n') | |
#saveRss.write(self.time+'\n') | |
#saveRss.write(self.content+'\n\n') | |
print self.content | |
rss = [self.title, self.time, res[0]] | |
self.outrss.append(rss) | |
def handle_data(self, data): | |
print 'some data', data | |
if self.state == 2: | |
self.title = data | |
if self.state == 4: | |
self.time = data | |
if self.state == 6: | |
pass | |
parser = HTML2RSS() | |
f = open('blogs_20150204.html').read() | |
parser.feed(f) | |
#aBlog = re.compile("<div class='blog'>.*</div>") | |
saveRss.close() | |
#allContent = [] | |
#print 'allconent ' | |
#for r in res: | |
# print r | |
#print 'content len', len(res) | |
#print len(parser.outrss) | |
items = [] | |
mpa = dict.fromkeys(range(32)) | |
a = ''.join(chr(n) for n in range(32)) | |
for r in parser.outrss: | |
r[1] = r[1].replace('时间:', '').replace('-0', '/').replace('-', '/') | |
print 'r2' | |
print r[2] | |
# r[2] = r[2].translate(mpa) | |
r[2] = re.sub("[%s]" % (a), '', r[2]) | |
print r[2] | |
items.append(item % (r[0], r[1], r[2])) | |
#break | |
it = str.join('', items) | |
ch = channel % it | |
full = rssheader % (ch) | |
f = open('out.xml', 'w') | |
f.write(full) | |
f.close() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment