Skip to content

Instantly share code, notes, and snippets.

@liyonghelpme
Created February 4, 2015 17:45
Show Gist options
  • Save liyonghelpme/48bc1332ace8b5002bcd to your computer and use it in GitHub Desktop.
Save liyonghelpme/48bc1332ace8b5002bcd to your computer and use it in GitHub Desktop.
将oschina导出的html转化为 rss文件
#coding:utf8
#将oschina导出的html 转化为rss格式
import re, sys, codecs, types
import htmlentitydefs
import urlparse
import HTMLParser
import urllib
rssheader = '''<?xml version="1.0" encoding="utf-8" ?>
<?xml-stylesheet type="text/xsl" title="XSL Formatting" href="/rss.xsl" media="all" ?>
<rss version="2.0">
%s
</rss>
'''
channel = '''
<channel>
<title>liyonghelpme</title>
<description>liyonghelpme</description>
<language>zh-cn</language>
<ttl>5</ttl>
<copyright><![CDATA[Copyright &copy; liyonghelpme]]></copyright>
<pubDate>2015/2/4 23:13:49</pubDate>
%s
</channel>
'''
item = '''
<item>
<title><![CDATA[%s]]></title>
<author>liyonghelpme</author>
<pubDate>%s</pubDate>
<description>
<![CDATA[
%s
]]>
</description>
</item>
'''
saveRss = open('rss.xml', 'w')
pat = re.compile("<div class='content'>(.*?)</div>", re.MULTILINE|re.DOTALL)
class HTML2RSS(HTMLParser.HTMLParser):
def __init__(self, out=None, baseurl=''):
HTMLParser.HTMLParser.__init__(self)
self.outrss = []
self.htmlData = ''
self.state = 0 #not read blog
self.title = ''
self.time = 0
self.content = ''
self.divNest = 0
self.conStartPos = -1
self.conEndPos = -1
def feed(self, data):
#data = data.replace("</' + 'script>", "</ignore>")
self.htmlData = data
HTMLParser.HTMLParser.feed(self, data)
def handle_starttag(self, tag, attrs, startPos = -1):
print 'start tag', tag, attrs
if tag == 'div' and len(attrs) == 1:
if len(attrs[0]) == 2:
firstAttr = attrs[0]
if attrs[0][0] == 'class' and attrs[0][1] == 'blog' and self.state == 0:
self.state = 1#begin blog
elif firstAttr[0] == 'class' and firstAttr[1] == 'date' and self.state == 3:
self.state = 4#begin read date
elif firstAttr[0] == 'class' and firstAttr[1] == 'content' and self.state == 5:
self.state = 6
#self.content = ''
self.content = self.get_starttag_text()
self.conStartPos = startPos
self.divNest = 0
if tag == 'div' and self.state == 6:
self.divNest += 1
if tag == 'a' and len(attrs) == 2 and self.state == 1:
if attrs[0][0] == 'name' and attrs[0][1].find('blog') != -1:
self.state = 2 #read title
def handle_endtag(self, tag, endPos=-1):
print 'end tag', tag
if self.state == 2:
self.state = 3#read time
if self.state == 4:
self.state = 5 #read content
if self.state == 6 and tag == 'div':
self.divNest -= 1
if self.divNest == 0:
self.conEndPos = endPos
#self.content = self.get_starttag_text()
print self.conStartPos
print self.conEndPos
self.content = self.htmlData[self.conStartPos:self.conEndPos]
res = pat.findall(self.content)
print 'read blog '
self.state = 0
#saveRss.write('read blog '+'\n\n')
#saveRss.write(self.title+'\n')
#saveRss.write(self.time+'\n')
#saveRss.write(self.content+'\n\n')
print self.content
rss = [self.title, self.time, res[0]]
self.outrss.append(rss)
def handle_data(self, data):
print 'some data', data
if self.state == 2:
self.title = data
if self.state == 4:
self.time = data
if self.state == 6:
pass
parser = HTML2RSS()
f = open('blogs_20150204.html').read()
parser.feed(f)
#aBlog = re.compile("<div class='blog'>.*</div>")
saveRss.close()
#allContent = []
#print 'allconent '
#for r in res:
# print r
#print 'content len', len(res)
#print len(parser.outrss)
items = []
mpa = dict.fromkeys(range(32))
a = ''.join(chr(n) for n in range(32))
for r in parser.outrss:
r[1] = r[1].replace('时间:', '').replace('-0', '/').replace('-', '/')
print 'r2'
print r[2]
# r[2] = r[2].translate(mpa)
r[2] = re.sub("[%s]" % (a), '', r[2])
print r[2]
items.append(item % (r[0], r[1], r[2]))
#break
it = str.join('', items)
ch = channel % it
full = rssheader % (ch)
f = open('out.xml', 'w')
f.write(full)
f.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment