liyonghelpme · February 4, 2015 17:45
diff --git a/convertToRss.py b/convertToRss.py
 #coding:utf8
 #将oschina导出的html 转化为rss格式
 import re, sys, codecs, types
 import htmlentitydefs
 import urlparse
 import HTMLParser
 import urllib

 rssheader = '''<?xml version="1.0" encoding="utf-8" ?>
 <?xml-stylesheet type="text/xsl" title="XSL Formatting" href="/rss.xsl" media="all" ?>
 <rss version="2.0">

 %s
 </rss>
 '''

 channel = '''
 <channel>
    <title>liyonghelpme</title>
    <description>liyonghelpme</description>
    <language>zh-cn</language>
    <ttl>5</ttl>
    <copyright><![CDATA[Copyright &copy; liyonghelpme]]></copyright> 
    <pubDate>2015/2/4 23:13:49</pubDate>

 %s
 </channel>
 '''

 item = '''
 <item>
 <title><![CDATA[%s]]></title>
 <author>liyonghelpme</author>
 <pubDate>%s</pubDate>
 <description>
 <![CDATA[
 %s
 ]]>
 </description>
 </item>
 '''
 
 saveRss = open('rss.xml', 'w')

 pat = re.compile("<div class='content'>(.*?)</div>", re.MULTILINE|re.DOTALL)

 class HTML2RSS(HTMLParser.HTMLParser):
    def __init__(self, out=None, baseurl=''):
        HTMLParser.HTMLParser.__init__(self)
        self.outrss = []
        
        self.htmlData = ''
        
        self.state = 0 #not read blog
        self.title = ''
        self.time = 0
        self.content = ''
        self.divNest = 0
        self.conStartPos = -1
        self.conEndPos = -1
        
    def feed(self, data):
        #data = data.replace("</' + 'script>", "</ignore>")
        self.htmlData = data
        HTMLParser.HTMLParser.feed(self, data)
    def handle_starttag(self, tag, attrs, startPos = -1):
        print 'start tag', tag, attrs
        if tag == 'div' and len(attrs) == 1:
            if len(attrs[0]) == 2:
                firstAttr = attrs[0]
                if attrs[0][0] == 'class' and attrs[0][1] == 'blog' and self.state == 0:
                    self.state = 1#begin blog
                elif firstAttr[0] == 'class' and firstAttr[1] == 'date' and self.state == 3:
                    self.state = 4#begin read date
                elif firstAttr[0] == 'class' and firstAttr[1] == 'content' and self.state == 5:
                    self.state = 6
                    #self.content = ''
                    self.content = self.get_starttag_text()
                    self.conStartPos = startPos
                    self.divNest = 0

        if tag == 'div' and self.state == 6:
            self.divNest += 1
                    
        if tag == 'a' and len(attrs) == 2 and self.state == 1:
            if attrs[0][0] == 'name' and attrs[0][1].find('blog') != -1:
                self.state = 2 #read title
            
    def handle_endtag(self, tag, endPos=-1):
        print 'end tag', tag
        if self.state == 2:
            self.state = 3#read time
        if self.state == 4:
            self.state = 5 #read content
        if self.state == 6 and tag == 'div':
            self.divNest -= 1

            if self.divNest == 0:
                self.conEndPos = endPos
                #self.content = self.get_starttag_text()
                print self.conStartPos
                print self.conEndPos
                self.content = self.htmlData[self.conStartPos:self.conEndPos]

                res = pat.findall(self.content)
                print 'read blog '
                self.state = 0
                #saveRss.write('read blog '+'\n\n')
                #saveRss.write(self.title+'\n')
                #saveRss.write(self.time+'\n')
                #saveRss.write(self.content+'\n\n')
                print self.content
                rss = [self.title, self.time, res[0]]
                self.outrss.append(rss)
             

    def handle_data(self, data):
        print 'some data', data
        if self.state == 2:
            self.title = data
        if self.state == 4:
            self.time = data
        if self.state == 6:
            pass

            


 parser = HTML2RSS()
 f = open('blogs_20150204.html').read()

 parser.feed(f)

 #aBlog = re.compile("<div class='blog'>.*</div>")
 saveRss.close()


 #allContent = []

 #print 'allconent '
 #for r in res:
 #    print r

 #print 'content len', len(res)
 #print len(parser.outrss)

 items = []
 mpa = dict.fromkeys(range(32))
 a = ''.join(chr(n) for n in range(32))

 for r in parser.outrss:
    r[1] = r[1].replace('时间：', '').replace('-0', '/').replace('-', '/')
    print 'r2'
    print r[2]
    # r[2] = r[2].translate(mpa)
    r[2] = re.sub("[%s]" % (a), '', r[2])
    print r[2]

    items.append(item % (r[0], r[1], r[2]))
    #break

 it = str.join('', items)
 ch = channel % it



 full = rssheader % (ch)
 f = open('out.xml', 'w')
 f.write(full)
 f.close()
	#coding:utf8
	#将oschina导出的html 转化为rss格式
	import re, sys, codecs, types
	import htmlentitydefs
	import urlparse
	import HTMLParser
	import urllib

	rssheader = '''<?xml version="1.0" encoding="utf-8" ?>
	<?xml-stylesheet type="text/xsl" title="XSL Formatting" href="/rss.xsl" media="all" ?>
	<rss version="2.0">

	%s
	</rss>
	'''

	channel = '''
	<channel>
	<title>liyonghelpme</title>
	<description>liyonghelpme</description>
	<language>zh-cn</language>
	<ttl>5</ttl>
	<copyright><![CDATA[Copyright © liyonghelpme]]></copyright>
	<pubDate>2015/2/4 23:13:49</pubDate>

	%s
	</channel>
	'''

	item = '''
	<item>
	<title><![CDATA[%s]]></title>
	<author>liyonghelpme</author>
	<pubDate>%s</pubDate>
	<description>
	<![CDATA[
	%s
	]]>
	</description>
	</item>
	'''

	saveRss = open('rss.xml', 'w')

	pat = re.compile("<div class='content'>(.*?)</div>", re.MULTILINE\|re.DOTALL)

	class HTML2RSS(HTMLParser.HTMLParser):
	def __init__(self, out=None, baseurl=''):
	HTMLParser.HTMLParser.__init__(self)
	self.outrss = []

	self.htmlData = ''

	self.state = 0 #not read blog
	self.title = ''
	self.time = 0
	self.content = ''
	self.divNest = 0
	self.conStartPos = -1
	self.conEndPos = -1

	def feed(self, data):
	#data = data.replace("</' + 'script>", "</ignore>")
	self.htmlData = data
	HTMLParser.HTMLParser.feed(self, data)
	def handle_starttag(self, tag, attrs, startPos = -1):
	print 'start tag', tag, attrs
	if tag == 'div' and len(attrs) == 1:
	if len(attrs[0]) == 2:
	firstAttr = attrs[0]
	if attrs[0][0] == 'class' and attrs[0][1] == 'blog' and self.state == 0:
	self.state = 1#begin blog
	elif firstAttr[0] == 'class' and firstAttr[1] == 'date' and self.state == 3:
	self.state = 4#begin read date
	elif firstAttr[0] == 'class' and firstAttr[1] == 'content' and self.state == 5:
	self.state = 6
	#self.content = ''
	self.content = self.get_starttag_text()
	self.conStartPos = startPos
	self.divNest = 0

	if tag == 'div' and self.state == 6:
	self.divNest += 1

	if tag == 'a' and len(attrs) == 2 and self.state == 1:
	if attrs[0][0] == 'name' and attrs[0][1].find('blog') != -1:
	self.state = 2 #read title

	def handle_endtag(self, tag, endPos=-1):
	print 'end tag', tag
	if self.state == 2:
	self.state = 3#read time
	if self.state == 4:
	self.state = 5 #read content
	if self.state == 6 and tag == 'div':
	self.divNest -= 1

	if self.divNest == 0:
	self.conEndPos = endPos
	#self.content = self.get_starttag_text()
	print self.conStartPos
	print self.conEndPos
	self.content = self.htmlData[self.conStartPos:self.conEndPos]

	res = pat.findall(self.content)
	print 'read blog '
	self.state = 0
	#saveRss.write('read blog '+'\n\n')
	#saveRss.write(self.title+'\n')
	#saveRss.write(self.time+'\n')
	#saveRss.write(self.content+'\n\n')
	print self.content
	rss = [self.title, self.time, res[0]]
	self.outrss.append(rss)


	def handle_data(self, data):
	print 'some data', data
	if self.state == 2:
	self.title = data
	if self.state == 4:
	self.time = data
	if self.state == 6:
	pass




	parser = HTML2RSS()
	f = open('blogs_20150204.html').read()

	parser.feed(f)

	#aBlog = re.compile("<div class='blog'>.*</div>")
	saveRss.close()


	#allContent = []

	#print 'allconent '
	#for r in res:
	# print r

	#print 'content len', len(res)
	#print len(parser.outrss)

	items = []
	mpa = dict.fromkeys(range(32))
	a = ''.join(chr(n) for n in range(32))

	for r in parser.outrss:
	r[1] = r[1].replace('时间：', '').replace('-0', '/').replace('-', '/')
	print 'r2'
	print r[2]
	# r[2] = r[2].translate(mpa)
	r[2] = re.sub("[%s]" % (a), '', r[2])
	print r[2]

	items.append(item % (r[0], r[1], r[2]))
	#break

	it = str.join('', items)
	ch = channel % it



	full = rssheader % (ch)
	f = open('out.xml', 'w')
	f.write(full)
	f.close()