pelletier · January 1, 2011 17:41
diff --git a/wp2rst.py b/wp2rst.py
 # -*- coding: UTF-8 -*-

 import sys
 import datetime
 import subprocess
 from xml.dom.minidom import parse, parseString



 def html2rst(html):
    html = str(html)
    p = subprocess.Popen(['pandoc', '--from=html', '--to=rst'],
                         stdin=subprocess.PIPE, stdout=subprocess.PIPE)
    return p.communicate(html.encode('utf-8'))[0]


 def getChildText(node, name):
    node = node.getElementsByTagName(name)[0]
    nodelist = node.childNodes
    rc = []
    for node in nodelist:
        if node.nodeType == node.TEXT_NODE:
            rc.append(node.data)
    return ''.join(rc)


 if len(sys.argv) < 2:
    print("Use: wp2rst <wp_xml_file_path>")
    exit(1)


 dom = parse(sys.argv[1])

 items = dom.getElementsByTagName('item')

 for item in items:
    if getChildText(item, 'wp:post_type') == 'post':
        
        # Grab the title
        title = getChildText(item, 'title')
        print(title)
        
        # We do not need drafts
        if getChildText(item, 'wp:status') in ['draft', 'auto-draft']:
            continue
        
        # Grab the content and convert it to RST
        html_content = item.getElementsByTagName('content:encoded')[0].firstChild.data
        rst_content = html2rst(html_content)
        
        # Grab the date and convert it to a correct date / time format
        raw_date = getChildText(item, 'pubDate')
        date = datetime.datetime.strptime(raw_date.split('+')[0], "%a, %d %b %Y %H:%M:%S ")
        print(date)

        # Grab the slug
        slug = getChildText(item, 'link').split('/')[-2]
        print(slug)

        # Look for tags
        tags = []
        for categ in item.getElementsByTagName('category'):
            if categ.getAttribute('domain') == 'tag' and categ.hasAttribute('nicename'):
                tags.append(categ.getAttribute('nicename'))
        print(tags)


        import codecs
        # Write the result in a file
        file_descriptor = codecs.open('data/blog/%s.rst' % slug, 'w+', "utf-8-sig")
        file_descriptor.write(title)
        file_descriptor.write('\n%s\n\n' % (len(title)*"#"))
        file_descriptor.write(':date: %s\n' % date.strftime('%Y-%m-%d %H:%M'))
        file_descriptor.write(':tags: %s\n' % ", ".join(tags) )
        file_descriptor.write(':author: Thomas Pelletier\n\n')
        print(type(str(rst_content)))
        file_descriptor.write(rst_content.decode('utf-8'))
        file_descriptor.close()
	# -- coding: UTF-8 --

	import sys
	import datetime
	import subprocess
	from xml.dom.minidom import parse, parseString



	def html2rst(html):
	html = str(html)
	p = subprocess.Popen(['pandoc', '--from=html', '--to=rst'],
	stdin=subprocess.PIPE, stdout=subprocess.PIPE)
	return p.communicate(html.encode('utf-8'))[0]


	def getChildText(node, name):
	node = node.getElementsByTagName(name)[0]
	nodelist = node.childNodes
	rc = []
	for node in nodelist:
	if node.nodeType == node.TEXT_NODE:
	rc.append(node.data)
	return ''.join(rc)


	if len(sys.argv) < 2:
	print("Use: wp2rst <wp_xml_file_path>")
	exit(1)


	dom = parse(sys.argv[1])

	items = dom.getElementsByTagName('item')

	for item in items:
	if getChildText(item, 'wp:post_type') == 'post':

	# Grab the title
	title = getChildText(item, 'title')
	print(title)

	# We do not need drafts
	if getChildText(item, 'wp:status') in ['draft', 'auto-draft']:
	continue

	# Grab the content and convert it to RST
	html_content = item.getElementsByTagName('content:encoded')[0].firstChild.data
	rst_content = html2rst(html_content)

	# Grab the date and convert it to a correct date / time format
	raw_date = getChildText(item, 'pubDate')
	date = datetime.datetime.strptime(raw_date.split('+')[0], "%a, %d %b %Y %H:%M:%S ")
	print(date)

	# Grab the slug
	slug = getChildText(item, 'link').split('/')[-2]
	print(slug)

	# Look for tags
	tags = []
	for categ in item.getElementsByTagName('category'):
	if categ.getAttribute('domain') == 'tag' and categ.hasAttribute('nicename'):
	tags.append(categ.getAttribute('nicename'))
	print(tags)


	import codecs
	# Write the result in a file
	file_descriptor = codecs.open('data/blog/%s.rst' % slug, 'w+', "utf-8-sig")
	file_descriptor.write(title)
	file_descriptor.write('\n%s\n\n' % (len(title)*"#"))
	file_descriptor.write(':date: %s\n' % date.strftime('%Y-%m-%d %H:%M'))
	file_descriptor.write(':tags: %s\n' % ", ".join(tags) )
	file_descriptor.write(':author: Thomas Pelletier\n\n')
	print(type(str(rst_content)))
	file_descriptor.write(rst_content.decode('utf-8'))
	file_descriptor.close()