jeremyboggs · October 24, 2013 19:47
diff --git a/jsr-to-wp-xml.py b/jsr-to-wp-xml.py
 #!/usr/bin/env python

 import sys
 import os
 import fnmatch
 import random
 import re
 from bs4 import BeautifulSoup

 skipped = []

 noheading = []

 def directoryNumber(directoryname):
    m = re.search(r'\d+',directoryname)
    if m is None:
        return 0
    else:
        return int(m.group())

 htmlfile = open('jsr-import.xml', 'w')

 htmlfile.write('''<?xml version="1.0" encoding="UTF-8" ?>
 <!-- This is a WordPress eXtended RSS file generated by WordPress as an export of your site. -->
 <!-- It contains information about your site's posts, pages, comments, categories, and other content. -->
 <!-- You may use this file to transfer that content from one site to another. -->
 <!-- This file is not intended to serve as a complete backup of your site. -->

 <!-- To import this information into a WordPress site follow these steps: -->
 <!-- 1. Log in to that site as an administrator. -->
 <!-- 2. Go to Tools: Import in the WordPress admin panel. -->
 <!-- 3. Install the "WordPress" importer from the list. -->
 <!-- 4. Activate & Run Importer. -->
 <!-- 5. Upload this file using the form provided on that page. -->
 <!-- 6. You will first be asked to map the authors in this export file to users -->
 <!--    on the site. For each author, you may choose to map to an -->
 <!--    existing user on the site or to create a new user. -->
 <!-- 7. WordPress will then import each of the posts, pages, comments, categories, etc. -->
 <!--    contained in this file into your site. -->

 <!-- generator="WordPress/3.5.2" created="2013-10-22 18:44" -->
 <rss version="2.0"
 	xmlns:excerpt="http://wordpress.org/export/1.2/excerpt/"
 	xmlns:content="http://purl.org/rss/1.0/modules/content/"
 	xmlns:wfw="http://wellformedweb.org/CommentAPI/"
 	xmlns:dc="http://purl.org/dc/elements/1.1/"
 	xmlns:wp="http://wordpress.org/export/1.2/"
 >

 <channel>
 	<title>JSR</title>
    <link>http://example.com</link>
 	<description>The Journal of Scriptural Reasoning</description>
 	<pubDate>Tue, 22 Oct 2013 18:44:04 +0000</pubDate>
 	<language>en-US</language>
 	<wp:wxr_version>1.2</wp:wxr_version>
 	<wp:base_site_url>http://example.com</wp:base_site_url>
 	<wp:base_blog_url>http://example.com</wp:base_blog_url>

 	<wp:author><wp:author_id>1877</wp:author_id><wp:author_login>rag9b</wp:author_login><wp:author_email>[email protected]</wp:author_email><wp:author_display_name><![CDATA[rag9b]]></wp:author_display_name><wp:author_first_name><![CDATA[Ronda]]></wp:author_first_name><wp:author_last_name><![CDATA[Grizzle]]></wp:author_last_name></wp:author>


 	<generator>http://wordpress.org/?v=3.5.2</generator>
 ''')

 for (indexnumber, (dirpath, dirs, files)) in enumerate(os.walk('.')):

    if ".svn" in dirs:
        dirs.remove(".svn")

    dirs.sort(key = directoryNumber, reverse=True)

    index_id = str(dirpath.replace('/','').replace('.', '').replace('volume','').replace('number','')) + str(indexnumber)

    for (postnumber, filename) in enumerate(fnmatch.filter(files, '*.html')):

        files.sort()

        if filename == 'template.html': continue

        if filename == 'index.html':
            post_id = index_id
            post_parent = 41
        else:
            post_id = index_id + str(postnumber)
            post_parent = index_id

        filepath = os.path.join(dirpath, filename)

        with open(filepath) as f:
            soup = BeautifulSoup(f.read())

        table = soup.find("table", bgcolor="#FFFFF0")

        if table is None:
            skipped.append(filepath)
            continue

        container = table.find("td")

        heading = container.find(['h1','h2','h3','h4', 'h5', 'h6'])

        if heading is None:
            noheading.append(filepath)
            continue

        article_title = unicode(heading.text).encode('utf-8')
        article_title = " ".join(article_title.split())
        
        multipletitles = ['The Rules of Scriptural Reasoning','Editor\'s Preface','PROLOGUE','Editors\' Introduction', 'Introduction', 'INTRODUCTION', 'LEVITICUS 25', 'Departure, but Not Yet Arrival: Performance in Exodus 15:22-26']

        for t in multipletitles:
            if article_title == t:
                article_title = article_title + " " + filepath

        article_title = article_title.replace('The Journal of Scriptural Reasoning', '')

        if not article_title:
            article_title = filepath

        heading.decompose()

        article_link = re.sub(r'\W+','-',article_title.lower())
        article_link = 'http://example.com/' + str(dirpath.replace('./', '')) + '/' + article_link + '/'

        if filename == 'index.html':
            article_content = ''
            menu_order = indexnumber + 1
        else:
            article_content = container.prettify()
            article_content = unicode(article_content).encode('utf-8')
            article_content = " ".join(line.strip() for line in article_content.split())
            menu_order = postnumber + 1

        htmlfile.write('<item>\n')
        htmlfile.write('<!-- '+filepath+ ' -->\n')
        htmlfile.write('<title>'+article_title+'</title>\n')
        htmlfile.write('<link>'+article_link+'</link>')
        htmlfile.write('<content:encoded><![CDATA['+article_content+']]></content:encoded>\n')
        htmlfile.write('<excerpt:encoded><![CDATA[]]></excerpt:encoded>\n')
        htmlfile.write('<wp:post_id>'+post_id+'</wp:post_id>\n')
        htmlfile.write('<wp:post_parent>'+str(post_parent)+'</wp:post_parent>\n')
        htmlfile.write('<wp:menu_order>'+str(menu_order)+'</wp:menu_order>\n')
        htmlfile.write('''<wp:comment_status>closed</wp:comment_status>
 <wp:ping_status>closed</wp:ping_status>
 <wp:status>publish</wp:status>
 <wp:post_type>page</wp:post_type>
 <wp:post_password></wp:post_password>
 <wp:is_sticky>0</wp:is_sticky>
 <wp:postmeta>
    <wp:meta_key>_wp_page_template</wp:meta_key>
    <wp:meta_value><![CDATA[default]]></wp:meta_value>
 </wp:postmeta>
 ''')

        htmlfile.write('</item>\n')

 htmlfile.write('''
    </channel>
 </rss>''')
	#!/usr/bin/env python

	import sys
	import os
	import fnmatch
	import random
	import re
	from bs4 import BeautifulSoup

	skipped = []

	noheading = []

	def directoryNumber(directoryname):
	m = re.search(r'\d+',directoryname)
	if m is None:
	return 0
	else:
	return int(m.group())

	htmlfile = open('jsr-import.xml', 'w')

	htmlfile.write('''<?xml version="1.0" encoding="UTF-8" ?>
	<!-- This is a WordPress eXtended RSS file generated by WordPress as an export of your site. -->
	<!-- It contains information about your site's posts, pages, comments, categories, and other content. -->
	<!-- You may use this file to transfer that content from one site to another. -->
	<!-- This file is not intended to serve as a complete backup of your site. -->

	<!-- To import this information into a WordPress site follow these steps: -->
	<!-- 1. Log in to that site as an administrator. -->
	<!-- 2. Go to Tools: Import in the WordPress admin panel. -->
	<!-- 3. Install the "WordPress" importer from the list. -->
	<!-- 4. Activate & Run Importer. -->
	<!-- 5. Upload this file using the form provided on that page. -->
	<!-- 6. You will first be asked to map the authors in this export file to users -->
	<!-- on the site. For each author, you may choose to map to an -->
	<!-- existing user on the site or to create a new user. -->
	<!-- 7. WordPress will then import each of the posts, pages, comments, categories, etc. -->
	<!-- contained in this file into your site. -->

	<!-- generator="WordPress/3.5.2" created="2013-10-22 18:44" -->
	<rss version="2.0"
	xmlns:excerpt="http://wordpress.org/export/1.2/excerpt/"
	xmlns:content="http://purl.org/rss/1.0/modules/content/"
	xmlns:wfw="http://wellformedweb.org/CommentAPI/"
	xmlns:dc="http://purl.org/dc/elements/1.1/"
	xmlns:wp="http://wordpress.org/export/1.2/"
	>

	<channel>
	<title>JSR</title>
	<link>http://example.com</link>
	<description>The Journal of Scriptural Reasoning</description>
	<pubDate>Tue, 22 Oct 2013 18:44:04 +0000</pubDate>
	<language>en-US</language>
	<wp:wxr_version>1.2</wp:wxr_version>
	<wp:base_site_url>http://example.com</wp:base_site_url>
	<wp:base_blog_url>http://example.com</wp:base_blog_url>

	<wp:author><wp:author_id>1877</wp:author_id><wp:author_login>rag9b</wp:author_login><wp:author_email>[email protected]</wp:author_email><wp:author_display_name><![CDATA[rag9b]]></wp:author_display_name><wp:author_first_name><![CDATA[Ronda]]></wp:author_first_name><wp:author_last_name><![CDATA[Grizzle]]></wp:author_last_name></wp:author>


	<generator>http://wordpress.org/?v=3.5.2</generator>
	''')

	for (indexnumber, (dirpath, dirs, files)) in enumerate(os.walk('.')):

	if ".svn" in dirs:
	dirs.remove(".svn")

	dirs.sort(key = directoryNumber, reverse=True)

	index_id = str(dirpath.replace('/','').replace('.', '').replace('volume','').replace('number','')) + str(indexnumber)

	for (postnumber, filename) in enumerate(fnmatch.filter(files, '*.html')):

	files.sort()

	if filename == 'template.html': continue

	if filename == 'index.html':
	post_id = index_id
	post_parent = 41
	else:
	post_id = index_id + str(postnumber)
	post_parent = index_id

	filepath = os.path.join(dirpath, filename)

	with open(filepath) as f:
	soup = BeautifulSoup(f.read())

	table = soup.find("table", bgcolor="#FFFFF0")

	if table is None:
	skipped.append(filepath)
	continue

	container = table.find("td")

	heading = container.find(['h1','h2','h3','h4', 'h5', 'h6'])

	if heading is None:
	noheading.append(filepath)
	continue

	article_title = unicode(heading.text).encode('utf-8')
	article_title = " ".join(article_title.split())

	multipletitles = ['The Rules of Scriptural Reasoning','Editor\'s Preface','PROLOGUE','Editors\' Introduction', 'Introduction', 'INTRODUCTION', 'LEVITICUS 25', 'Departure, but Not Yet Arrival: Performance in Exodus 15:22-26']

	for t in multipletitles:
	if article_title == t:
	article_title = article_title + " " + filepath

	article_title = article_title.replace('The Journal of Scriptural Reasoning', '')

	if not article_title:
	article_title = filepath

	heading.decompose()

	article_link = re.sub(r'\W+','-',article_title.lower())
	article_link = 'http://example.com/' + str(dirpath.replace('./', '')) + '/' + article_link + '/'

	if filename == 'index.html':
	article_content = ''
	menu_order = indexnumber + 1
	else:
	article_content = container.prettify()
	article_content = unicode(article_content).encode('utf-8')
	article_content = " ".join(line.strip() for line in article_content.split())
	menu_order = postnumber + 1

	htmlfile.write('<item>\n')
	htmlfile.write('<!-- '+filepath+ ' -->\n')
	htmlfile.write('<title>'+article_title+'</title>\n')
	htmlfile.write('<link>'+article_link+'</link>')
	htmlfile.write('<content:encoded><![CDATA['+article_content+']]></content:encoded>\n')
	htmlfile.write('<excerpt:encoded><![CDATA[]]></excerpt:encoded>\n')
	htmlfile.write('<wp:post_id>'+post_id+'</wp:post_id>\n')
	htmlfile.write('<wp:post_parent>'+str(post_parent)+'</wp:post_parent>\n')
	htmlfile.write('<wp:menu_order>'+str(menu_order)+'</wp:menu_order>\n')
	htmlfile.write('''<wp:comment_status>closed</wp:comment_status>
	<wp:ping_status>closed</wp:ping_status>
	<wp:status>publish</wp:status>
	<wp:post_type>page</wp:post_type>
	<wp:post_password></wp:post_password>
	<wp:is_sticky>0</wp:is_sticky>
	<wp:postmeta>
	<wp:meta_key>_wp_page_template</wp:meta_key>
	<wp:meta_value><![CDATA[default]]></wp:meta_value>
	</wp:postmeta>
	''')

	htmlfile.write('</item>\n')

	htmlfile.write('''
	</channel>
	</rss>''')