Skip to content

Instantly share code, notes, and snippets.

@jeremyboggs
Created October 24, 2013 19:47
Show Gist options
  • Save jeremyboggs/7143775 to your computer and use it in GitHub Desktop.
Save jeremyboggs/7143775 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
import sys
import os
import fnmatch
import random
import re
from bs4 import BeautifulSoup
skipped = []
noheading = []
def directoryNumber(directoryname):
m = re.search(r'\d+',directoryname)
if m is None:
return 0
else:
return int(m.group())
htmlfile = open('jsr-import.xml', 'w')
htmlfile.write('''<?xml version="1.0" encoding="UTF-8" ?>
<!-- This is a WordPress eXtended RSS file generated by WordPress as an export of your site. -->
<!-- It contains information about your site's posts, pages, comments, categories, and other content. -->
<!-- You may use this file to transfer that content from one site to another. -->
<!-- This file is not intended to serve as a complete backup of your site. -->
<!-- To import this information into a WordPress site follow these steps: -->
<!-- 1. Log in to that site as an administrator. -->
<!-- 2. Go to Tools: Import in the WordPress admin panel. -->
<!-- 3. Install the "WordPress" importer from the list. -->
<!-- 4. Activate & Run Importer. -->
<!-- 5. Upload this file using the form provided on that page. -->
<!-- 6. You will first be asked to map the authors in this export file to users -->
<!-- on the site. For each author, you may choose to map to an -->
<!-- existing user on the site or to create a new user. -->
<!-- 7. WordPress will then import each of the posts, pages, comments, categories, etc. -->
<!-- contained in this file into your site. -->
<!-- generator="WordPress/3.5.2" created="2013-10-22 18:44" -->
<rss version="2.0"
xmlns:excerpt="http://wordpress.org/export/1.2/excerpt/"
xmlns:content="http://purl.org/rss/1.0/modules/content/"
xmlns:wfw="http://wellformedweb.org/CommentAPI/"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:wp="http://wordpress.org/export/1.2/"
>
<channel>
<title>JSR</title>
<link>http://example.com</link>
<description>The Journal of Scriptural Reasoning</description>
<pubDate>Tue, 22 Oct 2013 18:44:04 +0000</pubDate>
<language>en-US</language>
<wp:wxr_version>1.2</wp:wxr_version>
<wp:base_site_url>http://example.com</wp:base_site_url>
<wp:base_blog_url>http://example.com</wp:base_blog_url>
<wp:author><wp:author_id>1877</wp:author_id><wp:author_login>rag9b</wp:author_login><wp:author_email>[email protected]</wp:author_email><wp:author_display_name><![CDATA[rag9b]]></wp:author_display_name><wp:author_first_name><![CDATA[Ronda]]></wp:author_first_name><wp:author_last_name><![CDATA[Grizzle]]></wp:author_last_name></wp:author>
<generator>http://wordpress.org/?v=3.5.2</generator>
''')
for (indexnumber, (dirpath, dirs, files)) in enumerate(os.walk('.')):
if ".svn" in dirs:
dirs.remove(".svn")
dirs.sort(key = directoryNumber, reverse=True)
index_id = str(dirpath.replace('/','').replace('.', '').replace('volume','').replace('number','')) + str(indexnumber)
for (postnumber, filename) in enumerate(fnmatch.filter(files, '*.html')):
files.sort()
if filename == 'template.html': continue
if filename == 'index.html':
post_id = index_id
post_parent = 41
else:
post_id = index_id + str(postnumber)
post_parent = index_id
filepath = os.path.join(dirpath, filename)
with open(filepath) as f:
soup = BeautifulSoup(f.read())
table = soup.find("table", bgcolor="#FFFFF0")
if table is None:
skipped.append(filepath)
continue
container = table.find("td")
heading = container.find(['h1','h2','h3','h4', 'h5', 'h6'])
if heading is None:
noheading.append(filepath)
continue
article_title = unicode(heading.text).encode('utf-8')
article_title = " ".join(article_title.split())
multipletitles = ['The Rules of Scriptural Reasoning','Editor\'s Preface','PROLOGUE','Editors\' Introduction', 'Introduction', 'INTRODUCTION', 'LEVITICUS 25', 'Departure, but Not Yet Arrival: Performance in Exodus 15:22-26']
for t in multipletitles:
if article_title == t:
article_title = article_title + " " + filepath
article_title = article_title.replace('The Journal of Scriptural Reasoning', '')
if not article_title:
article_title = filepath
heading.decompose()
article_link = re.sub(r'\W+','-',article_title.lower())
article_link = 'http://example.com/' + str(dirpath.replace('./', '')) + '/' + article_link + '/'
if filename == 'index.html':
article_content = ''
menu_order = indexnumber + 1
else:
article_content = container.prettify()
article_content = unicode(article_content).encode('utf-8')
article_content = " ".join(line.strip() for line in article_content.split())
menu_order = postnumber + 1
htmlfile.write('<item>\n')
htmlfile.write('<!-- '+filepath+ ' -->\n')
htmlfile.write('<title>'+article_title+'</title>\n')
htmlfile.write('<link>'+article_link+'</link>')
htmlfile.write('<content:encoded><![CDATA['+article_content+']]></content:encoded>\n')
htmlfile.write('<excerpt:encoded><![CDATA[]]></excerpt:encoded>\n')
htmlfile.write('<wp:post_id>'+post_id+'</wp:post_id>\n')
htmlfile.write('<wp:post_parent>'+str(post_parent)+'</wp:post_parent>\n')
htmlfile.write('<wp:menu_order>'+str(menu_order)+'</wp:menu_order>\n')
htmlfile.write('''<wp:comment_status>closed</wp:comment_status>
<wp:ping_status>closed</wp:ping_status>
<wp:status>publish</wp:status>
<wp:post_type>page</wp:post_type>
<wp:post_password></wp:post_password>
<wp:is_sticky>0</wp:is_sticky>
<wp:postmeta>
<wp:meta_key>_wp_page_template</wp:meta_key>
<wp:meta_value><![CDATA[default]]></wp:meta_value>
</wp:postmeta>
''')
htmlfile.write('</item>\n')
htmlfile.write('''
</channel>
</rss>''')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment