trentrichardson · November 2, 2018 12:13 · trentrichardson · Nov 2, 2018
diff --git a/wp-to-md.py b/wp-to-md.py
 import sys
 import os
 import io
 import time
 import datetime
 import dateutil.parser
 import re

 # for xml parsing
 from bs4 import BeautifulSoup

 # for converting to markdown
 import html2text

 # for downloading attachments
 import wget


 # for converting various wordpress code tags to markdown
 coderegex1 = re.compile(r'\[sourcecode language=\"[a-zA-Z0-9]*\"\](.*?)\[\/sourcecode\]', re.DOTALL)
 coderegex2 = re.compile(r'\[code language=\"[a-zA-Z0-9]*\"\](.*?)\[\/code\]', re.DOTALL)
 coderegex3 = re.compile(r'\[code lang=[a-zA-Z0-9]*\](.*?)\[\/code\]', re.DOTALL)

 # for replacing http://www.example.com/wp-content/uploads/ with just /uploads/
 uploadregex = re.compile(r'(https?\:\/\/[a-zA-Z0-9\-\.]+\/wp-content\/uploads\/)')

 class Post:

 	def __init__(self, title, author, date, content, category, status, slug, tags):
 		self.title = title
 		self.author = author
 		self.date = dateutil.parser.parse(date)
 		self.content = content
 		self.category = category
 		self.status = status
 		self.slug = slug
 		self.tags = tags


 def load_doc(filename):

 	print("> Loading document!")

 	doc = ""
 	with io.open(filename, 'r', encoding='UTF-8') as f:
 		doc = f.read()

 	return doc


 def parse_doc(doc):

 	print("> Parsing document!")

 	posts = []
 	attachments = []
 	soup = BeautifulSoup(doc, 'html.parser')

 	for item in soup.find_all('item'):

 		if item.find('wp:post_type').string == "post":

 			# get the tags
 			tags = []
 			for tag in item.findAll('category', { 'domain': 'post_tag'}):
 				tags.append(tag['nicename'])

 			# fix urls to not use wp-content nor full url
 			content = uploadregex.sub('{attach}images/',item.find('content:encoded').string)

 			posts.append(Post(
 				item.find('title').string,
 				item.find('dc:creator').string,
 				item.find('wp:post_date').string,
 				content,
 				item.find('category', { 'domain': 'category'})['nicename'],
 				item.find('wp:status').string,
 				item.find('wp:post_name').string,
 				'; '.join(tags) ))

 		elif item.find('wp:post_type').string == "attachment":
 			attachments.append(item.guid.string)

 	return posts, attachments


 def gen_markdown(post):

 	h = html2text.HTML2Text()
 	h.unicode_snob = 1
 	h.body_width = 0
 	h.dash_unordered_list = True

 	title = post.title.translate(str.maketrans({"\"": "&#34;", ":": "&#58;"}))
 	body = post.content

 	header ="""Title: %s
 Date: %s
 Category: %s
 Tags: %s
 Slug: %s
 Author: %s

 """%(title, post.date.strftime("%Y-%m-%d %H:%M:%S"), post.category, post.tags, post.slug, post.author)

 	body = re.sub(coderegex1, r"<pre>\1</pre>", body, re.U)
 	body = re.sub(coderegex2, r"<pre>\1</pre>", body, re.U)
 	body = re.sub(coderegex3, r"<pre>\1</pre>", body, re.U)

 	body = h.handle(body)

 	return header + body


 def save_posts(output, posts):

 	print("> Saving posts!")

 	out = ""
 	directory = ""

 	for p in posts:
 		if p.status == "publish":
 			directory = output + "_posts/"

 		elif p.status == "draft":
 			directory = output + "_drafts/"

 		else:
 			directory = output + "_other/"

 		if not os.path.exists(directory):
 			os.makedirs(directory)

 		print("Saving", directory + p.date.strftime("%Y-%m-%d") + "-" + p.slug + ".md")

 		with io.open(directory + p.date.strftime("%Y-%m-%d") + "-" + p.slug + ".md", 'w', encoding='UTF-8') as f:
 			f.write(gen_markdown(p))


 def download_attachments(output, attachments):

 	print("> Wget'ing attachments!")
 	# todo


 def main():

 	output = "./"

 	if len(sys.argv) == 1:
 		print("Parameters: filename for wordpress .xml export file, optional output directory")
 		return

 	elif len(sys.argv) == 2:
 		filename = sys.argv[1]

 	elif len(sys.argv) == 3:
 		filename = sys.argv[1]
 		output = sys.argv[2]

 	doc = ""
 	posts = []
 	attachments = []

 	doc = load_doc(filename)
 	posts, attachments = parse_doc(doc)

 	save_posts(output, posts)
 	download_attachments(output, attachments)


 if __name__ == '__main__':
 	main()
	import sys
	import os
	import io
	import time
	import datetime
	import dateutil.parser
	import re

	# for xml parsing
	from bs4 import BeautifulSoup

	# for converting to markdown
	import html2text

	# for downloading attachments
	import wget


	# for converting various wordpress code tags to markdown
	coderegex1 = re.compile(r'\[sourcecode language=\"[a-zA-Z0-9]\"\](.?)\[\/sourcecode\]', re.DOTALL)
	coderegex2 = re.compile(r'\[code language=\"[a-zA-Z0-9]\"\](.?)\[\/code\]', re.DOTALL)
	coderegex3 = re.compile(r'\[code lang=[a-zA-Z0-9]\](.?)\[\/code\]', re.DOTALL)

	# for replacing http://www.example.com/wp-content/uploads/ with just /uploads/
	uploadregex = re.compile(r'(https?\:\/\/[a-zA-Z0-9\-\.]+\/wp-content\/uploads\/)')

	class Post:

	def __init__(self, title, author, date, content, category, status, slug, tags):
	self.title = title
	self.author = author
	self.date = dateutil.parser.parse(date)
	self.content = content
	self.category = category
	self.status = status
	self.slug = slug
	self.tags = tags


	def load_doc(filename):

	print("> Loading document!")

	doc = ""
	with io.open(filename, 'r', encoding='UTF-8') as f:
	doc = f.read()

	return doc


	def parse_doc(doc):

	print("> Parsing document!")

	posts = []
	attachments = []
	soup = BeautifulSoup(doc, 'html.parser')

	for item in soup.find_all('item'):

	if item.find('wp:post_type').string == "post":

	# get the tags
	tags = []
	for tag in item.findAll('category', { 'domain': 'post_tag'}):
	tags.append(tag['nicename'])

	# fix urls to not use wp-content nor full url
	content = uploadregex.sub('{attach}images/',item.find('content:encoded').string)

	posts.append(Post(
	item.find('title').string,
	item.find('dc:creator').string,
	item.find('wp:post_date').string,
	content,
	item.find('category', { 'domain': 'category'})['nicename'],
	item.find('wp:status').string,
	item.find('wp:post_name').string,
	'; '.join(tags) ))

	elif item.find('wp:post_type').string == "attachment":
	attachments.append(item.guid.string)

	return posts, attachments


	def gen_markdown(post):

	h = html2text.HTML2Text()
	h.unicode_snob = 1
	h.body_width = 0
	h.dash_unordered_list = True

	title = post.title.translate(str.maketrans({"\"": """, ":": ":"}))
	body = post.content

	header ="""Title: %s
	Date: %s
	Category: %s
	Tags: %s
	Slug: %s
	Author: %s

	"""%(title, post.date.strftime("%Y-%m-%d %H:%M:%S"), post.category, post.tags, post.slug, post.author)

	body = re.sub(coderegex1, r"<pre>\1</pre>", body, re.U)
	body = re.sub(coderegex2, r"<pre>\1</pre>", body, re.U)
	body = re.sub(coderegex3, r"<pre>\1</pre>", body, re.U)

	body = h.handle(body)

	return header + body


	def save_posts(output, posts):

	print("> Saving posts!")

	out = ""
	directory = ""

	for p in posts:
	if p.status == "publish":
	directory = output + "_posts/"

	elif p.status == "draft":
	directory = output + "_drafts/"

	else:
	directory = output + "_other/"

	if not os.path.exists(directory):
	os.makedirs(directory)

	print("Saving", directory + p.date.strftime("%Y-%m-%d") + "-" + p.slug + ".md")

	with io.open(directory + p.date.strftime("%Y-%m-%d") + "-" + p.slug + ".md", 'w', encoding='UTF-8') as f:
	f.write(gen_markdown(p))


	def download_attachments(output, attachments):

	print("> Wget'ing attachments!")
	# todo


	def main():

	output = "./"

	if len(sys.argv) == 1:
	print("Parameters: filename for wordpress .xml export file, optional output directory")
	return

	elif len(sys.argv) == 2:
	filename = sys.argv[1]

	elif len(sys.argv) == 3:
	filename = sys.argv[1]
	output = sys.argv[2]

	doc = ""
	posts = []
	attachments = []

	doc = load_doc(filename)
	posts, attachments = parse_doc(doc)

	save_posts(output, posts)
	download_attachments(output, attachments)


	if __name__ == '__main__':
	main()