aharonium · July 19, 2019 01:31 · aharonium · Jul 16, 2019
diff --git a/wxr2txt.py b/wxr2txt.py
 #!/usr/bin/env python

 """This script converts WXR file to a number of plain text files.

 WXR stands for "WordPress eXtended RSS", which basically is just a
 regular XML file. This script extracts entries from the WXR file into
 plain text files. Output format: article name prefixed by date for
 posts, article name for pages.

 Usage: wxr2txt.py filename [-o output_dir]
 """

 import os
 import re
 import sys
 from xml.etree import ElementTree
 from urlparse import urlparse

 NAMESPACES = {
 		'content': 'http://purl.org/rss/1.0/modules/content/',
 		'wp': 'http://wordpress.org/export/1.2/',
 		'excerpt': 'http://wordpress.org/export/1.2/excerpt/',
 		'wfw': 'http://wellformedweb.org/CommentAPI/',
 		'dc': 'http://purl.org/dc/elements/1.1/',
 		
 }
 USAGE_STRING = "Usage: wxr2txt.py filename [-o output_dir]"  ''' note: output argument no longer works '''


 def main(argv):
 	filename, output_dir = _parse_and_validate_output(argv)
 	try:
 		data = ElementTree.parse(filename).getroot()
 	except ElementTree.ParseError:
 		_error("Invalid input file format. Can not parse the input.")
 	page_counter, post_counter = 0, 0
 	
 	cwd = os.getcwd()
 	
 	for post in data.find('channel').findall('item'):
 		post_type = post.find('wp:post_type', namespaces=NAMESPACES).text
 		
 		if post_type not in ('post', 'page'):
 			continue
 			
 		content = post.find('content:encoded', namespaces=NAMESPACES).text
 		date = post.find('wp:post_date', namespaces=NAMESPACES).text
 		postid = post.find('wp:post_id', namespaces=NAMESPACES).text
 		author = post.find('dc:creator', namespaces=NAMESPACES).text
 		title = post.find('title').text
 		linktitle = post.find('link').text
 		url = post.find('link').text
 		
 		''' date = date.split(' ')[0].replace('-', '') '''
 		date = date.split(' ')[0]
 		postidlink = 'http://opensiddur.org/?p=' + postid
 		linktitle = re.sub(r'^https:\/.*\/(.+)\/$', r'\1', linktitle)
 		url = re.sub(r'^https:\/(.*\/).+\/$', r'\1', url)
 		title = title.encode('utf8')
 		
 		fullname = os.path.join(cwd, url)
 		path, basename = os.path.split(fullname)
 		if not os.path.exists(path):
 			os.makedirs(path)

 		if post_type == 'post':
 			''' post_filename = linktitle + '_(' + author + '_' + date + ').html.md' ''' 
 			post_filename = linktitle + '.html.md'
 			post_counter += 1
 			
 		else:
 			post_filename = linktitle + '.html'
 			page_counter += 1
 			
 		''' with open(os.path.join(output_dir, post_filename), 'w') as post_file: ''' 
 		with open(os.path.join(fullname, post_filename), 'w') as post_file:
 			post_file.write('<html>\n<head></head>\n<body>\nTitle: ' + title + '<br />\n')
 			post_file.write('Primary contributor: ' + author + '<br />\n')
 			''' post_file.write('Sharing terms: ' + license + '<br />\n') ''' 
 			post_file.write('For attribution and license, please consult the following URL: <a href="' + postidlink + '">' + postidlink + '</a>\n<p />\n<hr />\n\n') 
 			post_file.write(content.encode('utf8'))
 			post_file.write('\n</body>\n</html>')
 			
 		post_counter += 1
 		
 	print "Saved {} posts and {} pages in directory '{}'.".format(
 			post_counter, page_counter, output_dir)


 def _parse_and_validate_output(argv):
 	if len(argv) not in (2, 4):
 		_error("Wrong number of arguments.")
 	filename = argv[1]
 	if not os.path.isfile(filename):
 		_error("Input file does not exist (or not enough permissions).")
 	output_dir = argv[3] if len(argv) == 4 and argv[2] == '-o' else os.getcwd()
 	if not os.path.isdir(output_dir):
 		_error("Output directory does not exist (or not enough permissions).")
 	return filename, output_dir


 def _error(text):
 	print text
 	print USAGE_STRING
 	sys.exit(1)

 if __name__ == "__main__":
 	main(sys.argv)
	#!/usr/bin/env python

	"""This script converts WXR file to a number of plain text files.

	WXR stands for "WordPress eXtended RSS", which basically is just a
	regular XML file. This script extracts entries from the WXR file into
	plain text files. Output format: article name prefixed by date for
	posts, article name for pages.

	Usage: wxr2txt.py filename [-o output_dir]
	"""

	import os
	import re
	import sys
	from xml.etree import ElementTree
	from urlparse import urlparse

	NAMESPACES = {
	'content': 'http://purl.org/rss/1.0/modules/content/',
	'wp': 'http://wordpress.org/export/1.2/',
	'excerpt': 'http://wordpress.org/export/1.2/excerpt/',
	'wfw': 'http://wellformedweb.org/CommentAPI/',
	'dc': 'http://purl.org/dc/elements/1.1/',

	}
	USAGE_STRING = "Usage: wxr2txt.py filename [-o output_dir]" ''' note: output argument no longer works '''


	def main(argv):
	filename, output_dir = _parse_and_validate_output(argv)
	try:
	data = ElementTree.parse(filename).getroot()
	except ElementTree.ParseError:
	_error("Invalid input file format. Can not parse the input.")
	page_counter, post_counter = 0, 0

	cwd = os.getcwd()

	for post in data.find('channel').findall('item'):
	post_type = post.find('wp:post_type', namespaces=NAMESPACES).text

	if post_type not in ('post', 'page'):
	continue

	content = post.find('content:encoded', namespaces=NAMESPACES).text
	date = post.find('wp:post_date', namespaces=NAMESPACES).text
	postid = post.find('wp:post_id', namespaces=NAMESPACES).text
	author = post.find('dc:creator', namespaces=NAMESPACES).text
	title = post.find('title').text
	linktitle = post.find('link').text
	url = post.find('link').text

	''' date = date.split(' ')[0].replace('-', '') '''
	date = date.split(' ')[0]
	postidlink = 'http://opensiddur.org/?p=' + postid
	linktitle = re.sub(r'^https:\/.*\/(.+)\/$', r'\1', linktitle)
	url = re.sub(r'^https:\/(.*\/).+\/$', r'\1', url)
	title = title.encode('utf8')

	fullname = os.path.join(cwd, url)
	path, basename = os.path.split(fullname)
	if not os.path.exists(path):
	os.makedirs(path)

	if post_type == 'post':
	''' post_filename = linktitle + '_(' + author + '_' + date + ').html.md' '''
	post_filename = linktitle + '.html.md'
	post_counter += 1

	else:
	post_filename = linktitle + '.html'
	page_counter += 1

	''' with open(os.path.join(output_dir, post_filename), 'w') as post_file: '''
	with open(os.path.join(fullname, post_filename), 'w') as post_file:
	post_file.write('<html>\n<head></head>\n<body>\nTitle: ' + title + '<br />\n')
	post_file.write('Primary contributor: ' + author + '<br />\n')
	''' post_file.write('Sharing terms: ' + license + '<br />\n') '''
	post_file.write('For attribution and license, please consult the following URL: <a href="' + postidlink + '">' + postidlink + '</a>\n<p />\n<hr />\n\n')
	post_file.write(content.encode('utf8'))
	post_file.write('\n</body>\n</html>')

	post_counter += 1

	print "Saved {} posts and {} pages in directory '{}'.".format(
	post_counter, page_counter, output_dir)


	def _parse_and_validate_output(argv):
	if len(argv) not in (2, 4):
	_error("Wrong number of arguments.")
	filename = argv[1]
	if not os.path.isfile(filename):
	_error("Input file does not exist (or not enough permissions).")
	output_dir = argv[3] if len(argv) == 4 and argv[2] == '-o' else os.getcwd()
	if not os.path.isdir(output_dir):
	_error("Output directory does not exist (or not enough permissions).")
	return filename, output_dir


	def _error(text):
	print text
	print USAGE_STRING
	sys.exit(1)

	if __name__ == "__main__":
	main(sys.argv)