-
-
Save ruslanosipov/b748a138389db2cda1e8 to your computer and use it in GitHub Desktop.
| #!/usr/bin/env python | |
| """This script converts WXR file to a number of plain text files. | |
| WXR stands for "WordPress eXtended RSS", which basically is just a | |
| regular XML file. This script extracts entries from the WXR file into | |
| plain text files. Output format: article name prefixed by date for | |
| posts, article name for pages. | |
| Usage: wxr2txt.py filename [-o output_dir] | |
| """ | |
| import os | |
| import re | |
| import sys | |
| from xml.etree import ElementTree | |
| NAMESPACES = { | |
| 'content': 'http://purl.org/rss/1.0/modules/content/', | |
| 'wp': 'http://wordpress.org/export/1.2/', | |
| } | |
| USAGE_STRING = "Usage: wxr2txt.py filename [-o output_dir]" | |
| def main(argv): | |
| filename, output_dir = _parse_and_validate_output(argv) | |
| try: | |
| data = ElementTree.parse(filename).getroot() | |
| except ElementTree.ParseError: | |
| _error("Invalid input file format. Can not parse the input.") | |
| page_counter, post_counter = 0, 0 | |
| for post in data.find('channel').findall('item'): | |
| post_type = post.find('wp:post_type', namespaces=NAMESPACES).text | |
| if post_type not in ('post', 'page'): | |
| continue | |
| content = post.find('content:encoded', namespaces=NAMESPACES).text | |
| date = post.find('wp:post_date', namespaces=NAMESPACES).text | |
| title = post.find('title').text | |
| date = date.split(' ')[0].replace('-', '') | |
| title = re.sub(r'[_]+', '_', re.sub(r'[^a-z0-9+]', '_', title.lower())) | |
| if post_type == 'post': | |
| post_filename = date + '_' + title + '.txt' | |
| post_counter += 1 | |
| else: | |
| post_filename = title + '.txt' | |
| page_counter += 1 | |
| with open(os.path.join(output_dir, post_filename), 'w') as post_file: | |
| post_file.write(content.encode('utf8')) | |
| post_counter += 1 | |
| print "Saved {} posts and {} pages in directory '{}'.".format( | |
| post_counter, page_counter, output_dir) | |
| def _parse_and_validate_output(argv): | |
| if len(argv) not in (2, 4): | |
| _error("Wrong number of arguments.") | |
| filename = argv[1] | |
| if not os.path.isfile(filename): | |
| _error("Input file does not exist (or not enough permissions).") | |
| output_dir = argv[3] if len(argv) == 4 and argv[2] == '-o' else os.getcwd() | |
| if not os.path.isdir(output_dir): | |
| _error("Output directory does not exist (or not enough permissions).") | |
| return filename, output_dir | |
| def _error(text): | |
| print text | |
| print USAGE_STRING | |
| sys.exit(1) | |
| if __name__ == "__main__": | |
| main(sys.argv) |
Why? What does that mean and what does it do?
Why? What does that mean and what does it do?
I don't really know the details of "why is this actually a problem" but needed to get this script working and came across this post:
https://www.sharooq.com/solved-typeerror-write-argument-must-be-str-not-bytes-in-python
Excellent. Thanks.
Can verify the change suggested by @gkv-ckultzow has worked in Python 3.12, to replace 'w' with 'wb' to read:
with open(os.path.join(output_dir, post_filename), 'wb') as post_file:
Also for Python 3.12, put () around the content in each print statement as mentioned by @werowe. For example:
print("Saved {} posts and {} pages in directory '{}'.".format(
post_counter, page_counter, output_dir))
No changes to any output text files created after executing the program, all 60 posts were captured in plain text in individual text files. Thanks.
for this line:
with open(os.path.join(output_dir, post_filename), 'w') as post_file:
replace 'w' with 'wb'