Skip to content

Instantly share code, notes, and snippets.

@cyrillbolliger
Last active October 6, 2020 13:29
Show Gist options
  • Save cyrillbolliger/5423f9d2c3f7f03f86ca8946f17159f3 to your computer and use it in GitHub Desktop.
Save cyrillbolliger/5423f9d2c3f7f03f86ca8946f17159f3 to your computer and use it in GitHub Desktop.
Strip WordPress Block Editor Tags (Gutenberg) form WXR
### usage: python3 wp_remove_gutenberg_tags.py < wordpress-export-with-tags.xml > cleand.xml
import fileinput
import re
data = ''
m_start = 0
m_stop = 0
for line in fileinput.input():
data += line
cdata_regex = re.compile('<content:encoded><!\[CDATA\[(.*?)]]></content:encoded>', re.DOTALL)
p_regex = re.compile('<!-- wp:paragraph -->\s*<p>\s*(?P<paragraph>.*?)\s*</p>\s*<!-- /wp:paragraph -->', re.DOTALL)
comment_regex = re.compile('<!--.*?-->')
newlines_regex = re.compile('\n+')
def clean(cdata):
"""
Strip block editor tags, remove multiple empty lines
:param cdata: string
:return: string
"""
clean = p_regex.sub('\g<paragraph>', cdata)
clean = comment_regex.sub('', clean)
clean = newlines_regex.sub('\n', clean)
return clean
for match in cdata_regex.finditer(data):
print(data[m_stop:match.start()])
print(clean(match.group(0)))
m_start = match.start()
m_stop = match.end()
print(data[m_stop:len(data)])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment