Skip to content

Instantly share code, notes, and snippets.

@greg-randall
Last active September 17, 2024 14:30
Show Gist options
  • Save greg-randall/3d7dde3a37271f72f9608cad2d613026 to your computer and use it in GitHub Desktop.
Save greg-randall/3d7dde3a37271f72f9608cad2d613026 to your computer and use it in GitHub Desktop.
Remove items that aren't from 2024 from a wordpress export.
import xml.etree.ElementTree as ET
# Parse the XML file and get the root
tree = ET.parse('wordpress_export.xml')
root = tree.getroot()
# Find the <channel> element
channel = root.find('channel')
total = 0
skipped = 0
kept = 0
# Iterate over all <item> elements within the <channel>
for item in channel.findall('item'):
# Find the <pubdate> element and get its text
pubdate = item.find('pubDate')
if pubdate is not None and isinstance(pubdate.text, str):
try:
if "2024" not in pubdate.text:
channel.remove(item)
print(f"\t{pubdate.text}")
skipped += 1
else:
print(pubdate.text)
kept += 1
except:
print("Error")
total += 1
# Write the modified XML back to the file
tree.write('trimmed.xml')
print(f"\nKept {kept} items, and dropped {skipped} items out of {total} total items.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment