Last active
September 17, 2024 14:30
-
-
Save greg-randall/3d7dde3a37271f72f9608cad2d613026 to your computer and use it in GitHub Desktop.
Remove items that aren't from 2024 from a wordpress export.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import xml.etree.ElementTree as ET | |
# Parse the XML file and get the root | |
tree = ET.parse('wordpress_export.xml') | |
root = tree.getroot() | |
# Find the <channel> element | |
channel = root.find('channel') | |
total = 0 | |
skipped = 0 | |
kept = 0 | |
# Iterate over all <item> elements within the <channel> | |
for item in channel.findall('item'): | |
# Find the <pubdate> element and get its text | |
pubdate = item.find('pubDate') | |
if pubdate is not None and isinstance(pubdate.text, str): | |
try: | |
if "2024" not in pubdate.text: | |
channel.remove(item) | |
print(f"\t{pubdate.text}") | |
skipped += 1 | |
else: | |
print(pubdate.text) | |
kept += 1 | |
except: | |
print("Error") | |
total += 1 | |
# Write the modified XML back to the file | |
tree.write('trimmed.xml') | |
print(f"\nKept {kept} items, and dropped {skipped} items out of {total} total items.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment