#!/usr/bin/env -S pipx run # This program walks through the URLs in the sitemap and checks to see if they # are in the Internet Archive Wayback Machine. # # You can run it like: # # pipx run data-usaid-gov-check.py > results.csv # # # Backstory: # # There was some concern about the state of USAID GHSC-PSM Health Commodity Delivery Dataset: # # https://catalog.data.gov/dataset/usaid-ghsc-psm-health-commodity-delivery-dataset # # which linked to data stored at: # # https://data.usaid.gov/api/views/tikn-bfy4/rows.csv?accessType=DOWNLOAD # # and is no longer available. # # Surprisingly https://data.usaid.gov/ is still live, and https://data.usaid.gov/robots.txt has a link to a sitemap: # # https://s3.amazonaws.com/sa-socrata-sitemaps-us-east-1-fedramp-prod/sitemaps/sitemap-data.usaid.gov.xml # # Which in turn contains a link to another sitemap: # # https://s3.amazonaws.com/sa-socrata-sitemaps-us-east-1-fedramp-prod/sitemaps/sitemap-datasets-data.usaid.gov0.xml # # /// script # dependencies = ["requests", "wayback"] # /// import csv import sys import xml.etree.ElementTree as etree import requests import wayback wb = wayback.WaybackClient() def main(): out = csv.DictWriter(sys.stdout, fieldnames=["url", "last_archive"]) out.writeheader() for url in sitemap_urls(): out.writerow({"url": url, "last_archive": last_archive(url)}) def sitemap_urls(): url = "https://s3.amazonaws.com/sa-socrata-sitemaps-us-east-1-fedramp-prod/sitemaps/sitemap-datasets-data.usaid.gov0.xml" sitemap = etree.fromstring(requests.get(url).text) for url_el in sitemap.findall( ".//{http://www.sitemaps.org/schemas/sitemap/0.9}loc" ): yield url_el.text def last_archive(url): for record in wb.search(url): if record.status_code == 200: return record.timestamp.isoformat() return None if __name__ == "__main__": main()