Skip to content

Instantly share code, notes, and snippets.

@turicas
Created November 1, 2019 20:52
Show Gist options
  • Save turicas/786121adcf6b36db16ab23f90889a38e to your computer and use it in GitHub Desktop.
Save turicas/786121adcf6b36db16ab23f90889a38e to your computer and use it in GitHub Desktop.

Check Errors on CKAN resources

This script connects to a CKAN API and, for all available datasets, check URLs of all resources to identify the ones that are not available.

Installing

pip install scrapy

Running

Example: check http://dados.gov.br/ and save results to dadosgovbr.csv:

time scrapy runspider \
    -a base_url=http://dados.gov.br/ \
    -s HTTPCACHE_ENABLED=true \
    --loglevel=INFO \
    ckan_resource_status.py \
    -o dadosgovbr.csv
import json
from urllib.parse import urljoin, urlparse
import scrapy
def is_valid(url):
meta = urlparse(url)
if (
not meta.scheme or
not meta.netloc or
(not meta.path and "\\" in meta.netloc) or
meta.scheme not in ("file", "http", "https", "s3")
):
return False
return True
class CKANResourceStatusSpider(scrapy.Spider):
name = "ckan-resource-status"
def __init__(self, base_url):
self.base_url = base_url
def make_request(self, *args, **kwargs):
kwargs["dont_filter"] = kwargs.get("dont_filter", True)
meta = kwargs.get("meta", {})
meta["handle_httpstatus_all"] = meta.get("handle_httpstatus_all", True)
kwargs["meta"] = meta
return scrapy.Request(*args, **kwargs)
def start_requests(self):
yield self.make_request(
url=urljoin(self.base_url, "/api/3/action/package_list"),
callback=self.parse_package_list,
)
def parse_package_list(self, response):
result = json.loads(response.body)["result"]
for package_id in result:
package_url = urljoin(self.base_url, "/api/3/action/package_show?id=" + package_id)
yield self.make_request(
url=package_url,
callback=self.parse_package_detail,
meta={"package_id": package_id, "package_url": package_url},
)
def parse_package_detail(self, response):
meta = response.request.meta
result = json.loads(response.body)["result"]
for resource in result["resources"]:
url = resource["url"]
if not is_valid(url):
yield {
"headers": None,
"http_status_code": None,
"package_id": meta["package_id"],
"package_url": meta["package_url"],
"resource": resource,
"status": "invalid-url",
}
else:
yield self.make_request(
url=url,
method="HEAD",
callback=self.parse_resource_head,
errback=self.parse_resource_error,
meta={
"package_id": meta["package_id"],
"package_url": meta["package_url"],
"resource": resource,
},
)
def parse_resource_error(self, failure):
request = failure.request
meta = request.meta
yield {
"headers": None,
"http_status_code": None,
"package_id": meta["package_id"],
"package_url": meta["package_url"],
"resource": meta["resource"],
"status": failure.value,
}
def parse_resource_head(self, response):
meta = response.request.meta
headers = {
key.decode("ascii"): "|".join(item.decode("ascii") for item in value)
for key, value in response.headers.items()
}
yield {
"headers": headers,
"http_status_code": response.status,
"package_id": meta["package_id"],
"package_url": meta["package_url"],
"resource": meta["resource"],
"status": "head_response",
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment