|
import json |
|
from urllib.parse import urljoin, urlparse |
|
|
|
import scrapy |
|
|
|
|
|
def is_valid(url): |
|
meta = urlparse(url) |
|
if ( |
|
not meta.scheme or |
|
not meta.netloc or |
|
(not meta.path and "\\" in meta.netloc) or |
|
meta.scheme not in ("file", "http", "https", "s3") |
|
): |
|
return False |
|
|
|
return True |
|
|
|
|
|
class CKANResourceStatusSpider(scrapy.Spider): |
|
name = "ckan-resource-status" |
|
|
|
def __init__(self, base_url): |
|
self.base_url = base_url |
|
|
|
def make_request(self, *args, **kwargs): |
|
kwargs["dont_filter"] = kwargs.get("dont_filter", True) |
|
|
|
meta = kwargs.get("meta", {}) |
|
meta["handle_httpstatus_all"] = meta.get("handle_httpstatus_all", True) |
|
kwargs["meta"] = meta |
|
|
|
return scrapy.Request(*args, **kwargs) |
|
|
|
def start_requests(self): |
|
yield self.make_request( |
|
url=urljoin(self.base_url, "/api/3/action/package_list"), |
|
callback=self.parse_package_list, |
|
) |
|
|
|
def parse_package_list(self, response): |
|
result = json.loads(response.body)["result"] |
|
for package_id in result: |
|
package_url = urljoin(self.base_url, "/api/3/action/package_show?id=" + package_id) |
|
yield self.make_request( |
|
url=package_url, |
|
callback=self.parse_package_detail, |
|
meta={"package_id": package_id, "package_url": package_url}, |
|
) |
|
|
|
def parse_package_detail(self, response): |
|
meta = response.request.meta |
|
result = json.loads(response.body)["result"] |
|
for resource in result["resources"]: |
|
url = resource["url"] |
|
if not is_valid(url): |
|
yield { |
|
"headers": None, |
|
"http_status_code": None, |
|
"package_id": meta["package_id"], |
|
"package_url": meta["package_url"], |
|
"resource": resource, |
|
"status": "invalid-url", |
|
} |
|
else: |
|
yield self.make_request( |
|
url=url, |
|
method="HEAD", |
|
callback=self.parse_resource_head, |
|
errback=self.parse_resource_error, |
|
meta={ |
|
"package_id": meta["package_id"], |
|
"package_url": meta["package_url"], |
|
"resource": resource, |
|
}, |
|
) |
|
|
|
def parse_resource_error(self, failure): |
|
request = failure.request |
|
meta = request.meta |
|
yield { |
|
"headers": None, |
|
"http_status_code": None, |
|
"package_id": meta["package_id"], |
|
"package_url": meta["package_url"], |
|
"resource": meta["resource"], |
|
"status": failure.value, |
|
} |
|
|
|
def parse_resource_head(self, response): |
|
meta = response.request.meta |
|
headers = { |
|
key.decode("ascii"): "|".join(item.decode("ascii") for item in value) |
|
for key, value in response.headers.items() |
|
} |
|
|
|
yield { |
|
"headers": headers, |
|
"http_status_code": response.status, |
|
"package_id": meta["package_id"], |
|
"package_url": meta["package_url"], |
|
"resource": meta["resource"], |
|
"status": "head_response", |
|
} |