Created
May 1, 2020 09:19
-
-
Save amercader/1da5e8d3bacddf19f90b88f207adb202 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
''' | |
Helper script to export datasets metadata from a CKAN instance using the ckanapi library | |
Usage: | |
./export_ckan_metadata.py <CKAN URL> <API key> | |
For example: | |
./export_ckan_metadata.py https://ridl.unchr.org XXXXX-YYY.. | |
''' | |
import argparse | |
import ckanapi | |
PAGE_SIZE = 50 | |
def export(url, api_key): | |
ckan = ckanapi.RemoteCKAN(url, api_key) | |
page = 1 | |
cnt = 0 | |
# Sorting options include: | |
# Date created: metadata_created desc | |
# Date modified: metadata_modified desc | |
# Title: title asc | |
sort = 'metadata_modified desc' | |
# Filters, some examples (can be combined in the same list): | |
# ['dataset_type:deposited-dataset'] # Custom dataset types | |
# ['organization:afghanistan'] # Single org (data container) | |
# ['visibility:public'] # Downloadable files | |
# ['data_collection_technique:tri'] # Custom fields | |
fq_list = [] | |
while True: | |
result = ckan.action.package_search( | |
q='*:*', | |
fq_list=fq_list, | |
rows=PAGE_SIZE, | |
start=PAGE_SIZE * (page - 1), | |
sort=sort | |
) | |
datasets =result['results'] | |
if not datasets: | |
break | |
for dataset in datasets: | |
cnt += 1 | |
# At this point `dataset` is an object that contains all the | |
# dataset metadata, including resources. Here's an example: | |
# https://ridl-uat.unhcr.org/api/action/package_show?id=test-deployment-september | |
# This can be exported in different formats depending on the needs, eg: | |
# 1. Output single fields: | |
print ('%s. %s - %s' % (cnt, dataset['id'], dataset['title'])) | |
# 2. Save as JSON | |
# import json | |
# with open('{}.json'.format(dataset['name']), 'w') as p: | |
# json.dump(dataset, f) | |
# 3. Export all as CSV | |
# This is easy using the `csv` module. The only thing that needs to be taken | |
# into account if resources need to be included is that there are multiple | |
# resources per dataset so they need to be stored on different rows, eg: | |
# id | name | title | ... | resource_id | resource_name | ... | | |
# ------|-----------|-----------|-----|-------------|---------------|-----| | |
# ds1 | dataset-1 | Dataset 1 | ... | res1 | Resource 1 | ... | | |
# ds1 | dataset-1 | Dataset 1 | ... | res2 | Resource 2 | ... | | |
# ds1 | dataset-1 | Dataset 1 | ... | res3 | Resource 3 | ... | | |
# or | |
# id | name | title | ... | resource_id | resource_name | ... | | |
# ------|-----------|-----------|-----|-------------|---------------|-----| | |
# ds1 | dataset-1 | Dataset 1 | ... | res1 | Resource 1 | ... | | |
# | | | | res2 | Resource 2 | ... | | |
# | | | | res3 | Resource 3 | ... | | |
page = page + 1 | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser(description='Export all datasets from a CKAN instance') | |
parser.add_argument('url', help='CKAN site to update') | |
parser.add_argument('api_key', help='API key on that site') | |
args = parser.parse_args() | |
export(args.url, args.api_key) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment