Skip to content

Instantly share code, notes, and snippets.

@amercader
Created May 1, 2020 09:19
Show Gist options
  • Save amercader/1da5e8d3bacddf19f90b88f207adb202 to your computer and use it in GitHub Desktop.
Save amercader/1da5e8d3bacddf19f90b88f207adb202 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
'''
Helper script to export datasets metadata from a CKAN instance using the ckanapi library
Usage:
./export_ckan_metadata.py <CKAN URL> <API key>
For example:
./export_ckan_metadata.py https://ridl.unchr.org XXXXX-YYY..
'''
import argparse
import ckanapi
PAGE_SIZE = 50
def export(url, api_key):
ckan = ckanapi.RemoteCKAN(url, api_key)
page = 1
cnt = 0
# Sorting options include:
# Date created: metadata_created desc
# Date modified: metadata_modified desc
# Title: title asc
sort = 'metadata_modified desc'
# Filters, some examples (can be combined in the same list):
# ['dataset_type:deposited-dataset'] # Custom dataset types
# ['organization:afghanistan'] # Single org (data container)
# ['visibility:public'] # Downloadable files
# ['data_collection_technique:tri'] # Custom fields
fq_list = []
while True:
result = ckan.action.package_search(
q='*:*',
fq_list=fq_list,
rows=PAGE_SIZE,
start=PAGE_SIZE * (page - 1),
sort=sort
)
datasets =result['results']
if not datasets:
break
for dataset in datasets:
cnt += 1
# At this point `dataset` is an object that contains all the
# dataset metadata, including resources. Here's an example:
# https://ridl-uat.unhcr.org/api/action/package_show?id=test-deployment-september
# This can be exported in different formats depending on the needs, eg:
# 1. Output single fields:
print ('%s. %s - %s' % (cnt, dataset['id'], dataset['title']))
# 2. Save as JSON
# import json
# with open('{}.json'.format(dataset['name']), 'w') as p:
# json.dump(dataset, f)
# 3. Export all as CSV
# This is easy using the `csv` module. The only thing that needs to be taken
# into account if resources need to be included is that there are multiple
# resources per dataset so they need to be stored on different rows, eg:
# id | name | title | ... | resource_id | resource_name | ... |
# ------|-----------|-----------|-----|-------------|---------------|-----|
# ds1 | dataset-1 | Dataset 1 | ... | res1 | Resource 1 | ... |
# ds1 | dataset-1 | Dataset 1 | ... | res2 | Resource 2 | ... |
# ds1 | dataset-1 | Dataset 1 | ... | res3 | Resource 3 | ... |
# or
# id | name | title | ... | resource_id | resource_name | ... |
# ------|-----------|-----------|-----|-------------|---------------|-----|
# ds1 | dataset-1 | Dataset 1 | ... | res1 | Resource 1 | ... |
# | | | | res2 | Resource 2 | ... |
# | | | | res3 | Resource 3 | ... |
page = page + 1
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Export all datasets from a CKAN instance')
parser.add_argument('url', help='CKAN site to update')
parser.add_argument('api_key', help='API key on that site')
args = parser.parse_args()
export(args.url, args.api_key)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment