amercader · May 1, 2020 09:19
diff --git a/export_ckan_metadata.py b/export_ckan_metadata.py
 #!/usr/bin/env python
 '''
 Helper script to export datasets metadata from a CKAN instance using the ckanapi library

 Usage:

    ./export_ckan_metadata.py <CKAN URL> <API key>

 For example:


    ./export_ckan_metadata.py https://ridl.unchr.org XXXXX-YYY..

 '''

 import argparse

 import ckanapi

 PAGE_SIZE = 50


 def export(url, api_key):

    ckan = ckanapi.RemoteCKAN(url, api_key)

    page = 1
    cnt = 0

    # Sorting options include:
    # Date created: metadata_created desc
    # Date modified: metadata_modified desc
    # Title: title asc
    sort = 'metadata_modified desc'


    # Filters, some examples (can be combined in the same list):
    # ['dataset_type:deposited-dataset']    # Custom dataset types
    # ['organization:afghanistan']          # Single org (data container)
    # ['visibility:public']                 # Downloadable files
    # ['data_collection_technique:tri']     # Custom fields
    fq_list = []

    while True:
        result = ckan.action.package_search(
            q='*:*',
            fq_list=fq_list,
            rows=PAGE_SIZE,
            start=PAGE_SIZE * (page - 1),
            sort=sort
        )

        datasets =result['results']
        if not datasets:
            break

        for dataset in datasets:
            cnt += 1


            # At this point `dataset` is an object that contains all the
            # dataset metadata, including resources. Here's an example:
            # https://ridl-uat.unhcr.org/api/action/package_show?id=test-deployment-september
            # This can be exported in different formats depending on the needs, eg:

            # 1. Output single fields:
            print ('%s. %s - %s' % (cnt, dataset['id'], dataset['title']))

            # 2. Save as JSON

            # import json
            # with open('{}.json'.format(dataset['name']), 'w') as p:
            #     json.dump(dataset, f)

            # 3. Export all as CSV

            # This is easy using the `csv` module. The only thing that needs to be taken
            # into account if resources need to be included is that there are multiple
            # resources per dataset so they need to be stored on different rows, eg:

            #   id  | name      | title     | ... | resource_id | resource_name | ... |
            # ------|-----------|-----------|-----|-------------|---------------|-----|
            # ds1   | dataset-1 | Dataset 1 | ... | res1        | Resource 1    | ... |
            # ds1   | dataset-1 | Dataset 1 | ... | res2        | Resource 2    | ... |
            # ds1   | dataset-1 | Dataset 1 | ... | res3        | Resource 3    | ... |

            # or

            #   id  | name      | title     | ... | resource_id | resource_name | ... |
            # ------|-----------|-----------|-----|-------------|---------------|-----|
            # ds1   | dataset-1 | Dataset 1 | ... | res1        | Resource 1    | ... |
            #       |           |           |     | res2        | Resource 2    | ... |
            #       |           |           |     | res3        | Resource 3    | ... |


        page = page + 1



 if __name__ == '__main__':

    parser = argparse.ArgumentParser(description='Export all datasets from a CKAN instance')
    parser.add_argument('url', help='CKAN site to update')
    parser.add_argument('api_key', help='API key on that site')

    args = parser.parse_args()
    export(args.url, args.api_key)
	#!/usr/bin/env python
	'''
	Helper script to export datasets metadata from a CKAN instance using the ckanapi library

	Usage:

	./export_ckan_metadata.py <CKAN URL> <API key>

	For example:


	./export_ckan_metadata.py https://ridl.unchr.org XXXXX-YYY..

	'''

	import argparse

	import ckanapi

	PAGE_SIZE = 50


	def export(url, api_key):

	ckan = ckanapi.RemoteCKAN(url, api_key)

	page = 1
	cnt = 0

	# Sorting options include:
	# Date created: metadata_created desc
	# Date modified: metadata_modified desc
	# Title: title asc
	sort = 'metadata_modified desc'


	# Filters, some examples (can be combined in the same list):
	# ['dataset_type:deposited-dataset'] # Custom dataset types
	# ['organization:afghanistan'] # Single org (data container)
	# ['visibility:public'] # Downloadable files
	# ['data_collection_technique:tri'] # Custom fields
	fq_list = []

	while True:
	result = ckan.action.package_search(
	q=':',
	fq_list=fq_list,
	rows=PAGE_SIZE,
	start=PAGE_SIZE * (page - 1),
	sort=sort
	)

	datasets =result['results']
	if not datasets:
	break

	for dataset in datasets:
	cnt += 1


	# At this point `dataset` is an object that contains all the
	# dataset metadata, including resources. Here's an example:
	# https://ridl-uat.unhcr.org/api/action/package_show?id=test-deployment-september
	# This can be exported in different formats depending on the needs, eg:

	# 1. Output single fields:
	print ('%s. %s - %s' % (cnt, dataset['id'], dataset['title']))

	# 2. Save as JSON

	# import json
	# with open('{}.json'.format(dataset['name']), 'w') as p:
	# json.dump(dataset, f)

	# 3. Export all as CSV

	# This is easy using the `csv` module. The only thing that needs to be taken
	# into account if resources need to be included is that there are multiple
	# resources per dataset so they need to be stored on different rows, eg:

	# id \| name \| title \| ... \| resource_id \| resource_name \| ... \|
	# ------\|-----------\|-----------\|-----\|-------------\|---------------\|-----\|
	# ds1 \| dataset-1 \| Dataset 1 \| ... \| res1 \| Resource 1 \| ... \|
	# ds1 \| dataset-1 \| Dataset 1 \| ... \| res2 \| Resource 2 \| ... \|
	# ds1 \| dataset-1 \| Dataset 1 \| ... \| res3 \| Resource 3 \| ... \|

	# or

	# id \| name \| title \| ... \| resource_id \| resource_name \| ... \|
	# ------\|-----------\|-----------\|-----\|-------------\|---------------\|-----\|
	# ds1 \| dataset-1 \| Dataset 1 \| ... \| res1 \| Resource 1 \| ... \|
	# \| \| \| \| res2 \| Resource 2 \| ... \|
	# \| \| \| \| res3 \| Resource 3 \| ... \|


	page = page + 1



	if __name__ == '__main__':

	parser = argparse.ArgumentParser(description='Export all datasets from a CKAN instance')
	parser.add_argument('url', help='CKAN site to update')
	parser.add_argument('api_key', help='API key on that site')

	args = parser.parse_args()
	export(args.url, args.api_key)