Last active
January 24, 2020 14:54
-
-
Save avdata99/fcf8f17cb90fd0f1cbd81529e06e4a2b to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import os | |
import requests | |
from slugify import slugify | |
instance_url = 'https://ckan.io' | |
package_list_url = 'api/3/action/package_list' | |
package_show_url = 'api/3/action/package_show' | |
expected_topics = [ | |
'College Costs', | |
'Student Aid', | |
'Colleges/Universities', | |
'Early Childhood', | |
'English Language Learners', | |
'Postsecondary', | |
'K-12', | |
'Elementary/Secondary', | |
'Students', | |
'Public Schools', | |
'Private Schools', | |
'Teachers and Administrators', | |
'Safety/Bullying', | |
'Special Education', | |
'STEM', | |
'Families', | |
'Student Demographics', | |
'Student Outcomes/Graduation Rates', | |
'Suspension/Discipline', | |
'Technical/Adult Education', | |
'Assessments', | |
'Data Systems', | |
'School Geography', | |
'International', | |
'Library Programs', | |
'Reports', | |
'Contracts', | |
'Programs' | |
] | |
results = requests.get(f'{instance_url}/{package_list_url}') | |
data = results.json() | |
datasets = data['result'] | |
categorized = 0 | |
non_categorized = 0 | |
groups_in_use = {} | |
categorized_datasets = [] | |
datasets_with_expected_tags = [] | |
tags_found = {} | |
datasets_with_tags_found = set() | |
for dataset in datasets: | |
print(f'Analyzing {dataset}') | |
if not os.path.isfile(f'{dataset}.json'): | |
dataset_response = requests.get(f'{instance_url}/{package_show_url}?id={dataset}') | |
data = dataset_response.json() | |
f = open(f'{dataset}.json', 'w') | |
f.write(dataset_response.text) | |
f.close() | |
else: | |
f = open(f'{dataset}.json', 'r') | |
data = json.load(f) | |
f.close() | |
full_dataset = data['result'] | |
tags = full_dataset.get('tags', []) | |
expected_topics_slug = [slugify(topic)for topic in expected_topics] | |
expected_topics_found_as_tags = 0 | |
for tag in tags: | |
name = tag['name'] | |
if name in expected_topics_slug: | |
datasets_with_tags_found.add(dataset) | |
expected_topics_found_as_tags += 1 | |
print(f'Dataset {dataset} has {name} TAG') | |
datasets_with_expected_tags.append({'dataset': dataset, 'tag': name}) | |
if name not in tags_found: | |
tags_found[name] = 0 | |
tags_found[name] += 1 | |
groups = full_dataset.get('groups', []) | |
if len(groups) > 0: | |
categorized += 1 | |
categorized_datasets.append(full_dataset) | |
else: | |
non_categorized += 1 | |
for group in groups: | |
name = group['name'] | |
if name not in groups_in_use: | |
groups_in_use[name] = 0 | |
groups_in_use[name] += 1 | |
total = categorized + non_categorized | |
perc_cat = round(categorized / total * 100, 2) | |
print(f'{total} datasets. {categorized} categorized ({perc_cat} %)') | |
print(f'{total} datasets. {categorized} categorized ({perc_cat} %)') | |
print('Groups in use') | |
print(groups_in_use) | |
print('Categorized datasets:') | |
for cd in categorized_datasets: | |
title = cd['title'] | |
groups = ', '.join([x['name'] for x in cd['groups']]) | |
print(f' - Dataset: {title}') | |
print(f' + Groups: {groups}') | |
print('Datasets with expected tags: {}'.format(len(datasets_with_expected_tags))) | |
print(datasets_with_expected_tags) | |
print('Tags found: {}'.format(len(tags_found))) | |
print(tags_found) | |
print('Datasets with tags found: {}'.format(len(datasets_with_tags_found))) | |
print(datasets_with_tags_found) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment