Created
January 6, 2017 18:35
-
-
Save jennyd/806cc592350837b72459283d81ddadac to your computer and use it in GitHub Desktop.
Analyse govuk-delivery subscription URLs and their query params
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import csv | |
import urlparse | |
from collections import Counter | |
'''Analyse data from govuk-delivery's database to find out which URLs exist as | |
subscription topics and which query params they use. This will help us work out | |
whether all Whitehall content is already tagged to enough things in the links | |
hash for us to model these subscriptions in email-alert-api and still be able | |
to match all relevant content to them. | |
We haven't yet cleaned up topics with no subscribers or ones which have never | |
sent an email - that may shrink the list of query params. | |
''' | |
# Export govuk-delivery's database to CSV in dev: | |
# mongoexport --host localhost --db govuk_delivery --collection topics --csv --out govuk-delivery-topics.csv --fields _id,created,topic_id | |
with open('govuk-delivery-topics.csv') as f: | |
reader = csv.DictReader(f) | |
topics = [row for row in reader] | |
urls = [row['_id'] for row in topics] | |
path_segments = [ | |
'announcements', | |
'feed', | |
'ministers', | |
'organisations', | |
'people', | |
'policies', | |
'publications', | |
'statistics', | |
'topical-events', | |
'topics', # policy areas | |
'world' | |
] | |
def extract_query_params(url_list): | |
return [urlparse.parse_qsl(urlparse.urlparse(url).query) for url in url_list] | |
def query_param_keys(query_params_list): | |
return [pair[0] for params in query_params_list for pair in params] | |
def comment(key): | |
'''Return an explanatory comment for some keys. | |
They have out-of-date names or their meanings are otherwise unclear. | |
''' | |
if key == 'topics[]': | |
return ' # policy areas' | |
elif key == 'departments[]': | |
return ' # organisations' | |
elif key in ('announcement_filter_option', 'publication_filter_option'): | |
return ' # some kind of format' | |
elif key == 'relevant_to_local_government': | |
return ' # boolean (1)' | |
elif key == 'official_document_status': | |
return ' # act_papers_only/command_papers_only/command_and_act_papers' | |
else: | |
return '' | |
base_url = 'https://www.gov.uk/government/' | |
for segment in path_segments: | |
relevant_urls = [url for url in urls if url.startswith(base_url + segment)] | |
print '{} URLs starting with {}'.format(len(relevant_urls), base_url + segment) | |
print 'Query param keys and usage counts:' | |
keys = query_param_keys(extract_query_params(relevant_urls)) | |
counter = Counter(keys) | |
for key, count in counter.items(): | |
print ' {}: {}{}'.format(key, count, comment(key)) | |
print '' | |
other_urls = [url for url in urls if not any(url.startswith(base_url + segment) for segment in path_segments)] | |
print 'Other URLs:' | |
for url in other_urls: | |
print url | |
print '\n' | |
print 'Values for *_filter_options:' | |
filter_option_values = [] | |
all_query_params = extract_query_params(urls) | |
for url_params in all_query_params: | |
for key, value in url_params: | |
if key in ('announcement_filter_option', 'publication_filter_option'): | |
filter_option_values.append(value) | |
print '{} total, {} unique values:'.format(len(filter_option_values), len(set(filter_option_values))) | |
for v in set(filter_option_values): | |
print ' {}'.format(v) | |
print '' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment