Skip to content

Instantly share code, notes, and snippets.

@jennyd
Created January 6, 2017 18:35
Show Gist options
  • Save jennyd/806cc592350837b72459283d81ddadac to your computer and use it in GitHub Desktop.
Save jennyd/806cc592350837b72459283d81ddadac to your computer and use it in GitHub Desktop.
Analyse govuk-delivery subscription URLs and their query params
#!/usr/bin/env python
import csv
import urlparse
from collections import Counter
'''Analyse data from govuk-delivery's database to find out which URLs exist as
subscription topics and which query params they use. This will help us work out
whether all Whitehall content is already tagged to enough things in the links
hash for us to model these subscriptions in email-alert-api and still be able
to match all relevant content to them.
We haven't yet cleaned up topics with no subscribers or ones which have never
sent an email - that may shrink the list of query params.
'''
# Export govuk-delivery's database to CSV in dev:
# mongoexport --host localhost --db govuk_delivery --collection topics --csv --out govuk-delivery-topics.csv --fields _id,created,topic_id
with open('govuk-delivery-topics.csv') as f:
reader = csv.DictReader(f)
topics = [row for row in reader]
urls = [row['_id'] for row in topics]
path_segments = [
'announcements',
'feed',
'ministers',
'organisations',
'people',
'policies',
'publications',
'statistics',
'topical-events',
'topics', # policy areas
'world'
]
def extract_query_params(url_list):
return [urlparse.parse_qsl(urlparse.urlparse(url).query) for url in url_list]
def query_param_keys(query_params_list):
return [pair[0] for params in query_params_list for pair in params]
def comment(key):
'''Return an explanatory comment for some keys.
They have out-of-date names or their meanings are otherwise unclear.
'''
if key == 'topics[]':
return ' # policy areas'
elif key == 'departments[]':
return ' # organisations'
elif key in ('announcement_filter_option', 'publication_filter_option'):
return ' # some kind of format'
elif key == 'relevant_to_local_government':
return ' # boolean (1)'
elif key == 'official_document_status':
return ' # act_papers_only/command_papers_only/command_and_act_papers'
else:
return ''
base_url = 'https://www.gov.uk/government/'
for segment in path_segments:
relevant_urls = [url for url in urls if url.startswith(base_url + segment)]
print '{} URLs starting with {}'.format(len(relevant_urls), base_url + segment)
print 'Query param keys and usage counts:'
keys = query_param_keys(extract_query_params(relevant_urls))
counter = Counter(keys)
for key, count in counter.items():
print ' {}: {}{}'.format(key, count, comment(key))
print ''
other_urls = [url for url in urls if not any(url.startswith(base_url + segment) for segment in path_segments)]
print 'Other URLs:'
for url in other_urls:
print url
print '\n'
print 'Values for *_filter_options:'
filter_option_values = []
all_query_params = extract_query_params(urls)
for url_params in all_query_params:
for key, value in url_params:
if key in ('announcement_filter_option', 'publication_filter_option'):
filter_option_values.append(value)
print '{} total, {} unique values:'.format(len(filter_option_values), len(set(filter_option_values)))
for v in set(filter_option_values):
print ' {}'.format(v)
print ''
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment