jennyd · January 6, 2017 18:35
diff --git a/analyse-topics.py b/analyse-topics.py
 #!/usr/bin/env python

 import csv
 import urlparse
 from collections import Counter


 '''Analyse data from govuk-delivery's database to find out which URLs exist as
 subscription topics and which query params they use. This will help us work out
 whether all Whitehall content is already tagged to enough things in the links
 hash for us to model these subscriptions in email-alert-api and still be able
 to match all relevant content to them.

 We haven't yet cleaned up topics with no subscribers or ones which have never
 sent an email - that may shrink the list of query params.
 '''


 # Export govuk-delivery's database to CSV in dev:
 # mongoexport --host localhost --db govuk_delivery --collection topics --csv --out govuk-delivery-topics.csv --fields _id,created,topic_id
 with open('govuk-delivery-topics.csv') as f:
    reader = csv.DictReader(f)
    topics = [row for row in reader]

 urls = [row['_id'] for row in topics]


 path_segments = [
    'announcements',
    'feed',
    'ministers',
    'organisations',
    'people',
    'policies',
    'publications',
    'statistics',
    'topical-events',
    'topics', # policy areas
    'world'
 ]


 def extract_query_params(url_list):
    return [urlparse.parse_qsl(urlparse.urlparse(url).query) for url in url_list]


 def query_param_keys(query_params_list):
    return [pair[0] for params in query_params_list for pair in params]


 def comment(key):
    '''Return an explanatory comment for some keys.

    They have out-of-date names or their meanings are otherwise unclear.
    '''
    if key == 'topics[]':
        return ' # policy areas'
    elif key == 'departments[]':
        return ' # organisations'
    elif key in ('announcement_filter_option', 'publication_filter_option'):
        return ' # some kind of format'
    elif key == 'relevant_to_local_government':
        return ' # boolean (1)'
    elif key == 'official_document_status':
        return ' # act_papers_only/command_papers_only/command_and_act_papers'
    else:
        return ''


 base_url = 'https://www.gov.uk/government/'


 for segment in path_segments:
    relevant_urls = [url for url in urls if url.startswith(base_url + segment)]
    print '{} URLs starting with {}'.format(len(relevant_urls), base_url + segment)
    print 'Query param keys and usage counts:'
    keys = query_param_keys(extract_query_params(relevant_urls))
    counter = Counter(keys)
    for key, count in counter.items():
        print '    {}: {}{}'.format(key, count, comment(key))
    print ''


 other_urls = [url for url in urls if not any(url.startswith(base_url + segment) for segment in path_segments)]
 print 'Other URLs:'
 for url in other_urls:
    print url
 print '\n'


 print 'Values for *_filter_options:'
 filter_option_values = []
 all_query_params = extract_query_params(urls)
 for url_params in all_query_params:
    for key, value in url_params:
        if key in ('announcement_filter_option', 'publication_filter_option'):
            filter_option_values.append(value)

 print '{} total, {} unique values:'.format(len(filter_option_values), len(set(filter_option_values)))
 for v in set(filter_option_values):
    print '  {}'.format(v)
 print ''
	#!/usr/bin/env python

	import csv
	import urlparse
	from collections import Counter


	'''Analyse data from govuk-delivery's database to find out which URLs exist as
	subscription topics and which query params they use. This will help us work out
	whether all Whitehall content is already tagged to enough things in the links
	hash for us to model these subscriptions in email-alert-api and still be able
	to match all relevant content to them.

	We haven't yet cleaned up topics with no subscribers or ones which have never
	sent an email - that may shrink the list of query params.
	'''


	# Export govuk-delivery's database to CSV in dev:
	# mongoexport --host localhost --db govuk_delivery --collection topics --csv --out govuk-delivery-topics.csv --fields _id,created,topic_id
	with open('govuk-delivery-topics.csv') as f:
	reader = csv.DictReader(f)
	topics = [row for row in reader]

	urls = [row['_id'] for row in topics]


	path_segments = [
	'announcements',
	'feed',
	'ministers',
	'organisations',
	'people',
	'policies',
	'publications',
	'statistics',
	'topical-events',
	'topics', # policy areas
	'world'
	]


	def extract_query_params(url_list):
	return [urlparse.parse_qsl(urlparse.urlparse(url).query) for url in url_list]


	def query_param_keys(query_params_list):
	return [pair[0] for params in query_params_list for pair in params]


	def comment(key):
	'''Return an explanatory comment for some keys.

	They have out-of-date names or their meanings are otherwise unclear.
	'''
	if key == 'topics[]':
	return ' # policy areas'
	elif key == 'departments[]':
	return ' # organisations'
	elif key in ('announcement_filter_option', 'publication_filter_option'):
	return ' # some kind of format'
	elif key == 'relevant_to_local_government':
	return ' # boolean (1)'
	elif key == 'official_document_status':
	return ' # act_papers_only/command_papers_only/command_and_act_papers'
	else:
	return ''


	base_url = 'https://www.gov.uk/government/'


	for segment in path_segments:
	relevant_urls = [url for url in urls if url.startswith(base_url + segment)]
	print '{} URLs starting with {}'.format(len(relevant_urls), base_url + segment)
	print 'Query param keys and usage counts:'
	keys = query_param_keys(extract_query_params(relevant_urls))
	counter = Counter(keys)
	for key, count in counter.items():
	print ' {}: {}{}'.format(key, count, comment(key))
	print ''


	other_urls = [url for url in urls if not any(url.startswith(base_url + segment) for segment in path_segments)]
	print 'Other URLs:'
	for url in other_urls:
	print url
	print '\n'


	print 'Values for *_filter_options:'
	filter_option_values = []
	all_query_params = extract_query_params(urls)
	for url_params in all_query_params:
	for key, value in url_params:
	if key in ('announcement_filter_option', 'publication_filter_option'):
	filter_option_values.append(value)

	print '{} total, {} unique values:'.format(len(filter_option_values), len(set(filter_option_values)))
	for v in set(filter_option_values):
	print ' {}'.format(v)
	print ''