Skip to content

Instantly share code, notes, and snippets.

@brucebolt
Last active March 28, 2019 15:46
Show Gist options
  • Save brucebolt/ec211868299b8efbdde7877e9a2d3c10 to your computer and use it in GitHub Desktop.
Save brucebolt/ec211868299b8efbdde7877e9a2d3c10 to your computer and use it in GitHub Desktop.
Compare top 5000 GOV.UK search results between ES2 and ES5
import requests
import sys
import urllib
print "Keyword\tPosition\tMatch\tES2 Result (rummager)\tES5 Result (search-api)"
counter = 1
f = open('top_5000_search_terms.csv', 'r')
for keyword in f:
sys.stderr.write("{}\n".format(str(counter)))
keyword = keyword.strip()
query = urllib.urlencode({'q': keyword})
try:
resp_es2 = requests.get("https://www.gov.uk/api/search.json?{}".format(query))
data_es2 = resp_es2.json()
resp_es5 = requests.get("http://localhost:3233/api/search.json?{}".format(query))
data_es5 = resp_es5.json()
except:
continue
max_results = max(len(data_es2['results']), len(data_es5['results']))
num_results = min(10, max_results)
if num_results > 0:
for i in range(0, num_results):
es2_value = data_es2['results'][i]['link'] if i < len(data_es2['results']) else '(no result at this position)'
es5_value = data_es5['results'][i]['link'] if i < len(data_es5['results']) else '(no result at this position)'
match = 'MATCH' if es2_value == es5_value else ''
print keyword,"\t", i+1, "\t", match, "\t", es2_value, "\t", es5_value
counter += 1
import csv
import re
results_es2 = {}
results_es5 = {}
with open('2019-03-25.txt', 'rb') as f:
rows = csv.reader(f, delimiter='\t')
next(rows, None)
for row in rows:
for index, value in enumerate(row):
row[index] = re.sub(r'\s*$', '', value)
if row[0] not in results_es2:
results_es2[row[0]] = {}
results_es2[row[0]][row[1]] = row[3]
if row[0] not in results_es5:
results_es5[row[0]] = {}
results_es5[row[0]][row[1]] = row[4]
for key in results_es2:
found = "FALSE"
if results_es2[key]['1'] in results_es5[key].values():
found = "TRUE"
print "{}\t{}\t{}".format(key, results_es2[key]['1'], found)
import csv
import re
results_es2 = {}
results_es5 = {}
with open('2019-03-25.txt', 'rb') as f:
rows = csv.reader(f, delimiter='\t')
next(rows, None)
for row in rows:
for index, value in enumerate(row):
row[index] = re.sub(r'\s*$', '', value)
if row[0] not in results_es2:
results_es2[row[0]] = {}
results_es2[row[0]][row[1]] = row[3]
if row[0] not in results_es5:
results_es5[row[0]] = {}
results_es5[row[0]][row[1]] = row[4]
for key in results_es2:
found = 0
missing = []
for i in results_es2[key].keys():
if results_es2[key][i] in results_es5[key].values():
found += 1
else:
missing.append("{} ({})".format(results_es2[key][i], i))
result = 'TRUE' if found == len(results_es2[key]) else 'FALSE'
print "{}\t{}\t{}\t{}".format(key, found, result, ";".join(missing))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment