Created
April 26, 2018 08:31
-
-
Save jaklinger/b09713b2e3884b0db056b213870ef1fa to your computer and use it in GitHub Desktop.
Example of matching organisations to companies house, by name and address including fuzzy matchign
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Note superfuzz can be found here: https://github.com/jaklinger/nesta_toolbox/blob/master/sandbox/jaklinger/superfuzz/superfuzz.py | |
import requests | |
from retrying import retry | |
import re | |
import time | |
from collections import Counter | |
from superfuzz.superfuzz import superfuzz | |
from fuzzywuzzy import fuzz | |
from fuzzywuzzy import process as fuzzy_process | |
# Gather global stop words together | |
words = [] | |
for k in ch_df["CompanyName"].values: | |
words += k.split() | |
stops = [w.lower() for w,_ in Counter(words).most_common(10)] | |
pattern = re.compile('[\W_]+') | |
ch_url = "https://api.companieshouse.gov.uk/search/companies" | |
postcodes_url = "https://api.postcodes.io/postcodes/" | |
ch_api_keys = ['GET_A_KEY',] | |
def destop(value): | |
return " ".join(x for x in value.split() if x.lower() not in stops).lower() | |
@retry(stop_max_attempt_number=5,wait_fixed=5000) | |
def query_ch(q): | |
params = dict(items_per_page=20, q=q) | |
auth = (random.choice(ch_api_keys),'') | |
r = requests.get(url=ch_url,auth=auth,params=params) | |
r.raise_for_status() | |
return r.json() | |
def get_lon_lat(postcode): | |
r = requests.get(url=postcodes_url+postcode) | |
r.raise_for_status() | |
pc_info = r.json() | |
lat = pc_info["result"]["latitude"] | |
lon = pc_info["result"]["longitude"] | |
return lon, lat | |
def match_by_postcode(postcode, query_result): | |
if postcode is None: | |
return None | |
for item in query_result["items"]: | |
if "address" not in item: | |
continue | |
if item["address"] is None: | |
continue | |
if "postal_code" not in item["address"]: | |
continue | |
_postcode = item["address"]["postal_code"] | |
if _postcode == postcode: | |
return item | |
def get_match(names_addresses, threshold=80, threshold_address=90): | |
name = names_addresses["name"] | |
address = names_addresses["address"] | |
_name = destop(name) | |
# Extract and format the postcode and address | |
postcode = None | |
if address is not None: | |
if "postCode" in address: | |
postcode = address["postCode"] | |
address = " ".join(v for k,v in address.items() | |
if v != "Unknown") | |
address = pattern.sub(' ', address) | |
# Query CH by name matches | |
r = query_ch(name) | |
if len(r['items']) > 0: | |
# No results so far, so try postcode match | |
# Get the lon, lat info for this postcode | |
try: | |
match = match_by_postcode(postcode, r) | |
except requests.exceptions.HTTPError: | |
pass | |
else: | |
if match is not None: | |
return match | |
for item in r["items"]: | |
score = superfuzz(_name,destop(item['title']), | |
[fuzz.partial_ratio,fuzz.token_sort_ratio]) | |
# If the score is good, then trust Companies House's sorting algorithm | |
if score > threshold: | |
return item | |
# Query CH by address matches | |
if address is not None: | |
r = query_ch(name+" "+address) | |
if len(r["items"]) != 0: | |
# First try a name match | |
for item in r['items']: | |
score = superfuzz(_name,destop(item['title']), | |
[fuzz.partial_ratio,fuzz.token_sort_ratio]) | |
# If the score is good, then trust Companies House's sorting algorithm | |
if score > threshold_address: | |
return item | |
# Otherwise try an address match | |
for item in r['items']: | |
if 'address_snippet' not in item: | |
continue | |
if item['address_snippet'] is None: | |
continue | |
_address = pattern.sub(' ', item['address_snippet']) | |
score = superfuzz(address,_address,[fuzz.partial_ratio,fuzz.token_sort_ratio]) | |
if score > threshold: | |
return item | |
# No result | |
return None | |
if __name__ == "__main__": | |
names_addresses = [] # <-- Note list of dict entries [organisation_name --> address_string] | |
results == {}: | |
for i, data in enumerate(names_addresses): | |
name = data["name"] | |
if name in results: | |
continue | |
try: | |
result = get_match(data) | |
except: | |
print("Sleeping on",i,"of",len(names_addresses)) | |
time.sleep(600) | |
result = get_match(data) | |
finally: | |
results[name] = result |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment