jaklinger · April 26, 2018 08:31
diff --git a/companies_house_match.py b/companies_house_match.py
 # Note superfuzz can be found here: https://github.com/jaklinger/nesta_toolbox/blob/master/sandbox/jaklinger/superfuzz/superfuzz.py

 import requests
 from retrying import retry
 import re
 import time
 from collections import Counter
 from superfuzz.superfuzz import superfuzz
 from fuzzywuzzy import fuzz
 from fuzzywuzzy import process as fuzzy_process

 # Gather global stop words together
 words = []
 for k in ch_df["CompanyName"].values:
    words += k.split()
    
 stops = [w.lower() for w,_ in Counter(words).most_common(10)]
 pattern = re.compile('[\W_]+')
 ch_url = "https://api.companieshouse.gov.uk/search/companies"
 postcodes_url = "https://api.postcodes.io/postcodes/"
 ch_api_keys = ['GET_A_KEY',]

 def destop(value):
    return " ".join(x for x in value.split() if x.lower() not in stops).lower()

 @retry(stop_max_attempt_number=5,wait_fixed=5000)
 def query_ch(q):
    params = dict(items_per_page=20, q=q)
    auth = (random.choice(ch_api_keys),'')
    r = requests.get(url=ch_url,auth=auth,params=params)    
    r.raise_for_status()    
    return r.json()

 def get_lon_lat(postcode):
    r = requests.get(url=postcodes_url+postcode)
    r.raise_for_status()
    pc_info = r.json()
    lat = pc_info["result"]["latitude"]
    lon = pc_info["result"]["longitude"]
    return lon, lat

 def match_by_postcode(postcode, query_result):
    if postcode is None:
        return None
    for item in query_result["items"]:
        if "address" not in item:
            continue
        if item["address"] is None:
            continue
        if "postal_code" not in item["address"]:
            continue        
        _postcode = item["address"]["postal_code"]
        if _postcode == postcode:
            return item

 def get_match(names_addresses, threshold=80, threshold_address=90):
    name = names_addresses["name"]
    address = names_addresses["address"]
    _name = destop(name)
    # Extract and format the postcode and address
    postcode = None
    if address is not None:
        if "postCode" in address:
            postcode = address["postCode"]
        address = " ".join(v for k,v in address.items() 
                           if v != "Unknown")
        address = pattern.sub(' ', address)
        
    # Query CH by name matches
    r = query_ch(name)
    
    if len(r['items']) > 0:
        # No results so far, so try postcode match
        # Get the lon, lat info for this postcode
        try:
            match = match_by_postcode(postcode, r)
        except requests.exceptions.HTTPError:
            pass
        else:
            if match is not None:
                return match  
    
    for item in r["items"]:        
        score = superfuzz(_name,destop(item['title']),
                          [fuzz.partial_ratio,fuzz.token_sort_ratio])
        # If the score is good, then trust Companies House's sorting algorithm
        if score > threshold:
            return item
    
    # Query CH by address matches
    if address is not None:
        r = query_ch(name+" "+address)      
        if len(r["items"]) != 0:
            # First try a name match
            for item in r['items']:
                score = superfuzz(_name,destop(item['title']),
                            [fuzz.partial_ratio,fuzz.token_sort_ratio])
                # If the score is good, then trust Companies House's sorting algorithm
                if score > threshold_address:
                    return item
            # Otherwise try an address match
            for item in r['items']:
                if 'address_snippet' not in item:
                    continue
                if item['address_snippet'] is None:
                    continue                    
                _address = pattern.sub(' ', item['address_snippet'])
                score = superfuzz(address,_address,[fuzz.partial_ratio,fuzz.token_sort_ratio])
                
                if score > threshold:
                    return item
            
    # No result
    return None

 if __name__ == "__main__":
  
  names_addresses = [] # <-- Note list of dict entries [organisation_name --> address_string]
  
  results == {}:
  for i, data in enumerate(names_addresses):
      name = data["name"]
      if name in results:
          continue
      try:
          result = get_match(data)
      except:
          print("Sleeping on",i,"of",len(names_addresses))
          time.sleep(600)        
          result = get_match(data)
      finally:
          results[name] = result
	# Note superfuzz can be found here: https://github.com/jaklinger/nesta_toolbox/blob/master/sandbox/jaklinger/superfuzz/superfuzz.py

	import requests
	from retrying import retry
	import re
	import time
	from collections import Counter
	from superfuzz.superfuzz import superfuzz
	from fuzzywuzzy import fuzz
	from fuzzywuzzy import process as fuzzy_process

	# Gather global stop words together
	words = []
	for k in ch_df["CompanyName"].values:
	words += k.split()

	stops = [w.lower() for w,_ in Counter(words).most_common(10)]
	pattern = re.compile('[\W_]+')
	ch_url = "https://api.companieshouse.gov.uk/search/companies"
	postcodes_url = "https://api.postcodes.io/postcodes/"
	ch_api_keys = ['GET_A_KEY',]

	def destop(value):
	return " ".join(x for x in value.split() if x.lower() not in stops).lower()

	@retry(stop_max_attempt_number=5,wait_fixed=5000)
	def query_ch(q):
	params = dict(items_per_page=20, q=q)
	auth = (random.choice(ch_api_keys),'')
	r = requests.get(url=ch_url,auth=auth,params=params)
	r.raise_for_status()
	return r.json()

	def get_lon_lat(postcode):
	r = requests.get(url=postcodes_url+postcode)
	r.raise_for_status()
	pc_info = r.json()
	lat = pc_info["result"]["latitude"]
	lon = pc_info["result"]["longitude"]
	return lon, lat

	def match_by_postcode(postcode, query_result):
	if postcode is None:
	return None
	for item in query_result["items"]:
	if "address" not in item:
	continue
	if item["address"] is None:
	continue
	if "postal_code" not in item["address"]:
	continue
	_postcode = item["address"]["postal_code"]
	if _postcode == postcode:
	return item

	def get_match(names_addresses, threshold=80, threshold_address=90):
	name = names_addresses["name"]
	address = names_addresses["address"]
	_name = destop(name)
	# Extract and format the postcode and address
	postcode = None
	if address is not None:
	if "postCode" in address:
	postcode = address["postCode"]
	address = " ".join(v for k,v in address.items()
	if v != "Unknown")
	address = pattern.sub(' ', address)

	# Query CH by name matches
	r = query_ch(name)

	if len(r['items']) > 0:
	# No results so far, so try postcode match
	# Get the lon, lat info for this postcode
	try:
	match = match_by_postcode(postcode, r)
	except requests.exceptions.HTTPError:
	pass
	else:
	if match is not None:
	return match

	for item in r["items"]:
	score = superfuzz(_name,destop(item['title']),
	[fuzz.partial_ratio,fuzz.token_sort_ratio])
	# If the score is good, then trust Companies House's sorting algorithm
	if score > threshold:
	return item

	# Query CH by address matches
	if address is not None:
	r = query_ch(name+" "+address)
	if len(r["items"]) != 0:
	# First try a name match
	for item in r['items']:
	score = superfuzz(_name,destop(item['title']),
	[fuzz.partial_ratio,fuzz.token_sort_ratio])
	# If the score is good, then trust Companies House's sorting algorithm
	if score > threshold_address:
	return item
	# Otherwise try an address match
	for item in r['items']:
	if 'address_snippet' not in item:
	continue
	if item['address_snippet'] is None:
	continue
	_address = pattern.sub(' ', item['address_snippet'])
	score = superfuzz(address,_address,[fuzz.partial_ratio,fuzz.token_sort_ratio])

	if score > threshold:
	return item

	# No result
	return None

	if __name__ == "__main__":

	names_addresses = [] # <-- Note list of dict entries [organisation_name --> address_string]

	results == {}:
	for i, data in enumerate(names_addresses):
	name = data["name"]
	if name in results:
	continue
	try:
	result = get_match(data)
	except:
	print("Sleeping on",i,"of",len(names_addresses))
	time.sleep(600)
	result = get_match(data)
	finally:
	results[name] = result