Skip to content

Instantly share code, notes, and snippets.

@pjbull
Created October 9, 2015 17:04
Show Gist options
  • Save pjbull/5adc70cd6d01224b2c17 to your computer and use it in GitHub Desktop.
Save pjbull/5adc70cd6d01224b2c17 to your computer and use it in GitHub Desktop.
Boston restaurants to canonical name and address
import re
import sys
import unicodedata
import pandas as pd
def clean_string(s):
if isinstance(s, unicode):
s = unicodedata.normalize('NFKD', s).encode('ascii', 'ignore')
# lowercase everything
s = s.lower()
# all whitespace to single space
s = re.sub("[\s]+", " ", s)
# all non alphanumeric removed
s = re.sub("[^a-z0-9 ]", "", s)
return s
def load_boston_data(path_to_boston_data):
print "Loading saved data and cleaning inspections..."
# read CSV
inspections = pd.read_csv(path_to_boston_data,
dtype={"Zip": str})
# there's a unicode character at the start of the first column we
# need to remove
inspections.columns = ['BusinessName'] + inspections.columns[1:].tolist()
# add name+address column for primary key
inspections["name_and_address"] = map(clean_string, inspections.BusinessName.astype(str) +
" " + inspections.Address.astype(str) +
" " + inspections.City.astype(str) +
" " + inspections.State.astype(str) +
" " + inspections.Zip.astype(str))
return inspections
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment