Last active
September 2, 2020 00:57
-
-
Save allenanie/2e352613ff917ad4cf45034fc8313e41 to your computer and use it in GitHub Desktop.
A very simple Geochecker!
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib.request | |
import csv | |
from collections import defaultdict | |
import nltk | |
class GeoExtractor(object): | |
def __init__(self): | |
self.zipcode_to_state = {} | |
self.statenames_to_state = {} | |
self.countynames_to_states = defaultdict(set) | |
self.citynames_to_states = defaultdict(set) | |
self.initialize_database() | |
self.zipcode_match_set = set(self.zipcode_to_state.keys()) | |
self.statename_match_set = set(self.statenames_to_state.keys()) | |
self.countyname_match_set = set(self.countynames_to_states.keys()) | |
self.cityname_match_set = set(self.countynames_to_states.keys()) | |
def initialize_database(self): | |
geo_data_url = "https://github.com/scpike/us-state-county-zip/raw/master/geo-data.csv" | |
response = urllib.request.urlopen(geo_data_url) | |
data = response.read() # a `bytes` object | |
text = data.decode('utf-8') | |
state_data_csv = [] | |
for i, line in enumerate(text.split('\n')): | |
if i == 0: | |
continue | |
rows = line.split(',') | |
_, state_name, state_abbr, zipcode, county, city = rows | |
self.zipcode_to_state[zipcode] = state_name | |
self.statenames_to_state[state_name] = state_name | |
self.statenames_to_state[state_abbr] = state_name | |
self.countynames_to_states[county].add(state_name) | |
self.citynames_to_states[city].add(state_name) | |
def check_overlap(self, keyword_match_set, text_set, state_dic): | |
overlap = keyword_match_set.intersection(text_set) | |
state_names = set() | |
for o in overlap: | |
if type(state_dic[o]) == set: | |
state_names.update(state_dic[o]) | |
else: | |
state_names.add(state_dic[o]) | |
return len(overlap) > 0, state_names | |
def extract_state_from_text(self, text): | |
# This method uses a simple priority matching | |
# 1. If there's ZipCode, we will use it and return state | |
# 2. If there's state name / abbr (upper case), we will use it and return state | |
# 3. If there's a city name, full match, we will return all | |
tokens = nltk.word_tokenize(text) | |
text_set = set(tokens) | |
state_proposals = [] | |
zip_check, state_names = self.check_overlap(self.zipcode_match_set, text_set, self.zipcode_to_state) | |
state_proposals.extend(state_names) | |
statename_check, state_names = self.check_overlap(self.statename_match_set, text_set, self.statenames_to_state) | |
state_proposals.extend(state_names) | |
# the reason is that city/county level is a lot messier | |
if zip_check or statename_check: | |
return state_proposals | |
county_check, state_names = self.check_overlap(self.countyname_match_set, text_set, self.countynames_to_states) | |
state_proposals.extend(state_names) | |
if county_check: | |
return state_proposals | |
_, state_names = self.check_overlap(self.cityname_match_set, text_set, self.citynames_to_states) | |
return state_names | |
geoe = GeoExtractor() | |
print(geoe.extract_state_from_text("I live in 94506, TX")) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment