Last active
August 15, 2018 16:52
-
-
Save estasney/4c530447fd7797862eb4401cb4001d8c to your computer and use it in GitHub Desktop.
Performance oriented string search across multiple datasets
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from collections import OrderedDict | |
from operator import itemgetter | |
class NameData(object): | |
def __init__(self, data, name, priority, preprocessor=None): | |
self.data = self.structure_data(data) | |
self.name_set = self.generate_set(data) | |
self.name = name | |
self.priority = priority | |
self.preprocessor = preprocessor | |
def structure_data(self, data): | |
data_ = sorted(data, key=itemgetter(0)) | |
return OrderedDict(data_) | |
def generate_set(self, data): | |
data_ = sorted(data, key=itemgetter(0)) | |
data_ = [x for x, _ in data_] | |
return set(data_) | |
@property | |
def _signature_(self): | |
return self.name, self.priority | |
def search(self, item): | |
if self.preprocessor: | |
item = self.preprocessor(item) | |
if item not in self.name_set: | |
return False | |
return self.data[item], self._signature_ | |
def __contains__(self, item): | |
if self.preprocessor: | |
item = self.preprocessor(item) | |
if item in self.name_set: | |
return True | |
else: | |
return False | |
def __repr__(self): | |
return "<NameData {}, Priority: {}>".format(self.name, self.priority) | |
class NameSearch(object): | |
def __init__(self, datasets): | |
self.datasets = sorted(datasets, key=lambda x: x.priority) | |
def __contains__(self, item): | |
# builtin function map (faster) | |
if any([item in d for d in self.datasets]): | |
return True | |
else: | |
return False | |
def __repr__(self): | |
return "<NameSearch>" | |
def search(self, item): | |
# Map the search | |
results = [] | |
for d in self.datasets: | |
results.append(d.search(item)) | |
results = [r for r in results if r is not False] | |
return results |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment