Last active
June 12, 2018 21:57
-
-
Save jobelenus/e52b3c8cd86d07eca88c90fc50ec3332 to your computer and use it in GitHub Desktop.
How to get a set of KNOWN_GOOD
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class SourceDimension(models.Model): | |
source_id = models.IntegerField(db_index=True) | |
name = models.CharField(max_length=256, db_index=True) | |
category = models.CharField(max_length=128, default='') | |
fuzz_grouped_source_name = models.CharField(max_length=256, default='') | |
fuzz_ratio = models.PositiveIntegerField(default=0) | |
KNOWN_GOOD = [ | |
# These are first on purpose | |
'Edmunds CarCode', 'Edmunds Price Promise', 'TrueCar/USAA', | |
# generally we want the MOST specific first, least specific last | |
'Dealer.com', 'Cars.com', 'CarGurus', 'TrueCar', 'Edmunds', | |
'Carloan.com', 'Kelley Blue Book', 'Third Party Core', | |
'CARFAX', 'DealerFire', 'Costco', 'Tdds', | |
'TradePending', 'Jazel', 'PureCars' | |
] | |
DEFAULT_THRESHOLD = 85 | |
@staticmethod | |
def find_source_name_commonalities(delimiter=',', max_length=5): | |
""" | |
from strolid_reporting.models import SourceDimension | |
counter = SourceDimension.find_source_name_commonalities() | |
Note: this MUST return the values longest first (assuming longer = more specific, see comment above in KNOWN_GOOD) | |
""" | |
counter = Counter() | |
for name in SourceDimension.objects.exclude(name='').values_list('name', flat=True): | |
tokens = name.split(delimiter) | |
tokens = list(filter(lambda x: True if len(x) > max_length else False, tokens)) | |
tokens = list(map(str.strip, tokens)) | |
counter.update(tokens) | |
return counter | |
@staticmethod | |
def update_fuzz(threshold=None): | |
""" | |
from strolid_reporting.models import SourceDimension | |
SourceDimension.update_fuzz() | |
d = SourceDimension.objects.exclude(fuzz_ratio=0).order_by('fuzz_ratio').first() | |
print(d.name, '|', d.fuzz_grouped_source_name, '|', d.fuzz_ratio) | |
>>> SourceDimension.objects.filter(fuzz_ratio=0).count() | |
2851 | |
>>> SourceDimension.objects.exclude(fuzz_ratio=0).count() | |
1943 | |
""" | |
SourceDimension.objects.update(fuzz_grouped_source_name='', fuzz_ratio=0) | |
if not threshold: | |
threshold = SourceDimension.DEFAULT_THRESHOLD | |
for dim in SourceDimension.objects.exclude(name=''): | |
if dim.name == 'Strolid Phone Up': | |
dim.fuzz_grouped_source_name = 'Strolid Phone Up' | |
dim.fuzz_ratio = 100 | |
elif dim.name == 'Strolid Chat': | |
dim.fuzz_grouped_source_name = 'Strolid Chat' | |
dim.fuzz_ratio = 100 | |
else: | |
for grouped_name in SourceDimension.KNOWN_GOOD: | |
ratio = fuzz.partial_ratio(grouped_name, dim.name) | |
# don't replace unless its better than threshold or what we have (and dont overwrite 100s)! | |
if ratio > threshold and ratio > dim.fuzz_ratio and dim.fuzz_ratio != 100: | |
dim.fuzz_grouped_source_name = grouped_name | |
dim.fuzz_ratio = ratio | |
dim.save() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment