jobelenus · June 12, 2018 21:57
diff --git a/example.py b/example.py
 class SourceDimension(models.Model):
    source_id = models.IntegerField(db_index=True)
    name = models.CharField(max_length=256, db_index=True)
    category = models.CharField(max_length=128, default='')
    fuzz_grouped_source_name = models.CharField(max_length=256, default='')
    fuzz_ratio = models.PositiveIntegerField(default=0)

    KNOWN_GOOD = [
        # These are first on purpose
        'Edmunds CarCode', 'Edmunds Price Promise', 'TrueCar/USAA',
        # generally we want the MOST specific first, least specific last
        'Dealer.com', 'Cars.com', 'CarGurus', 'TrueCar', 'Edmunds',
        'Carloan.com', 'Kelley Blue Book', 'Third Party Core',
        'CARFAX', 'DealerFire', 'Costco', 'Tdds',
        'TradePending', 'Jazel', 'PureCars'
    ]
    DEFAULT_THRESHOLD = 85
    
        @staticmethod
    def find_source_name_commonalities(delimiter=',', max_length=5):
        """
 from strolid_reporting.models import SourceDimension
 counter = SourceDimension.find_source_name_commonalities()
        Note: this MUST return the values longest first (assuming longer = more specific, see comment above in KNOWN_GOOD)
        """
        counter = Counter()
        for name in SourceDimension.objects.exclude(name='').values_list('name', flat=True):
            tokens = name.split(delimiter)
            tokens = list(filter(lambda x: True if len(x) > max_length else False, tokens))
            tokens = list(map(str.strip, tokens))
            counter.update(tokens)
        return counter

    @staticmethod
    def update_fuzz(threshold=None):
        """
 from strolid_reporting.models import SourceDimension
 SourceDimension.update_fuzz()
 d = SourceDimension.objects.exclude(fuzz_ratio=0).order_by('fuzz_ratio').first()
 print(d.name, '|', d.fuzz_grouped_source_name, '|', d.fuzz_ratio)

 >>> SourceDimension.objects.filter(fuzz_ratio=0).count()
 2851
 >>> SourceDimension.objects.exclude(fuzz_ratio=0).count()
 1943
        """
        SourceDimension.objects.update(fuzz_grouped_source_name='', fuzz_ratio=0)
        if not threshold:
            threshold = SourceDimension.DEFAULT_THRESHOLD
        for dim in SourceDimension.objects.exclude(name=''):
            if dim.name == 'Strolid Phone Up':
                dim.fuzz_grouped_source_name = 'Strolid Phone Up'
                dim.fuzz_ratio = 100
            elif dim.name == 'Strolid Chat':
                dim.fuzz_grouped_source_name = 'Strolid Chat'
                dim.fuzz_ratio = 100
            else:
                for grouped_name in SourceDimension.KNOWN_GOOD:
                    ratio = fuzz.partial_ratio(grouped_name, dim.name)
                    # don't replace unless its better than threshold or what we have (and dont overwrite 100s)!
                    if ratio > threshold and ratio > dim.fuzz_ratio and dim.fuzz_ratio != 100:
                        dim.fuzz_grouped_source_name = grouped_name
                        dim.fuzz_ratio = ratio
                        dim.save()
	class SourceDimension(models.Model):
	source_id = models.IntegerField(db_index=True)
	name = models.CharField(max_length=256, db_index=True)
	category = models.CharField(max_length=128, default='')
	fuzz_grouped_source_name = models.CharField(max_length=256, default='')
	fuzz_ratio = models.PositiveIntegerField(default=0)

	KNOWN_GOOD = [
	# These are first on purpose
	'Edmunds CarCode', 'Edmunds Price Promise', 'TrueCar/USAA',
	# generally we want the MOST specific first, least specific last
	'Dealer.com', 'Cars.com', 'CarGurus', 'TrueCar', 'Edmunds',
	'Carloan.com', 'Kelley Blue Book', 'Third Party Core',
	'CARFAX', 'DealerFire', 'Costco', 'Tdds',
	'TradePending', 'Jazel', 'PureCars'
	]
	DEFAULT_THRESHOLD = 85

	@staticmethod
	def find_source_name_commonalities(delimiter=',', max_length=5):
	"""
	from strolid_reporting.models import SourceDimension
	counter = SourceDimension.find_source_name_commonalities()
	Note: this MUST return the values longest first (assuming longer = more specific, see comment above in KNOWN_GOOD)
	"""
	counter = Counter()
	for name in SourceDimension.objects.exclude(name='').values_list('name', flat=True):
	tokens = name.split(delimiter)
	tokens = list(filter(lambda x: True if len(x) > max_length else False, tokens))
	tokens = list(map(str.strip, tokens))
	counter.update(tokens)
	return counter

	@staticmethod
	def update_fuzz(threshold=None):
	"""
	from strolid_reporting.models import SourceDimension
	SourceDimension.update_fuzz()
	d = SourceDimension.objects.exclude(fuzz_ratio=0).order_by('fuzz_ratio').first()
	print(d.name, '\|', d.fuzz_grouped_source_name, '\|', d.fuzz_ratio)

	>>> SourceDimension.objects.filter(fuzz_ratio=0).count()
	2851
	>>> SourceDimension.objects.exclude(fuzz_ratio=0).count()
	1943
	"""
	SourceDimension.objects.update(fuzz_grouped_source_name='', fuzz_ratio=0)
	if not threshold:
	threshold = SourceDimension.DEFAULT_THRESHOLD
	for dim in SourceDimension.objects.exclude(name=''):
	if dim.name == 'Strolid Phone Up':
	dim.fuzz_grouped_source_name = 'Strolid Phone Up'
	dim.fuzz_ratio = 100
	elif dim.name == 'Strolid Chat':
	dim.fuzz_grouped_source_name = 'Strolid Chat'
	dim.fuzz_ratio = 100
	else:
	for grouped_name in SourceDimension.KNOWN_GOOD:
	ratio = fuzz.partial_ratio(grouped_name, dim.name)
	# don't replace unless its better than threshold or what we have (and dont overwrite 100s)!
	if ratio > threshold and ratio > dim.fuzz_ratio and dim.fuzz_ratio != 100:
	dim.fuzz_grouped_source_name = grouped_name
	dim.fuzz_ratio = ratio
	dim.save()