Skip to content

Instantly share code, notes, and snippets.

@devniel
Last active August 27, 2015 18:57
Show Gist options
  • Save devniel/af206664f3db0949d578 to your computer and use it in GitHub Desktop.
Save devniel/af206664f3db0949d578 to your computer and use it in GitHub Desktop.
clone a string with a percentage of similarity
# -*- coding: utf-8 -*-
from datetime import datetime, timedelta
import random
import unittest
import difflib
def ratio(original, changed):
changes = 0
u = zip(original, changed)
for i,j in u:
if i != j:
changes+=1
ratio = round(1.0 - (changes/len(original)), 2)
return ratio
def clone_date(dt, percentage, comparison='='):
fecha_original_str = dt.strftime("%y%m%d")
fecha_str = None
changed = False
intents = 0
max_intents = 10000
if comparison == '=' and percentage == 1:
return fecha_original_str
while not changed and intents < max_intents:
_dt = dt + timedelta(days=random.randrange(0, 3000))
fecha_str = _dt.strftime("%y%m%d")
print("Trying ..." + str(intents))
print("Ratio : " + str(ratio(fecha_original_str, fecha_str)))
if comparison == '=':
if ratio(fecha_original_str, fecha_str) == percentage:
changed = True
break
if comparison == '>':
if ratio(fecha_original_str, fecha_str) > percentage:
changed = True
break
if comparison == '<':
if ratio(fecha_original_str, fecha_str) < percentage:
changed = True
break
intents+=1
if intents == max_intents:
raise Exception('Max intents exceeded. Remenber that some percentages of similarity are imposible to get.')
return fecha_str
def clone(string, percentage, only_numbers=False, start=0):
length = len(string)
lstring = list(string)
res = round(length * (round(1 - percentage, 2)))
words_to_change = int(res)
index_characters_changed = []
for i in range(0, words_to_change):
found = False
while not found:
random_index = random.randrange(start, length)
if random_index not in index_characters_changed:
index_characters_changed.append(random_index)
found = True
if lstring[random_index] == '\n':
found = False
if lstring[random_index] == ' ':
found = False
original_character = lstring[random_index]
if only_numbers:
w = '1234567890'
else:
w = 'abcdefghijklmnopqrstuvwxyz0123456789'.upper()
changed = False
while not changed:
lstring[random_index] = random.choice(w)
if lstring[random_index] != original_character:
changed = True
return "".join(lstring)
class TestClone(unittest.TestCase):
def test_clone_date(self):
dt = datetime.now() + timedelta(days=random.randrange(0,3))
fecha_original_str = dt.strftime("%y%m%d")
percentage = 0.95
comparison = '<'
fecha_str = clone_date(dt, percentage, comparison=comparison)
_ratio = ratio(fecha_original_str, fecha_str)
print("========== TEST CLONE DATE ==========")
print(fecha_original_str + " === " + fecha_str + " , ratio = " + str(_ratio))
print("=====================================")
if comparison == '=':
self.assertTrue(_ratio == percentage)
if comparison == '<':
self.assertTrue(_ratio < percentage)
if comparison == '>':
self.assertTrue(_ratio > percentage)
def test_clone(self):
original_word = """
:50K:/191000695986399
IBM D5L PERU
IBMDELIERU S.A.C.
AV. PROLOCGACIHN JAVIER PRADO EETE
LIMA 10 PERU
"""
percentage = 0.95
#changed_word = clone(original_word, percentage, start=22)
changed_word = """
:50K:/191000695986399
IBM D5L PERU
IBMDELIERU S.O.D.
AV. PROLOCKACIHN JAVIER PRADO EETE
LIM2 1D PERU
"""
print("ORIGINAL : " + original_word)
print("CHANGED : " + changed_word)
changes = 0
u = zip(original_word, changed_word)
for i,j in u:
if i != j:
changes+=1
print("LEN : " + str(len(original_word)))
print("CHANGES : " + str(changes))
_ratio = round(1.0 - (changes/len(original_word)),2)
print("RATIO : " + str(_ratio))
self.assertEqual(percentage, _ratio)
if __name__ == '__main__':
unittest.main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment