Last active
July 6, 2023 23:44
-
-
Save leongkui/c69c840eda526243017cdb19d1551ed2 to your computer and use it in GitHub Desktop.
Fuzzy String Matching and Removal
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# fuzzy_removal will scan through the text and replace the text with replacement if the text is similar to the target | |
# default replacement is <Patient Name> | |
# default similarity score is 80 | |
def fuzzy_removal( | |
text: str, | |
target: str, | |
replacement: str='<Patient Name>', | |
min_score: int=80, | |
debug: bool=False, | |
) -> str: | |
final_texts = '' | |
target_length = len(target.split(' ')) | |
words = text.split(' ') | |
full_length = len(words) | |
# iterate through texts list and form a string of words with length equal to target_length plus 2 | |
# compare the string with target_name and print match if matched | |
i: int = 0 | |
if debug: | |
print("text=%s" % text) | |
print("Full length = %d" % full_length) | |
final_texts = text | |
while i < full_length: | |
breakout = False | |
for increment_length in range(0, target_length + 2): | |
final_pos = i + increment_length | |
if final_pos > full_length: | |
final_pos = full_length | |
text = ' '.join(words[i:final_pos]) | |
score = fuzz.ratio(text, target) | |
if debug: | |
print("comparing (%s) with (%s), score = %d" % (text, target, score)) | |
if score > min_score: | |
if debug: | |
print('Match: %s' % text) | |
i = final_pos | |
final_texts = final_texts.replace(text, replacement) | |
# this is to break out of the for loop and continue with the while loop | |
breakout = True | |
break | |
if breakout: | |
continue | |
i += 1 | |
return final_texts |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment