Skip to content

Instantly share code, notes, and snippets.

@alexeyev
Created July 2, 2018 11:55
Show Gist options
  • Save alexeyev/b74e7ef043424e03ed8eb1ee3a8a8273 to your computer and use it in GitHub Desktop.
Save alexeyev/b74e7ef043424e03ed8eb1ee3a8a8273 to your computer and use it in GitHub Desktop.
Comparing texts as sequences
import difflib
text_one = open("1.txt", "r").read()
text_two = open("2.txt", "r").read()
sm = difflib.SequenceMatcher(isjunk=None, a=text_one, b=text_two)
mbs = sm.get_matching_blocks()
for mb in mbs:
if mb.size > 10:
print("{", text_one[mb.a:mb.a + mb.size].replace("\n", " "))
print("}", text_two[mb.b:mb.b + mb.size].replace("\n", " "))
print()
print("difflib: a total of ", len(mbs), "blocks matching exactly")
print("difflib: ratio ", sm.ratio())
## -----------------
from fuzzywuzzy import fuzz
simple_ratio = fuzz.ratio(text_one, text_two)
print("fuzzywuzzy: simple ratio =", simple_ratio)
## takes a while to compute
# partial_ratio = fuzz.partial_ratio(text_one, text_two)
# print(partial_ratio)
#
#
# token_sort_ratio = fuzz.token_sort_ratio(text_one, text_two)
# print(token_sort_ratio)
#
#
# token_set_ratio = fuzz.token_set_ratio(text_one, text_two)
# print(token_set_ratio)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment