Skip to content

Instantly share code, notes, and snippets.

@alexeyev
Created November 27, 2020 12:29
Show Gist options
  • Save alexeyev/d1de72c1041f7683a97dd5dfbd19d1b6 to your computer and use it in GitHub Desktop.
Save alexeyev/d1de72c1041f7683a97dd5dfbd19d1b6 to your computer and use it in GitHub Desktop.
# coding: utf-8
from difflib import SequenceMatcher
t0 = open("text0.txt", "r+").read().strip().replace("\n", " ").replace(" ", " ")
t1 = open("text1.txt", "r+").read().strip().replace("\n", " ").replace(" ", " ")
matcher = SequenceMatcher(a=t0, b=t1)
ratio = matcher.ratio()
mbs = matcher.get_matching_blocks()
prev_a, prev_b = 0, 0
prev_a_len, prev_b_len = 0, 0
COUNT_MATCH, COUNT_ALL = 0, 0
for mb in mbs:
mm_0 = t0[prev_a + prev_a_len:mb.a]
mm_1 = t1[prev_b + prev_b_len:mb.b]
if mm_0.strip() == mm_1.strip() or len(mm_0.strip()) < 10 and len(mm_1.strip()) < 10:
# print("NOPE")
COUNT_MATCH += 2 * max(len(mm_0), len(mm_1))
COUNT_ALL += 2 * max(len(mm_0), len(mm_1))
else:
print()
print("== Meaningful?== ")
print(mm_0, "] [", t0[mb.a:mb.a + mb.size])
print(mm_1, "] [", t1[mb.b:mb.b + mb.size])
print("YES!")
COUNT_ALL += len(mm_0) + len(mm_1)
COUNT_MATCH += 2 * mb.size
COUNT_ALL += 2 * mb.size
prev_a, prev_b = mb.a, mb.b
prev_a_len, prev_b_len = mb.size, mb.size
print(ratio)
print("TRUE RATIO:", COUNT_MATCH / COUNT_ALL)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment