Last active
November 9, 2017 01:14
-
-
Save schcriher/7388722 to your computer and use it in GitHub Desktop.
Calcula la diferencia entre dos textos
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
#-*- coding: utf-8 -*- | |
# | |
# Copyright (C) 2013-2017 Cristian Hernán Schmidt | |
# | |
# texdiff.py is free software: you can redistribute it and/or modify | |
# it under the terms of the GNU General Public License as published by | |
# the Free Software Foundation, either version 3 of the License, or | |
# (at your option) any later version. | |
# | |
# texdiff.py is distributed in the hope that it will be useful, | |
# but WITHOUT ANY WARRANTY; without even the implied warranty of | |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
# GNU General Public License for more details. | |
# | |
# You should have received a copy of the GNU General Public License | |
# along with texdiff.py. If not, see <http://www.gnu.org/licenses/>. | |
import re | |
import unicodedata | |
NO_WORD_RE = re.compile('[^\w]') | |
def remove_diacritics(string): | |
"""Removes the Mark and Nonspacing characters from the string""" | |
nfkd = unicodedata.normalize('NFKD', string) | |
return ''.join(c for c in nfkd if unicodedata.category(c) != 'Mn') | |
def get_positions(string, letter, offset=0): | |
""" Returns all positions of the letter in the string, | |
offset is an integer that modifies all positions. | |
""" | |
for i, c in enumerate(string): | |
if c == letter: | |
yield i + offset | |
def texdiff(a, b, insensitive=True, accents=False, onlyword=False): | |
""" Returns the fraction of the difference between "a" and "b". | |
An incorrect letter is more penalized in a short word than in a long one. | |
Parameters: | |
insensitive=True Set case-insensitive | |
accents=False There is no distinction between words with and | |
without accents | |
onlyword=False It analyzes only the characters considered word | |
in the regular expressions | |
Use: | |
fraction = texdiff(a, b) | |
where: | |
0 <= fraction <= 1 | |
0.0 Zero difference, "a" and "b" are the same | |
0.5 Half difference, example: the same letters in another order | |
1.0 Full difference, no letter matches | |
Design: Schmidt Cristian Hernán <[email protected]> | |
""" | |
if insensitive: | |
a = a.lower() | |
b = b.lower() | |
if not accents: | |
a = remove_diacritics(a) | |
b = remove_diacritics(b) | |
if onlyword: | |
a = NO_WORD_RE.sub('', a) | |
b = NO_WORD_RE.sub('', b) | |
n = len(a + b) | |
letters = set(a + b) | |
a_offset = b.find(a) if a in b else 0 | |
b_offset = a.find(b) if b in a else 0 | |
diff_quantity = 0 | |
diff_position = 0 | |
for letter in letters: | |
count_a = a.count(letter) | |
count_b = b.count(letter) | |
diff_quantity += abs(count_a - count_b) | |
pos_a = set(get_positions(a, letter, a_offset)) | |
pos_b = set(get_positions(b, letter, b_offset)) | |
diff_position += len(pos_a.symmetric_difference(pos_b)) | |
return (diff_quantity + diff_position) / (2 * n) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment