Created
September 20, 2024 10:21
-
-
Save fungiboletus/b203e8c9179cb5886de1c631c741ca5d to your computer and use it in GitHub Desktop.
Febrl's sgram algorithm as a standalone version
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ============================================================================= | |
# AUSTRALIAN NATIONAL UNIVERSITY OPEN SOURCE LICENSE (ANUOS LICENSE) | |
# VERSION 1.3 | |
# | |
# The contents of this file are subject to the ANUOS License Version 1.3 | |
# (the "License"); you may not use this file except in compliance with | |
# the License. You may obtain a copy of the License at: | |
# | |
# https://sourceforge.net/projects/febrl/ | |
# | |
# Software distributed under the License is distributed on an "AS IS" | |
# basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See | |
# the License for the specific language governing rights and limitations | |
# under the License. | |
# | |
# The Original Software is: "stringcmp.py" | |
# | |
# The Initial Developer of the Original Software is: | |
# Dr Peter Christen (Research School of Computer Science, The Australian | |
# National University) | |
# | |
# Copyright (C) 2002 - 2011 the Australian National University and | |
# others. All Rights Reserved. | |
# | |
# Contributors: | |
# | |
# Alternatively, the contents of this file may be used under the terms | |
# of the GNU General Public License Version 2 or later (the "GPL"), in | |
# which case the provisions of the GPL are applicable instead of those | |
# above. The GPL is available at the following URL: http://www.gnu.org/ | |
# If you wish to allow use of your version of this file only under the | |
# terms of the GPL, and not to allow others to use your version of this | |
# file under the terms of the ANUOS License, indicate your decision by | |
# deleting the provisions above and replace them with the notice and | |
# other provisions required by the GPL. If you do not delete the | |
# provisions above, a recipient may use your version of this file under | |
# the terms of any one of the ANUOS License or the GPL. | |
# ============================================================================= | |
# | |
# Freely extensible biomedical record linkage (Febrl) - Version 0.4.2 | |
# | |
# See: http://datamining.anu.edu.au/linkage.html | |
# | |
# ============================================================================= | |
QGRAM_START_CHAR = chr(1) | |
QGRAM_END_CHAR = chr(2) | |
def sgram(str1, str2, gc=[[0],[0,1],[1,2]], common_divisor='average', padded=True): | |
"""Return approximate string comparator measure (between 0.0 and 1.0) | |
using s-grams (skip-grams) with bigrams. | |
USAGE: | |
score = sgram(str1, str2, gc, common_divisor, min_threshold, padded) | |
ARGUMENTS: | |
str1 The first string | |
str2 The second string | |
gc Gram class list (see below). | |
common_divisor Method of how to calculate the divisor, it can be set to | |
'average','shortest', or 'longest' , and is calculated | |
according to the lengths of the two input strings | |
min_threshold Minimum threshold between 0 and 1 | |
padded If set to True (default), the beginnng and end of the | |
strings will be padded with (q-1) special characters, if | |
False no padding will be done. | |
DESCRIPTION: | |
Uses s-grams as described in: | |
"Non-adjacent Digrams Improve Matching of Cross-Lingual Spelling Variants" | |
by H. Keskustalo, A. Pirkola, K. Visala, E. Leppanen and J. Jarvelin, | |
SPIRE 2003. | |
Padding will result in special start and end characters being added at the | |
beginning and the end of the character, similar to as done for the qgram | |
and posqgram routines. | |
""" | |
# Quick check if the strings are empty or the same | |
if (str1 == '') or (str2 == ''): | |
return 0.0 | |
elif (str1 == str2): | |
return 1.0 | |
# Check if divisor is OK | |
if (common_divisor not in ['average', 'shortest', 'longest']): | |
raise ValueError('Illegal value for common divisor: %s' % (common_divisor)) | |
# Extend strings with start and end characters | |
if padded: | |
tmp_str1 = QGRAM_START_CHAR + str1 + QGRAM_END_CHAR | |
tmp_str2 = QGRAM_START_CHAR + str2 + QGRAM_END_CHAR | |
else: | |
tmp_str1 = str1 | |
tmp_str2 = str2 | |
len1 = len(tmp_str1) | |
len2 = len(tmp_str2) | |
common = 0.0 # Sum number of common s-grams over gram classes | |
divisor = 0.0 # Sum of divisors over gram classes | |
# Loop over all gram classes given | |
for c in gc: | |
sgram_list1 = [] | |
sgram_list2 = [] | |
for s in c: # Skip distances | |
for i in range(0, len1 - s - 1): | |
sgram_list1.append(tmp_str1[i] + tmp_str1[i + s + 1]) | |
for i in range(0, len2 - s - 1): | |
sgram_list2.append(tmp_str2[i] + tmp_str2[i + s + 1]) | |
num_sgram1 = len(sgram_list1) | |
num_sgram2 = len(sgram_list2) | |
if common_divisor == 'average': | |
this_divisor = 0.5 * (num_sgram1 + num_sgram2) # Average number of s-grams | |
elif common_divisor == 'shortest': | |
this_divisor = min(num_sgram1, num_sgram2) | |
else: # Longest | |
this_divisor = max(num_sgram1, num_sgram2) | |
if num_sgram1 < num_sgram2: # Count using the shorter s-gram list | |
short_sgram_list = sgram_list1 | |
long_sgram_list = sgram_list2 | |
else: | |
short_sgram_list = sgram_list2 | |
long_sgram_list = sgram_list1 | |
this_common = 0 # Number of common s-grams for this gram class | |
for s_gram in short_sgram_list: | |
if s_gram in long_sgram_list: | |
this_common += 1 | |
long_sgram_list.remove(s_gram) # Remove the counted s-gram | |
common += this_common | |
divisor += this_divisor | |
if divisor == 0: # One string did not have any s-grams | |
w = 0.0 | |
else: | |
w = common / divisor | |
assert (w >= 0.0) and (w <= 1.0), 'Similarity weight outside 0-1: %f' % (w) | |
return w | |
def sgram_ci(str1, str2): | |
return sgram(str1.lower(), str2.lower()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment