Skip to content

Instantly share code, notes, and snippets.

@fungiboletus
Created September 20, 2024 10:21
Show Gist options
  • Save fungiboletus/b203e8c9179cb5886de1c631c741ca5d to your computer and use it in GitHub Desktop.
Save fungiboletus/b203e8c9179cb5886de1c631c741ca5d to your computer and use it in GitHub Desktop.
Febrl's sgram algorithm as a standalone version
# =============================================================================
# AUSTRALIAN NATIONAL UNIVERSITY OPEN SOURCE LICENSE (ANUOS LICENSE)
# VERSION 1.3
#
# The contents of this file are subject to the ANUOS License Version 1.3
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at:
#
# https://sourceforge.net/projects/febrl/
#
# Software distributed under the License is distributed on an "AS IS"
# basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
# the License for the specific language governing rights and limitations
# under the License.
#
# The Original Software is: "stringcmp.py"
#
# The Initial Developer of the Original Software is:
# Dr Peter Christen (Research School of Computer Science, The Australian
# National University)
#
# Copyright (C) 2002 - 2011 the Australian National University and
# others. All Rights Reserved.
#
# Contributors:
#
# Alternatively, the contents of this file may be used under the terms
# of the GNU General Public License Version 2 or later (the "GPL"), in
# which case the provisions of the GPL are applicable instead of those
# above. The GPL is available at the following URL: http://www.gnu.org/
# If you wish to allow use of your version of this file only under the
# terms of the GPL, and not to allow others to use your version of this
# file under the terms of the ANUOS License, indicate your decision by
# deleting the provisions above and replace them with the notice and
# other provisions required by the GPL. If you do not delete the
# provisions above, a recipient may use your version of this file under
# the terms of any one of the ANUOS License or the GPL.
# =============================================================================
#
# Freely extensible biomedical record linkage (Febrl) - Version 0.4.2
#
# See: http://datamining.anu.edu.au/linkage.html
#
# =============================================================================
QGRAM_START_CHAR = chr(1)
QGRAM_END_CHAR = chr(2)
def sgram(str1, str2, gc=[[0],[0,1],[1,2]], common_divisor='average', padded=True):
"""Return approximate string comparator measure (between 0.0 and 1.0)
using s-grams (skip-grams) with bigrams.
USAGE:
score = sgram(str1, str2, gc, common_divisor, min_threshold, padded)
ARGUMENTS:
str1 The first string
str2 The second string
gc Gram class list (see below).
common_divisor Method of how to calculate the divisor, it can be set to
'average','shortest', or 'longest' , and is calculated
according to the lengths of the two input strings
min_threshold Minimum threshold between 0 and 1
padded If set to True (default), the beginnng and end of the
strings will be padded with (q-1) special characters, if
False no padding will be done.
DESCRIPTION:
Uses s-grams as described in:
"Non-adjacent Digrams Improve Matching of Cross-Lingual Spelling Variants"
by H. Keskustalo, A. Pirkola, K. Visala, E. Leppanen and J. Jarvelin,
SPIRE 2003.
Padding will result in special start and end characters being added at the
beginning and the end of the character, similar to as done for the qgram
and posqgram routines.
"""
# Quick check if the strings are empty or the same
if (str1 == '') or (str2 == ''):
return 0.0
elif (str1 == str2):
return 1.0
# Check if divisor is OK
if (common_divisor not in ['average', 'shortest', 'longest']):
raise ValueError('Illegal value for common divisor: %s' % (common_divisor))
# Extend strings with start and end characters
if padded:
tmp_str1 = QGRAM_START_CHAR + str1 + QGRAM_END_CHAR
tmp_str2 = QGRAM_START_CHAR + str2 + QGRAM_END_CHAR
else:
tmp_str1 = str1
tmp_str2 = str2
len1 = len(tmp_str1)
len2 = len(tmp_str2)
common = 0.0 # Sum number of common s-grams over gram classes
divisor = 0.0 # Sum of divisors over gram classes
# Loop over all gram classes given
for c in gc:
sgram_list1 = []
sgram_list2 = []
for s in c: # Skip distances
for i in range(0, len1 - s - 1):
sgram_list1.append(tmp_str1[i] + tmp_str1[i + s + 1])
for i in range(0, len2 - s - 1):
sgram_list2.append(tmp_str2[i] + tmp_str2[i + s + 1])
num_sgram1 = len(sgram_list1)
num_sgram2 = len(sgram_list2)
if common_divisor == 'average':
this_divisor = 0.5 * (num_sgram1 + num_sgram2) # Average number of s-grams
elif common_divisor == 'shortest':
this_divisor = min(num_sgram1, num_sgram2)
else: # Longest
this_divisor = max(num_sgram1, num_sgram2)
if num_sgram1 < num_sgram2: # Count using the shorter s-gram list
short_sgram_list = sgram_list1
long_sgram_list = sgram_list2
else:
short_sgram_list = sgram_list2
long_sgram_list = sgram_list1
this_common = 0 # Number of common s-grams for this gram class
for s_gram in short_sgram_list:
if s_gram in long_sgram_list:
this_common += 1
long_sgram_list.remove(s_gram) # Remove the counted s-gram
common += this_common
divisor += this_divisor
if divisor == 0: # One string did not have any s-grams
w = 0.0
else:
w = common / divisor
assert (w >= 0.0) and (w <= 1.0), 'Similarity weight outside 0-1: %f' % (w)
return w
def sgram_ci(str1, str2):
return sgram(str1.lower(), str2.lower())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment