fungiboletus · September 20, 2024 10:21
diff --git a/sgram.py b/sgram.py
 # =============================================================================
 # AUSTRALIAN NATIONAL UNIVERSITY OPEN SOURCE LICENSE (ANUOS LICENSE)
 # VERSION 1.3
 # 
 # The contents of this file are subject to the ANUOS License Version 1.3
 # (the "License"); you may not use this file except in compliance with
 # the License. You may obtain a copy of the License at:
 # 
 #   https://sourceforge.net/projects/febrl/
 # 
 # Software distributed under the License is distributed on an "AS IS"
 # basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
 # the License for the specific language governing rights and limitations
 # under the License.
 # 
 # The Original Software is: "stringcmp.py"
 # 
 # The Initial Developer of the Original Software is:
 #   Dr Peter Christen (Research School of Computer Science, The Australian
 #                      National University)
 # 
 # Copyright (C) 2002 - 2011 the Australian National University and
 # others. All Rights Reserved.
 # 
 # Contributors:
 # 
 # Alternatively, the contents of this file may be used under the terms
 # of the GNU General Public License Version 2 or later (the "GPL"), in
 # which case the provisions of the GPL are applicable instead of those
 # above. The GPL is available at the following URL: http://www.gnu.org/
 # If you wish to allow use of your version of this file only under the
 # terms of the GPL, and not to allow others to use your version of this
 # file under the terms of the ANUOS License, indicate your decision by
 # deleting the provisions above and replace them with the notice and
 # other provisions required by the GPL. If you do not delete the
 # provisions above, a recipient may use your version of this file under
 # the terms of any one of the ANUOS License or the GPL.
 # =============================================================================
 #
 # Freely extensible biomedical record linkage (Febrl) - Version 0.4.2
 #
 # See: http://datamining.anu.edu.au/linkage.html
 #
 # =============================================================================

 QGRAM_START_CHAR = chr(1)
 QGRAM_END_CHAR = chr(2)

 def sgram(str1, str2, gc=[[0],[0,1],[1,2]], common_divisor='average', padded=True):
    """Return approximate string comparator measure (between 0.0 and 1.0)
       using s-grams (skip-grams) with bigrams.

    USAGE:
      score = sgram(str1, str2, gc, common_divisor, min_threshold, padded)

    ARGUMENTS:
      str1            The first string
      str2            The second string
      gc              Gram class list (see below).
      common_divisor  Method of how to calculate the divisor, it can be set to
                      'average','shortest', or 'longest' , and is calculated
                      according to the lengths of the two input strings
      min_threshold   Minimum threshold between 0 and 1
      padded          If set to True (default), the beginnng and end of the
                      strings will be padded with (q-1) special characters, if
                      False no padding will be done.

    DESCRIPTION:
      Uses s-grams as described in:

      "Non-adjacent Digrams Improve Matching of Cross-Lingual Spelling Variants"
      by H. Keskustalo, A. Pirkola, K. Visala, E. Leppanen and J. Jarvelin,
      SPIRE 2003.

      Padding will result in special start and end characters being added at the
      beginning and the end of the character, similar to as done for the qgram
      and posqgram routines.
    """

    # Quick check if the strings are empty or the same
    if (str1 == '') or (str2 == ''):
        return 0.0
    elif (str1 == str2):
        return 1.0

    # Check if divisor is OK
    if (common_divisor not in ['average', 'shortest', 'longest']):
        raise ValueError('Illegal value for common divisor: %s' % (common_divisor))

    # Extend strings with start and end characters

    if padded:
        tmp_str1 = QGRAM_START_CHAR + str1 + QGRAM_END_CHAR
        tmp_str2 = QGRAM_START_CHAR + str2 + QGRAM_END_CHAR
    else:
        tmp_str1 = str1
        tmp_str2 = str2

    len1 = len(tmp_str1)
    len2 = len(tmp_str2)

    common = 0.0   # Sum number of common s-grams over gram classes
    divisor = 0.0  # Sum of divisors over gram classes

    # Loop over all gram classes given
    for c in gc:
        sgram_list1 = []
        sgram_list2 = []

        for s in c:  # Skip distances
            for i in range(0, len1 - s - 1):
                sgram_list1.append(tmp_str1[i] + tmp_str1[i + s + 1])
            for i in range(0, len2 - s - 1):
                sgram_list2.append(tmp_str2[i] + tmp_str2[i + s + 1])

        num_sgram1 = len(sgram_list1)
        num_sgram2 = len(sgram_list2)

        if common_divisor == 'average':
            this_divisor = 0.5 * (num_sgram1 + num_sgram2)  # Average number of s-grams
        elif common_divisor == 'shortest':
            this_divisor = min(num_sgram1, num_sgram2)
        else:  # Longest
            this_divisor = max(num_sgram1, num_sgram2)

        if num_sgram1 < num_sgram2:  # Count using the shorter s-gram list
            short_sgram_list = sgram_list1
            long_sgram_list = sgram_list2
        else:
            short_sgram_list = sgram_list2
            long_sgram_list = sgram_list1

        this_common = 0  # Number of common s-grams for this gram class

        for s_gram in short_sgram_list:
            if s_gram in long_sgram_list:
                this_common += 1
                long_sgram_list.remove(s_gram)  # Remove the counted s-gram

        common += this_common
        divisor += this_divisor

    if divisor == 0:  # One string did not have any s-grams
        w = 0.0
    else:
        w = common / divisor

    assert (w >= 0.0) and (w <= 1.0), 'Similarity weight outside 0-1: %f' % (w)

    return w

 def sgram_ci(str1, str2):
  return sgram(str1.lower(), str2.lower())
	# =============================================================================
	# AUSTRALIAN NATIONAL UNIVERSITY OPEN SOURCE LICENSE (ANUOS LICENSE)
	# VERSION 1.3
	#
	# The contents of this file are subject to the ANUOS License Version 1.3
	# (the "License"); you may not use this file except in compliance with
	# the License. You may obtain a copy of the License at:
	#
	# https://sourceforge.net/projects/febrl/
	#
	# Software distributed under the License is distributed on an "AS IS"
	# basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
	# the License for the specific language governing rights and limitations
	# under the License.
	#
	# The Original Software is: "stringcmp.py"
	#
	# The Initial Developer of the Original Software is:
	# Dr Peter Christen (Research School of Computer Science, The Australian
	# National University)
	#
	# Copyright (C) 2002 - 2011 the Australian National University and
	# others. All Rights Reserved.
	#
	# Contributors:
	#
	# Alternatively, the contents of this file may be used under the terms
	# of the GNU General Public License Version 2 or later (the "GPL"), in
	# which case the provisions of the GPL are applicable instead of those
	# above. The GPL is available at the following URL: http://www.gnu.org/
	# If you wish to allow use of your version of this file only under the
	# terms of the GPL, and not to allow others to use your version of this
	# file under the terms of the ANUOS License, indicate your decision by
	# deleting the provisions above and replace them with the notice and
	# other provisions required by the GPL. If you do not delete the
	# provisions above, a recipient may use your version of this file under
	# the terms of any one of the ANUOS License or the GPL.
	# =============================================================================
	#
	# Freely extensible biomedical record linkage (Febrl) - Version 0.4.2
	#
	# See: http://datamining.anu.edu.au/linkage.html
	#
	# =============================================================================

	QGRAM_START_CHAR = chr(1)
	QGRAM_END_CHAR = chr(2)

	def sgram(str1, str2, gc=[[0],[0,1],[1,2]], common_divisor='average', padded=True):
	"""Return approximate string comparator measure (between 0.0 and 1.0)
	using s-grams (skip-grams) with bigrams.

	USAGE:
	score = sgram(str1, str2, gc, common_divisor, min_threshold, padded)

	ARGUMENTS:
	str1 The first string
	str2 The second string
	gc Gram class list (see below).
	common_divisor Method of how to calculate the divisor, it can be set to
	'average','shortest', or 'longest' , and is calculated
	according to the lengths of the two input strings
	min_threshold Minimum threshold between 0 and 1
	padded If set to True (default), the beginnng and end of the
	strings will be padded with (q-1) special characters, if
	False no padding will be done.

	DESCRIPTION:
	Uses s-grams as described in:

	"Non-adjacent Digrams Improve Matching of Cross-Lingual Spelling Variants"
	by H. Keskustalo, A. Pirkola, K. Visala, E. Leppanen and J. Jarvelin,
	SPIRE 2003.

	Padding will result in special start and end characters being added at the
	beginning and the end of the character, similar to as done for the qgram
	and posqgram routines.
	"""

	# Quick check if the strings are empty or the same
	if (str1 == '') or (str2 == ''):
	return 0.0
	elif (str1 == str2):
	return 1.0

	# Check if divisor is OK
	if (common_divisor not in ['average', 'shortest', 'longest']):
	raise ValueError('Illegal value for common divisor: %s' % (common_divisor))

	# Extend strings with start and end characters

	if padded:
	tmp_str1 = QGRAM_START_CHAR + str1 + QGRAM_END_CHAR
	tmp_str2 = QGRAM_START_CHAR + str2 + QGRAM_END_CHAR
	else:
	tmp_str1 = str1
	tmp_str2 = str2

	len1 = len(tmp_str1)
	len2 = len(tmp_str2)

	common = 0.0 # Sum number of common s-grams over gram classes
	divisor = 0.0 # Sum of divisors over gram classes

	# Loop over all gram classes given
	for c in gc:
	sgram_list1 = []
	sgram_list2 = []

	for s in c: # Skip distances
	for i in range(0, len1 - s - 1):
	sgram_list1.append(tmp_str1[i] + tmp_str1[i + s + 1])
	for i in range(0, len2 - s - 1):
	sgram_list2.append(tmp_str2[i] + tmp_str2[i + s + 1])

	num_sgram1 = len(sgram_list1)
	num_sgram2 = len(sgram_list2)

	if common_divisor == 'average':
	this_divisor = 0.5 * (num_sgram1 + num_sgram2) # Average number of s-grams
	elif common_divisor == 'shortest':
	this_divisor = min(num_sgram1, num_sgram2)
	else: # Longest
	this_divisor = max(num_sgram1, num_sgram2)

	if num_sgram1 < num_sgram2: # Count using the shorter s-gram list
	short_sgram_list = sgram_list1
	long_sgram_list = sgram_list2
	else:
	short_sgram_list = sgram_list2
	long_sgram_list = sgram_list1

	this_common = 0 # Number of common s-grams for this gram class

	for s_gram in short_sgram_list:
	if s_gram in long_sgram_list:
	this_common += 1
	long_sgram_list.remove(s_gram) # Remove the counted s-gram

	common += this_common
	divisor += this_divisor

	if divisor == 0: # One string did not have any s-grams
	w = 0.0
	else:
	w = common / divisor

	assert (w >= 0.0) and (w <= 1.0), 'Similarity weight outside 0-1: %f' % (w)

	return w

	def sgram_ci(str1, str2):
	return sgram(str1.lower(), str2.lower())