yardstick17 · January 5, 2017 04:37
diff --git a/caverphone.py b/caverphone.py
 import re


 def metaphone(term):
    "returns metaphone code for a given string"

    # implementation of the original algorithm from Lawrence Philips
    # extended/rewritten by M. Kuhn
    # improvements with thanks to John Machin <[email protected]>

    # define return value
    code = ""
    term_length = len(term)
    if (term_length == 0):
        # empty string ?
        return code
    # end if

    # extension #1 (added 2005-01-28)
    # convert to lowercase
    term = str(term).lower()

    # extension #2 (added 2005-01-28)
    # remove all non-english characters, first
    term = re.sub(r'[^a-z]', '', term)
    if len(term) == 0:
        # nothing left
        return code
    # end if

    # extension #3 (added 2005-01-24)
    # conflate repeated letters
    firstChar = term[0]
    str2 = firstChar
    for x in term:
        if x != str2[-1]:
            str2 = str2 + x
            # end if
    # end for

    # extension #4 (added 2005-01-24)
    # remove any vowels unless a vowel is the first letter
    firstChar = str2[0]
    str3 = firstChar
    for x in str2[1:]:
        if (re.search(r'[^aeiou]', x)):
            str3 = str3 + x
            # end if
    # end for

    term = str3
    term_length = len(term)
    if term_length == 0:
        # nothing left
        return code
    # end if

    # check for exceptions
    if (term_length > 1):
        # get first two characters
        first_chars = term[0:2]

        # build translation table
        table = {
            "ae": "e",
            "gn": "n",
            "kn": "n",
            "pn": "n",
            "wr": "n",
            "wh": "w"
        }

        if first_chars in table.keys():
            term = term[2:]
            code = table[first_chars]
            term_length = len(term)
            # end if

    elif (term[0] == "x"):
        term = ""
        code = "s"
        term_length = 0
    # end if

    # define standard translation table
    st_trans = {
        "b": "b",
        "c": "k",
        "d": "t",
        "g": "k",
        "h": "h",
        "k": "k",
        "p": "p",
        "q": "k",
        "s": "s",
        "t": "t",
        "v": "f",
        "w": "w",
        "x": "ks",
        "y": "y",
        "z": "s"
    }

    i = 0
    while (i < term_length):
        # init character to add, init basic patterns
        add_char = ""
        part_n_2 = ""
        part_n_3 = ""
        part_n_4 = ""
        part_c_2 = ""
        part_c_3 = ""

        # extract a number of patterns, if possible
        if (i < (term_length - 1)):
            part_n_2 = term[i:i + 2]

            if (i > 0):
                part_c_2 = term[i - 1:i + 1]
                part_c_3 = term[i - 1:i + 2]
                # end if
        # end if

        if (i < (term_length - 2)):
            part_n_3 = term[i:i + 3]
        # end if

        if (i < (term_length - 3)):
            part_n_4 = term[i:i + 4]
        # end if

        # use table with conditions for translations
        if (term[i] == "b"):
            add_char = st_trans["b"]
            if (i == (term_length - 1)):
                if (i > 0):
                    if (term[i - 1] == "m"):
                        add_char = ""
                        # end if
                        # end if
                        # end if
        elif (term[i] == "c"):
            add_char = st_trans["c"]
            if (part_n_2 == "ch"):
                add_char = "x"
            elif (re.search(r'c[iey]', part_n_2)):
                add_char = "s"
            # end if

            if (part_n_3 == "cia"):
                add_char = "x"
            # end if

            if (re.search(r'sc[iey]', part_c_3)):
                add_char = ""
                # end if

        elif (term[i] == "d"):
            add_char = st_trans["d"]
            if (re.search(r'dg[eyi]', part_n_3)):
                add_char = "j"
                # end if

        elif (term[i] == "g"):
            add_char = st_trans["g"]

            if (part_n_2 == "gh"):
                if (i == (term_length - 2)):
                    add_char = ""
                    # end if
            elif (re.search(r'gh[aeiouy]', part_n_3)):
                add_char = ""
            elif (part_n_2 == "gn"):
                add_char = ""
            elif (part_n_4 == "gned"):
                add_char = ""
            elif (re.search(r'dg[eyi]', part_c_3)):
                add_char = ""
            elif (part_n_2 == "gi"):
                if (part_c_3 != "ggi"):
                    add_char = "j"
                    # end if
            elif (part_n_2 == "ge"):
                if (part_c_3 != "gge"):
                    add_char = "j"
                    # end if
            elif (part_n_2 == "gy"):
                if (part_c_3 != "ggy"):
                    add_char = "j"
                    # end if
            elif (part_n_2 == "gg"):
                add_char = ""
                # end if
        elif (term[i] == "h"):
            add_char = st_trans["h"]
            if (re.search(r'[aeiouy]h[^aeiouy]', part_c_3)):
                add_char = ""
            elif (re.search(r'[csptg]h', part_c_2)):
                add_char = ""
                # end if
        elif (term[i] == "k"):
            add_char = st_trans["k"]
            if (part_c_2 == "ck"):
                add_char = ""
                # end if
        elif (term[i] == "p"):
            add_char = st_trans["p"]
            if (part_n_2 == "ph"):
                add_char = "f"
                # end if
        elif (term[i] == "q"):
            add_char = st_trans["q"]
        elif (term[i] == "s"):
            add_char = st_trans["s"]
            if (part_n_2 == "sh"):
                add_char = "x"
            # end if

            if (re.search(r'si[ao]', part_n_3)):
                add_char = "x"
                # end if
        elif (term[i] == "t"):
            add_char = st_trans["t"]
            if (part_n_2 == "th"):
                add_char = "0"
            # end if

            if (re.search(r'ti[ao]', part_n_3)):
                add_char = "x"
                # end if
        elif (term[i] == "v"):
            add_char = st_trans["v"]
        elif (term[i] == "w"):
            add_char = st_trans["w"]
            if (re.search(r'w[^aeiouy]', part_n_2)):
                add_char = ""
                # end if
        elif (term[i] == "x"):
            add_char = st_trans["x"]
        elif (term[i] == "y"):
            add_char = st_trans["y"]
        elif (term[i] == "z"):
            add_char = st_trans["z"]
        else:
            # alternative
            add_char = term[i]
        # end if

        code = code + add_char
        i += 1
    # end while

    # return metaphone code
    return code


 def caverphone(term):
    "returns the language key using the caverphone algorithm 2.0"

    # Developed at the University of Otago, New Zealand.
    # Project: Caversham Project (http://caversham.otago.ac.nz)
    # Developer: David Hood, University of Otago, New Zealand
    # Contact: [email protected]
    # Project Technical Paper: http://caversham.otago.ac.nz/files/working/ctp150804.pdf
    # Version 2.0 (2004-08-15)

    code = ""
    term_length = len(term)
    if (term_length == 0):
        # empty string ?
        return code
    # end if

    # convert to lowercase
    code = str(term).lower()

    # remove anything not in the standard alphabet (a-z)
    code = re.sub(r'[^a-z]', '', code)

    # remove final e
    if code.endswith("e"):
        code = code[:-1]

    # if the name starts with cough, rough, tough, enough or trough -> cou2f (rou2f, tou2f, enou2f, trough)
    code = re.sub(r'^([crt]|(en)|(tr))ough', r'\1ou2f', code)

    # if the name starts with gn -> 2n
    code = re.sub(r'^gn', r'2n', code)

    # if the name ends with mb -> m2
    code = re.sub(r'mb$', r'm2', code)

    # replace cq -> 2q
    code = re.sub(r'cq', r'2q', code)

    # replace c[i,e,y] -> s[i,e,y]
    code = re.sub(r'c([iey])', r's\1', code)

    # replace tch -> 2ch
    code = re.sub(r'tch', r'2ch', code)

    # replace c,q,x -> k
    code = re.sub(r'[cqx]', r'k', code)

    # replace v -> f
    code = re.sub(r'v', r'f', code)

    # replace dg -> 2g
    code = re.sub(r'dg', r'2g', code)

    # replace ti[o,a] -> si[o,a]
    code = re.sub(r'ti([oa])', r'si\1', code)

    # replace d -> t
    code = re.sub(r'd', r't', code)

    # replace ph -> fh
    code = re.sub(r'ph', r'fh', code)

    # replace b -> p
    code = re.sub(r'b', r'p', code)

    # replace sh -> s2
    code = re.sub(r'sh', r's2', code)

    # replace z -> s
    code = re.sub(r'z', r's', code)

    # replace initial vowel [aeiou] -> A
    code = re.sub(r'^[aeiou]', r'A', code)

    # replace all other vowels [aeiou] -> 3
    code = re.sub(r'[aeiou]', r'3', code)

    # replace j -> y
    code = re.sub(r'j', r'y', code)

    # replace an initial y3 -> Y3
    code = re.sub(r'^y3', r'Y3', code)

    # replace an initial y -> A
    code = re.sub(r'^y', r'A', code)

    # replace y -> 3
    code = re.sub(r'y', r'3', code)

    # replace 3gh3 -> 3kh3
    code = re.sub(r'3gh3', r'3kh3', code)

    # replace gh -> 22
    code = re.sub(r'gh', r'22', code)

    # replace g -> k
    code = re.sub(r'g', r'k', code)

    # replace groups of s,t,p,k,f,m,n by its single, upper-case equivalent
    for single_letter in ["s", "t", "p", "k", "f", "m", "n"]:
        otherParts = re.split(single_letter + "+", code)
        # code = string.join(otherParts, string.upper(single_letter))
        code = str(single_letter).upper().join(otherParts)

    # replace w[3,h3] by W[3,h3]
    code = re.sub(r'w(h?3)', r'W\1', code)

    # replace final w with 3
    code = re.sub(r'w$', r'3', code)

    # replace w -> 2
    code = re.sub(r'w', r'2', code)

    # replace h at the beginning with an A
    code = re.sub(r'^h', r'A', code)

    # replace all other occurrences of h with a 2
    code = re.sub(r'h', r'2', code)

    # replace r3 with R3
    code = re.sub(r'r3', r'R3', code)

    # replace final r -> 3
    code = re.sub(r'r$', r'3', code)

    # replace r with 2
    code = re.sub(r'r', r'2', code)

    # replace l3 with L3
    code = re.sub(r'l3', r'L3', code)

    # replace final l -> 3
    code = re.sub(r'l$', r'3', code)

    # replace l with 2
    code = re.sub(r'l', r'2', code)

    # remove all 2's
    code = re.sub(r'2', r'', code)

    # replace the final 3 -> A
    code = re.sub(r'3$', r'A', code)

    # remove all 3's
    code = re.sub(r'3', r'', code)

    # extend the code by 10 '1' (one)
    code += '1' * 10

    # take the first 10 characters
    caverphoneCode = code[:10]

    # return caverphone code
    return caverphoneCode
	import re


	def metaphone(term):
	"returns metaphone code for a given string"

	# implementation of the original algorithm from Lawrence Philips
	# extended/rewritten by M. Kuhn
	# improvements with thanks to John Machin <[email protected]>

	# define return value
	code = ""
	term_length = len(term)
	if (term_length == 0):
	# empty string ?
	return code
	# end if

	# extension #1 (added 2005-01-28)
	# convert to lowercase
	term = str(term).lower()

	# extension #2 (added 2005-01-28)
	# remove all non-english characters, first
	term = re.sub(r'[^a-z]', '', term)
	if len(term) == 0:
	# nothing left
	return code
	# end if

	# extension #3 (added 2005-01-24)
	# conflate repeated letters
	firstChar = term[0]
	str2 = firstChar
	for x in term:
	if x != str2[-1]:
	str2 = str2 + x
	# end if
	# end for

	# extension #4 (added 2005-01-24)
	# remove any vowels unless a vowel is the first letter
	firstChar = str2[0]
	str3 = firstChar
	for x in str2[1:]:
	if (re.search(r'[^aeiou]', x)):
	str3 = str3 + x
	# end if
	# end for

	term = str3
	term_length = len(term)
	if term_length == 0:
	# nothing left
	return code
	# end if

	# check for exceptions
	if (term_length > 1):
	# get first two characters
	first_chars = term[0:2]

	# build translation table
	table = {
	"ae": "e",
	"gn": "n",
	"kn": "n",
	"pn": "n",
	"wr": "n",
	"wh": "w"
	}

	if first_chars in table.keys():
	term = term[2:]
	code = table[first_chars]
	term_length = len(term)
	# end if

	elif (term[0] == "x"):
	term = ""
	code = "s"
	term_length = 0
	# end if

	# define standard translation table
	st_trans = {
	"b": "b",
	"c": "k",
	"d": "t",
	"g": "k",
	"h": "h",
	"k": "k",
	"p": "p",
	"q": "k",
	"s": "s",
	"t": "t",
	"v": "f",
	"w": "w",
	"x": "ks",
	"y": "y",
	"z": "s"
	}

	i = 0
	while (i < term_length):
	# init character to add, init basic patterns
	add_char = ""
	part_n_2 = ""
	part_n_3 = ""
	part_n_4 = ""
	part_c_2 = ""
	part_c_3 = ""

	# extract a number of patterns, if possible
	if (i < (term_length - 1)):
	part_n_2 = term[i:i + 2]

	if (i > 0):
	part_c_2 = term[i - 1:i + 1]
	part_c_3 = term[i - 1:i + 2]
	# end if
	# end if

	if (i < (term_length - 2)):
	part_n_3 = term[i:i + 3]
	# end if

	if (i < (term_length - 3)):
	part_n_4 = term[i:i + 4]
	# end if

	# use table with conditions for translations
	if (term[i] == "b"):
	add_char = st_trans["b"]
	if (i == (term_length - 1)):
	if (i > 0):
	if (term[i - 1] == "m"):
	add_char = ""
	# end if
	# end if
	# end if
	elif (term[i] == "c"):
	add_char = st_trans["c"]
	if (part_n_2 == "ch"):
	add_char = "x"
	elif (re.search(r'c[iey]', part_n_2)):
	add_char = "s"
	# end if

	if (part_n_3 == "cia"):
	add_char = "x"
	# end if

	if (re.search(r'sc[iey]', part_c_3)):
	add_char = ""
	# end if

	elif (term[i] == "d"):
	add_char = st_trans["d"]
	if (re.search(r'dg[eyi]', part_n_3)):
	add_char = "j"
	# end if

	elif (term[i] == "g"):
	add_char = st_trans["g"]

	if (part_n_2 == "gh"):
	if (i == (term_length - 2)):
	add_char = ""
	# end if
	elif (re.search(r'gh[aeiouy]', part_n_3)):
	add_char = ""
	elif (part_n_2 == "gn"):
	add_char = ""
	elif (part_n_4 == "gned"):
	add_char = ""
	elif (re.search(r'dg[eyi]', part_c_3)):
	add_char = ""
	elif (part_n_2 == "gi"):
	if (part_c_3 != "ggi"):
	add_char = "j"
	# end if
	elif (part_n_2 == "ge"):
	if (part_c_3 != "gge"):
	add_char = "j"
	# end if
	elif (part_n_2 == "gy"):
	if (part_c_3 != "ggy"):
	add_char = "j"
	# end if
	elif (part_n_2 == "gg"):
	add_char = ""
	# end if
	elif (term[i] == "h"):
	add_char = st_trans["h"]
	if (re.search(r'[aeiouy]h[^aeiouy]', part_c_3)):
	add_char = ""
	elif (re.search(r'[csptg]h', part_c_2)):
	add_char = ""
	# end if
	elif (term[i] == "k"):
	add_char = st_trans["k"]
	if (part_c_2 == "ck"):
	add_char = ""
	# end if
	elif (term[i] == "p"):
	add_char = st_trans["p"]
	if (part_n_2 == "ph"):
	add_char = "f"
	# end if
	elif (term[i] == "q"):
	add_char = st_trans["q"]
	elif (term[i] == "s"):
	add_char = st_trans["s"]
	if (part_n_2 == "sh"):
	add_char = "x"
	# end if

	if (re.search(r'si[ao]', part_n_3)):
	add_char = "x"
	# end if
	elif (term[i] == "t"):
	add_char = st_trans["t"]
	if (part_n_2 == "th"):
	add_char = "0"
	# end if

	if (re.search(r'ti[ao]', part_n_3)):
	add_char = "x"
	# end if
	elif (term[i] == "v"):
	add_char = st_trans["v"]
	elif (term[i] == "w"):
	add_char = st_trans["w"]
	if (re.search(r'w[^aeiouy]', part_n_2)):
	add_char = ""
	# end if
	elif (term[i] == "x"):
	add_char = st_trans["x"]
	elif (term[i] == "y"):
	add_char = st_trans["y"]
	elif (term[i] == "z"):
	add_char = st_trans["z"]
	else:
	# alternative
	add_char = term[i]
	# end if

	code = code + add_char
	i += 1
	# end while

	# return metaphone code
	return code


	def caverphone(term):
	"returns the language key using the caverphone algorithm 2.0"

	# Developed at the University of Otago, New Zealand.
	# Project: Caversham Project (http://caversham.otago.ac.nz)
	# Developer: David Hood, University of Otago, New Zealand
	# Contact: [email protected]
	# Project Technical Paper: http://caversham.otago.ac.nz/files/working/ctp150804.pdf
	# Version 2.0 (2004-08-15)

	code = ""
	term_length = len(term)
	if (term_length == 0):
	# empty string ?
	return code
	# end if

	# convert to lowercase
	code = str(term).lower()

	# remove anything not in the standard alphabet (a-z)
	code = re.sub(r'[^a-z]', '', code)

	# remove final e
	if code.endswith("e"):
	code = code[:-1]

	# if the name starts with cough, rough, tough, enough or trough -> cou2f (rou2f, tou2f, enou2f, trough)
	code = re.sub(r'^([crt]\|(en)\|(tr))ough', r'\1ou2f', code)

	# if the name starts with gn -> 2n
	code = re.sub(r'^gn', r'2n', code)

	# if the name ends with mb -> m2
	code = re.sub(r'mb$', r'm2', code)

	# replace cq -> 2q
	code = re.sub(r'cq', r'2q', code)

	# replace c[i,e,y] -> s[i,e,y]
	code = re.sub(r'c([iey])', r's\1', code)

	# replace tch -> 2ch
	code = re.sub(r'tch', r'2ch', code)

	# replace c,q,x -> k
	code = re.sub(r'[cqx]', r'k', code)

	# replace v -> f
	code = re.sub(r'v', r'f', code)

	# replace dg -> 2g
	code = re.sub(r'dg', r'2g', code)

	# replace ti[o,a] -> si[o,a]
	code = re.sub(r'ti([oa])', r'si\1', code)

	# replace d -> t
	code = re.sub(r'd', r't', code)

	# replace ph -> fh
	code = re.sub(r'ph', r'fh', code)

	# replace b -> p
	code = re.sub(r'b', r'p', code)

	# replace sh -> s2
	code = re.sub(r'sh', r's2', code)

	# replace z -> s
	code = re.sub(r'z', r's', code)

	# replace initial vowel [aeiou] -> A
	code = re.sub(r'^[aeiou]', r'A', code)

	# replace all other vowels [aeiou] -> 3
	code = re.sub(r'[aeiou]', r'3', code)

	# replace j -> y
	code = re.sub(r'j', r'y', code)

	# replace an initial y3 -> Y3
	code = re.sub(r'^y3', r'Y3', code)

	# replace an initial y -> A
	code = re.sub(r'^y', r'A', code)

	# replace y -> 3
	code = re.sub(r'y', r'3', code)

	# replace 3gh3 -> 3kh3
	code = re.sub(r'3gh3', r'3kh3', code)

	# replace gh -> 22
	code = re.sub(r'gh', r'22', code)

	# replace g -> k
	code = re.sub(r'g', r'k', code)

	# replace groups of s,t,p,k,f,m,n by its single, upper-case equivalent
	for single_letter in ["s", "t", "p", "k", "f", "m", "n"]:
	otherParts = re.split(single_letter + "+", code)
	# code = string.join(otherParts, string.upper(single_letter))
	code = str(single_letter).upper().join(otherParts)

	# replace w[3,h3] by W[3,h3]
	code = re.sub(r'w(h?3)', r'W\1', code)

	# replace final w with 3
	code = re.sub(r'w$', r'3', code)

	# replace w -> 2
	code = re.sub(r'w', r'2', code)

	# replace h at the beginning with an A
	code = re.sub(r'^h', r'A', code)

	# replace all other occurrences of h with a 2
	code = re.sub(r'h', r'2', code)

	# replace r3 with R3
	code = re.sub(r'r3', r'R3', code)

	# replace final r -> 3
	code = re.sub(r'r$', r'3', code)

	# replace r with 2
	code = re.sub(r'r', r'2', code)

	# replace l3 with L3
	code = re.sub(r'l3', r'L3', code)

	# replace final l -> 3
	code = re.sub(r'l$', r'3', code)

	# replace l with 2
	code = re.sub(r'l', r'2', code)

	# remove all 2's
	code = re.sub(r'2', r'', code)

	# replace the final 3 -> A
	code = re.sub(r'3$', r'A', code)

	# remove all 3's
	code = re.sub(r'3', r'', code)

	# extend the code by 10 '1' (one)
	code += '1' * 10

	# take the first 10 characters
	caverphoneCode = code[:10]

	# return caverphone code
	return caverphoneCode