Skip to content

Instantly share code, notes, and snippets.

@yardstick17
Created January 5, 2017 04:37
Show Gist options
  • Save yardstick17/ba88ed55f3b26c7d041c8b13287c268c to your computer and use it in GitHub Desktop.
Save yardstick17/ba88ed55f3b26c7d041c8b13287c268c to your computer and use it in GitHub Desktop.
Phonetic Algorithm
import re
def metaphone(term):
"returns metaphone code for a given string"
# implementation of the original algorithm from Lawrence Philips
# extended/rewritten by M. Kuhn
# improvements with thanks to John Machin <[email protected]>
# define return value
code = ""
term_length = len(term)
if (term_length == 0):
# empty string ?
return code
# end if
# extension #1 (added 2005-01-28)
# convert to lowercase
term = str(term).lower()
# extension #2 (added 2005-01-28)
# remove all non-english characters, first
term = re.sub(r'[^a-z]', '', term)
if len(term) == 0:
# nothing left
return code
# end if
# extension #3 (added 2005-01-24)
# conflate repeated letters
firstChar = term[0]
str2 = firstChar
for x in term:
if x != str2[-1]:
str2 = str2 + x
# end if
# end for
# extension #4 (added 2005-01-24)
# remove any vowels unless a vowel is the first letter
firstChar = str2[0]
str3 = firstChar
for x in str2[1:]:
if (re.search(r'[^aeiou]', x)):
str3 = str3 + x
# end if
# end for
term = str3
term_length = len(term)
if term_length == 0:
# nothing left
return code
# end if
# check for exceptions
if (term_length > 1):
# get first two characters
first_chars = term[0:2]
# build translation table
table = {
"ae": "e",
"gn": "n",
"kn": "n",
"pn": "n",
"wr": "n",
"wh": "w"
}
if first_chars in table.keys():
term = term[2:]
code = table[first_chars]
term_length = len(term)
# end if
elif (term[0] == "x"):
term = ""
code = "s"
term_length = 0
# end if
# define standard translation table
st_trans = {
"b": "b",
"c": "k",
"d": "t",
"g": "k",
"h": "h",
"k": "k",
"p": "p",
"q": "k",
"s": "s",
"t": "t",
"v": "f",
"w": "w",
"x": "ks",
"y": "y",
"z": "s"
}
i = 0
while (i < term_length):
# init character to add, init basic patterns
add_char = ""
part_n_2 = ""
part_n_3 = ""
part_n_4 = ""
part_c_2 = ""
part_c_3 = ""
# extract a number of patterns, if possible
if (i < (term_length - 1)):
part_n_2 = term[i:i + 2]
if (i > 0):
part_c_2 = term[i - 1:i + 1]
part_c_3 = term[i - 1:i + 2]
# end if
# end if
if (i < (term_length - 2)):
part_n_3 = term[i:i + 3]
# end if
if (i < (term_length - 3)):
part_n_4 = term[i:i + 4]
# end if
# use table with conditions for translations
if (term[i] == "b"):
add_char = st_trans["b"]
if (i == (term_length - 1)):
if (i > 0):
if (term[i - 1] == "m"):
add_char = ""
# end if
# end if
# end if
elif (term[i] == "c"):
add_char = st_trans["c"]
if (part_n_2 == "ch"):
add_char = "x"
elif (re.search(r'c[iey]', part_n_2)):
add_char = "s"
# end if
if (part_n_3 == "cia"):
add_char = "x"
# end if
if (re.search(r'sc[iey]', part_c_3)):
add_char = ""
# end if
elif (term[i] == "d"):
add_char = st_trans["d"]
if (re.search(r'dg[eyi]', part_n_3)):
add_char = "j"
# end if
elif (term[i] == "g"):
add_char = st_trans["g"]
if (part_n_2 == "gh"):
if (i == (term_length - 2)):
add_char = ""
# end if
elif (re.search(r'gh[aeiouy]', part_n_3)):
add_char = ""
elif (part_n_2 == "gn"):
add_char = ""
elif (part_n_4 == "gned"):
add_char = ""
elif (re.search(r'dg[eyi]', part_c_3)):
add_char = ""
elif (part_n_2 == "gi"):
if (part_c_3 != "ggi"):
add_char = "j"
# end if
elif (part_n_2 == "ge"):
if (part_c_3 != "gge"):
add_char = "j"
# end if
elif (part_n_2 == "gy"):
if (part_c_3 != "ggy"):
add_char = "j"
# end if
elif (part_n_2 == "gg"):
add_char = ""
# end if
elif (term[i] == "h"):
add_char = st_trans["h"]
if (re.search(r'[aeiouy]h[^aeiouy]', part_c_3)):
add_char = ""
elif (re.search(r'[csptg]h', part_c_2)):
add_char = ""
# end if
elif (term[i] == "k"):
add_char = st_trans["k"]
if (part_c_2 == "ck"):
add_char = ""
# end if
elif (term[i] == "p"):
add_char = st_trans["p"]
if (part_n_2 == "ph"):
add_char = "f"
# end if
elif (term[i] == "q"):
add_char = st_trans["q"]
elif (term[i] == "s"):
add_char = st_trans["s"]
if (part_n_2 == "sh"):
add_char = "x"
# end if
if (re.search(r'si[ao]', part_n_3)):
add_char = "x"
# end if
elif (term[i] == "t"):
add_char = st_trans["t"]
if (part_n_2 == "th"):
add_char = "0"
# end if
if (re.search(r'ti[ao]', part_n_3)):
add_char = "x"
# end if
elif (term[i] == "v"):
add_char = st_trans["v"]
elif (term[i] == "w"):
add_char = st_trans["w"]
if (re.search(r'w[^aeiouy]', part_n_2)):
add_char = ""
# end if
elif (term[i] == "x"):
add_char = st_trans["x"]
elif (term[i] == "y"):
add_char = st_trans["y"]
elif (term[i] == "z"):
add_char = st_trans["z"]
else:
# alternative
add_char = term[i]
# end if
code = code + add_char
i += 1
# end while
# return metaphone code
return code
def caverphone(term):
"returns the language key using the caverphone algorithm 2.0"
# Developed at the University of Otago, New Zealand.
# Project: Caversham Project (http://caversham.otago.ac.nz)
# Developer: David Hood, University of Otago, New Zealand
# Contact: [email protected]
# Project Technical Paper: http://caversham.otago.ac.nz/files/working/ctp150804.pdf
# Version 2.0 (2004-08-15)
code = ""
term_length = len(term)
if (term_length == 0):
# empty string ?
return code
# end if
# convert to lowercase
code = str(term).lower()
# remove anything not in the standard alphabet (a-z)
code = re.sub(r'[^a-z]', '', code)
# remove final e
if code.endswith("e"):
code = code[:-1]
# if the name starts with cough, rough, tough, enough or trough -> cou2f (rou2f, tou2f, enou2f, trough)
code = re.sub(r'^([crt]|(en)|(tr))ough', r'\1ou2f', code)
# if the name starts with gn -> 2n
code = re.sub(r'^gn', r'2n', code)
# if the name ends with mb -> m2
code = re.sub(r'mb$', r'm2', code)
# replace cq -> 2q
code = re.sub(r'cq', r'2q', code)
# replace c[i,e,y] -> s[i,e,y]
code = re.sub(r'c([iey])', r's\1', code)
# replace tch -> 2ch
code = re.sub(r'tch', r'2ch', code)
# replace c,q,x -> k
code = re.sub(r'[cqx]', r'k', code)
# replace v -> f
code = re.sub(r'v', r'f', code)
# replace dg -> 2g
code = re.sub(r'dg', r'2g', code)
# replace ti[o,a] -> si[o,a]
code = re.sub(r'ti([oa])', r'si\1', code)
# replace d -> t
code = re.sub(r'd', r't', code)
# replace ph -> fh
code = re.sub(r'ph', r'fh', code)
# replace b -> p
code = re.sub(r'b', r'p', code)
# replace sh -> s2
code = re.sub(r'sh', r's2', code)
# replace z -> s
code = re.sub(r'z', r's', code)
# replace initial vowel [aeiou] -> A
code = re.sub(r'^[aeiou]', r'A', code)
# replace all other vowels [aeiou] -> 3
code = re.sub(r'[aeiou]', r'3', code)
# replace j -> y
code = re.sub(r'j', r'y', code)
# replace an initial y3 -> Y3
code = re.sub(r'^y3', r'Y3', code)
# replace an initial y -> A
code = re.sub(r'^y', r'A', code)
# replace y -> 3
code = re.sub(r'y', r'3', code)
# replace 3gh3 -> 3kh3
code = re.sub(r'3gh3', r'3kh3', code)
# replace gh -> 22
code = re.sub(r'gh', r'22', code)
# replace g -> k
code = re.sub(r'g', r'k', code)
# replace groups of s,t,p,k,f,m,n by its single, upper-case equivalent
for single_letter in ["s", "t", "p", "k", "f", "m", "n"]:
otherParts = re.split(single_letter + "+", code)
# code = string.join(otherParts, string.upper(single_letter))
code = str(single_letter).upper().join(otherParts)
# replace w[3,h3] by W[3,h3]
code = re.sub(r'w(h?3)', r'W\1', code)
# replace final w with 3
code = re.sub(r'w$', r'3', code)
# replace w -> 2
code = re.sub(r'w', r'2', code)
# replace h at the beginning with an A
code = re.sub(r'^h', r'A', code)
# replace all other occurrences of h with a 2
code = re.sub(r'h', r'2', code)
# replace r3 with R3
code = re.sub(r'r3', r'R3', code)
# replace final r -> 3
code = re.sub(r'r$', r'3', code)
# replace r with 2
code = re.sub(r'r', r'2', code)
# replace l3 with L3
code = re.sub(r'l3', r'L3', code)
# replace final l -> 3
code = re.sub(r'l$', r'3', code)
# replace l with 2
code = re.sub(r'l', r'2', code)
# remove all 2's
code = re.sub(r'2', r'', code)
# replace the final 3 -> A
code = re.sub(r'3$', r'A', code)
# remove all 3's
code = re.sub(r'3', r'', code)
# extend the code by 10 '1' (one)
code += '1' * 10
# take the first 10 characters
caverphoneCode = code[:10]
# return caverphone code
return caverphoneCode
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment