Created
January 5, 2017 04:37
-
-
Save yardstick17/ba88ed55f3b26c7d041c8b13287c268c to your computer and use it in GitHub Desktop.
Phonetic Algorithm
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
def metaphone(term): | |
"returns metaphone code for a given string" | |
# implementation of the original algorithm from Lawrence Philips | |
# extended/rewritten by M. Kuhn | |
# improvements with thanks to John Machin <[email protected]> | |
# define return value | |
code = "" | |
term_length = len(term) | |
if (term_length == 0): | |
# empty string ? | |
return code | |
# end if | |
# extension #1 (added 2005-01-28) | |
# convert to lowercase | |
term = str(term).lower() | |
# extension #2 (added 2005-01-28) | |
# remove all non-english characters, first | |
term = re.sub(r'[^a-z]', '', term) | |
if len(term) == 0: | |
# nothing left | |
return code | |
# end if | |
# extension #3 (added 2005-01-24) | |
# conflate repeated letters | |
firstChar = term[0] | |
str2 = firstChar | |
for x in term: | |
if x != str2[-1]: | |
str2 = str2 + x | |
# end if | |
# end for | |
# extension #4 (added 2005-01-24) | |
# remove any vowels unless a vowel is the first letter | |
firstChar = str2[0] | |
str3 = firstChar | |
for x in str2[1:]: | |
if (re.search(r'[^aeiou]', x)): | |
str3 = str3 + x | |
# end if | |
# end for | |
term = str3 | |
term_length = len(term) | |
if term_length == 0: | |
# nothing left | |
return code | |
# end if | |
# check for exceptions | |
if (term_length > 1): | |
# get first two characters | |
first_chars = term[0:2] | |
# build translation table | |
table = { | |
"ae": "e", | |
"gn": "n", | |
"kn": "n", | |
"pn": "n", | |
"wr": "n", | |
"wh": "w" | |
} | |
if first_chars in table.keys(): | |
term = term[2:] | |
code = table[first_chars] | |
term_length = len(term) | |
# end if | |
elif (term[0] == "x"): | |
term = "" | |
code = "s" | |
term_length = 0 | |
# end if | |
# define standard translation table | |
st_trans = { | |
"b": "b", | |
"c": "k", | |
"d": "t", | |
"g": "k", | |
"h": "h", | |
"k": "k", | |
"p": "p", | |
"q": "k", | |
"s": "s", | |
"t": "t", | |
"v": "f", | |
"w": "w", | |
"x": "ks", | |
"y": "y", | |
"z": "s" | |
} | |
i = 0 | |
while (i < term_length): | |
# init character to add, init basic patterns | |
add_char = "" | |
part_n_2 = "" | |
part_n_3 = "" | |
part_n_4 = "" | |
part_c_2 = "" | |
part_c_3 = "" | |
# extract a number of patterns, if possible | |
if (i < (term_length - 1)): | |
part_n_2 = term[i:i + 2] | |
if (i > 0): | |
part_c_2 = term[i - 1:i + 1] | |
part_c_3 = term[i - 1:i + 2] | |
# end if | |
# end if | |
if (i < (term_length - 2)): | |
part_n_3 = term[i:i + 3] | |
# end if | |
if (i < (term_length - 3)): | |
part_n_4 = term[i:i + 4] | |
# end if | |
# use table with conditions for translations | |
if (term[i] == "b"): | |
add_char = st_trans["b"] | |
if (i == (term_length - 1)): | |
if (i > 0): | |
if (term[i - 1] == "m"): | |
add_char = "" | |
# end if | |
# end if | |
# end if | |
elif (term[i] == "c"): | |
add_char = st_trans["c"] | |
if (part_n_2 == "ch"): | |
add_char = "x" | |
elif (re.search(r'c[iey]', part_n_2)): | |
add_char = "s" | |
# end if | |
if (part_n_3 == "cia"): | |
add_char = "x" | |
# end if | |
if (re.search(r'sc[iey]', part_c_3)): | |
add_char = "" | |
# end if | |
elif (term[i] == "d"): | |
add_char = st_trans["d"] | |
if (re.search(r'dg[eyi]', part_n_3)): | |
add_char = "j" | |
# end if | |
elif (term[i] == "g"): | |
add_char = st_trans["g"] | |
if (part_n_2 == "gh"): | |
if (i == (term_length - 2)): | |
add_char = "" | |
# end if | |
elif (re.search(r'gh[aeiouy]', part_n_3)): | |
add_char = "" | |
elif (part_n_2 == "gn"): | |
add_char = "" | |
elif (part_n_4 == "gned"): | |
add_char = "" | |
elif (re.search(r'dg[eyi]', part_c_3)): | |
add_char = "" | |
elif (part_n_2 == "gi"): | |
if (part_c_3 != "ggi"): | |
add_char = "j" | |
# end if | |
elif (part_n_2 == "ge"): | |
if (part_c_3 != "gge"): | |
add_char = "j" | |
# end if | |
elif (part_n_2 == "gy"): | |
if (part_c_3 != "ggy"): | |
add_char = "j" | |
# end if | |
elif (part_n_2 == "gg"): | |
add_char = "" | |
# end if | |
elif (term[i] == "h"): | |
add_char = st_trans["h"] | |
if (re.search(r'[aeiouy]h[^aeiouy]', part_c_3)): | |
add_char = "" | |
elif (re.search(r'[csptg]h', part_c_2)): | |
add_char = "" | |
# end if | |
elif (term[i] == "k"): | |
add_char = st_trans["k"] | |
if (part_c_2 == "ck"): | |
add_char = "" | |
# end if | |
elif (term[i] == "p"): | |
add_char = st_trans["p"] | |
if (part_n_2 == "ph"): | |
add_char = "f" | |
# end if | |
elif (term[i] == "q"): | |
add_char = st_trans["q"] | |
elif (term[i] == "s"): | |
add_char = st_trans["s"] | |
if (part_n_2 == "sh"): | |
add_char = "x" | |
# end if | |
if (re.search(r'si[ao]', part_n_3)): | |
add_char = "x" | |
# end if | |
elif (term[i] == "t"): | |
add_char = st_trans["t"] | |
if (part_n_2 == "th"): | |
add_char = "0" | |
# end if | |
if (re.search(r'ti[ao]', part_n_3)): | |
add_char = "x" | |
# end if | |
elif (term[i] == "v"): | |
add_char = st_trans["v"] | |
elif (term[i] == "w"): | |
add_char = st_trans["w"] | |
if (re.search(r'w[^aeiouy]', part_n_2)): | |
add_char = "" | |
# end if | |
elif (term[i] == "x"): | |
add_char = st_trans["x"] | |
elif (term[i] == "y"): | |
add_char = st_trans["y"] | |
elif (term[i] == "z"): | |
add_char = st_trans["z"] | |
else: | |
# alternative | |
add_char = term[i] | |
# end if | |
code = code + add_char | |
i += 1 | |
# end while | |
# return metaphone code | |
return code | |
def caverphone(term): | |
"returns the language key using the caverphone algorithm 2.0" | |
# Developed at the University of Otago, New Zealand. | |
# Project: Caversham Project (http://caversham.otago.ac.nz) | |
# Developer: David Hood, University of Otago, New Zealand | |
# Contact: [email protected] | |
# Project Technical Paper: http://caversham.otago.ac.nz/files/working/ctp150804.pdf | |
# Version 2.0 (2004-08-15) | |
code = "" | |
term_length = len(term) | |
if (term_length == 0): | |
# empty string ? | |
return code | |
# end if | |
# convert to lowercase | |
code = str(term).lower() | |
# remove anything not in the standard alphabet (a-z) | |
code = re.sub(r'[^a-z]', '', code) | |
# remove final e | |
if code.endswith("e"): | |
code = code[:-1] | |
# if the name starts with cough, rough, tough, enough or trough -> cou2f (rou2f, tou2f, enou2f, trough) | |
code = re.sub(r'^([crt]|(en)|(tr))ough', r'\1ou2f', code) | |
# if the name starts with gn -> 2n | |
code = re.sub(r'^gn', r'2n', code) | |
# if the name ends with mb -> m2 | |
code = re.sub(r'mb$', r'm2', code) | |
# replace cq -> 2q | |
code = re.sub(r'cq', r'2q', code) | |
# replace c[i,e,y] -> s[i,e,y] | |
code = re.sub(r'c([iey])', r's\1', code) | |
# replace tch -> 2ch | |
code = re.sub(r'tch', r'2ch', code) | |
# replace c,q,x -> k | |
code = re.sub(r'[cqx]', r'k', code) | |
# replace v -> f | |
code = re.sub(r'v', r'f', code) | |
# replace dg -> 2g | |
code = re.sub(r'dg', r'2g', code) | |
# replace ti[o,a] -> si[o,a] | |
code = re.sub(r'ti([oa])', r'si\1', code) | |
# replace d -> t | |
code = re.sub(r'd', r't', code) | |
# replace ph -> fh | |
code = re.sub(r'ph', r'fh', code) | |
# replace b -> p | |
code = re.sub(r'b', r'p', code) | |
# replace sh -> s2 | |
code = re.sub(r'sh', r's2', code) | |
# replace z -> s | |
code = re.sub(r'z', r's', code) | |
# replace initial vowel [aeiou] -> A | |
code = re.sub(r'^[aeiou]', r'A', code) | |
# replace all other vowels [aeiou] -> 3 | |
code = re.sub(r'[aeiou]', r'3', code) | |
# replace j -> y | |
code = re.sub(r'j', r'y', code) | |
# replace an initial y3 -> Y3 | |
code = re.sub(r'^y3', r'Y3', code) | |
# replace an initial y -> A | |
code = re.sub(r'^y', r'A', code) | |
# replace y -> 3 | |
code = re.sub(r'y', r'3', code) | |
# replace 3gh3 -> 3kh3 | |
code = re.sub(r'3gh3', r'3kh3', code) | |
# replace gh -> 22 | |
code = re.sub(r'gh', r'22', code) | |
# replace g -> k | |
code = re.sub(r'g', r'k', code) | |
# replace groups of s,t,p,k,f,m,n by its single, upper-case equivalent | |
for single_letter in ["s", "t", "p", "k", "f", "m", "n"]: | |
otherParts = re.split(single_letter + "+", code) | |
# code = string.join(otherParts, string.upper(single_letter)) | |
code = str(single_letter).upper().join(otherParts) | |
# replace w[3,h3] by W[3,h3] | |
code = re.sub(r'w(h?3)', r'W\1', code) | |
# replace final w with 3 | |
code = re.sub(r'w$', r'3', code) | |
# replace w -> 2 | |
code = re.sub(r'w', r'2', code) | |
# replace h at the beginning with an A | |
code = re.sub(r'^h', r'A', code) | |
# replace all other occurrences of h with a 2 | |
code = re.sub(r'h', r'2', code) | |
# replace r3 with R3 | |
code = re.sub(r'r3', r'R3', code) | |
# replace final r -> 3 | |
code = re.sub(r'r$', r'3', code) | |
# replace r with 2 | |
code = re.sub(r'r', r'2', code) | |
# replace l3 with L3 | |
code = re.sub(r'l3', r'L3', code) | |
# replace final l -> 3 | |
code = re.sub(r'l$', r'3', code) | |
# replace l with 2 | |
code = re.sub(r'l', r'2', code) | |
# remove all 2's | |
code = re.sub(r'2', r'', code) | |
# replace the final 3 -> A | |
code = re.sub(r'3$', r'A', code) | |
# remove all 3's | |
code = re.sub(r'3', r'', code) | |
# extend the code by 10 '1' (one) | |
code += '1' * 10 | |
# take the first 10 characters | |
caverphoneCode = code[:10] | |
# return caverphone code | |
return caverphoneCode |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment