Created
May 1, 2024 18:32
-
-
Save tobchen/4e2ad939ca9b16c48ab9d0ec771b8c95 to your computer and use it in GitHub Desktop.
My go at Cologne Phonetics in PostgreSQL
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
CREATE FUNCTION cologne_phonetics_preprocess(txt varchar) RETURNS varchar AS $$ | |
BEGIN | |
RETURN replace(replace(replace(replace(upper(txt), 'Ä', 'A'), 'Ö', 'O'), 'Ü', 'U'), 'ß', 'SS'); | |
END; | |
$$ LANGUAGE plpgsql | |
IMMUTABLE | |
RETURNS NULL ON NULL INPUT; | |
CREATE FUNCTION cologne_phonetics_encode_except_cx(ch varchar, next_ch varchar) RETURNS varchar AS $$ | |
DECLARE | |
result varchar; | |
BEGIN | |
IF ch = 'A' OR ch = 'E' OR ch = 'I' OR ch = 'J' OR ch = 'O' OR ch = 'U' OR ch = 'Y' THEN | |
result := '0'; | |
/* ELSIF ch = 'H' THEN | |
result := '-'; */ | |
ELSIF ch = 'B' THEN | |
result := '1'; | |
ELSIF ch = 'F' OR ch = 'V' OR ch = 'W' THEN | |
result := '3'; | |
ELSIF ch = 'G' OR ch = 'K' OR ch = 'Q' THEN | |
result := '4'; | |
ELSIF ch = 'L' THEN | |
result := '5'; | |
ELSIF ch = 'M' OR ch = 'N' THEN | |
result := '6'; | |
ELSIF ch = 'R' THEN | |
result := '7'; | |
ELSIF ch = 'S' OR ch = 'Z' THEN | |
result := '8'; | |
ELSIF ch = 'P' THEN | |
IF next_ch IS NULL OR next_ch != 'H' THEN | |
result := '1'; | |
ELSE | |
result := '3'; | |
END IF; | |
ELSIF ch = 'D' OR ch = 'T' THEN | |
IF next_ch IS NULL OR (next_ch != 'C' AND next_ch != 'S' AND next_ch != 'Z') THEN | |
result := '2'; | |
ELSE | |
result := '8'; | |
END IF; | |
ELSE | |
result := ''; | |
END IF; | |
RETURN result; | |
END; | |
$$ LANGUAGE plpgsql | |
IMMUTABLE | |
CALLED ON NULL INPUT; | |
CREATE FUNCTION cologne_phonetics_encode_only_c(prev_ch varchar, next_ch varchar) RETURNS varchar AS $$ | |
DECLARE | |
result varchar; | |
BEGIN | |
IF prev_ch IS NULL THEN | |
IF next_ch IS NOT NULL AND (next_ch = 'A' OR next_ch = 'H' OR next_ch = 'K' OR next_ch = 'L' | |
OR next_ch = 'O' OR next_ch = 'Q' OR next_ch = 'R' OR next_ch = 'U' OR next_ch = 'X') THEN | |
result := '4'; | |
ELSE | |
result := '8'; | |
END IF; | |
ELSIF prev_ch = 'S' OR prev_ch = 'Z' THEN | |
result := '8'; | |
ELSIF next_ch IS NOT NULL AND (next_ch = 'A' OR next_ch = 'H' OR next_ch = 'K' OR next_ch = 'O' | |
OR next_ch = 'Q' OR next_ch = 'U' OR next_ch = 'X') THEN | |
result := '4'; | |
ELSE | |
result := '8'; | |
END IF; | |
RETURN result; | |
END; | |
$$ LANGUAGE plpgsql | |
IMMUTABLE | |
CALLED ON NULL INPUT; | |
CREATE FUNCTION cologne_phonetics_encode_only_x(prev_ch varchar) RETURNS varchar AS $$ | |
DECLARE | |
result varchar; | |
BEGIN | |
IF prev_ch IS NOT NULL AND (prev_ch = 'C' OR prev_ch = 'K' OR prev_ch = 'Q') THEN | |
result := '48'; | |
ELSE | |
result := '8'; | |
END IF; | |
RETURN result; | |
END; | |
$$ LANGUAGE plpgsql | |
IMMUTABLE | |
CALLED ON NULL INPUT; | |
CREATE FUNCTION cologne_phonetics_encode(prepped varchar) RETURNS varchar AS $$ | |
DECLARE | |
result varchar; | |
prepped_len int; | |
prepped_array varchar[]; | |
i int; | |
prev_ch varchar; | |
ch varchar; | |
next_ch varchar; | |
BEGIN | |
prepped_array := regexp_split_to_array(prepped, ''); | |
prepped_len := array_length(prepped_array, 1); | |
result := ''; | |
i := 1; | |
ch := prepped_array[1]; | |
WHILE i <= prepped_len LOOP | |
IF (i + 1) <= prepped_len THEN | |
next_ch := prepped_array[i + 1]; | |
ELSE | |
next_ch := NULL; | |
END IF; | |
IF ch = 'C' THEN | |
result := result || cologne_phonetics_encode_only_c(prev_ch, next_ch); | |
ELSIF ch = 'X' THEN | |
result := result || cologne_phonetics_encode_only_x(prev_ch); | |
ELSE | |
result := result || cologne_phonetics_encode_except_cx(ch, next_ch); | |
END IF; | |
prev_ch := ch; | |
ch := next_ch; | |
i := i + 1; | |
END LOOP; | |
RETURN result; | |
END; | |
$$ LANGUAGE plpgsql | |
IMMUTABLE | |
RETURNS NULL ON NULL INPUT; | |
CREATE FUNCTION cologne_phonetics_finalize(encoded varchar) RETURNS varchar AS $$ | |
DECLARE | |
encoded_array varchar[]; | |
result varchar; | |
ch varchar; | |
prev_ch varchar; | |
BEGIN | |
encoded_array := regexp_split_to_array(encoded, ''); | |
FOREACH ch IN ARRAY encoded_array | |
LOOP | |
IF prev_ch IS NULL THEN | |
result := ch; | |
prev_ch := ch; | |
ELSIF ch != prev_ch AND ch != '0' THEN | |
result := result || ch; | |
prev_ch := ch; | |
END IF; | |
END LOOP; | |
RETURN result; | |
END; | |
$$ LANGUAGE plpgsql | |
IMMUTABLE | |
RETURNS NULL ON NULL INPUT; | |
CREATE FUNCTION cologne_phonetics(txt varchar) RETURNS varchar AS $$ | |
DECLARE | |
prepped varchar; | |
encoded varchar; | |
result varchar; | |
BEGIN | |
prepped := cologne_phonetics_preprocess(txt); | |
encoded := cologne_phonetics_encode(prepped); | |
result := cologne_phonetics_finalize(encoded); | |
RETURN result; | |
END; | |
$$ LANGUAGE plpgsql | |
IMMUTABLE | |
RETURNS NULL ON NULL INPUT; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment