Skip to content

Instantly share code, notes, and snippets.

@dawu76
Created April 13, 2024 00:40
Show Gist options
  • Save dawu76/ad23809f06e6efc2db28fd8b12e14d72 to your computer and use it in GitHub Desktop.
Save dawu76/ad23809f06e6efc2db28fd8b12e14d72 to your computer and use it in GitHub Desktop.
illustration of BigQuery normalization options
-- https://medium.com/codex/normalize-and-casefold-in-bigquery-675c670976b0
-- example of two strings that are converted to same string under all normalization modes
SELECT a, b,
NORMALIZE_AND_CASEFOLD(a, NFD) as a_nfd,
NORMALIZE_AND_CASEFOLD(b, NFD) AS b_nfd,
NORMALIZE_AND_CASEFOLD(a, NFC) as a_nfc,
NORMALIZE_AND_CASEFOLD(b, NFC) AS b_nfc,
NORMALIZE_AND_CASEFOLD(a, NFKD) as a_nfkd,
NORMALIZE_AND_CASEFOLD(b, NFKD) AS b_nkfd,
NORMALIZE_AND_CASEFOLD(a, NFKC) as a_nkfc,
NORMALIZE_AND_CASEFOLD(b, NFKC) AS b_nkfc
FROM (SELECT "A\u0308\uFB03n" AS a, "Ä\uFB03n" AS b) t;
-- [{
-- "a": "Äffin",
-- "b": "Äffin",
-- "a_nfd": "äffin", # 'a_nfd' and 'b_nfd' are equal
-- "b_nfd": "äffin",
-- "a_nfc": "äffin", # 'a_nfc' and 'b_nfc' are equal
-- "b_nfc": "äffin",
-- "a_nfkd": "äffin", # 'a_nfkd' and b_nkfd' are equal
-- "b_nkfd": "äffin",
-- "a_nkfc": "äffin", # 'a_nkfc' and 'b_nkfc' are equal
-- "b_nkfc": "äffin"
-- }]
-- example of two strings that remain different under certain normalization modes
SELECT a, b,
NORMALIZE_AND_CASEFOLD(a, NFD) as a_nfd,
NORMALIZE_AND_CASEFOLD(b, NFD) AS b_nfd,
NORMALIZE_AND_CASEFOLD(a, NFC) as a_nfc,
NORMALIZE_AND_CASEFOLD(b, NFC) AS b_nfc,
NORMALIZE_AND_CASEFOLD(a, NFKD) as a_nfkd,
NORMALIZE_AND_CASEFOLD(b, NFKD) AS b_nkfd,
NORMALIZE_AND_CASEFOLD(a, NFKC) as a_nkfc,
NORMALIZE_AND_CASEFOLD(b, NFKC) AS b_nkfc
FROM (SELECT '\u2168' AS a, 'IX' AS b) t;
-- [{
-- "a": "Ⅸ",
-- "b": "IX",
-- "a_nfd": "ⅸ", # 'a_nfd' != 'b_nfd'
-- "b_nfd": "ix",
-- "a_nfc": "ⅸ", # 'a_nfc' != 'b_nfc'
-- "b_nfc": "ix",
-- "a_nfkd": "ix", # 'a_nfkd' == 'b_nkfd'
-- "b_nkfd": "ix",
-- "a_nkfc": "ix", # 'a_nkfc' == 'b_nkfc'
-- "b_nkfc": "ix"
-- }]
SELECT a, b,
REGEXP_REPLACE(NORMALIZE_AND_CASEFOLD(a, NFD), r'\pM', '') as a_nfd,
REGEXP_REPLACE(NORMALIZE_AND_CASEFOLD(b, NFD), r'\pM', '') AS b_nfd,
REGEXP_REPLACE(NORMALIZE_AND_CASEFOLD(a, NFC), r'\pM', '') as a_nfc,
REGEXP_REPLACE(NORMALIZE_AND_CASEFOLD(b, NFC), r'\pM', '') AS b_nfc,
REGEXP_REPLACE(NORMALIZE_AND_CASEFOLD(a, NFKD), r'\pM', '') as a_nfkd,
REGEXP_REPLACE(NORMALIZE_AND_CASEFOLD(b, NFKD), r'\pM', '') AS b_nkfd,
REGEXP_REPLACE(NORMALIZE_AND_CASEFOLD(a, NFKC), r'\pM', '') as a_nkfc,
REGEXP_REPLACE(NORMALIZE_AND_CASEFOLD(b, NFKC), r'\pM', '') AS b_nkfc
FROM (SELECT '\u2168' AS a, 'IX' AS b) t;
-- [{
-- "a": "Ⅸ",
-- "b": "IX",
-- "a_nfd": "ⅸ",
-- "b_nfd": "ix",
-- "a_nfc": "ⅸ",
-- "b_nfc": "ix",
-- "a_nfkd": "ix",
-- "b_nkfd": "ix",
-- "a_nkfc": "ix",
-- "b_nkfc": "ix"
-- }]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment