Last active
August 7, 2023 21:18
-
-
Save sacundim/e9fa7cba6d5cf403153d5903160e36a8 to your computer and use it in GitHub Desktop.
Expanding PANGO lineage aliases with DuckDB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
WITH alias_key AS ( | |
UNPIVOT ( | |
SELECT COLUMNS('^(A.+|B.+|[C-WY-Z].*)') | |
FROM 'https://raw.githubusercontent.com/cov-lineages/pango-designation/master/pango_designation/alias_key.json' | |
) | |
ON COLUMNS (*) | |
INTO NAME prefix VALUE expansion | |
), lineage_notes AS ( | |
SELECT | |
regexp_matches(line, '^\*') AS withdrawn, | |
regexp_extract(line, '^\*?([A-Z]+)', 1) AS prefix, | |
regexp_extract(line, '^\*?([A-Z]+)((\.[0-9]+)*)', 2) AS numbers, | |
regexp_extract(line, '^\*?([A-Z]+)((\.[0-9]+)*)\t(.*)$', 4) AS description | |
FROM read_csv( | |
'https://raw.githubusercontent.com/cov-lineages/pango-designation/master/lineage_notes.txt', | |
delim=NULL, header=TRUE, columns={'line': 'VARCHAR'} | |
) | |
), unaliased AS ( | |
SELECT | |
withdrawn, | |
prefix || numbers AS lineage, | |
COALESCE(expansion, prefix) || numbers | |
AS unaliased, | |
regexp_split_to_array(COALESCE(expansion, prefix) || numbers, '\.') | |
AS split, | |
description | |
FROM lineage_notes | |
LEFT OUTER JOIN alias_key | |
USING (prefix) | |
) | |
SELECT | |
withdrawn, | |
lineage, | |
unaliased, | |
split[1] AS root, | |
[CAST(str AS UINT16) FOR str IN split[2:]] | |
AS numbers, | |
description | |
FROM unaliased | |
ORDER BY root, numbers; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment