Created
May 7, 2019 21:06
-
-
Save hoehrmann/496ffec6e7e6ae8f73f15b86412610cf to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
WITH | |
bytes AS ( | |
SELECT 0x00 AS byte | |
UNION ALL | |
SELECT byte+1 FROM bytes WHERE byte < 0xFF | |
), | |
base AS ( | |
SELECT 0x0000 AS cp | |
UNION ALL | |
SELECT cp+1 FROM base WHERE cp < 0x10FFFF | |
), | |
surrogates AS ( | |
SELECT 0xd800 AS cp | |
UNION ALL | |
SELECT cp+1 FROM surrogates WHERE cp < 0xDFFF | |
), | |
valid AS ( | |
SELECT * FROM base | |
EXCEPT | |
SELECT * FROM surrogates | |
ORDER BY cp | |
), | |
cuts as ( | |
SELECT 0x00000 AS cp UNION SELECT 0x00007f -- 1b (ascii) | |
UNION SELECT 0x00080 UNION SELECT 0x0007ff -- 2b | |
UNION SELECT 0x00800 UNION SELECT 0x00d7ff -- 3b (1st) | |
UNION SELECT 0x0d800 UNION SELECT 0x00dfff -- 3b (surrogates) | |
UNION SELECT 0x0e000 UNION SELECT 0x00ffff -- 3b (2nd) | |
UNION SELECT 0x10000 UNION SELECT 0x10ffff -- 4b | |
), | |
u8 AS ( | |
SELECT | |
cp, | |
CASE | |
WHEN cp <= 0x0007f THEN cp | |
ELSE ( | |
( | |
((cp << 0) & 0x0000003f) | | |
((cp << 2) & 0x00003f00) | | |
((cp << 4) & 0x003f0000) | | |
((cp << 6) & 0x3f000000) | |
) | |
| | |
CASE | |
WHEN cp <= 0x0007FF THEN 0x0000c080 | |
WHEN cp <= 0x00FFFF THEN 0x00e08080 | |
WHEN cp <= 0x10FFFF THEN 0xf0808080 | |
END | |
) | |
END AS u8 | |
FROM | |
cuts -- ought to be `valid` but that is slow | |
), | |
stuff as ( | |
SELECT | |
NULL AS lp, | |
NULL AS rp, | |
NULL AS ch, | |
1234 AS id, | |
u8_1.u8 AS l8, | |
u8_2.u8 AS r8, | |
CASE | |
WHEN u8_1.u8 > 0x00ffffff then 4 | |
WHEN u8_1.u8 > 0x0000ffff then 3 | |
WHEN u8_1.u8 > 0x000000ff then 2 | |
ELSE 1 | |
END AS len | |
FROM | |
( | |
-- TODO: this is actually the properly segmented input data | |
SELECT 0x00000 AS lu, 0x00007f AS ru UNION -- 1b | |
SELECT 0x00080 AS lu, 0x0007ff AS ru UNION -- 2b | |
SELECT 0x00800 AS lu, 0x00d7ff AS ru UNION -- 3b (1st) | |
-- SELECT 0x0d800 AS lu, 0x00dfff AS ru UNION -- surrogates | |
SELECT 0x0e000 AS lu, 0x00ffff AS ru UNION -- 3b (2nd) | |
SELECT 0x10000 AS lu, 0x10ffff AS ru -- 4b | |
) x | |
INNER JOIN u8 u8_1 ON (u8_1.cp = x.lu) | |
INNER JOIN u8 u8_2 ON (u8_2.cp = x.ru) | |
), | |
rec AS ( | |
SELECT * FROM stuff | |
UNION | |
SELECT | |
rec.l8 AS lp, | |
rec.r8 AS rp, | |
bytes.byte AS ch, | |
rec.id AS id, | |
CASE | |
WHEN bytes.byte = rec.l8 >> 8*(rec.len - 1) | |
THEN rec.l8 & (0xFFFFFFFF >> 8*(4 - rec.len + 1)) | |
ELSE 0x80808080 & (0xFFFFFFFF >> 8*(4 - rec.len + 1)) | |
END AS l8, | |
CASE | |
WHEN bytes.byte = rec.r8 >> 8*(rec.len - 1) | |
THEN rec.r8 & (0xFFFFFFFF >> 8*(4 - rec.len + 1)) | |
ELSE 0xBFBFBFBF & (0xFFFFFFFF >> 8*(4 - rec.len + 1)) | |
END AS r8, | |
rec.len - 1 AS len | |
FROM | |
rec | |
INNER JOIN bytes | |
WHERE | |
len > 0 | |
AND | |
bytes.byte | |
BETWEEN | |
rec.l8 >> 8*(rec.len - 1) | |
AND | |
rec.r8 >> 8*(rec.len - 1) | |
), | |
states AS ( | |
SELECT | |
l8, | |
r8, | |
JSON_GROUP_ARRAY(distinct id) AS ids, | |
row_number() OVER (ORDER BY l8 <> 0, r8 <> 0, l8, r8) AS state | |
FROM | |
rec | |
WHERE | |
lp is not null | |
GROUP BY | |
l8, | |
r8 | |
) | |
select | |
COALESCE(lhs.state, 1) AS lhs_state, | |
SUBSTR(JSON_GROUP_ARRAY(rec.ch), 0, 30) AS bytes, | |
JSON_GROUP_ARRAY(distinct rec.id) AS ids, | |
rhs.state AS rhs_state | |
from | |
rec | |
left join states rhs | |
on rhs.l8 = rec.l8 and rhs.r8 = rec.r8 | |
left join states lhs | |
on lhs.l8 = rec.lp and lhs.r8 = rec.rp | |
GROUP BY | |
rec.lp,rec.rp,rec.l8,rec.r8 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment