Created
April 30, 2024 04:54
-
-
Save cpatulea/8446030f4a70e92fd66fa9a090c238c3 to your computer and use it in GitHub Desktop.
Foulab wiki: non-ASCII characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
with h as ( | |
select *, convert(data using utf8) as data_utf8 from tiki_history | |
where data <> convert(convert(data using utf8) using ascii) | |
), | |
h1b as ( | |
with recursive cte as ( | |
select pageName, version, data_utf8, 0 as ascii, ' ' as c, 1 as pos | |
from h | |
union all | |
select pageName, version, data_utf8, | |
regexp_instr(substring(data_utf8, pos, 1), '[\r\n\t -~]'), | |
substring(data_utf8, pos, 1), | |
pos + 1 | |
from cte | |
where pos <= char_length(data_utf8) | |
) | |
select pageName, version, ascii, c | |
from cte | |
where pos >= 2 | |
and not ascii | |
) | |
-- select | |
-- hex(c), | |
-- c, | |
-- concat('https://laboratoires.foulab.org/w/tiki-pagehistory.php?page=', replace(pageName, ' ', '+'), '&preview=', version) | |
-- from h1b; | |
select | |
hex(c) as c_hex, | |
c, | |
count(*) | |
-- min(concat('https://laboratoires.foulab.org/w/tiki-pagehistory.php?page=', replace(pageName, ' ', '+'), '&preview=', version)) as url | |
from h1b group by c_hex, c order by c_hex; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment