Created
May 23, 2020 02:16
-
-
Save brito/36b5e57bf8ceed7f8c1a7dc7e704c1d8 to your computer and use it in GitHub Desktop.
Kitchen sink analysis of relations in schemas
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* __ _ _ _ __ | |
__ _ / _| |_ ___ __ _ _ __ __ _ ___| |_ ___ ___| |__ __ _ ___ _ __ ___ /_/ ___ | |
/ _` | |_| __/ _ \ / _` | '_ \ / _` / __| __/ _ \ / __| '_ \ / _` / __| '_ ` _ \ / _ \/ __| | |
| (_| | _| || (_) | (_| | | | | (_| \__ \ || (_) | (__| | | | (_| \__ \ | | | | | (_) \__ \ | |
\__,_|_| \__\___/ \__,_|_| |_|\__,_|___/\__\___/ \___|_| |_|\__,_|___/_| |_| |_|\___/|___/ | |
αυτοαναστοχασμός ergo legit */ | |
select | |
_column, _type, | |
_nulls, | |
_slider( | |
(_nulls::float/max(_nulls) over () * 30)::int, | |
(_distinct::float/max(_distinct) over () * 30)::int) | |
as "nulls vs distinct", | |
_distinct, | |
_range[1]::text, | |
_bar(_pct_nulls::int) as "% nulls", | |
_tally.*, | |
_iqt.*, | |
_histogram.*, | |
concat_ws(' ± ', | |
_avg_length[1]::text::numeric, | |
ceil((_max_length[1]::text::int - _min_length[1]::text::int)/2)) | |
as _width, | |
_schema, | |
_relname, | |
_kind | |
from _meta ( | |
'{app_public}', | |
'{data_report_session,data_raw_session,data_report}'), | |
-- tally columns | |
_tally(_schema,_relname,_column) as _tally, | |
-- interquartile | |
_iqt(_schema,_relname,_column,_type) as _iqt, | |
-- histogram buckets | |
_histogram(_schema,_relname,_column,_type) as _histogram, | |
-- count distinct values | |
round((xpath('/row/count/text()', query_to_xml(format($$ | |
select count(*) from (select distinct %3$I from %1$I.%2$I) d | |
$$,_schema,_relname,_column), false, true, '')))[1]::text::int) as _distinct, | |
-- count nulls | |
round((xpath('/row/count/text()', query_to_xml(format($$ | |
select count(*) from %I.%I where %I is null | |
$$,_schema,_relname,_column), false, true, '')))[1]::text::int) as _nulls, | |
-- percent null | |
round((xpath('/row/p/text()', query_to_xml(format($$ | |
select round(100*count(*) filter (where %3$I is null) / count(*)) as p from %1$I.%2$I | |
$$,_schema,_relname,_column), false, true, '')))[1]::text::int) as _pct_nulls, | |
-- count zeroes | |
-- FIXME not very useful either zeroes or the scale at which "near zero" is calculated | |
-- round((xpath('/row/count/text()', query_to_xml(format($$ | |
-- select count(*) from %1$I.%2$I | |
-- where '%4$I' ~ 'int|float' and %3$I::text ~ '^(-|00:00:)0' | |
-- $$,_schema,_relname,_column,_type), false, true, '')))[1]::text::int) as _near_zeroes, | |
-- range | |
xpath('/row/r/text()', query_to_xml(format(case | |
-- transforming a uuid into a number just to get a range is a bit unnecessary | |
-- when typname = '_uuid' then $$ | |
-- select numrange( | |
-- min(('x'||translate(left(%2$I::text,18),'-',''))::bit(64)::bigint), | |
-- max(('x'||translate(left(%2$I::text,18),'-',''))::bit(64)::bigint),'[]') | |
-- as r from %3$I.%4$I $$ | |
when _type ~ 'int\d|float|numeric' then $$ | |
select numrange(min(%3$I::numeric),max(%3$I::numeric),'[]') | |
as r from %1$I.%2$I $$ | |
when _type = '_timestamptz' then $$ | |
select tstzrange(min(%3$I),max(%3$I)) | |
as r from %1$I.%2$I $$ | |
when _type = '_tstzrange' then $$ | |
select tstzrange(min(lower(%3$I)),max(upper(%3$I))) | |
as r from %1$I.%2$I $$ | |
when _type = '_interval' then $$ | |
select concat_ws(',',min(%3$I),max(%3$I)) | |
as r from %1$I.%2$I $$ | |
when _type = '_bool' then $$ | |
select to_json(array[ | |
count(*) filter (where %3$I = true), | |
count(*) filter (where %3$I = false)]) | |
as r from %1$I.%2$I $$ | |
when _type = '_text' and '%2$I' ~ '^\d\.?\d*' then $$ | |
select numrange(min(%3$I::numeric),max(%3$I::numeric)) | |
as r from %1$I.%2$I $$ | |
when _type = '_text' and not '%2$I' ~ '^\d\.?\d*' then $$ | |
select numrange(min(length(%3$I)),max(length(%3$I))) | |
as r from %1$I.%2$I $$ | |
end,_schema,_relname,_column,_type), false, true, '')) as _range, | |
-- min length as text | |
xpath('/row/min/text()', query_to_xml(format($$ | |
select min(length(%3$I::text)) from %1$I.%2$I | |
$$,_schema,_relname,_column), false, true, '')) as _min_length, | |
-- avg length as text | |
xpath('/row/round/text()', query_to_xml(format($$ | |
select round(avg(length(%3$I::text))) from %1$I.%2$I | |
$$,_schema,_relname,_column), false, true, '')) as _avg_length, | |
-- max length as text | |
xpath('/row/max/text()', query_to_xml(format($$ | |
select max(length(%3$I::text)) from %1$I.%2$I | |
$$,_schema,_relname,_column), false, true, '')) as _max_length | |
; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Definitely room to improve with regards to syntax... I'm definitely misusing xml here because I was unnecessarily avoiding pgplsql.