Last active
December 18, 2016 15:20
-
-
Save cimmanon/04c6ead97fae2257dd32a5977898de76 to your computer and use it in GitHub Desktop.
PostgreSQL function for doing diagnostics on unfamiliar tables
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
begin; | |
/*----------------------------------------------------------------------------------------------------*\ | |
| Functions for generating statistics | |
\*----------------------------------------------------------------------------------------------------*/ | |
create or replace function rel_column_count(schema_name text, table_name text, column_name text, column_type text) returns table | |
( not_null_count bigint | |
, distinct_count bigint | |
, min_length int | |
, max_length int | |
) as $$ | |
DECLARE | |
s ALIAS FOR schema_name; | |
t ALIAS FOR table_name; | |
c ALIAS FOR column_name; | |
q text; | |
BEGIN | |
q := case when column_type = 'text' | |
then format('select count(*), count(distinct %I), min(length(%I)), max(length(%I)) from %I.%I where nullif(%I, '''') is not null', c, c, c, s, t, c) | |
else format('select count(*), count(distinct %I), null :: int, null :: int from %I.%I where %I is not null', c, s, t, c) END; | |
return query execute q; | |
END; | |
$$ language plpgsql stable; | |
------------------------------------- | |
create or replace function rel_statistics(schema_name text, table_name text) returns table | |
( column_name name | |
, column_type text | |
, maybe_fk bool | |
, maybe_bool bool | |
, not_null_count bigint | |
, not_null_percent numeric(5,2) | |
, distinct_count bigint | |
, min_length int | |
, max_length int | |
) as $$ | |
declare | |
s ALIAS FOR schema_name; | |
t ALIAS FOR table_name; | |
c bigint; | |
begin | |
execute format('select count(*) from %I.%I', s, t) into c; | |
return query | |
select | |
x.column_name | |
, x.column_type | |
, x.maybe_fk | |
, x.maybe_bool | |
, x.not_null_count | |
, x.not_null_percent | |
, x.distinct_count | |
, x.min_length | |
, x.max_length | |
from ( | |
select | |
r.column_name | |
, r.column_type | |
, r.column_name ilike '%_id' as maybe_fk | |
, r.column_type = 'text' | |
and cc.distinct_count <= 2 | |
-- ^ if there's 2 values, its a good sign that its a fake bool | |
and (cc.max_length = 1 or (cc.min_length >= 2 and cc.max_length <= 3)) as maybe_bool | |
-- ^ 1 letter = Y/N or 1/0 | |
-- ^ 2 or 3 letters = yes or no | |
, cc.not_null_count | |
, (round(((cc.not_null_count) :: numeric / c :: numeric) * 10000) / 100) :: numeric(5,2) as not_null_percent | |
, cc.distinct_count | |
, cc.min_length | |
, cc.max_length | |
from | |
( | |
select | |
-- ns.nspname as schema_name, | |
-- cls.relname as table_name, | |
attr.attname as column_name, | |
trim(leading '_' from tp.typname) as column_type | |
from | |
pg_catalog.pg_attribute as attr | |
join pg_catalog.pg_class as cls on cls.oid = attr.attrelid | |
join pg_catalog.pg_namespace as ns on ns.oid = cls.relnamespace | |
join pg_catalog.pg_type as tp on tp.typelem = attr.atttypid | |
where | |
ns.nspname = s | |
and cls.relname = t | |
and not attr.attisdropped | |
and tp.typanalyze :: text = 'array_typanalyze' | |
and attr.attnum > 0 | |
order by | |
attr.attnum | |
) as r | |
, rel_column_count(s, t, r.column_name, r.column_type) as cc | |
) as x | |
order by | |
x.not_null_count = 0 | |
, x.column_type | |
, x.maybe_fk desc | |
, x.maybe_bool desc | |
, x.not_null_count desc | |
, x.distinct_count | |
, x.column_name | |
; | |
end; | |
$$ language plpgsql; | |
--select * from rel_statistics('public', 'tax_items'); | |
--select * from rel_statistics('public', 'sales_reps'); | |
/*----------------------------------------------------------------------------------------------------*\ | |
| Generate the reports in CSV format | |
\*----------------------------------------------------------------------------------------------------*/ | |
DO $$ | |
DECLARE | |
s TEXT := 'public'; | |
r RECORD; | |
BEGIN | |
FOR r IN | |
SELECT * FROM system.netsuite_view_schedule ORDER BY name | |
/* | |
SELECT matviewname, schemaname | |
FROM pg_matviews JOIN system.netsuite_view_schedule ON matviewname = netsuite_view_schedule.name | |
ORDER BY matviewname | |
-- LIMIT 5 | |
*/ | |
LOOP | |
RAISE NOTICE 'Generating statistics for %...', r.name; | |
EXECUTE format('COPY (SELECT * FROM rel_statistics(''%s'', ''%s'')) to ''/home/deploy/notes/%s.csv'' csv header', s, r.name, r.name); | |
END LOOP; | |
END | |
$$ LANGUAGE plpgsql; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment