Last active
February 4, 2016 18:13
-
-
Save johnsmclay/c5899cd3552c7f55d470 to your computer and use it in GitHub Desktop.
Get basic statistical and variation info on every column in a table
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
DROP TABLE IF EXISTS `clay_sandbox`.`column_stats`; | |
CREATE TABLE `clay_sandbox`.`column_stats` ( | |
`database` varchar(64) NOT NULL, | |
`table` varchar(64) NOT NULL, | |
`column` varchar(64) NOT NULL, | |
`position_in_table` bigint(20) SIGNED NOT NULL, | |
`nonnull_vals` bigint(20) DEFAULT NULL, | |
`nonmnull_pctg` decimal(7,5) DEFAULT NULL, | |
`sum_val` decimal(32,16) DEFAULT NULL, | |
`min_val` decimal(32,16) DEFAULT NULL, | |
`max_val` decimal(32,16) DEFAULT NULL, | |
`avg_val` decimal(32,16) DEFAULT NULL, | |
`count_distinct` bigint(20) DEFAULT NULL, | |
`distinct_values` varchar(2048) DEFAULT NULL, | |
PRIMARY KEY (`customer`, `database_group`, `table`, `column`) | |
) ENGINE=`InnoDB` CHARACTER SET utf8; | |
DROP PROCEDURE IF EXISTS `clay_sandbox`.`get_column_stats`; | |
DELIMITER // | |
CREATE PROCEDURE `clay_sandbox`.`get_column_stats`(in database_name_req varchar(64), in table_name_req varchar(64)) | |
BEGIN | |
-- EXPLANATION: These hold the values used for the current iteration | |
declare current_database varchar(64); | |
declare current_table varchar(64); | |
declare current_column varchar(64); | |
declare current_position int(11); | |
declare current_column_type varchar(64); | |
declare current_nonnulls bigint(20); | |
declare current_count_distinct bigint(20); | |
declare current_distinct_values varchar(2048); | |
declare current_row_count bigint(20); | |
declare current_sum_val decimal(32,16); | |
declare current_min_val decimal(32,16); | |
declare current_max_val decimal(32,16); | |
declare current_avg_val decimal(32,16); | |
declare done boolean; | |
declare the_cursor cursor for | |
SELECT | |
TABLE_SCHEMA AS `database_name` | |
,TABLE_NAME AS `table` | |
,COLUMN_NAME AS `column` | |
,ORDINAL_POSITION AS position_in_table | |
,DATA_TYPE AS column_type | |
FROM `information_schema`.`COLUMNS` | |
WHERE 1=1 | |
AND TABLE_SCHEMA = database_name_req | |
AND TABLE_NAME = table_name_req | |
order by | |
ORDINAL_POSITION ASC | |
; | |
-- EXPLANATION: when doing the "fetch" later, if there is not another line set the "done" flag | |
declare continue handler for not found set done=TRUE; | |
SET | |
done = FALSE | |
,current_nonnulls := NULL | |
,current_count_distinct := NULL | |
,current_distinct_values := NULL | |
,current_sum_val := NULL | |
,current_min_val := NULL | |
,current_max_val := NULL | |
,current_avg_val := NULL | |
; | |
SET @sql = CONCAT('SELECT COUNT(*) FROM ',database_name_req,'.',table_name_req,' INTO @current_row_count;'); | |
PREPARE stmt FROM @sql; | |
EXECUTE stmt; | |
DEALLOCATE PREPARE stmt; | |
SET current_row_count := @current_row_count; | |
open the_cursor; | |
the_loop: loop | |
-- EXPLANATION: sets the values for the current iteration into the variables declared above | |
fetch the_cursor into | |
current_database | |
,current_table | |
,current_column | |
,current_position | |
,current_column_type | |
; | |
-- EXPLANATION: check the "done" flag to make sure we haven't processed all the segments/lines | |
if done = TRUE then leave the_loop; end if; | |
-- === DO ALL YOU THINGS FOR EACH SEGMENT BELOW HERE ======= | |
SET @sql = CONCAT( | |
'SELECT COUNT(DISTINCT ',current_column, | |
'),SUM(',current_column,'),MIN(',current_column, | |
'),MAX(',current_column,'),AVG(',current_column, | |
'),LEFT(GROUP_CONCAT(DISTINCT ',current_column,'),2048),SUM(IF(',current_column,' IS NOT NULL,1,0)) FROM ',current_database,'.',current_table, | |
' INTO @current_count_distinct,@current_sum_val,@current_min_val,@current_max_val,@current_avg_val, @current_distinct_values,@current_nonnulls;'); | |
PREPARE stmt FROM @sql; | |
EXECUTE stmt; | |
DEALLOCATE PREPARE stmt; | |
SET | |
current_count_distinct := @current_count_distinct | |
,current_sum_val := @current_sum_val | |
,current_min_val := @current_min_val | |
,current_max_val := @current_max_val | |
,current_avg_val := @current_avg_val | |
,current_distinct_values := @current_distinct_values | |
,current_nonnulls := @current_nonnulls | |
; | |
DELETE FROM `clay_sandbox`.`column_stats` | |
WHERE TRUE | |
AND `database` = current_database | |
AND `table` = current_table | |
AND `column` = current_column | |
; | |
INSERT INTO `clay_sandbox`.`column_stats` ( | |
,`database` | |
,`table` | |
,`column` | |
,position_in_table | |
,nonnull_vals | |
,nonmnull_pctg | |
,sum_val | |
,min_val | |
,max_val | |
,avg_val | |
,count_distinct | |
,distinct_values | |
) | |
VALUES ( | |
current_database | |
,current_table | |
,current_column | |
,current_position | |
,current_nonnulls | |
,(current_nonnulls/current_row_count) | |
,IF(current_column_type IN ( | |
'decimal' | |
,'int' | |
,'bigint' | |
,'tinyint' | |
,'double' | |
,'float' | |
,'mediumint' | |
,'smallint' | |
),current_sum_val,NULL) | |
,IF(current_column_type IN ( | |
'decimal' | |
,'int' | |
,'bigint' | |
,'tinyint' | |
,'double' | |
,'float' | |
,'mediumint' | |
,'smallint' | |
,'timestamp' | |
,'date' | |
,'datetime' | |
),current_min_val,NULL) | |
,IF(current_column_type IN ( | |
'decimal' | |
,'int' | |
,'bigint' | |
,'tinyint' | |
,'double' | |
,'float' | |
,'mediumint' | |
,'smallint' | |
,'timestamp' | |
,'date' | |
,'datetime' | |
),current_max_val,NULL) | |
,IF(current_column_type IN ( | |
'decimal' | |
,'int' | |
,'bigint' | |
,'tinyint' | |
,'double' | |
,'float' | |
,'mediumint' | |
,'smallint' | |
),current_avg_val,NULL) | |
,current_count_distinct | |
,current_distinct_values | |
); | |
-- === DO ALL YOU THINGS FOR EACH SEGMENT ABOVE HERE ======= | |
end loop; | |
close the_cursor; | |
SELECT 'Complete' AS status; | |
END // | |
DELIMITER ; | |
CALL `clay_sandbox`.`get_column_stats`('funtimesdb','new_landing_table'); | |
SELECT * FROM `clay_sandbox`.`column_stats`; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment