Last active
August 12, 2024 22:28
-
-
Save MarkPryceMaherMSFT/135771ae2cee0dc88b7f693da9c89d8a to your computer and use it in GitHub Desktop.
Proc to profile the data in a table. This proc will give to the min and max values in every column, with is useful when selecting the right numberic data type - bit/int/bigint It will give you the length of the of the value, which is useful when picking the right variable length datatype - char/varchar/nvarchar It will also give you the number o…
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--exec profile_table 'dbo','Copy_into_example_c' | |
/* | |
Proc to profile the data in a table. | |
This proc will give to the min and max values in every column, with is useful when selecting the right numberic data type - bit/int/bigint | |
It will give you the length of the of the value, which is useful when picking the right variable length datatype - char/varchar/nvarchar | |
It will also give you the number of distinct values, which is useful for selecting a distribution column. | |
It will work over normal and external tables. | |
WARNING: This proc was built as a proof of concept - it is very very very very very very very slow. | |
So just be careful! Test out on small tables (i.e. in both rows and columns) | |
Note: The results are pushed to a table, *not a temp table* - this is so you can open another session and query the table to see how far through it is. | |
*/ | |
alter proc profile_table | |
@schema_name varchar(50), @table_name varchar(500) | |
as | |
begin | |
DECLARE @sSQL varchar(8000); | |
if exists ( | |
select 1 from sys.schemas s inner join sys.tables t | |
on s.schema_id = t.schema_id | |
where s.name = 'dbo' and t.name = 'tmp_cols' ) | |
BEGIN | |
PRINT 'DELETE TABLE' | |
set @sSQL = 'DROP TABLE [dbo].[tmp_cols];' | |
EXEC (@sSQL); | |
END | |
ELSE | |
BEGIN | |
PRINT 'TABLE DOES NOT EXISTS' | |
END | |
create table dbo.tmp_cols | |
( | |
seq_id bigint, | |
column_name varchar(255), | |
datatype varchar(255), | |
col_max_length bigint, | |
col_precision bigint, | |
col_scale bigint, | |
is_nullable bigint, | |
min_length bigint, | |
max_length bigint, | |
min_value varchar(max), | |
max_value varchar(max), | |
distinct_values bigint, | |
total_rows bigint, | |
empty_values bigint, | |
null_values bigint | |
) | |
with | |
( distribution=round_robin, heap) | |
insert into dbo.tmp_cols | |
select column_id, | |
c.name as 'column_name', st.name as 'datatype', | |
c.max_length, c.precision, c.scale, c.is_nullable , null,null,null,null ,null,null,null,null | |
from sys.columns c inner join sys.tables t on | |
t.object_id = c.object_id | |
inner join sys.schemas s on s.schema_id = t.schema_id | |
inner join sys.types st on c.system_type_id = st.system_type_id | |
where t.name = @table_name and s.name = @schema_name | |
declare @sql2 varchar(4000); | |
declare @sql3 varchar(4000); | |
set @sql3 = ' | |
declare @total_rows bigint; | |
select @total_rows = count(*) from ' + @schema_name + '.'+ @table_name + ' | |
update dbo.tmp_cols set total_rows = @total_rows | |
' | |
exec(@sql3) | |
DECLARE | |
@i INT = 1 | |
, @t INT = (SELECT COUNT(*) FROM dbo.tmp_cols) | |
,@min_length bigint | |
,@max_length bigint | |
,@min_value varchar(max) | |
,@max_value varchar(max) | |
,@column_name varchar(255) | |
,@datatype varchar(255) | |
WHILE @i <= @t | |
BEGIN | |
select @column_name = column_name , @datatype=datatype from dbo.tmp_cols | |
where @i = seq_id | |
set @sql2 = 'declare @min_length bigint; | |
declare @max_length bigint; | |
declare @min_col varchar(max); | |
declare @max_col varchar(max); | |
declare @distinct_values bigint; | |
declare @empty_values bigint; | |
declare @null_values bigint; | |
select | |
@distinct_values = count(distinct [' + @column_name +']), | |
@min_length = min(len([' + @column_name +'])) , | |
@max_length = max(len([' + @column_name +'])) , | |
@min_col = min([' + @column_name +']) , | |
@max_col= max([' + @column_name +']) , | |
@empty_values = count(case when [' + @column_name +'] = '''' then 1 end ) , | |
@null_values = count(case when [' + @column_name +'] is null then 1 end ) | |
from ' + @schema_name + '.'+ @table_name + ' | |
update dbo.tmp_cols | |
set | |
min_length = @min_length, | |
max_length = @max_length, | |
min_value = @min_col, | |
max_value = @max_col, | |
distinct_values = @distinct_values, | |
empty_values = @empty_values, | |
null_values = @null_values | |
where seq_id = ' + convert(varchar(400),@i) | |
print @sql2; | |
exec (@sql2) | |
SET @i+=1; | |
END | |
select * from dbo.tmp_cols ; | |
end | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
So Usefull! Thanks!