MarkPryceMaherMSFT · August 12, 2024 22:28 · EmilianoEmanuelSosa · Aug 12, 2024
diff --git a/Profile_table.sql b/Profile_table.sql

 --exec profile_table  'dbo','Copy_into_example_c'

 /*
 Proc to profile the data in a table.
 This proc will give to the min and max values in every column, with is useful when selecting the right numberic data type - bit/int/bigint

 It will give you the length of the of the value, which is useful when picking the right variable length datatype - char/varchar/nvarchar

 It will also give you the number of distinct values, which is useful for selecting a distribution column.

 It will work over normal and external tables.

 WARNING: This proc was built as a proof of concept - it is very very very very very very very slow.
 So just be careful! Test out on small tables (i.e. in both rows and columns)

 Note:  The results are pushed to a table, *not a temp table* - this is so you can open another session and query the table to see how far through it is.

 */
 alter proc profile_table
 @schema_name varchar(50), @table_name varchar(500)
 as
 begin


 DECLARE @sSQL varchar(8000);

 if exists (
 select 1 from sys.schemas s inner join sys.tables t 
 on s.schema_id = t.schema_id 
 where s.name =   'dbo' and t.name =  'tmp_cols' )
 BEGIN
    PRINT 'DELETE TABLE'
    set @sSQL = 'DROP TABLE [dbo].[tmp_cols];'
    EXEC (@sSQL);
 END
 ELSE
 BEGIN
    PRINT 'TABLE DOES NOT EXISTS'
 END

 create table dbo.tmp_cols 
 (
 	seq_id bigint,
 	column_name varchar(255),
 	datatype varchar(255),
 	col_max_length bigint,
 	col_precision bigint,
 	col_scale bigint,
 	is_nullable bigint,
 	min_length bigint,
 	max_length bigint,
 	min_value varchar(max),
 	max_value varchar(max),
 	distinct_values bigint,
 	total_rows bigint,
 	empty_values bigint,
 	null_values bigint
 )
 with
 ( distribution=round_robin, heap)


 insert into dbo.tmp_cols
 select column_id,
 c.name as 'column_name',  st.name as 'datatype',
 c.max_length, c.precision, c.scale, c.is_nullable , null,null,null,null ,null,null,null,null
 from sys.columns c inner join  sys.tables t on
 t.object_id = c.object_id 
 inner join sys.schemas s on s.schema_id = t.schema_id
 inner join sys.types st on c.system_type_id = st.system_type_id
 where t.name = @table_name and s.name = @schema_name

 declare @sql2 varchar(4000);
 declare @sql3 varchar(4000);

 set @sql3 = '
 declare @total_rows bigint;

 select @total_rows = count(*) from ' + @schema_name + '.'+ @table_name + '

 update dbo.tmp_cols set total_rows = @total_rows
 '
 exec(@sql3)

 DECLARE
 	@i INT = 1
 	, @t INT = (SELECT COUNT(*) FROM dbo.tmp_cols)
 	,@min_length bigint
 	,@max_length bigint
 	,@min_value varchar(max)
 	,@max_value varchar(max)
 	,@column_name varchar(255)
 	,@datatype varchar(255)
 WHILE @i <= @t
  BEGIN
 	
 select @column_name = column_name , @datatype=datatype  from dbo.tmp_cols
 where @i = seq_id


 	set @sql2 = 'declare @min_length bigint;
 declare @max_length bigint;
 declare @min_col varchar(max);
 declare @max_col varchar(max);
 declare @distinct_values bigint;
 declare @empty_values bigint;
 declare @null_values bigint;
 select 
 @distinct_values  = count(distinct [' + @column_name +']),
 @min_length = min(len([' + @column_name +'])) ,  
 @max_length = max(len([' + @column_name +'])) ,
 @min_col = min([' + @column_name +']) ,
 @max_col= max([' + @column_name +']) ,
 @empty_values = count(case when [' + @column_name +'] = '''' then 1 end ) ,
 @null_values = count(case when  [' + @column_name +'] is null then 1 end ) 
 from ' + @schema_name + '.'+ @table_name + '

 update  dbo.tmp_cols 
 set 
 	min_length = @min_length,
 	max_length = @max_length,
 	min_value = @min_col,
 	max_value = @max_col,
 	distinct_values = @distinct_values,
 	empty_values = @empty_values,
 	null_values = @null_values
 where seq_id = ' + convert(varchar(400),@i)

 print @sql2;
 
 exec (@sql2)
 	

 	SET @i+=1;
 END

 select * from dbo.tmp_cols ;

 end

	--exec profile_table 'dbo','Copy_into_example_c'

	/*
	Proc to profile the data in a table.
	This proc will give to the min and max values in every column, with is useful when selecting the right numberic data type - bit/int/bigint

	It will give you the length of the of the value, which is useful when picking the right variable length datatype - char/varchar/nvarchar

	It will also give you the number of distinct values, which is useful for selecting a distribution column.

	It will work over normal and external tables.

	WARNING: This proc was built as a proof of concept - it is very very very very very very very slow.
	So just be careful! Test out on small tables (i.e. in both rows and columns)

	Note: The results are pushed to a table, not a temp table - this is so you can open another session and query the table to see how far through it is.

	*/
	alter proc profile_table
	@schema_name varchar(50), @table_name varchar(500)
	as
	begin


	DECLARE @sSQL varchar(8000);

	if exists (
	select 1 from sys.schemas s inner join sys.tables t
	on s.schema_id = t.schema_id
	where s.name = 'dbo' and t.name = 'tmp_cols' )
	BEGIN
	PRINT 'DELETE TABLE'
	set @sSQL = 'DROP TABLE [dbo].[tmp_cols];'
	EXEC (@sSQL);
	END
	ELSE
	BEGIN
	PRINT 'TABLE DOES NOT EXISTS'
	END

	create table dbo.tmp_cols
	(
	seq_id bigint,
	column_name varchar(255),
	datatype varchar(255),
	col_max_length bigint,
	col_precision bigint,
	col_scale bigint,
	is_nullable bigint,
	min_length bigint,
	max_length bigint,
	min_value varchar(max),
	max_value varchar(max),
	distinct_values bigint,
	total_rows bigint,
	empty_values bigint,
	null_values bigint
	)
	with
	( distribution=round_robin, heap)


	insert into dbo.tmp_cols
	select column_id,
	c.name as 'column_name', st.name as 'datatype',
	c.max_length, c.precision, c.scale, c.is_nullable , null,null,null,null ,null,null,null,null
	from sys.columns c inner join sys.tables t on
	t.object_id = c.object_id
	inner join sys.schemas s on s.schema_id = t.schema_id
	inner join sys.types st on c.system_type_id = st.system_type_id
	where t.name = @table_name and s.name = @schema_name

	declare @sql2 varchar(4000);
	declare @sql3 varchar(4000);

	set @sql3 = '
	declare @total_rows bigint;

	select @total_rows = count(*) from ' + @schema_name + '.'+ @table_name + '

	update dbo.tmp_cols set total_rows = @total_rows
	'
	exec(@sql3)

	DECLARE
	@i INT = 1
	, @t INT = (SELECT COUNT(*) FROM dbo.tmp_cols)
	,@min_length bigint
	,@max_length bigint
	,@min_value varchar(max)
	,@max_value varchar(max)
	,@column_name varchar(255)
	,@datatype varchar(255)
	WHILE @i <= @t
	BEGIN

	select @column_name = column_name , @datatype=datatype from dbo.tmp_cols
	where @i = seq_id


	set @sql2 = 'declare @min_length bigint;
	declare @max_length bigint;
	declare @min_col varchar(max);
	declare @max_col varchar(max);
	declare @distinct_values bigint;
	declare @empty_values bigint;
	declare @null_values bigint;
	select
	@distinct_values = count(distinct [' + @column_name +']),
	@min_length = min(len([' + @column_name +'])) ,
	@max_length = max(len([' + @column_name +'])) ,
	@min_col = min([' + @column_name +']) ,
	@max_col= max([' + @column_name +']) ,
	@empty_values = count(case when [' + @column_name +'] = '''' then 1 end ) ,
	@null_values = count(case when [' + @column_name +'] is null then 1 end )
	from ' + @schema_name + '.'+ @table_name + '

	update dbo.tmp_cols
	set
	min_length = @min_length,
	max_length = @max_length,
	min_value = @min_col,
	max_value = @max_col,
	distinct_values = @distinct_values,
	empty_values = @empty_values,
	null_values = @null_values
	where seq_id = ' + convert(varchar(400),@i)

	print @sql2;

	exec (@sql2)


	SET @i+=1;
	END

	select * from dbo.tmp_cols ;

	end