Last active
July 25, 2024 15:44
-
-
Save ducnh1022/7a5c5ee27e6f356a36ef46ef73d07273 to your computer and use it in GitHub Desktop.
BigQuery Scripting with Dynamic SQL
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
DECLARE project_id STRING DEFAULT 'your_project_id'; | |
DECLARE dataset_id STRING DEFAULT 'your_dataset_id'; | |
DECLARE table_name STRING DEFAULT 'your_table_name'; | |
-- Step 1: Retrieve column details for relevant data types | |
WITH column_details AS ( | |
SELECT | |
column_name, | |
data_type | |
FROM | |
`project_id.dataset_id.INFORMATION_SCHEMA.COLUMNS` | |
WHERE | |
table_name = table_name | |
AND data_type IN ('STRING', 'INTEGER', 'FLOAT', 'NUMERIC', 'BOOLEAN', 'DATETIME', 'TIMESTAMP') | |
), | |
-- Step 2: Generate profiling queries for each column based on data type | |
profiling_queries AS ( | |
SELECT | |
column_name, | |
data_type, | |
CASE | |
WHEN data_type = 'STRING' THEN FORMAT(""" | |
SELECT | |
'%s' AS table_name, | |
'%s' AS column_name, | |
COUNT(*) AS total_count, | |
COUNT(DISTINCT %s) AS unique_count, | |
STRING_AGG(DISTINCT %s ORDER BY RAND() LIMIT 100) AS sample_values | |
FROM `project_id.dataset_id.%s` | |
""", table_name, column_name, column_name, column_name, table_name) | |
WHEN data_type = 'INTEGER' OR data_type = 'FLOAT' OR data_type = 'NUMERIC' THEN FORMAT(""" | |
SELECT | |
'%s' AS table_name, | |
'%s' AS column_name, | |
COUNT(*) AS total_count, | |
COUNT(DISTINCT %s) AS unique_count, | |
MIN(%s) AS min_value, | |
MAX(%s) AS max_value, | |
AVG(%s) AS mean_value, | |
STDDEV(%s) AS std_dev, | |
ARRAY_AGG(DISTINCT %s ORDER BY RAND() LIMIT 100) AS sample_values | |
FROM `project_id.dataset_id.%s` | |
""", table_name, column_name, column_name, column_name, column_name, column_name, column_name, table_name) | |
WHEN data_type = 'BOOLEAN' THEN FORMAT(""" | |
SELECT | |
'%s' AS table_name, | |
'%s' AS column_name, | |
COUNT(*) AS total_count, | |
COUNT(IF(%s, 1, NULL)) AS true_count, | |
COUNT(IF(NOT %s, 1, NULL)) AS false_count, | |
ARRAY_AGG(DISTINCT %s ORDER BY RAND() LIMIT 100) AS sample_values | |
FROM `project_id.dataset_id.%s` | |
""", table_name, column_name, column_name, column_name, column_name, table_name) | |
WHEN data_type = 'DATETIME' THEN FORMAT(""" | |
SELECT | |
'%s' AS table_name, | |
'%s' AS column_name, | |
COUNT(*) AS total_count, | |
MIN(%s) AS earliest_value, | |
MAX(%s) AS latest_value, | |
TIMESTAMP_DIFF(MAX(%s), MIN(%s), SECOND) AS duration_seconds, | |
ARRAY_AGG(DISTINCT %s ORDER BY RAND() LIMIT 100) AS sample_values | |
FROM `project_id.dataset_id.%s` | |
""", table_name, column_name, column_name, column_name, column_name, column_name, column_name, table_name) | |
WHEN data_type = 'TIMESTAMP' THEN FORMAT(""" | |
SELECT | |
'%s' AS table_name, | |
'%s' AS column_name, | |
COUNT(*) AS total_count, | |
MIN(CAST(%s AS DATE)) AS earliest_date, | |
MAX(CAST(%s AS DATE)) AS latest_date, | |
TIMESTAMP_DIFF(MAX(%s), MIN(%s), SECOND) AS duration_seconds, | |
ARRAY_AGG(DISTINCT CAST(%s AS DATE) ORDER BY RAND() LIMIT 100) AS sample_dates | |
FROM `project_id.dataset_id.%s` | |
""", table_name, column_name, column_name, column_name, column_name, column_name, column_name, table_name) | |
END AS profiling_query | |
FROM | |
column_details | |
) | |
-- Step 3: Output profiling queries | |
SELECT | |
column_name, | |
data_type, | |
profiling_query | |
FROM | |
profiling_queries; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment