Skip to content

Instantly share code, notes, and snippets.

@ducnh1022
Last active July 25, 2024 15:44
Show Gist options
  • Save ducnh1022/7a5c5ee27e6f356a36ef46ef73d07273 to your computer and use it in GitHub Desktop.
Save ducnh1022/7a5c5ee27e6f356a36ef46ef73d07273 to your computer and use it in GitHub Desktop.
BigQuery Scripting with Dynamic SQL
DECLARE project_id STRING DEFAULT 'your_project_id';
DECLARE dataset_id STRING DEFAULT 'your_dataset_id';
DECLARE table_name STRING DEFAULT 'your_table_name';
-- Step 1: Retrieve column details for relevant data types
WITH column_details AS (
SELECT
column_name,
data_type
FROM
`project_id.dataset_id.INFORMATION_SCHEMA.COLUMNS`
WHERE
table_name = table_name
AND data_type IN ('STRING', 'INTEGER', 'FLOAT', 'NUMERIC', 'BOOLEAN', 'DATETIME', 'TIMESTAMP')
),
-- Step 2: Generate profiling queries for each column based on data type
profiling_queries AS (
SELECT
column_name,
data_type,
CASE
WHEN data_type = 'STRING' THEN FORMAT("""
SELECT
'%s' AS table_name,
'%s' AS column_name,
COUNT(*) AS total_count,
COUNT(DISTINCT %s) AS unique_count,
STRING_AGG(DISTINCT %s ORDER BY RAND() LIMIT 100) AS sample_values
FROM `project_id.dataset_id.%s`
""", table_name, column_name, column_name, column_name, table_name)
WHEN data_type = 'INTEGER' OR data_type = 'FLOAT' OR data_type = 'NUMERIC' THEN FORMAT("""
SELECT
'%s' AS table_name,
'%s' AS column_name,
COUNT(*) AS total_count,
COUNT(DISTINCT %s) AS unique_count,
MIN(%s) AS min_value,
MAX(%s) AS max_value,
AVG(%s) AS mean_value,
STDDEV(%s) AS std_dev,
ARRAY_AGG(DISTINCT %s ORDER BY RAND() LIMIT 100) AS sample_values
FROM `project_id.dataset_id.%s`
""", table_name, column_name, column_name, column_name, column_name, column_name, column_name, table_name)
WHEN data_type = 'BOOLEAN' THEN FORMAT("""
SELECT
'%s' AS table_name,
'%s' AS column_name,
COUNT(*) AS total_count,
COUNT(IF(%s, 1, NULL)) AS true_count,
COUNT(IF(NOT %s, 1, NULL)) AS false_count,
ARRAY_AGG(DISTINCT %s ORDER BY RAND() LIMIT 100) AS sample_values
FROM `project_id.dataset_id.%s`
""", table_name, column_name, column_name, column_name, column_name, table_name)
WHEN data_type = 'DATETIME' THEN FORMAT("""
SELECT
'%s' AS table_name,
'%s' AS column_name,
COUNT(*) AS total_count,
MIN(%s) AS earliest_value,
MAX(%s) AS latest_value,
TIMESTAMP_DIFF(MAX(%s), MIN(%s), SECOND) AS duration_seconds,
ARRAY_AGG(DISTINCT %s ORDER BY RAND() LIMIT 100) AS sample_values
FROM `project_id.dataset_id.%s`
""", table_name, column_name, column_name, column_name, column_name, column_name, column_name, table_name)
WHEN data_type = 'TIMESTAMP' THEN FORMAT("""
SELECT
'%s' AS table_name,
'%s' AS column_name,
COUNT(*) AS total_count,
MIN(CAST(%s AS DATE)) AS earliest_date,
MAX(CAST(%s AS DATE)) AS latest_date,
TIMESTAMP_DIFF(MAX(%s), MIN(%s), SECOND) AS duration_seconds,
ARRAY_AGG(DISTINCT CAST(%s AS DATE) ORDER BY RAND() LIMIT 100) AS sample_dates
FROM `project_id.dataset_id.%s`
""", table_name, column_name, column_name, column_name, column_name, column_name, column_name, table_name)
END AS profiling_query
FROM
column_details
)
-- Step 3: Output profiling queries
SELECT
column_name,
data_type,
profiling_query
FROM
profiling_queries;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment