Last active
October 31, 2023 13:53
-
-
Save sayle-doit/d1d14b951b2b4173e9014cbc93d370c6 to your computer and use it in GitHub Desktop.
Query to count out the number of query jobs that reference each table in a project.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- | |
-- This query will pull out a count of all tables referenced over the past | |
-- 7 days (configurable) by query jobs in the current project. | |
-- Note this will pull in queries that query the information schema views. | |
-- | |
-- Note by default this hits the US multi-region, so if using a different | |
-- region then change region-us below to that region name. | |
-- | |
DECLARE start_time TIMESTAMP DEFAULT | |
-- Change this start time here if wanting to change from 7 days to present | |
TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 7 DAY); | |
DECLARE end_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP(); | |
WITH tables AS ( | |
SELECT | |
jobs.job_id, | |
tables.* | |
FROM | |
`region-us`.INFORMATION_SCHEMA.JOBS AS jobs, | |
UNNEST(referenced_tables) AS tables | |
WHERE | |
-- Only include query jobs and exclude script query jobs. | |
-- The reason is that script job types have sub-jobs that | |
-- will be counted as well and lead to skewed data. | |
job_type = 'QUERY' | |
AND statement_type <> 'SCRIPT' | |
-- This line filters out CDC queries that have a type of NULL | |
-- and job_type of BACKGROUND | |
AND statement_TYPE IS NOT NULL | |
), | |
-- Grab the information schema jobs using some basic regex | |
info_schema_jobs AS ( | |
SELECT | |
-- Note doing a distinct here as sometimes this query will pull back | |
-- duplicate values which skew the final results. | |
DISTINCT tables.* | |
-- Doing a lowercase on all of the region dataset names. | |
-- Sometimes they come out with uppercases in them that hurts | |
-- being able to filter them later. An example is region-us and | |
-- region-US are both returned at times. | |
REPLACE (LOWER(dataset_id) AS dataset_id) | |
FROM | |
tables | |
WHERE | |
REGEXP_CONTAINS(dataset_id, '^(region-[a-zA-Z]+)$') = TRUE | |
OR REGEXP_CONTAINS(dataset_id, '^([a-z]+-[a-z]+[0-9]+)$') = TRUE | |
), | |
-- Remove all jobs from temporary datasets or the information schema views | |
real_tables_only AS ( | |
SELECT | |
tables.* | |
FROM | |
tables | |
JOIN | |
`region-us`.INFORMATION_SCHEMA.SCHEMATA | |
ON | |
schema_name = dataset_id | |
), | |
-- Union the "real table" results and the information schema results | |
all_jobs AS ( | |
SELECT | |
CONCAT(project_id, '.', dataset_id, '.', table_id) AS table_id | |
FROM | |
real_tables_only | |
UNION ALL | |
SELECT | |
CONCAT(project_id, '.', dataset_id, '.', table_id) AS table_id | |
FROM | |
info_schema_jobs ) | |
-- Final result that does counting | |
SELECT | |
table_id, | |
COUNT(table_id) AS table_count | |
FROM | |
all_jobs | |
GROUP BY | |
table_id | |
ORDER BY | |
table_count ASC; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment