sayle-doit · October 31, 2023 13:53
diff --git a/bq_tables_queried_count.sql b/bq_tables_queried_count.sql
  --
  --  This query will pull out a count of all tables referenced over the past
  --  7 days (configurable) by query jobs in the current project.
  --  Note this will pull in queries that query the information schema views.
  --
  --  Note by default this hits the US multi-region, so if using a different
  --  region then change region-us below to that region name.
  --

 DECLARE start_time TIMESTAMP DEFAULT
  -- Change this start time here if wanting to change from 7 days to present
  TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 7 DAY);
 DECLARE end_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP();

 WITH tables AS (
  SELECT
    jobs.job_id,
    tables.*
  FROM
    `region-us`.INFORMATION_SCHEMA.JOBS AS jobs,
    UNNEST(referenced_tables) AS tables
  WHERE
    -- Only include query jobs and exclude script query jobs.
    -- The reason is that script job types have sub-jobs that
    -- will be counted as well and lead to skewed data.
    job_type = 'QUERY'
    AND statement_type <> 'SCRIPT'
    -- This line filters out CDC queries that have a type of NULL
    -- and job_type of BACKGROUND
    AND statement_TYPE IS NOT NULL
 ),
 -- Grab the information schema jobs using some basic regex
 info_schema_jobs AS (
  SELECT
    -- Note doing a distinct here as sometimes this query will pull back
    -- duplicate values which skew the final results.
    DISTINCT tables.*
    -- Doing a lowercase on all of the region dataset names.
    -- Sometimes they come out with uppercases in them that hurts
    -- being able to filter them later. An example is region-us and
    -- region-US are both returned at times.
    REPLACE (LOWER(dataset_id) AS dataset_id)
  FROM
    tables
  WHERE
    REGEXP_CONTAINS(dataset_id, '^(region-[a-zA-Z]+)$') = TRUE
    OR REGEXP_CONTAINS(dataset_id, '^([a-z]+-[a-z]+[0-9]+)$') = TRUE
 ),
 -- Remove all jobs from temporary datasets or the information schema views
 real_tables_only AS (
  SELECT
    tables.*
  FROM
    tables
  JOIN
    `region-us`.INFORMATION_SCHEMA.SCHEMATA
  ON
    schema_name = dataset_id
 ),
 -- Union the "real table" results and the information schema results
 all_jobs AS (
  SELECT
    CONCAT(project_id, '.', dataset_id, '.', table_id) AS table_id
  FROM
    real_tables_only
  UNION ALL
  SELECT
    CONCAT(project_id, '.', dataset_id, '.', table_id) AS table_id
  FROM
    info_schema_jobs )

 -- Final result that does counting
 SELECT
  table_id,
  COUNT(table_id) AS table_count
 FROM
  all_jobs
 GROUP BY
  table_id
 ORDER BY
  table_count ASC;
	--
	-- This query will pull out a count of all tables referenced over the past
	-- 7 days (configurable) by query jobs in the current project.
	-- Note this will pull in queries that query the information schema views.
	--
	-- Note by default this hits the US multi-region, so if using a different
	-- region then change region-us below to that region name.
	--

	DECLARE start_time TIMESTAMP DEFAULT
	-- Change this start time here if wanting to change from 7 days to present
	TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 7 DAY);
	DECLARE end_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP();

	WITH tables AS (
	SELECT
	jobs.job_id,
	tables.*
	FROM
	`region-us`.INFORMATION_SCHEMA.JOBS AS jobs,
	UNNEST(referenced_tables) AS tables
	WHERE
	-- Only include query jobs and exclude script query jobs.
	-- The reason is that script job types have sub-jobs that
	-- will be counted as well and lead to skewed data.
	job_type = 'QUERY'
	AND statement_type <> 'SCRIPT'
	-- This line filters out CDC queries that have a type of NULL
	-- and job_type of BACKGROUND
	AND statement_TYPE IS NOT NULL
	),
	-- Grab the information schema jobs using some basic regex
	info_schema_jobs AS (
	SELECT
	-- Note doing a distinct here as sometimes this query will pull back
	-- duplicate values which skew the final results.
	DISTINCT tables.*
	-- Doing a lowercase on all of the region dataset names.
	-- Sometimes they come out with uppercases in them that hurts
	-- being able to filter them later. An example is region-us and
	-- region-US are both returned at times.
	REPLACE (LOWER(dataset_id) AS dataset_id)
	FROM
	tables
	WHERE
	REGEXP_CONTAINS(dataset_id, '^(region-[a-zA-Z]+)$') = TRUE
	OR REGEXP_CONTAINS(dataset_id, '^([a-z]+-[a-z]+[0-9]+)$') = TRUE
	),
	-- Remove all jobs from temporary datasets or the information schema views
	real_tables_only AS (
	SELECT
	tables.*
	FROM
	tables
	JOIN
	`region-us`.INFORMATION_SCHEMA.SCHEMATA
	ON
	schema_name = dataset_id
	),
	-- Union the "real table" results and the information schema results
	all_jobs AS (
	SELECT
	CONCAT(project_id, '.', dataset_id, '.', table_id) AS table_id
	FROM
	real_tables_only
	UNION ALL
	SELECT
	CONCAT(project_id, '.', dataset_id, '.', table_id) AS table_id
	FROM
	info_schema_jobs )

	-- Final result that does counting
	SELECT
	table_id,
	COUNT(table_id) AS table_count
	FROM
	all_jobs
	GROUP BY
	table_id
	ORDER BY
	table_count ASC;