Skip to content

Instantly share code, notes, and snippets.

@justisr
Forked from hui-zheng/BQ_partition_dedup.sql
Created April 11, 2026 23:06
Show Gist options
  • Select an option

  • Save justisr/2139e44eb005faec6909719d0cdefe6e to your computer and use it in GitHub Desktop.

Select an option

Save justisr/2139e44eb005faec6909719d0cdefe6e to your computer and use it in GitHub Desktop.
This list provides BigQuery SQL templates that remove duplicates for large size timestamp partitioned table (using MERGE statement) and for small size table or a non-partition table (Using REPLACE TABLE statement)
-- WARNING: back up the table before this operation
-- FOR large size timestamp partitioned table
-- -------------------------------------------
-- -- To de-duplicate rows of a given range of a partition table, using surrage_key as unique id
-- -------------------------------------------
DECLARE dt_start DEFAULT TIMESTAMP("2019-09-17T00:00:00", "America/Los_Angeles") ;
DECLARE dt_end DEFAULT TIMESTAMP("2019-09-22T00:00:00", "America/Los_Angeles");
MERGE INTO `gcp_project`.`data_set`.`the_table` AS INTERNAL_DEST
USING (
SELECT k.*
FROM (
SELECT ARRAY_AGG(original_data LIMIT 1)[OFFSET(0)] k
FROM `gcp_project`.`data_set`.`the_table` AS original_data
WHERE stamp BETWEEN dt_start AND dt_end
GROUP BY surrogate_key
)
) AS INTERNAL_SOURCE
ON FALSE
WHEN NOT MATCHED BY SOURCE
AND INTERNAL_DEST.stamp BETWEEN dt_start AND dt_end -- remove all data in partiion range
THEN DELETE
WHEN NOT MATCHED THEN INSERT ROW
-- FOR a small size table or a non-partition table
-------------------------------------------
-- To de-duplicate rows of the whole table using surrage_key as unique id
-------------------------------------------
CREATE OR REPLACE TABLE `semios-dbt`.`data_set`.`pois`
AS
SELECT k.*
FROM (
SELECT ARRAY_AGG(original_data LIMIT 1)[OFFSET(0)] k
FROM `gcp_project`.`data_set`.`the_table` AS original_data
GROUP BY surrogate_key
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment