Last active
May 26, 2023 14:14
-
-
Save hui-zheng/f7e972bcbe9cde0c6cb6318f7270b67a to your computer and use it in GitHub Desktop.
This list provides BigQuery SQL templates that remove duplicates for large size timestamp partitioned table (using MERGE statement) and for small size table or a non-partition table (Using REPLACE TABLE statement)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- WARNING: back up the table before this operation | |
-- FOR large size timestamp partitioned table | |
-- ------------------------------------------- | |
-- -- To de-duplicate rows of a given range of a partition table, using surrage_key as unique id | |
-- ------------------------------------------- | |
DECLARE dt_start DEFAULT TIMESTAMP("2019-09-17T00:00:00", "America/Los_Angeles") ; | |
DECLARE dt_end DEFAULT TIMESTAMP("2019-09-22T00:00:00", "America/Los_Angeles"); | |
MERGE INTO `gcp_project`.`data_set`.`the_table` AS INTERNAL_DEST | |
USING ( | |
SELECT k.* | |
FROM ( | |
SELECT ARRAY_AGG(original_data LIMIT 1)[OFFSET(0)] k | |
FROM `gcp_project`.`data_set`.`the_table` AS original_data | |
WHERE stamp BETWEEN dt_start AND dt_end | |
GROUP BY surrogate_key | |
) | |
) AS INTERNAL_SOURCE | |
ON FALSE | |
WHEN NOT MATCHED BY SOURCE | |
AND INTERNAL_DEST.stamp BETWEEN dt_start AND dt_end -- remove all data in partiion range | |
THEN DELETE | |
WHEN NOT MATCHED THEN INSERT ROW |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- FOR a small size table or a non-partition table | |
------------------------------------------- | |
-- To de-duplicate rows of the whole table using surrage_key as unique id | |
------------------------------------------- | |
CREATE OR REPLACE TABLE `semios-dbt`.`data_set`.`pois` | |
AS | |
SELECT k.* | |
FROM ( | |
SELECT ARRAY_AGG(original_data LIMIT 1)[OFFSET(0)] k | |
FROM `gcp_project`.`data_set`.`the_table` AS original_data | |
GROUP BY surrogate_key | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment