Last active
August 31, 2020 22:04
-
-
Save mmasashi/32602818772b110071bc to your computer and use it in GitHub Desktop.
Duplicate Record Delete Query Generator For Amazon Redshift
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Duplicate record delete query generator for Amazon Redshift | |
# By running a generated query, duplicate rows in a specified table will be removed. | |
# | |
# Usage: | |
# ruby delete_dup_records_redshift.rb <table-name> <priary-keys-with-comma-separator> | |
unless ARGV.count == 2 | |
puts <<EOT | |
Usage: | |
ruby delete_dup_records_redshift.rb <table name> <primary keys> | |
Example: | |
# Single primary key | |
ruby delete_dup_records_redshift.rb users id | |
# Composite primary keys | |
ruby delete_dup_records_redshift.rb users_groups user_id,group_id | |
# Specify schema name | |
ruby delete_dup_records_redshift.rb public.users id | |
EOT | |
exit 1 | |
end | |
table = ARGV[0] # ex: 'm_test_table_multi_pk' | |
primary_keys = ARGV[1].split(',') # ex: 'id1,id2' | |
temp_table = "#{table}_temp_for_dup_rows_#{Time.now.strftime('%Y%m%d_%H%M%S')}" | |
#### Main | |
QUERY_TMPL = <<EOT | |
-- Check duplicate row count | |
SELECT '%{table}' as table, count(*) as num_dup_keys FROM (SELECT %{primary_keys} FROM %{table} GROUP BY %{primary_keys} HAVING count(*) <> 1); | |
-- Delete duplicate rows | |
BEGIN; | |
LOCK %{table}; | |
SELECT count(*) FROM (SELECT %{primary_keys} FROM %{table} GROUP BY %{primary_keys} HAVING count(*) <> 1); | |
CREATE TABLE %{temp_table} (LIKE %{table}); | |
INSERT INTO %{temp_table} (SELECT distinct a.* FROM %{table} a, (SELECT %{primary_keys} FROM %{table} GROUP BY %{primary_keys} HAVING count(*) <> 1) b where %{insert_condition} ); | |
DELETE FROM %{table} using %{temp_table} where %{delete_condition}; | |
INSERT INTO %{table} (select * from %{temp_table}); | |
DROP TABLE %{temp_table}; | |
END; | |
EOT | |
puts QUERY_TMPL % { | |
table: table, | |
temp_table: temp_table, | |
primary_keys: primary_keys.join(','), | |
insert_condition: primary_keys.collect{|pk| "a.#{pk} = b.#{pk}"}.join(" AND "), | |
delete_condition: primary_keys.collect{|pk| "#{table}.#{pk} = #{temp_table}.#{pk}"}.join(" AND "), | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Single primary key | |
$ ruby delete_dup_records_redshift.rb m_test_table id | |
-- Check duplicate row count | |
SELECT 'm_test_table' as table, count(*) as num_dup_keys FROM (SELECT id FROM m_test_table GROUP BY id HAVING count(*) <> 1); | |
-- Delete duplicate rows | |
BEGIN; | |
LOCK m_test_table; | |
SELECT count(*) FROM (SELECT id FROM m_test_table GROUP BY id HAVING count(*) <> 1); | |
CREATE TABLE m_test_table_temp_for_dup_rows_20160315_153707 (LIKE m_test_table); | |
INSERT INTO m_test_table_temp_for_dup_rows_20160315_153707 (SELECT distinct a.* FROM m_test_table a, (SELECT id FROM m_test_table GROUP BY id HAVING count(*) <> 1) b where a.id = b.id ); | |
DELETE FROM m_test_table using m_test_table_temp_for_dup_rows_20160315_153707 where m_test_table.id = m_test_table_temp_for_dup_rows_20160315_153707.id; | |
INSERT INTO m_test_table (select * from m_test_table_temp_for_dup_rows_20160315_153707); | |
DROP TABLE m_test_table_temp_for_dup_rows_20160315_153707; | |
END; | |
# Composite primary keys | |
$ ruby delete_dup_records_redshift.rb m_test_table_multi_pk id1,id2 | |
-- Check duplicate row count | |
SELECT 'm_test_table_multi_pk' as table, count(*) as num_dup_keys FROM (SELECT id1,id2 FROM m_test_table_multi_pk GROUP BY id1,id2 HAVING count(*) <> 1); | |
-- Delete duplicate rows | |
BEGIN; | |
LOCK m_test_table_multi_pk; | |
SELECT count(*) FROM (SELECT id1,id2 FROM m_test_table_multi_pk GROUP BY id1,id2 HAVING count(*) <> 1); | |
CREATE TABLE m_test_table_multi_pk_temp_for_dup_rows_20160315_153607 (LIKE m_test_table_multi_pk); | |
INSERT INTO m_test_table_multi_pk_temp_for_dup_rows_20160315_153607 (SELECT distinct a.* FROM m_test_table_multi_pk a, (SELECT id1,id2 FROM m_test_table_multi_pk GROUP BY id1,id2 HAVING count(*) <> 1) b where a.id1 = b.id1 AND a.id2 = b.id2 ); | |
DELETE FROM m_test_table_multi_pk using m_test_table_multi_pk_temp_for_dup_rows_20160315_153607 where m_test_table_multi_pk.id1 = m_test_table_multi_pk_temp_for_dup_rows_20160315_153607.id1 AND m_test_table_multi_pk.id2 = m_test_table_multi_pk_temp_for_dup_rows_20160315_153607.id2; | |
INSERT INTO m_test_table_multi_pk (select * from m_test_table_multi_pk_temp_for_dup_rows_20160315_153607); | |
DROP TABLE m_test_table_multi_pk_temp_for_dup_rows_20160315_153607; | |
END; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment