Created
August 22, 2019 18:06
-
-
Save tbnorth/6e2d7b8a4225cb70d56fb153a85a368e to your computer and use it in GitHub Desktop.
Script that generates SQL to remove duplicate records from a table
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# change these bits then run the script to generate the SQL, e.g. | |
# <edit file> | |
# <save file> | |
# bash uniqify.sql.sh >fix_mytable.sql | |
# then execute the SQL line by line checking the results, preferably in | |
# a transaction you can roll back or a copy of the DB | |
# START OF THINGS TO CHANGE | |
# the table containing duplicates | |
TABLE=intake | |
# the primary key field in the table | |
PK=intake | |
# the set of fields representing the identities that *should* be unique in the table | |
IDENTITY="site, src, rep, year, start_month, end_month" | |
# OPTIONAL, result fields to check results are the same for dupes | |
RESULT_FLDS="agents_in, agents_near, agents_crossing" | |
# END OF THINGS TO CHANGE | |
cat << EOT | |
-- how many you expect | |
select count(*) from (select distinct $IDENTITY from $TABLE); | |
-- how many you have | |
select count(*) from $TABLE; | |
-- reps of each identity | |
create temp table tmp_reps as select $IDENTITY, count(*) as dupes_n from $TABLE group by $IDENTITY; | |
-- should match the expected number of records | |
select count(*) from tmp_reps; | |
-- examine some | |
SELECT * from tmp_reps limit 5; | |
-- see the highest number of dupes | |
select max(dupes_n) from tmp_reps; | |
-- min PK for each IDENTITY, the one we'll keep | |
create temp table tmp_min_pk as select $IDENTITY, min($PK) as min_pk, dupes_n from $TABLE join tmp_reps using ($IDENTITY) group by $IDENTITY; | |
-- should match the expected number of records | |
select count(*) from tmp_min_pk; | |
-- should match the expected number of records | |
select count(*) from (select distinct min_pk from tmp_min_pk); | |
-- optional check of targetting | |
select * from $TABLE join (select * from tmp_min_pk where dupes_n > 1 limit 3) using ($IDENTITY); | |
select count(*) from (select * from $TABLE join (select * from tmp_min_pk where dupes_n > 1 limit 3) as x using ($IDENTITY)); | |
select $PK, min_pk, $RESULT_FLDS from $TABLE join (select * from tmp_min_pk where dupes_n > 1 limit 3) as x using ($IDENTITY); | |
-- drop dupes - CAN BE QUITE SLOW | |
delete from $TABLE where not exists (select 1 from tmp_min_pk where $TABLE.$PK = tmp_min_pk.min_pk); | |
-- check result | |
select count(*) from (select distinct $IDENTITY from $TABLE); | |
select count(*) from $TABLE; | |
EOT |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment