Skip to content

Instantly share code, notes, and snippets.

@tbnorth
Created August 22, 2019 18:06
Show Gist options
  • Save tbnorth/6e2d7b8a4225cb70d56fb153a85a368e to your computer and use it in GitHub Desktop.
Save tbnorth/6e2d7b8a4225cb70d56fb153a85a368e to your computer and use it in GitHub Desktop.
Script that generates SQL to remove duplicate records from a table
# change these bits then run the script to generate the SQL, e.g.
# <edit file>
# <save file>
# bash uniqify.sql.sh >fix_mytable.sql
# then execute the SQL line by line checking the results, preferably in
# a transaction you can roll back or a copy of the DB
# START OF THINGS TO CHANGE
# the table containing duplicates
TABLE=intake
# the primary key field in the table
PK=intake
# the set of fields representing the identities that *should* be unique in the table
IDENTITY="site, src, rep, year, start_month, end_month"
# OPTIONAL, result fields to check results are the same for dupes
RESULT_FLDS="agents_in, agents_near, agents_crossing"
# END OF THINGS TO CHANGE
cat << EOT
-- how many you expect
select count(*) from (select distinct $IDENTITY from $TABLE);
-- how many you have
select count(*) from $TABLE;
-- reps of each identity
create temp table tmp_reps as select $IDENTITY, count(*) as dupes_n from $TABLE group by $IDENTITY;
-- should match the expected number of records
select count(*) from tmp_reps;
-- examine some
SELECT * from tmp_reps limit 5;
-- see the highest number of dupes
select max(dupes_n) from tmp_reps;
-- min PK for each IDENTITY, the one we'll keep
create temp table tmp_min_pk as select $IDENTITY, min($PK) as min_pk, dupes_n from $TABLE join tmp_reps using ($IDENTITY) group by $IDENTITY;
-- should match the expected number of records
select count(*) from tmp_min_pk;
-- should match the expected number of records
select count(*) from (select distinct min_pk from tmp_min_pk);
-- optional check of targetting
select * from $TABLE join (select * from tmp_min_pk where dupes_n > 1 limit 3) using ($IDENTITY);
select count(*) from (select * from $TABLE join (select * from tmp_min_pk where dupes_n > 1 limit 3) as x using ($IDENTITY));
select $PK, min_pk, $RESULT_FLDS from $TABLE join (select * from tmp_min_pk where dupes_n > 1 limit 3) as x using ($IDENTITY);
-- drop dupes - CAN BE QUITE SLOW
delete from $TABLE where not exists (select 1 from tmp_min_pk where $TABLE.$PK = tmp_min_pk.min_pk);
-- check result
select count(*) from (select distinct $IDENTITY from $TABLE);
select count(*) from $TABLE;
EOT
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment