tbnorth · August 22, 2019 18:06
diff --git a/uniqify.sql.sh b/uniqify.sql.sh
 # change these bits then run the script to generate the SQL, e.g.
 #   <edit file>
 #   <save file>
 #   bash uniqify.sql.sh >fix_mytable.sql
 # then execute the SQL line by line checking the results, preferably in
 # a transaction you can roll back or a copy of the DB

 # START OF THINGS TO CHANGE

 # the table containing duplicates
 TABLE=intake
 # the primary key field in the table
 PK=intake
 # the set of fields representing the identities that *should* be unique in the table
 IDENTITY="site, src, rep, year, start_month, end_month"
 # OPTIONAL, result fields to check results are the same for dupes
 RESULT_FLDS="agents_in, agents_near, agents_crossing"

 # END OF THINGS TO CHANGE

 cat << EOT
 -- how many you expect
 select count(*) from (select distinct $IDENTITY from $TABLE);

 -- how many you have
 select count(*) from $TABLE;

 -- reps of each identity
 create temp table tmp_reps as select $IDENTITY, count(*) as dupes_n from $TABLE group by $IDENTITY;
 -- should match the expected number of records
 select count(*) from tmp_reps;
 -- examine some
 SELECT * from tmp_reps limit 5;
 -- see the highest number of dupes
 select max(dupes_n) from tmp_reps;

 -- min PK for each IDENTITY, the one we'll keep
 create temp table tmp_min_pk as select $IDENTITY, min($PK) as min_pk, dupes_n from $TABLE join tmp_reps using ($IDENTITY) group by $IDENTITY;
 -- should match the expected number of records
 select count(*) from tmp_min_pk;
 -- should match the expected number of records
 select count(*) from (select distinct min_pk from tmp_min_pk);

 -- optional check of targetting
 select * from $TABLE join (select * from tmp_min_pk where dupes_n > 1 limit 3) using ($IDENTITY);
 select count(*) from (select * from $TABLE join (select * from tmp_min_pk where dupes_n > 1 limit 3) as x using ($IDENTITY));
 select $PK, min_pk, $RESULT_FLDS from $TABLE join (select * from tmp_min_pk where dupes_n > 1 limit 3) as x using ($IDENTITY);

 -- drop dupes - CAN BE QUITE SLOW
 delete from $TABLE where not exists (select 1 from tmp_min_pk where $TABLE.$PK = tmp_min_pk.min_pk);
 -- check result
 select count(*) from (select distinct $IDENTITY from $TABLE);
 select count(*) from $TABLE;
 EOT
	# change these bits then run the script to generate the SQL, e.g.
	# <edit file>
	# <save file>
	# bash uniqify.sql.sh >fix_mytable.sql
	# then execute the SQL line by line checking the results, preferably in
	# a transaction you can roll back or a copy of the DB

	# START OF THINGS TO CHANGE

	# the table containing duplicates
	TABLE=intake
	# the primary key field in the table
	PK=intake
	# the set of fields representing the identities that should be unique in the table
	IDENTITY="site, src, rep, year, start_month, end_month"
	# OPTIONAL, result fields to check results are the same for dupes
	RESULT_FLDS="agents_in, agents_near, agents_crossing"

	# END OF THINGS TO CHANGE

	cat << EOT
	-- how many you expect
	select count(*) from (select distinct $IDENTITY from $TABLE);

	-- how many you have
	select count(*) from $TABLE;

	-- reps of each identity
	create temp table tmp_reps as select $IDENTITY, count(*) as dupes_n from $TABLE group by $IDENTITY;
	-- should match the expected number of records
	select count(*) from tmp_reps;
	-- examine some
	SELECT * from tmp_reps limit 5;
	-- see the highest number of dupes
	select max(dupes_n) from tmp_reps;

	-- min PK for each IDENTITY, the one we'll keep
	create temp table tmp_min_pk as select $IDENTITY, min($PK) as min_pk, dupes_n from $TABLE join tmp_reps using ($IDENTITY) group by $IDENTITY;
	-- should match the expected number of records
	select count(*) from tmp_min_pk;
	-- should match the expected number of records
	select count(*) from (select distinct min_pk from tmp_min_pk);

	-- optional check of targetting
	select * from $TABLE join (select * from tmp_min_pk where dupes_n > 1 limit 3) using ($IDENTITY);
	select count() from (select from $TABLE join (select * from tmp_min_pk where dupes_n > 1 limit 3) as x using ($IDENTITY));
	select $PK, min_pk, $RESULT_FLDS from $TABLE join (select * from tmp_min_pk where dupes_n > 1 limit 3) as x using ($IDENTITY);

	-- drop dupes - CAN BE QUITE SLOW
	delete from $TABLE where not exists (select 1 from tmp_min_pk where $TABLE.$PK = tmp_min_pk.min_pk);
	-- check result
	select count(*) from (select distinct $IDENTITY from $TABLE);
	select count(*) from $TABLE;
	EOT