Skip to content

Instantly share code, notes, and snippets.

@royseto
Last active December 16, 2015 18:19
Show Gist options
  • Save royseto/5476480 to your computer and use it in GitHub Desktop.
Save royseto/5476480 to your computer and use it in GitHub Desktop.
Dedup a relational table using SQL delete (tested with PostgreSQL)
roy-setos-macbook:~ royseto$ ssh -i ~/.ec2/rs-keypair-2.pem [email protected]
Welcome to Postgresql, TurnKey Linux 12.0 / Debian 6.0.5 Squeeze
System information (as of Sun Apr 28 09:57:02 2013)
System load: 0.00 Memory usage: 41%
Processes: 62 Swap usage: 0%
Usage of /: 7.1% of 9.84GB IP address for eth0: 10.197.60.2
TKLBAM (Backup and Migration): NOT INITIALIZED
To initialize TKLBAM, run the "tklbam-init" command to link this
system to your TurnKey Hub account. For details see the man page or
go to:
http://www.turnkeylinux.org/tklbam
Last login: Sun Apr 28 09:35:10 2013 from 24.7.89.229
root@postgresql ~# psql postgres postgres
psql (8.4.17)
Type "help" for help.
postgres=# CREATE TABLE t (a INTEGER, b INTEGER);
CREATE TABLE
postgres=# INSERT INTO t VALUES (1, 2); -- nondup row
INSERT 0 1
postgres=# INSERT INTO t VALUES (2, 5); -- nondup row #2
INSERT 0 1
postgres=# INSERT INTO t VALUES (3, 7); -- row with 2 dups
INSERT 0 1
postgres=# INSERT INTO t VALUES (3, 7); -- row with 2 dups
INSERT 0 1
postgres=# INSERT INTO t VALUES (4, 2); -- row with 3 dups
INSERT 0 1
postgres=# INSERT INTO t VALUES (4, 2); -- row with 3 dups
INSERT 0 1
postgres=# INSERT INTO t VALUES (4, 2); -- row with 3 dups
INSERT 0 1
postgres=# INSERT INTO t VALUES (5, null); -- nondup row containing a null value
INSERT 0 1
postgres=# INSERT INTO t VALUES (6, null); -- dup row containing a null value
INSERT 0 1
postgres=# INSERT INTO t VALUES (6, null); -- dup row containing a null value
INSERT 0 1
postgres=# SELECT * FROM t;
a | b
---+---
1 | 2
2 | 5
3 | 7
3 | 7
4 | 2
4 | 2
4 | 2
5 |
6 |
6 |
(10 rows)
postgres=# -- remove duplicate rows from table t
postgres=# DELETE FROM t
postgres-# WHERE ctid NOT IN (
postgres(# SELECT MIN(ctid) FROM t GROUP BY a, b
postgres(# );
DELETE 4
postgres=# SELECT * FROM t; -- duplicates should be gone now
a | b
---+---
1 | 2
2 | 5
3 | 7
4 | 2
5 |
6 |
(6 rows)
postgres=#
postgres=# TRUNCATE TABLE t;
TRUNCATE TABLE
postgres=# INSERT INTO t VALUES (1, 2); -- nondup row
INSERT 0 1
postgres=# INSERT INTO t VALUES (2, 5); -- nondup row #2
INSERT 0 1
postgres=# INSERT INTO t VALUES (3, 7); -- row with 2 dups
INSERT 0 1
postgres=# INSERT INTO t VALUES (3, 7); -- row with 2 dups
INSERT 0 1
postgres=# INSERT INTO t VALUES (4, 2); -- row with 3 dups
INSERT 0 1
postgres=# INSERT INTO t VALUES (4, 2); -- row with 3 dups
INSERT 0 1
postgres=# INSERT INTO t VALUES (4, 2); -- row with 3 dups
INSERT 0 1
postgres=# INSERT INTO t VALUES (5, null); -- nondup row containing a null value
INSERT 0 1
postgres=# INSERT INTO t VALUES (6, null); -- dup row containing a null value
INSERT 0 1
postgres=# INSERT INTO t VALUES (6, null); -- dup row containing a null value
INSERT 0 1
postgres=# SELECT * FROM t;
a | b
---+---
1 | 2
2 | 5
3 | 7
3 | 7
4 | 2
4 | 2
4 | 2
5 |
6 |
6 |
(10 rows)
postgres=# SELECT a, b, ROW_NUMBER() OVER (PARTITION BY a, b ORDER BY (SELECT 0)) rn
postgres-# FROM t;
a | b | rn
---+---+----
1 | 2 | 1
2 | 5 | 1
3 | 7 | 1
3 | 7 | 2
4 | 2 | 1
4 | 2 | 2
4 | 2 | 3
5 | | 1
6 | | 1
6 | | 2
(10 rows)
postgres=# WITH cte
postgres-# AS (SELECT ROW_NUMBER() OVER (PARTITION BY A, B
postgres(# ORDER BY (SELECT 0)) RN
postgres(# FROM T)
postgres-# DELETE FROM cte
postgres-# WHERE RN > 1;
ERROR: syntax error at or near "DELETE"
LINE 5: DELETE FROM cte
^
postgres=#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment