Skip to content

Instantly share code, notes, and snippets.

@Dillie-O
Last active December 18, 2015 00:59
Show Gist options
  • Save Dillie-O/5700776 to your computer and use it in GitHub Desktop.
Save Dillie-O/5700776 to your computer and use it in GitHub Desktop.
Refactored Advanced Remove Duplicate Row Script in SQL
-- Algorithm --
-- 1. Run query to get all duplicate count values. Since we're getting cloned
-- rows, we can run the query based off of all fields in the table. Store
-- in table variable.
--
-- 2. Iterate through table.
--
-- 2a. Use DELETE TOP(x - 1) remove all duplicates.
--
-- 3. Done
DECLARE @RowsToProcess int
DECLARE @CurrentRow int
DECLARE @Message varchar(MAX)
DECLARE @duplicate_count int
DECLARE @application_id int
DECLARE @lock_version int
DECLARE @asset_id varchar(50),
DECLARE @customer_id int,
DECLARE @received_date datetime,
DECLARE @fee_amount money,
DECLARE @created_at datetime,
DECLARE @updated_at datetime
DECLARE @DuplicateVersions TABLE
(
row_id int NOT NULL PRIMARY KEY IDENTITY(1,1),
duplicate_count int,
application_id int,
lock_version int,
asset_id varchar(50),
customer_id int,
received_date datetime,
fee_amount money,
created_at datetime,
updated_at datetime
)
INSERT INTO @DuplicateVersions
(
duplicate_count,
application_id,
lock_version,
asset_id,
customer_id,
received_date,
fee_amount,
created_at,
updated_at
)
SELECT COUNT(application_id) AS duplicate_count,
[application_id],
[lock_version],
[asset_id],
[customer_id],
[received_date],
[fee_amount],
[created_at],
[updated_at]
FROM application_versions
GROUP BY [application_id],
[lock_version],
[asset_id],
[customer_id],
[received_date],
[fee_amount],
[created_at],
[updated_at]
HAVING COUNT(application_id) > 1
ORDER BY [application_id] ASC, [lock_version] ASC
SELECT @RowsToProcess = COUNT(row_id) FROM @DuplicateVersions
SET @Message = 'Duplicate rows retrieved. Records to process: ' + CAST(@RowsToProcess AS VARCHAR)
RAISERROR (@Message, 10, 1) WITH NOWAIT
SET @CurrentRow = 1
WHILE @CurrentRow <= @RowsToProcess
BEGIN
SELECT @duplicate_count = duplicate_count,
@application_id = application_id,
@lock_version = lock_version,
@asset_id = asset_id,
@customer_id = customer_id,
@received_date = received_date,
@fee_amount = fee_amount,
@created_at = created_at,
@updated_at = updated_at
FROM @DuplicateVersions AS DuplicateVersions
WHERE row_id = @CurrentRow
SET @Message = '[' + CAST(@CurrentRow AS VARCHAR) + ' of ' + CAST(@RowsToProcess AS VARCHAR) + '] AppID: ' + CAST(@application_id AS VARCHAR) + ' / Lock: ' + CAST(@lock_version AS VARCHAR) + ' - ' + CAST((@duplicate_count - 1) AS VARCHAR) + ' Duplicate(s)'
RAISERROR (@Message, 10, 1) WITH NOWAIT
-- Since we're using a DELETE TOP (x) command is handled by selecting the
-- proper duplicate count - 1 based on our initial query. Since we do this
-- as a subquery within our DELETE statement we can match based on column,
-- making sure to test for NULL values.
DELETE application_versions FROM
(
SELECT TOP (@duplicate_count - 1) id
FROM application_versions
WHERE (@application_id IS NULL OR application_id = @application_id)
AND (@lock_version IS NULL OR lock_version = @lock_version)
AND (@asset_id IS NULL OR asset_id = @asset_id)
AND (@customer_id IS NULL OR customer_id = @customer_id)
AND (@received_date IS NULL OR received_date = @received_date)
AND (@fee_amount IS NULL OR fee_amount = @fee_amount)
AND (@created_at IS NULL OR created_at = @created_at)
AND (@updated_at IS NULL OR updated_at = @updated_at)
) AS t1
WHERE application_versions.id = t1.id
SET @CurrentRow = @CurrentRow + 1
END
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment