Skip to content

Instantly share code, notes, and snippets.

@Bilbottom
Created December 4, 2024 09:29
Show Gist options
  • Save Bilbottom/7f1288c678cd70e06f9273779d07fccb to your computer and use it in GitHub Desktop.
Save Bilbottom/7f1288c678cd70e06f9273779d07fccb to your computer and use it in GitHub Desktop.
DuckDB CSV diff
id col_1 col_2 col_3
1 a b 0.1
2 c d -11
3 e f 111
id col_1 col_2 col_3 col_4
2 c d -11
3 e
4 f f
with
files as (
/* Union to align their data types */
select 'file_1' as file, *
from 'csv-1.csv'
union all by name
select 'file_2' as file, *
from 'csv-2.csv'
),
file_1 as (
select
* exclude (file),
sha256(concat_ws('|', *columns(* exclude (file)))) as hash
from files
where file = 'file_1'
),
file_2 as (
select
* exclude (file),
sha256(concat_ws('|', *columns(* exclude (file)))) as hash
from files
where file = 'file_2'
),
both_files as (
select
case
when file_1.id is null then 'insert'
when file_2.id is null then 'delete'
when file_1.hash != file_2.hash then 'update'
else 'same'
end as diff_type,
id,
columns(file_1.*) as "\0__f1",
columns(file_2.*) as "\0__f2",
from file_1
full join file_2
using (id) /* whatever column(s) uniquely identify records */
)
select * exclude (hash__f1, hash__f2)
from both_files
order by id
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment