Created
May 10, 2016 03:31
-
-
Save eirikbakke/4b77eeea5fbc79511f563f5dd15a0076 to your computer and use it in GitHub Desktop.
Panamapapers PostgreSQL import script
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- PostgreSQL import script for panamapapers CSV files available from https://panamapapers.icij.org/blog/20160509-offshore-database-release.html | |
-- First convert CSV files from MacRoman to UTF-8 encoding: | |
-- iconv -f MACROMAN -t UTF-8 Entities.csv > Entities-utf8.csv | |
-- iconv -f MACROMAN -t UTF-8 Addresses.csv > Addresses-utf8.csv | |
-- iconv -f MACROMAN -t UTF-8 Intermediaries.csv > Intermediaries-utf8.csv | |
-- iconv -f MACROMAN -t UTF-8 Officers.csv > Officers-utf8.csv | |
-- iconv -f MACROMAN -t UTF-8 all_edges.csv > all_edges-utf8.csv | |
drop table if exists addresses cascade; | |
drop table if exists countries cascade; | |
create table addresses( | |
"address" varchar(1024) not null, | |
icij_id character(32), | |
valid_until character(47), | |
country_code character(3), | |
country varchar(64), | |
node_id integer not null, | |
source_id character(16) not null | |
); | |
copy addresses("address", icij_id, valid_until, country_code, country, node_id, source_id) from | |
'/Path/To/Files/offshore_leaks_csvs/Addresses-utf8.csv' | |
with delimiter E',' | |
header | |
null as '' | |
csv quote as E'"' | |
ENCODING 'UTF-8'; | |
select distinct country_code as "code", country as "name" into countries from addresses; | |
delete from countries where "code" is null and "name" is null; | |
alter table countries alter column "code" set not null; | |
alter table countries alter column "name" set not null; | |
alter table countries add primary key("code"); | |
alter table addresses add constraint addresses_country_fk foreign key (country_code) references countries("code"); | |
alter table addresses drop column country; | |
alter table addresses owner to duser; | |
alter table countries owner to duser; | |
alter table addresses add primary key(node_id); | |
drop table if exists entities cascade; | |
drop table if exists entities_countries cascade; | |
create table entities( | |
"name" varchar(256), | |
original_name varchar(256), | |
former_name varchar(256), | |
jurisdiction character(5) not null, | |
jurisdiction_description varchar(64) not null, | |
company_type varchar(64), | |
"address" varchar(1024), | |
internal_id character(8), | |
incorporation_date text, | |
inactivation_date text, | |
struck_off_date text, | |
dorm_date text, | |
"status" varchar(64), | |
service_provider varchar(32), | |
ibcRUC varchar(32), | |
country_codes text, | |
countries text, | |
note varchar(256), | |
valid_until character(47) not null, | |
node_id integer not null, | |
source_id character(16) not null | |
); | |
copy entities("name", original_name, former_name, jurisdiction, jurisdiction_description, company_type, "address", internal_id, incorporation_date, inactivation_date, struck_off_date, dorm_date, "status", service_provider, ibcRUC, country_codes, countries, note, valid_until, node_id, source_id) from | |
'/Path/To/Files/offshore_leaks_csvs/Entities-utf8.csv' | |
with delimiter E',' | |
header | |
null as '' | |
csv quote as E'"' | |
ENCODING 'UTF-8'; | |
alter table entities owner to duser; | |
ALTER TABLE entities ALTER COLUMN "incorporation_date" TYPE DATE | |
using to_date(incorporation_date, 'd-Mon-yyyy'); | |
ALTER TABLE entities ALTER COLUMN "inactivation_date" TYPE DATE | |
using to_date(inactivation_date, 'd-Mon-yyyy'); | |
ALTER TABLE entities ALTER COLUMN "struck_off_date" TYPE DATE | |
using to_date(struck_off_date, 'd-Mon-yyyy'); | |
ALTER TABLE entities ALTER COLUMN "dorm_date" TYPE DATE | |
using to_date(dorm_date, 'd-Mon-yyyy'); | |
create index entities_incorporation_date_idx on entities(incorporation_date); | |
create index entities_inactivation_date_idx on entities(inactivation_date); | |
create index entities_struck_off_date_idx on entities(struck_off_date); | |
create index entities_dorm_date_idx on entities(dorm_date); | |
alter table entities add primary key(node_id); | |
select distinct node_id, regexp_split_to_table(country_codes,';') as country_code into entities_countries from entities; | |
alter table entities_countries owner to duser; | |
alter table entities_countries add primary key("node_id", "country_code"); | |
alter table entities_countries add constraint entities_countries_node_id_fk foreign key (node_id) references entities("node_id"); | |
alter table entities_countries add constraint entities_countries_country_fk foreign key (country_code) references countries("code"); | |
alter table entities drop column country_codes; | |
alter table entities drop column countries; | |
drop table if exists intermediaries cascade; | |
drop table if exists intermediaries_countries cascade; | |
create table intermediaries( | |
"name" varchar(128), | |
internal_id character(6), | |
"address" varchar(256), | |
valid_until character(47) not null, | |
country_codes text, | |
countries text, | |
"status" varchar(64), | |
node_id integer not null, | |
source_id character(16) not null | |
); | |
copy intermediaries("name", internal_id, "address", "valid_until", "country_codes", "countries", "status", node_id, source_id) from | |
'/Path/To/Files/offshore_leaks_csvs/Intermediaries-utf8.csv' | |
with delimiter E',' | |
header | |
null as '' | |
csv quote as E'"' | |
ENCODING 'UTF-8'; | |
alter table intermediaries owner to duser; | |
alter table intermediaries add primary key(node_id); | |
select distinct node_id, regexp_split_to_table(country_codes,';') as country_code into intermediaries_countries from intermediaries; | |
alter table intermediaries_countries owner to duser; | |
alter table intermediaries_countries add primary key("node_id", "country_code"); | |
alter table intermediaries_countries add constraint intermediaries_countries_node_id_fk foreign key (node_id) references intermediaries("node_id"); | |
alter table intermediaries_countries add constraint intermediaries_countries_country_fk foreign key (country_code) references countries("code"); | |
alter table intermediaries drop column country_codes; | |
alter table intermediaries drop column countries; | |
drop table if exists officers cascade; | |
drop table if exists officers_countries cascade; | |
create table officers( | |
"name" varchar(256), | |
icij_id character(32), | |
valid_until character(47) not null, | |
country_codes text, | |
countries text, | |
node_id integer not null, | |
source_id character(16) not null | |
); | |
alter table officers owner to duser; | |
copy officers("name", icij_id, valid_until, country_codes, countries, node_id, source_id) from | |
'/Path/To/Files/offshore_leaks_csvs/Officers-utf8.csv' | |
with delimiter E',' | |
header | |
null as '' | |
csv quote as E'"' | |
ENCODING 'UTF-8'; | |
alter table officers add primary key(node_id); | |
select distinct node_id, regexp_split_to_table(country_codes,';') as country_code into officers_countries from officers; | |
alter table officers_countries owner to duser; | |
alter table officers_countries add primary key("node_id", "country_code"); | |
alter table officers_countries add constraint officers_countries_node_id_fk foreign key (node_id) references officers("node_id"); | |
alter table officers_countries add constraint officers_countries_country_fk foreign key (country_code) references countries("code"); | |
alter table officers drop column country_codes; | |
alter table officers drop column countries; | |
drop table if exists all_edges_tmp cascade; | |
drop table if exists all_edges cascade; | |
create table all_edges_tmp( | |
node1 integer not null, | |
rel_type character(18) not null, | |
node2 integer not null | |
); | |
copy all_edges_tmp(node1, rel_type, node2) from | |
'/Path/To/Files/offshore_leaks_csvs/all_edges-utf8.csv' | |
with delimiter E',' | |
header | |
null as '' | |
csv quote as E'"' | |
ENCODING 'UTF-8'; | |
select distinct * into all_edges from all_edges_tmp; | |
drop table all_edges_tmp cascade; | |
alter table all_edges owner to duser; | |
-- Note: Duplicate (node1,node2) pairs known to exist. | |
alter table all_edges add primary key(node1, rel_type, node2); | |
create index all_edges_rel_type_idx on all_edges(rel_type); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment