Last active
July 5, 2016 00:40
-
-
Save rvanbruggen/217738484702c2d0ca33 to your computer and use it in GitHub Desktop.
Importing the Belgian Corporate Registry
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//load the codes | |
create index on :CodeCategory(name); | |
using periodic commit 1000 | |
load csv with headers from | |
"file:/<path>/sourcecsv/code.csv" as csv | |
with distinct csv.Category as Category | |
merge (:CodeCategory {name: Category}); | |
create index on :Code(name); | |
using periodic commit | |
load csv with headers from | |
"file:/<path>/sourcecsv/code.csv" as csv | |
with distinct csv.Code as Code | |
merge (c:Code {name: Code}); | |
using periodic commit | |
load csv with headers from | |
"file:/<path>/sourcecsv/code.csv" as csv | |
with distinct csv.Category as Category, csv.Code as Code | |
match (cc:CodeCategory {name: Category}), (c:Code {name: Code}) | |
merge (cc)<-[:PART_OF]-(c); | |
create index on :CodeMeaning(description); | |
using periodic commit | |
load csv with headers from | |
"file:/<path>/sourcecsv/code.csv" as csv | |
merge (cm:CodeMeaning {language: csv.Language, description: csv.Description}); | |
using periodic commit | |
load csv with headers from | |
"file:/<path>/sourcecsv/code.csv" as csv | |
match (cc:CodeCategory {name: csv.Category})<-[:PART_OF]-(c:Code {name: csv.Code}), (cm:CodeMeaning {language: csv.Language, description: csv.Description}) | |
merge (c)<-[:MEANS]-(cm); | |
//load the enterprises | |
create constraint on (e:Enterprise) | |
assert e.EnterpriseNumber is unique; | |
using periodic commit 5000 | |
load csv with headers from | |
"file:/<path>/sourcecsv/enterprise.csv" as csv | |
create (e:Enterprise {EnterpriseNumber: csv.EnterpriseNumber, Status: csv.Status, JuridicalSituation: csv.JuridicalSituation, TypeOfEnterprise: csv.TypeOfEnterprise, JuridicalForm: csv.JuridicalForm, StartDate: toInt(substring(csv.StartDate,0,2))+toInt(substring(csv.StartDate,3,2))*100 + toInt(substring(csv.StartDate,6,4))*10000}); | |
//load the establishments | |
create constraint on (eb:Establishment) | |
assert eb.EstablishmentNumber is unique; | |
using periodic commit | |
load csv with headers from | |
"file:/<path>/sourcecsv/establishment.csv" as csv | |
create (es:Establishment {EstablishmentNumber: csv.EstablishmentNumber, StartDate: toInt(substring(csv.StartDate,0,2))+toInt(substring(csv.StartDate,3,2))*100+toInt(substring(csv.StartDate,6,4))*10000}); | |
using periodic commit | |
load csv with headers from | |
"file:/<path>/sourcecsv/establishment.csv" as csv | |
match (e:Enterprise {EnterpriseNumber: csv.EnterpriseNumber}), (es:Establishment {EstablishmentNumber: csv.EstablishmentNumber}) | |
create (es)-[:PART_OF]->(e); | |
//add contact information to enterprises and establishments | |
//add contact information to enterprises | |
using periodic commit | |
load csv with headers from | |
"file:/<path>/sourcecsv/contact.csv" as csv | |
match (e:Enterprise {EnterpriseNumber: csv.EntityNumber}) | |
create (e)-[:HAS_CONTACT_INFO]->(ci:ContactInfo {type: csv.ContactType, value: csv.Value}); | |
//add contact information to establishments | |
using periodic commit | |
load csv with headers from | |
"file:/<path>/sourcecsv/contact.csv" as csv | |
match (es:Establishment {EstablishmentNumber: csv.EntityNumber}) | |
create (es)-[:HAS_CONTACT_INFO]->(ci:ContactInfo {type: csv.ContactType, value: csv.Value}); | |
create constraint on (c:City) | |
assert c.name is unique; | |
create constraint on (z:Zip) | |
assert z.name is unique; | |
create index on :Street(name); | |
using periodic commit 100000 | |
load csv with headers from | |
"file:/<path>/sourcecsv/address.csv" as csv | |
with distinct toUpper(csv.MunicipalityNL) as MunicipalityNL | |
merge (city:City {name: MunicipalityNL}); | |
using periodic commit 100000 | |
load csv with headers from | |
"file:/<path>/sourcecsv/address.csv" as csv | |
with distinct toUpper(csv.Zipcode) as Zipcode | |
merge (zip:Zip {name: Zipcode}); | |
// connect the zips to the cities | |
using periodic commit 100000 | |
load csv with headers from | |
"file:/<path>/sourcecsv/address.csv" as csv | |
with distinct toUpper(csv.Zipcode) as Zipcode, toUpper(csv.MunicipalityNL) as MunicipalityNL | |
match (city:City {name: MunicipalityNL}), (zip:Zip {name: Zipcode}) | |
create unique (city)-[:HAS_ZIP_CODE]->(zip); | |
using periodic commit 50000 | |
load csv with headers from | |
"file:/<path>/sourcecsv/address.csv" as csv | |
with distinct toUpper(csv.Zipcode) as Zipcode, toUpper(csv.StreetNL) as StreetNL | |
match (zip:Zip {name: Zipcode}) | |
create unique (zip)<-[:PART_OF]-(str:Street {name: StreetNL, zip:Zipcode}); | |
//add the houseNumbers | |
create index on :HouseNumber(houseNumber); | |
using periodic commit 1000 | |
load csv with headers from | |
"file:/<path>/sourcecsv/address.csv" as csv | |
with toUpper(csv.Zipcode) as Zipcode, toUpper(csv.StreetNL) as StreetNL, toUpper(csv.HouseNumber) as HouseNumber | |
match (street:Street {name: StreetNL, zip:Zipcode}) | |
using index street:Street(name) | |
CREATE UNIQUE (:HouseNumber {houseNumber: HouseNumber})-[:PART_OF]->(street); | |
//connect the enterprises to the addresses | |
// DOES NOT WORK - solved with PYTHON script | |
// using periodic commit 10000 | |
// load csv with headers from | |
// "file:/<path>/sourcecsv/address.csv" as csv | |
// with distinct toUpper(csv.Zipcode) as Zipcode, toUpper(csv.StreetNL) as StreetNL, toUpper(csv.HouseNumber) as HouseNumber, csv.EntityNumber as EntityNumber | |
// match (e:Enterprise {EnterpriseNumber: EntityNumber}), | |
// (street:Street {name: StreetNL, zip:Zipcode})<-[:PART_OF]-(h:HouseNumber {houseNumber: HouseNumber}) | |
// create (e)-[:HAS_ADDRESS]->(h); | |
//load the activities of Enterprises | |
using periodic commit 25000 | |
load csv with headers from | |
"file:/<path>/sourcecsv/activity.csv" as csv | |
with csv limit 50000 | |
match (e:Enterprise {EnterpriseNumber: csv.EntityNumber}), | |
(cc2:CodeCategory {name: "ActivityGroup"})--(c2:Code {name: csv.ActivityGroup}) | |
merge (e)-[:HAS_ACTIVITY_GROUP {type: csv.Classification}]->(c2); | |
using periodic commit 25000 | |
load csv with headers from | |
"file:/<path>/sourcecsv/activity.csv" as csv | |
with csv limit 1000 | |
match (e:Enterprise {EnterpriseNumber: csv.EntityNumber}), | |
(cc1:CodeCategory {name: "Nace"+csv.NaceVersion})<-[:PART_OF]-(c1:Code {name: csv.NaceCode}) | |
merge (e)-[hnc:HAS_NACE_CODE {type: csv.Classification}]->(c1); | |
//load the activities of Establishments | |
using periodic commit 25000 | |
load csv with headers from | |
"file:/<path>/sourcecsv/activity.csv" as csv | |
match (e:Establishment {EstablishmentNumber: csv.EntityNumber}), | |
(cc2:CodeCategory {name: "ActivityGroup"})--(c2:Code {name: csv.ActivityGroup}) | |
merge (e)-[:HAS_ACTIVITY_GROUP {type: csv.Classification}]->(c2); | |
using periodic commit 25000 | |
load csv with headers from | |
"file:/<path>/sourcecsv/activity.csv" as csv | |
match (e:Establishment {EstablishmentNumber: csv.EntityNumber}), | |
(cc1:CodeCategory {name: "Nace"+csv.NaceVersion})--(c1:Code {name: csv.NaceCode}) | |
merge (e)-[hnc:HAS_NACE_CODE]->(c1) | |
on create set hnc.type=csv.Classification | |
on match set hnc.type=csv.Classification; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import datetime | |
from py2neo import Graph | |
from py2neo.packages.httpstream import http | |
http.socket_timeout = 9999 | |
graph = Graph() | |
print "Starting to process links between Enterprises and Addresses..." | |
for filenr in range(0,113): | |
tx1 = graph.cypher.begin() | |
statement1 = """ | |
load csv with headers from | |
"file:/<path>/splitcsv/splitaddress/splitaddress_"""+str(filenr)+"""" as csv | |
with distinct toUpper(csv.Zipcode) as Zipcode, toUpper(csv.StreetNL) as StreetNL, toUpper(csv.HouseNumber) as HouseNumber, csv.EntityNumber as EntityNumber | |
match (e:Enterprise {EnterpriseNumber: EntityNumber}), | |
(street:Street {name: StreetNL, zip:Zipcode})<-[:PART_OF]-(h:HouseNumber {houseNumber: HouseNumber}) | |
create (e)-[:HAS_ADDRESS]->(h); | |
""" | |
tx1.append(statement1) | |
tx1.process() | |
tx1.commit() | |
print "Enterprise Filenr: "+str(filenr)+" processed, at "+str(datetime.datetime.now()) | |
print "Starting to process links between Establishments and Addresses..." | |
for filenr in range(0,113): | |
tx2 = graph.cypher.begin() | |
statement2 = """ | |
load csv with headers from | |
"file:/<path>/splitcsv/splitaddress/splitaddress_"""+str(filenr)+"""" as csv | |
with distinct toUpper(csv.Zipcode) as Zipcode, toUpper(csv.StreetNL) as StreetNL, toUpper(csv.HouseNumber) as HouseNumber, csv.EntityNumber as EntityNumber | |
match (e:Establishment {EstablishmentNumber: EntityNumber}), | |
(street:Street {name: StreetNL, zip:Zipcode})<-[:PART_OF]-(h:HouseNumber {houseNumber: HouseNumber}) | |
create (e)-[:HAS_ADDRESS]->(h); | |
""" | |
tx2.append(statement2) | |
tx2.process() | |
tx2.commit() | |
print "Establishment Filenr: "+str(filenr)+" processed, at "+str(datetime.datetime.now()) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//THIS IS PART 1 | |
//load the codes | |
create index on :CodeCategory(name); | |
import-cypher -i /<path>/sourcecsv/code.csv -b 10000 -d , -q with distinct {Category} as Category merge (:CodeCategory {name: Category}); | |
create index on :Code(name); | |
import-cypher -i /<path>/sourcecsv/code.csv -b 10000 -d , -q with distinct {Code} as Code merge (c:Code {name: Code}); | |
import-cypher -i /<path>/sourcecsv/code.csv -b 10000 -d , -q with distinct {Category} as Category, {Code} as Code match (cc:CodeCategory {name: Category}), (c:Code {name: Code}) merge (cc)<-[:PART_OF]-(c); | |
create index on :CodeMeaning(description); | |
import-cypher -i /<path>/sourcecsv/code.csv -b 10000 -d , -q merge (cm:CodeMeaning {language: {Language}, description: {Description}}); | |
import-cypher -i /<path>/sourcecsv/code.csv -b 10000 -d , -q match (cc:CodeCategory {name: {Category}})<-[:PART_OF]-(c:Code {name: {Code}}), (cm:CodeMeaning {language: {Language}, description: {Description}}) merge (c)<-[:MEANS]-(cm); | |
//load the enterprises | |
create constraint on (e:Enterprise) | |
assert e.EnterpriseNumber is unique; | |
import-cypher -i /<path>/sourcecsv/enterprise.csv -b 10000 -d , -q create (e:Enterprise {EnterpriseNumber: {EnterpriseNumber}, Status: {Status}, JuridicalSituation: {JuridicalSituation}, TypeOfEnterprise: {TypeOfEnterprise}, JuridicalForm: {JuridicalForm}, StartDate: toInt(substring({StartDate},0,2))+toInt(substring({StartDate},3,2))*100 + toInt(substring({StartDate},6,4))*10000}); | |
//load the establishments | |
create constraint on (eb:Establishment) | |
assert eb.EstablishmentNumber is unique; | |
import-cypher -i /<path>/sourcecsv/establishment.csv -b 10000 -d , -q create (es:Establishment {EstablishmentNumber: {EstablishmentNumber}, StartDate: toInt(substring({StartDate},0,2))+toInt(substring({StartDate},3,2))*100+toInt(substring({StartDate},6,4))*10000}); | |
import-cypher -i /<path>/sourcecsv/establishment.csv -b 10000 -d , -q match (e:Enterprise {EnterpriseNumber: {EnterpriseNumber}}), (es:Establishment {EstablishmentNumber: {EstablishmentNumber}}) create (es)-[:PART_OF]->(e); | |
//add contact information to enterprises and establishments | |
//add contact information to enterprises | |
import-cypher -i /<path>/sourcecsv/contact.csv -b 10000 -d , -q match (e:Enterprise {EnterpriseNumber: {EntityNumber}}) create (e)-[:HAS_CONTACT_INFO]->(ci:ContactInfo {type: {ContactType}, value: {Value}}); | |
//add contact information to establishments | |
import-cypher -i /<path>/sourcecsv/contact.csv -b 10000 -d , -q match (es:Establishment {EstablishmentNumber: {EntityNumber}}) create (es)-[:HAS_CONTACT_INFO]->(ci:ContactInfo {type: {ContactType}, value: {Value}}); | |
//execution time: 7min 16sec | |
//3180356 nodes | |
//1602575 rels | |
//THIS IS PART 2 | |
return timestamp(); | |
create constraint on (c:City) | |
assert c.name is unique; | |
create constraint on (z:Zip) | |
assert z.name is unique; | |
create index on :Street(name); | |
import-cypher -i /<path>/sourcecsv/address.csv -b 10000 -d , -q with distinct toUpper({MunicipalityNL}) as MunicipalityNL merge (city:City {name: MunicipalityNL}); | |
import-cypher -i /<path>/sourcecsv/address.csv -b 10000 -d , -q with distinct toUpper({Zipcode}) as Zipcode merge (zip:Zip {name: Zipcode}); | |
// connect the zips to the cities | |
import-cypher -i /<path>/sourcecsv/address.csv -b 10000 -d , -q with distinct toUpper({Zipcode}) as Zipcode, toUpper({MunicipalityNL}) as MunicipalityNL match (city:City {name: MunicipalityNL}), (zip:Zip {name: Zipcode}) create unique (city)-[:HAS_ZIP_CODE]->(zip); | |
import-cypher -i file:/<path>/sourcecsv/address.csv -b 5000 -d , -q with distinct toUpper({Zipcode}) as Zipcode, toUpper({StreetNL}) as StreetNL match (zip:Zip {name: Zipcode}) create unique (zip)<-[:PART_OF]-(str:Street {name: StreetNL, zip: Zipcode}); | |
//add the houseNumbers | |
create index on :HouseNumber(houseNumber); | |
import-cypher -i /<path>/sourcecsv/address.csv -b 10000 -d , -q with toUpper({Zipcode}) as Zipcode, toUpper({StreetNL}) as StreetNL, toUpper({HouseNumber}) as HouseNumber match (street:Street {name: StreetNL, zip: Zipcode}) using index street:Street(name) CREATE UNIQUE (:HouseNumber {houseNumber: HouseNumber})-[:PART_OF]->(street); | |
//connect the enterprises to the addresses | |
import-cypher -i /<path>/sourcecsv/address.csv -b 10000 -d , -q with distinct toUpper({Zipcode}) as Zipcode, toUpper({StreetNL}) as StreetNL, toUpper({HouseNumber}) as HouseNumber, {EntityNumber} as EntityNumber match (e:Enterprise {EnterpriseNumber: EntityNumber}), (street:Street {name: StreetNL, zip:Zipcode})<-[:PART_OF]-(h:HouseNumber {houseNumber: HouseNumber}) create (e)-[:HAS_ADDRESS]->(h); | |
//connect the Establishments to the addresses | |
import-cypher -i /<path>/sourcecsv/address.csv -b 10000 -d , -q with distinct toUpper({Zipcode}) as Zipcode, toUpper({StreetNL}) as StreetNL, toUpper({HouseNumber}) as HouseNumber, {EntityNumber} as EntityNumber match (e:Establishment {EstablishmentNumber: EntityNumber}), (street:Street {name: StreetNL, zip:Zipcode})<-[:PART_OF]-(h:HouseNumber {houseNumber: HouseNumber}) create (e)-[:HAS_ADDRESS]->(h); | |
//load the activities of Enterprises | |
import-cypher -i /<path>/sourcecsv/activity.csv -b 10000 -d , -q match (e:Enterprise {EnterpriseNumber: {EntityNumber}}), (cc2:CodeCategory {name: "ActivityGroup"})--(c2:Code {name: {ActivityGroup}}) merge (e)-[:HAS_ACTIVITY_GROUP {type: {Classification}}]->(c2); | |
MATCH (n:CodeCategory {name:"Nace2008"})--(c:Code) | |
set c.fullname="Nace2008-"+c.name; | |
MATCH (n:CodeCategory {name:"Nace2003"})--(c:Code) | |
set c.fullname="Nace2003-"+c.name; | |
create index on :Code(fullname); | |
import-cypher -i /<path>/sourcecsv/activity.csv -b 5000 -d , -q match (e:Enterprise {EnterpriseNumber: {EntityNumber}}), (c1:Code {fullname: "Nace"+{NaceVersion}+"-"+{NaceCode}}) create unique (e)-[hnc:HAS_NACE_CODE {type: {Classification}}]->(c1); | |
//load the activities of Establishments | |
import-cypher -i /<path>/sourcecsv/activity.csv -b 10000 -d , -q match (e:Establishment {EstablishmentNumber: {EntityNumber}}), (cc2:CodeCategory {name: "ActivityGroup"})--(c2:Code {name: {ActivityGroup}}) merge (e)-[:HAS_ACTIVITY_GROUP {type: {Classification}}]->(c2); | |
import-cypher -i /<path>/sourcecsv/activity.csv -b 5000 -d , -q match (e:Establishment {EstablishmentNumber: {EntityNumber}}), (c1:Code {fullname: "Nace"+{NaceVersion}+"-"+{NaceCode}}) create unique (e)-[hnc:HAS_NACE_CODE {type: {Classification}}]->(c1); | |
return timestamp(); | |
|
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment