Last active
November 3, 2020 19:09
-
-
Save jvilledieu/7845e027705f5d861736 to your computer and use it in GitHub Desktop.
Import script for Crunchbase. This is used to create a Neo4j data populated with the Crunchbase data (see blog post: http://linkurio.us/the-crunchbase-graph-data-modelling/). The data remains the property of Crunchbase and its use is limited by the following license: https://info.crunchbase.com/docs/licensing-policy-v1/
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//Cleaning up the database | |
MATCH (n) | |
WITH n LIMIT 10000 | |
OPTIONAL MATCH (n)-[r]->() | |
DELETE n,r; | |
//Importing the file Companies.csv | |
//Creating the constraints | |
create constraint on (a:COMPANY) assert a.permalink is unique; | |
create constraint on (b:CATEGORY) assert b.name is unique; | |
create constraint on (c:MARKET) assert c.name is unique; | |
create constraint on (d:STATUS) assert d.name is unique; | |
create constraint on (e:COUNTRY) assert e.name is unique; | |
create constraint on (f:STATE) assert f.name is unique; | |
create constraint on (g:REGION) assert g.name is unique; | |
create constraint on (h:CITY) assert h.name is unique; | |
//Creating the nodes | |
USING PERIODIC COMMIT 1000 | |
LOAD CSV WITH HEADERS FROM "file:c:/Companies.csv" AS line | |
FIELDTERMINATOR ';' | |
MERGE (b:CATEGORY {name: line.category_list}) | |
MERGE (c:MARKET {name: line.market}) | |
MERGE (d:STATUS {name: line.status}) | |
MERGE (e:COUNTRY {name: line.country_code}) | |
MERGE (f:STATE {name: line.state_code}) | |
MERGE (g:REGION {name: line.region}) | |
MERGE (h:CITY {name: line.city}); | |
USING PERIODIC COMMIT 2000 | |
LOAD CSV WITH HEADERS FROM "file:c:/Companies.csv" AS line | |
FIELDTERMINATOR ';' | |
CREATE (a:COMPANY { | |
name: line.name, | |
permalink: line.permalink, | |
homepage_url: line.homepage_url, | |
funding_total: toInt(line.funding_total_usd), | |
funding_rounds: toInt(line.funding_rounds), | |
founded_at: line.founded_at, | |
founded_month: line.founded_month, | |
founded_quarter: line.founded_quarter, | |
founded_year: toInt(line.founded_year), | |
first_funding_at: line.first_funding_at, | |
last_funding_at: line.last_funding_at, | |
category: line.category_list, | |
market: line.market, | |
status: line.status, | |
country: line.country_code, | |
state: line.state_code, | |
region: line.region, | |
city: line.city, | |
url: 'http://www.crunchbase.com' + line.permalink, | |
logo: 'http://www.crunchbase.com' + line.permalink + '/primary-image/raw' | |
}); | |
//Creating the edges | |
//Creating an index on the companies to speed up the import | |
DROP CONSTRAINT ON (a:COMPANY) ASSERT a.permalink IS UNIQUE; | |
CREATE INDEX ON :COMPANY(permalink); | |
//Companies & Categories | |
MATCH (a:COMPANY) | |
MATCH (b:CATEGORY {name: a.category}) | |
CREATE (a)-[:HAS_CATEGORY]->(b); | |
//Companies & Market | |
MATCH (a:COMPANY) | |
MATCH (b:MARKET {name: a.market}) | |
CREATE (a)-[:HAS_MARKET]->(b); | |
//Companies & Status | |
MATCH (a:COMPANY) | |
MATCH (b:STATUS {name: a.status}) | |
CREATE (a)-[:HAS_STATUS]->(b); | |
//Companies & Country | |
MATCH (a:COMPANY) | |
MATCH (b:COUNTRY {name: a.country}) | |
CREATE (a)-[:HAS_COUNTRY]->(b); | |
//Companies & State | |
MATCH (a:COMPANY) | |
MATCH (b:STATE {name: a.state}) | |
CREATE (a)-[:HAS_STATE]->(b); | |
//Companies & Region | |
MATCH (a:COMPANY) | |
MATCH (b:REGION {name: a.region}) | |
CREATE (a)-[:HAS_REGION]->(b); | |
//Companies & Cities | |
MATCH (a:COMPANY) | |
MATCH (b:CITY {name: a.city}) | |
CREATE (a)-[:HAS_CITY]->(b); | |
//Importing the file Investments.csv | |
//Creating the constraints | |
create constraint on (a:INVESTOR) assert a.permalink is unique; | |
create constraint on (b:INVESTOR_CATEGORY) assert b.name is unique; | |
create constraint on (c:INVESTOR_MARKET) assert c.name is unique; | |
create constraint on (e:COUNTRY) assert e.name is unique; | |
create constraint on (f:STATE) assert f.name is unique; | |
create constraint on (g:REGION) assert g.name is unique; | |
create constraint on (h:CITY) assert h.name is unique; | |
create constraint on (i:FUNDING_ROUND) assert i.permalink is unique; | |
create constraint on (j:ROUND_CODE) assert j.name is unique; | |
create constraint on (k:ROUND_TYPE) assert k.name is unique; | |
//Creating the nodes | |
USING PERIODIC COMMIT 2000 | |
LOAD CSV WITH HEADERS FROM "file:c:/Investments.csv" AS line | |
FIELDTERMINATOR ';' | |
MATCH (a:COMPANY { permalink: line.investor_permalink }) | |
SET a:INVESTOR; | |
USING PERIODIC COMMIT 2000 | |
LOAD CSV WITH HEADERS FROM "file:c:/Investments.csv" AS line | |
FIELDTERMINATOR ';' | |
MERGE (a:INVESTOR { permalink: line.investor_permalink }) | |
ON CREATE SET a.name = line.investor_name, | |
a.category = line.investor_category_list, | |
a.market = line.investor_market, | |
a.country = line.investor_country_code, | |
a.state = line.investor_state_code, | |
a.region = line.investor_region, | |
a.city = line.investor_city, | |
a.logo = 'http://www.crunchbase.com' + line.permalink + '/primary-image/raw', | |
a.url = 'http://www.crunchbase.com' + line.investor_permalink; | |
USING PERIODIC COMMIT 1000 | |
LOAD CSV WITH HEADERS FROM "file:c:/Investments.csv" AS line | |
FIELDTERMINATOR ';' | |
MERGE (b:INVESTOR_CATEGORY {name: line.investor_category_list}) | |
MERGE (c:INVESTOR_MARKET {name: line.investor_market}) | |
MERGE (d:COUNTRY {name: line.investor_country_code}) | |
MERGE (e:STATE {name: line.investor_state_code}) | |
MERGE (f:REGION {name: line.investor_region}) | |
MERGE (g:CITY {name: line.investor_city}) | |
MERGE (h:FUNDING_ROUND {permalink: line.funding_round_permalink}) | |
ON CREATE SET h.funded_at = line.funded_at, h.funded_month = line.funded_month, h.funded_quarter = line.funded_quarter, h.funded_year = toInt(line.funded_year), h.raised_amount_usd = toInt(line.raised_amount_usd), h. funding_round_code = line.funding_round_code, h.funding_round_type = line.funding_round_type, h.url = 'http://www.crunchbase.com' + line.permalink | |
MERGE (i:ROUND_CODE {name: line.funding_round_code}) | |
MERGE (j:ROUND_TYPE {name: line.funding_round_type}); | |
//Creating the edges | |
//Creating an index on the investors and funding rounds to speed up the import | |
DROP CONSTRAINT ON (a:INVESTOR) ASSERT a.permalink IS UNIQUE; | |
CREATE INDEX ON :INVESTOR(permalink); | |
DROP CONSTRAINT ON (a:FUNDING_ROUND) ASSERT a.permalink IS UNIQUE; | |
CREATE INDEX ON :FUNDING_ROUND(permalink); | |
//Investor & Category | |
MATCH (a:INVESTOR) | |
MATCH (b:INVESTOR_CATEGORY { name: a.category}) | |
CREATE (a)-[:HAS_INVESTOR_CATEGORY]->(b); | |
//Investor & Market | |
MATCH (a:INVESTOR) | |
MATCH (b:INVESTOR_MARKET { name: a.market}) | |
CREATE (a)-[:HAS_INVESTOR_MARKET]->(b); | |
//Investor and Country | |
MATCH (a:INVESTOR) | |
MATCH (b:COUNTRY { name: a.country}) | |
CREATE (a)-[:HAS_COUNTRY]->(b); | |
//Investor & State | |
MATCH (a:INVESTOR) | |
MATCH (b:STATE { name: a.state}) | |
CREATE (a)-[:HAS_STATE]->(b); | |
//Investor & Region | |
MATCH (a:INVESTOR) | |
MATCH (b:REGION { name: a.region}) | |
CREATE (a)-[:HAS_REGION]->(b); | |
//Investor & City | |
MATCH (a:INVESTOR) | |
MATCH (b:CITY { name: a.city}) | |
CREATE (a)-[:HAS_CITY]->(b); | |
//Investor & Funding Round | |
USING PERIODIC COMMIT 1000 LOAD CSV WITH HEADERS FROM "file:c:/Investments.csv" AS line | |
FIELDTERMINATOR ';' | |
MATCH (a:INVESTOR { permalink: line.investor_permalink}),(b:FUNDING_ROUND { permalink: line.funding_round_permalink}) | |
CREATE (a)-[:HAS_FUNDED]->(b); | |
//Startup & Funding Round | |
USING PERIODIC COMMIT 1000 LOAD CSV WITH HEADERS FROM "file:c:/Investments.csv" AS line | |
FIELDTERMINATOR ';' | |
MATCH (a:COMPANY { permalink: line.company_permalink}), (b:FUNDING_ROUND { permalink: line.funding_round_permalink}) | |
CREATE (a)-[:HAS_BEEN_FUNDED]->(b); | |
//Funding round and round code | |
MATCH (a:FUNDING_ROUND) | |
MATCH (b:ROUND_CODE { name: a.funding_round_code}) | |
CREATE (a)-[:HAS_ROUND_CODE]->(b); | |
//Funding round and round type | |
MATCH (a:FUNDING_ROUND) | |
MATCH (b:ROUND_TYPE { name: a.funding_round_type}) | |
CREATE (a)-[:HAS_ROUND_TYPE]->(b); | |
//Importing the file Acquisitions.csv | |
USING PERIODIC COMMIT 1000 | |
LOAD CSV WITH HEADERS FROM "file:c:/Acquisitions.csv" AS line | |
FIELDTERMINATOR ';' | |
MATCH (a:COMPANY {permalink: line.company_permalink}), (b:COMPANY {permalink: line.acquirer_permalink}) | |
CREATE (b)-[:HAS_ACQUIRED {acquisition_date : line.acquired_at, month: line.acquired_month, quarter: line.acquired_quarter, year: line.acquired_year, price_amount: | |
toInt(line.price_amount), currency: line.price_currency_code}]->(a); | |
//----------------------- | |
//Cleaning up | |
//----------------------- | |
MATCH (n:COMPANY) | |
WHERE n.state = '' | |
REMOVE n.state; | |
MATCH (n:COMPANY) | |
WHERE n.homepage_url = '' | |
REMOVE n.homepage_url; | |
MATCH (n:COMPANY) | |
WHERE n.funding_total = '' | |
REMOVE n.funding_total; | |
MATCH (n:COMPANY) | |
WHERE n.funding_rounds = '' | |
REMOVE n.funding_rounds; | |
MATCH (n:COMPANY) | |
WHERE n.founded_at = '' | |
REMOVE n.founded_at; | |
MATCH (n:COMPANY) | |
WHERE n.founded_month = '' | |
REMOVE n.founded_month; | |
MATCH (n:COMPANY) | |
WHERE n.founded_quarter = '' | |
REMOVE n.founded_quarter; | |
MATCH (n:COMPANY) | |
WHERE n.founded_year = '' | |
REMOVE n.founded_year; | |
MATCH (n:COMPANY) | |
WHERE n.first_funding_at = '' | |
REMOVE n.first_funding_at; | |
MATCH (n:COMPANY) | |
WHERE n.last_funding_at = '' | |
REMOVE n.last_funding_at; | |
MATCH (n:COMPANY) | |
WHERE n.category = '' | |
REMOVE n.category; | |
MATCH (n:COMPANY) | |
WHERE n.market = '' | |
REMOVE n.market; | |
MATCH (n:COMPANY) | |
WHERE n.status = '' | |
REMOVE n.status; | |
MATCH (n:COMPANY) | |
WHERE n.country = '' | |
REMOVE n.country; | |
MATCH (n:COMPANY) | |
WHERE n.state = '' | |
REMOVE n.state; | |
MATCH (n:COMPANY) | |
WHERE n.region = '' | |
REMOVE n.region; | |
MATCH (n:COMPANY) | |
REMOVE n.city; | |
MATCH (n:INVESTOR) | |
WHERE n.state = '' | |
REMOVE n.state; | |
MATCH (n:INVESTOR) | |
WHERE n.category = '' | |
REMOVE n.category; | |
MATCH (n:INVESTOR) | |
WHERE n.market = '' | |
REMOVE n.market; | |
MATCH (n:INVESTOR) | |
WHERE n.city = '' | |
REMOVE n.city; | |
MATCH (n:INVESTOR) | |
WHERE n.region = '' | |
REMOVE n.region; | |
MATCH (n:INVESTOR) | |
WHERE n.country = '' | |
REMOVE n.country; | |
optional match (n)-[r]-() where n.name = '' | |
DELETE n, r; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment