Created
November 14, 2014 00:52
-
-
Save bkimble/e0cd53702799284484d9 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/ruby | |
puts "hello" | |
require 'tempfile' | |
puts ARGV[1] | |
filename,lines,prefix = ARGV | |
unless filename && lines | |
abort("missing file or lines") | |
end | |
lines = lines.to_i | |
neo_query = <<-EOF | |
// load csv | |
LOAD CSV WITH HEADERS FROM "file:%%FILENAME%%" AS csvLine | |
// Find or create a user node with an id value of the userId field from the CSV | |
MERGE (user:User { id: toInt(csvLine.userId) }) | |
// Find or create an alias node with a display name value (need to ensure this is not merging people with same names without taking email and such in to consideration) | |
MERGE (alias:Alias { name: csvLine.displayName }) | |
// Create the KNOWS relationship between the user we found or created, and an empty contact niode that we are creating | |
// We set a merge false field so a subsequent process will know that it needs to be worked on | |
// Then link the empty contact node to the alias node we found or created | |
CREATE (user)-[:KNOWS]->(contact:Contact { merge: false })-[:ALIAS]->(alias) | |
// Loop over each Email address | |
FOREACH (address IN split(replace(replace(replace(replace(csvLine.emails,'[',''),']',''),'"',''),' ',''),',') | | |
// Find or create an email address node | |
MERGE (email:Email { address: address }) | |
// If we find an email address (not create) set merge = true so we know how to deal with it later | |
ON MATCH SET contact.merge = true | |
// finally create the relationship between the contact we created in the above block, and the email we found or created here. | |
CREATE (contact)-[:EMAIL]->(email) | |
) | |
// Loop over each phone number | |
FOREACH (number IN split(replace(replace(replace(replace(csvLine.phoneNumbers,'[',''),']',''),'"',''),' ',''),',') | | |
// find or create a phone number node | |
MERGE (phone:Phone { number: number }) | |
// If we find a phopne number (not create), set merge = true so we know how to deal with it late | |
ON MATCH SET contact.merge = true | |
// finally create the relationship between the contact we created in the above block, and the phone we found or created here. | |
CREATE (contact)-[:PHONE]->(phone) | |
); | |
// Phase 2 | |
// Delete null Email addresses and Phone numbers ( a result in us using FOREACH on an empty array) | |
MATCH (email:Email { address: '' })<-[e:EMAIL]-() | |
DELETE e, email; | |
MATCH (phone:Phone { number: '' })<-[p:PHONE]-() | |
DELETE p, phone; | |
// End phase 2 | |
EOF | |
File.open(filename, 'r') do |f| | |
total_imported = 0 | |
headers = f.gets | |
size = `wc -l #{filename}`.gsub(/^(\d+).+?$/,"\\1") | |
files = size.to_i / lines.to_i | |
1.upto(files+1) do |file_count| | |
puts "making file." | |
file = File.new("/home/ubuntu/csvchunk.csv", "w") | |
begin | |
file.puts headers | |
0.upto(lines.to_i) do | |
if line = f.gets | |
file.puts line | |
end | |
end | |
file.close | |
puts "making query file now." | |
query_file = File.new('queryfile', 'w') | |
query = neo_query.gsub(/%%FILENAME%%/, file.path) | |
begin | |
query_file.puts query | |
query_file.close | |
end | |
start = Time.now | |
`/usr/bin/neo4j-shell -file #{query_file.path} /dev/null` | |
total_time = Time.now - start | |
total_imported += lines | |
puts "#{total_imported} total, last batch: #{total_time}" | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment