Last active
April 9, 2024 15:24
-
-
Save aneeshdurg/19441eff1a315aeec417509ecd0e297d to your computer and use it in GitHub Desktop.
Load LDBC CSV files into JanusGraph
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Usage: bin/gremlin.sh -e load.groovy | |
// This file loads LDBC CSV files into a graph | |
import java.text.SimpleDateFormat | |
dateFormat = new SimpleDateFormat("yyyy-MM-dd"); | |
timestampFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss") | |
def defineSchemaAndIndices(graph) { | |
mgmt = graph.openManagement(); // create management object | |
// define vertex labels | |
mgmt.makeVertexLabel("Place").make(); | |
mgmt.makeVertexLabel("Comment").make(); | |
mgmt.makeVertexLabel("Forum").make(); | |
mgmt.makeVertexLabel("Person").make(); | |
mgmt.makeVertexLabel("Post").make(); | |
mgmt.makeVertexLabel("Tag").make(); | |
mgmt.makeVertexLabel("TagClass").make(); | |
mgmt.makeVertexLabel("University").make(); | |
mgmt.makeVertexLabel("Company").make(); | |
// define edge labels and USAGE | |
mgmt.makeEdgeLabel("containerOf").multiplicity(Multiplicity.SIMPLE).make(); | |
mgmt.makeEdgeLabel("hasCreator").multiplicity(Multiplicity.SIMPLE).make(); | |
mgmt.makeEdgeLabel("hasInterest").multiplicity(Multiplicity.SIMPLE).make(); | |
mgmt.makeEdgeLabel("hasMember").multiplicity(Multiplicity.SIMPLE).make(); | |
mgmt.makeEdgeLabel("hasModerator").multiplicity(Multiplicity.SIMPLE).make(); | |
mgmt.makeEdgeLabel("hasTag").multiplicity(Multiplicity.SIMPLE).make(); | |
mgmt.makeEdgeLabel("isLocatedIn").multiplicity(Multiplicity.SIMPLE).make(); | |
mgmt.makeEdgeLabel("isPartOf").multiplicity(Multiplicity.SIMPLE).make(); | |
mgmt.makeEdgeLabel("isSubclassOf").multiplicity(Multiplicity.SIMPLE).make(); | |
mgmt.makeEdgeLabel("likes").multiplicity(Multiplicity.SIMPLE).make(); | |
mgmt.makeEdgeLabel("knows").multiplicity(Multiplicity.SIMPLE).make(); | |
mgmt.makeEdgeLabel("replyOf").multiplicity(Multiplicity.SIMPLE).make(); | |
mgmt.makeEdgeLabel("studyAt").multiplicity(Multiplicity.SIMPLE).make(); | |
mgmt.makeEdgeLabel("workAt").multiplicity(Multiplicity.SIMPLE).make(); | |
mgmt.makeEdgeLabel("hasType").multiplicity(Multiplicity.SIMPLE).make(); | |
// define vertex property keys | |
mgmt.makePropertyKey("id").dataType(Long.class).cardinality(Cardinality.SINGLE).make(); // Forum, Post/Comment (Message), Company/University (Organisation), Person, City/Country/Continent (Place), Tag, TagClass | |
mgmt.makePropertyKey("title").dataType(String.class).cardinality(Cardinality.SINGLE).make(); // Forum | |
mgmt.makePropertyKey("creationDate").dataType(Date.class).cardinality(Cardinality.SINGLE).make(); // Forum, Post/Comment (Message), Person, :knows, :likes | |
mgmt.makePropertyKey("browserUsed").dataType(String.class).cardinality(Cardinality.SINGLE).make(); // Post/Comment (Message), Person | |
mgmt.makePropertyKey("locationIP").dataType(String.class).cardinality(Cardinality.SINGLE).make(); // Post/Comment (Message) | |
mgmt.makePropertyKey("content").dataType(String.class).cardinality(Cardinality.SINGLE).make(); // Post/Comment (Message) | |
mgmt.makePropertyKey("length").dataType(Long.class).cardinality(Cardinality.SINGLE).make(); // Post/Comment (Message | |
mgmt.makePropertyKey("name").dataType(String.class).cardinality(Cardinality.SINGLE).make(); // Company/University (Organisation), City/Country/Continent (Place), Tag, TagClass | |
mgmt.makePropertyKey("url").dataType(String.class).cardinality(Cardinality.SINGLE).make(); // Company/University (Organisation), City/Country/Continent (Place), Tag, TagClass | |
mgmt.makePropertyKey("firstName").dataType(String.class).cardinality(Cardinality.SINGLE).make(); // Person | |
mgmt.makePropertyKey("lastName").dataType(String.class).cardinality(Cardinality.SINGLE).make(); // Person | |
mgmt.makePropertyKey("gender").dataType(String.class).cardinality(Cardinality.SINGLE).make(); // Person | |
mgmt.makePropertyKey("birthday").dataType(Date.class).cardinality(Cardinality.SINGLE).make(); // Person | |
mgmt.makePropertyKey("email").dataType(String.class).cardinality(Cardinality.LIST).make(); // Person | |
mgmt.makePropertyKey("language").dataType(String.class).cardinality(Cardinality.LIST).make(); // Person, Post (Message) | |
mgmt.makePropertyKey("imageFile").dataType(String.class).cardinality(Cardinality.SINGLE).make(); // Post (Message) | |
mgmt.makePropertyKey("type").dataType(String.class).cardinality(Cardinality.SINGLE).make(); // Company/University (Organisation), City/Country/Continent (Place) | |
// define edge property keys | |
mgmt.makePropertyKey("joinDate").dataType(Date.class).cardinality(Cardinality.SINGLE).make(); // :hasMember | |
mgmt.makePropertyKey("classYear").dataType(Integer.class).cardinality(Cardinality.SINGLE).make(); // :studyAt | |
mgmt.makePropertyKey("workFrom").dataType(Integer.class).cardinality(Cardinality.SINGLE).make(); // :workAt | |
// define graph indexes | |
mgmt.buildIndex("byPlaceId", Vertex.class).addKey(mgmt.getPropertyKey("id")).indexOnly(mgmt.getVertexLabel("Place")).unique().buildCompositeIndex(); | |
mgmt.buildIndex("byCommentId", Vertex.class).addKey(mgmt.getPropertyKey("id")).indexOnly(mgmt.getVertexLabel("Comment")).unique().buildCompositeIndex(); | |
mgmt.buildIndex("byCompanyId", Vertex.class).addKey(mgmt.getPropertyKey("id")).indexOnly(mgmt.getVertexLabel("Company")).unique().buildCompositeIndex(); | |
mgmt.buildIndex("byUniversityId", Vertex.class).addKey(mgmt.getPropertyKey("id")).indexOnly(mgmt.getVertexLabel("University")).unique().buildCompositeIndex(); | |
mgmt.buildIndex("byForumId", Vertex.class).addKey(mgmt.getPropertyKey("id")).indexOnly(mgmt.getVertexLabel("Forum")).unique().buildCompositeIndex(); | |
mgmt.buildIndex("byPersonId", Vertex.class).addKey(mgmt.getPropertyKey("id")).indexOnly(mgmt.getVertexLabel("Person")).unique().buildCompositeIndex(); | |
mgmt.buildIndex("byPostId", Vertex.class).addKey(mgmt.getPropertyKey("id")).indexOnly(mgmt.getVertexLabel("Post")).unique().buildCompositeIndex(); | |
mgmt.buildIndex("byTagId", Vertex.class).addKey(mgmt.getPropertyKey("id")).indexOnly(mgmt.getVertexLabel("Tag")).unique().buildCompositeIndex(); | |
mgmt.buildIndex("byTagClassId", Vertex.class).addKey(mgmt.getPropertyKey("id")).indexOnly(mgmt.getVertexLabel("TagClass")).unique().buildCompositeIndex(); | |
// commit schema | |
mgmt.commit(); | |
} | |
def CSVToMap(filename) { | |
file = new File(filename) | |
lines = file.readLines() | |
// Get the CSV Header | |
header = lines[0].tokenize("|") | |
// Load the CSV file as a map using the header as keys | |
rows = [] | |
lines[1..-1].each { line -> | |
def row = [:] | |
def props = line.tokenize("|") | |
for (int i = 0; i < props.size(); i++) { | |
if (header[i] == "id") { | |
row.put(header[i], props[i].toLong()) | |
} else if (header[i] == "creationDate"){ | |
ts = timestampFormat.parse(props[i]) | |
row.put(header[i], ts) | |
} else if (header[i] == "birthday") { | |
d = dateFormat.parse(props[i]) | |
row.put(header[i], d) | |
} else if (header[i] == "classYear" || header[i] == "workFrom") { | |
row.put(header[i], props[i].toInteger()) | |
} else if (header[i] == "length") { | |
row.put(header[i], props[i].toLong()) | |
} else { | |
row.put(header[i], props[i]) | |
} | |
} | |
rows << row | |
} | |
return rows | |
} | |
def constructVertices(g, label, allVertices, typeAsLabel) { | |
allVertices.each { vertexProps -> | |
if (typeAsLabel) { | |
label = vertexProps["type"] | |
} | |
v = g.addV(label).next() | |
vertexProps.each { key, val -> | |
v.property(key, val) | |
} | |
} | |
} | |
def constructEdges(g, label, fromlabel, fromKey, tolabel, toKey, allEdges) { | |
allEdges.each { edgeProps -> | |
fromVertex = g.V().has(fromlabel, "id", edgeProps[fromKey]).next() | |
toVertex = g.V().has(tolabel, "id", edgeProps[toKey]).next() | |
e = g.V(fromVertex).addE(label).to(toVertex).next() | |
edgeProps.each { key, val -> | |
if (key == fromKey || key == toKey) { | |
return | |
} | |
e.property(key, val) | |
} | |
valueMap = g.E(e).valueMap().next() | |
} | |
} | |
def getAllCSVFilesInDirectory(dir) { | |
def files = [] | |
new File(dir).eachFile { file -> | |
if (file.name.endsWith(".csv")) { | |
files << file.toString() | |
} | |
} | |
return files | |
} | |
def loadLDBCVertices(g, type, label, typeAsLabel) { | |
path = "/mygraph/ldbc/$type/$label" | |
files = getAllCSVFilesInDirectory(path) | |
files.each { file -> | |
vertices = CSVToMap(file) | |
constructVertices(g, label, vertices, typeAsLabel) | |
} | |
println "Loaded $label vertices" | |
} | |
def loadLDBCEdges(g, label, inlabel, outlabel) { | |
path = "/mygraph/ldbc/dynamic/${inlabel}_${label}_${outlabel}" | |
files = getAllCSVFilesInDirectory(path) | |
// if inlabel is the same as outlabel, then the headers will be ${inlabel}1Id | |
// and ${inlabel}2Id, otherwise they will be ${inlabel}Id and ${outlabel}Id | |
if (inlabel == outlabel) { | |
inKey = "${inlabel}1Id" | |
outKey = "${inlabel}2Id" | |
} else { | |
inKey = "${inlabel}Id" | |
outKey = "${outlabel}Id" | |
} | |
files.each { file -> | |
edges = CSVToMap(file) | |
constructEdges(g, label, inlabel, inKey, outlabel, outKey, edges) | |
} | |
println "Loaded $label edges" | |
} | |
def loadLDBC(graph) { | |
defineSchemaAndIndices(graph) | |
println "Schema and Indices defined" | |
g = graph.traversal() | |
loadLDBCVertices(g, "static", "Place", false) | |
loadLDBCVertices(g, "static", "Tag", false) | |
loadLDBCVertices(g, "static", "TagClass", false) | |
// Organisation can be either Company or University | |
loadLDBCVertices(g, "static", "Organisation", true) | |
loadLDBCVertices(g, "dynamic", "Forum", false) | |
loadLDBCVertices(g, "dynamic", "Post", false) | |
loadLDBCVertices(g, "dynamic", "Comment", false) | |
loadLDBCVertices(g, "dynamic", "Person", false) | |
loadLDBCEdges(g, "hasTag", "Comment", "Tag") | |
loadLDBCEdges(g, "knows", "Person", "Person") | |
loadLDBCEdges(g, "hasMember", "Forum", "Person") | |
loadLDBCEdges(g, "likes", "Person", "Post") | |
loadLDBCEdges(g, "workAt", "Person", "Company") | |
loadLDBCEdges(g, "hasTag", "Post", "Tag") | |
loadLDBCEdges(g, "hasInterest", "Person", "Tag") | |
loadLDBCEdges(g, "hasTag", "Forum", "Tag") | |
loadLDBCEdges(g, "studyAt", "Person", "University") | |
loadLDBCEdges(g, "likes", "Person", "Comment") | |
return g | |
} | |
graph = JanusGraphFactory.open('conf/janusgraph-inmemory-server.properties') | |
g = loadLDBC(graph) | |
vmap = g.V().limit(1).valueMap().next() | |
println "Vertex: $vmap" | |
emap = g.E().limit(1).valueMap().next() | |
println "Edge: $emap" | |
// Save to Kryo - reading from kryo takes 6s on my machine compared to 15s for loading from CSV | |
// g.io('/mygraph/ldbc003.kryo').write().iterate() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment