Skip to content

Instantly share code, notes, and snippets.

@habedi
Last active September 2, 2024 16:01
Show Gist options
  • Save habedi/da7a2d88b095870b2cc0e92555c8bc61 to your computer and use it in GitHub Desktop.
Save habedi/da7a2d88b095870b2cc0e92555c8bc61 to your computer and use it in GitHub Desktop.
A script with the commands to load CSV files into a Neo4j graph database. The data is from StackExchange data dump (https://archive.org/details/stackexchange) #cypher #neo4j #stackoverflow_data #csv #graphdb
// Loading the posts
LOAD CSV WITH HEADERS FROM 'file:///posts_all_csv.csv' AS row
WITH toInteger(row[0]) AS postId, row[5] AS postBody, toInteger(row[3]) AS postScore
RETURN count(row);
LOAD CSV WITH HEADERS FROM 'file:///posts_all_csv.csv' AS row FIELDTERMINATOR '\t'
WITH row[0] AS postId, row[3] AS postScore, row[5] AS postBody
MERGE (p:Post {postId: postId})
SET p.postBody = postBody, p.postScore = postScore
RETURN p;
// Deleting all nodes in the database
MATCH (n)
DETACH DELETE n;
// Deleting edges based on their labels
MATCH ()-[r]->()
DELETE r;
// Loading answers
CREATE CONSTRAINT ON (a:Answer) ASSERT a.AnswerId IS UNIQUE;
CREATE CONSTRAINT ON (a:Answer) ASSERT exists(a.AnswerId);
LOAD CSV WITH HEADERS FROM 'file:///answers2_b64.csv' AS row FIELDTERMINATOR '\t'
WITH toInteger(row.Id) AS AnswerId, toInteger(row.ParentId) AS ParentId,
apoc.temporal.format(row.CreationDate, 'dd MMMM yyyy HH:mm:ss') AS CreationDate,
toInteger(row.Score) AS Score, toInteger(row.ViewCount) AS ViewCount,
apoc.text.base64Decode(row.Body) AS Body, toInteger(row.OwnerUserId) AS OwnerUserId,
apoc.temporal.format(row.LastActivityDate, 'dd MMMM yyyy HH:mm:ss') AS LastActivityDate,
toInteger(row.CommentCount) AS CommentCount
MERGE (a:Answer {AnswerId: AnswerId})
SET a.ParentId = ParentId, a.CreationDate = CreationDate, a.Score = Score,
a.ViewCount = ViewCount, a.Body = Body, a.OwnerUserId = OwnerUserId,
a.LastActivityDate = LastActivityDate, a.CommentCount = CommentCount
RETURN a;
// Loading users
CREATE CONSTRAINT ON (u:User) ASSERT u.UserId IS UNIQUE;
CREATE CONSTRAINT ON (u:User) ASSERT exists(u.UserId);
LOAD CSV WITH HEADERS FROM 'file:///users2_b64.csv' AS row FIELDTERMINATOR '\t'
WITH toInteger(row.Id) AS UserId, toInteger(row.Reputation) AS Reputation,
apoc.temporal.format(row.CreationDate, 'dd MMMM yyyy HH:mm:ss') AS CreationDate,
row.DisplayName AS DisplayName,
apoc.temporal.format(row.LastAccessDate, 'dd MMMM yyyy HH:mm:ss') AS LastAccessDate,
apoc.text.base64Decode(coalesce(row.AboutMe, '')) AS AboutMe,
toInteger(row.Views) AS Views, toInteger(row.UpVotes) AS UpVotes,
toInteger(row.DownVotes) AS DownVotes
MERGE (r:User {UserId: UserId})
SET r.Reputation = Reputation, r.CreationDate = CreationDate,
r.DisplayName = DisplayName, r.LastAccessDate = LastAccessDate,
r.AboutMe = AboutMe, r.Views = Views, r.UpVotes = UpVotes,
r.DownVotes = DownVotes
RETURN r;
// Loading questions
CREATE CONSTRAINT ON (q:Question) ASSERT q.QuestionId IS UNIQUE;
CREATE CONSTRAINT ON (q:Question) ASSERT exists(q.QuestionId);
LOAD CSV WITH HEADERS FROM 'file:///questions2_b64.csv' AS row FIELDTERMINATOR '\t'
WITH toInteger(row.Id) AS QuestionId, toInteger(row.AcceptedAnswerId) AS AcceptedAnswerId,
apoc.temporal.format(row.CreationDate, 'dd MMMM yyyy HH:mm:ss') AS CreationDate,
toInteger(row.Score) AS Score, toInteger(row.ViewCount) AS ViewCount,
apoc.text.base64Decode(row.Body) AS Body, toInteger(row.OwnerUserId) AS OwnerUserId,
apoc.temporal.format(row.LastActivityDate, 'dd MMMM yyyy HH:mm:ss') AS LastActivityDate,
apoc.text.base64Decode(row.Title) AS Title, apoc.text.split(row.Tags, ' ') AS Tags,
toInteger(row.AnswerCount) AS AnswerCount, toInteger(row.CommentCount) AS CommentCount,
toInteger(row.FavoriteCount) AS FavoriteCount,
apoc.temporal.format(coalesce(row.ClosedDate, ''), 'dd MMMM yyyy HH:mm:ss') AS ClosedDate
MERGE (q:Question {QuestionId: QuestionId})
SET q.AcceptedAnswerId = AcceptedAnswerId, q.CreationDate = CreationDate,
q.Score = Score, q.ViewCount = ViewCount, q.Body = Body, q.OwnerUserId = OwnerUserId,
q.LastActivityDate = LastActivityDate, q.Title = Title, q.Tags = Tags,
q.AnswerCount = AnswerCount, q.CommentCount = CommentCount, q.FavoriteCount = FavoriteCount,
q.ClosedDate = ClosedDate
RETURN q;
// Loading comments
CREATE CONSTRAINT ON (c:Comment) ASSERT c.CommentId IS UNIQUE;
CREATE CONSTRAINT ON (c:Comment) ASSERT exists(c.CommentId);
LOAD CSV WITH HEADERS FROM 'file:///comments2_b64.csv' AS row FIELDTERMINATOR '\t'
WITH toInteger(row.Id) AS CommentId, toInteger(row.PostId) AS PostId,
apoc.temporal.format(row.CreationDate, 'dd MMMM yyyy HH:mm:ss') AS CreationDate,
toInteger(row.Score) AS Score, apoc.text.base64Decode(row.`Text`) AS Body,
toInteger(row.UserId) AS UserId
MERGE (c:Comment {CommentId: CommentId})
SET c.PostId = PostId, c.CreationDate = CreationDate, c.Score = Score,
c.Body = Body, c.UserId = UserId
RETURN c;
// Loading tags
CREATE CONSTRAINT ON (t:Tag) ASSERT t.TagId IS UNIQUE;
CREATE CONSTRAINT ON (t:Tag) ASSERT exists(t.TagId);
LOAD CSV WITH HEADERS FROM 'file:///tags2_b64.csv' AS row FIELDTERMINATOR '\t'
WITH toInteger(row.Id) AS TagId, toInteger(row.TagCount) AS TagCount,
row.TagName AS TagName
MERGE (t:Tag {TagId: TagId})
SET t.TagName = TagName, t.TagCount = TagCount
RETURN t;
// Loading badges
CREATE CONSTRAINT ON (b:Badge) ASSERT b.BadgeName IS UNIQUE;
CREATE CONSTRAINT ON (b:Badge) ASSERT exists(b.BadgeName);
LOAD CSV WITH HEADERS FROM 'file:///badges2_b64_nodes.csv' AS row FIELDTERMINATOR '\t'
WITH row.Name AS BadgeName, toInteger(row.Class) AS BadgeClass
MERGE (b:Badge {BadgeName: BadgeName})
SET b.BadgeClass = BadgeClass
RETURN b;
// Adding an edge from user to badge
LOAD CSV WITH HEADERS FROM 'file:///badges2_b64_edges.csv' AS row FIELDTERMINATOR '\t'
WITH row.Name AS BadgeName, toInteger(row.UserId) AS UserId,
apoc.temporal.format(row.Date, 'dd MMMM yyyy HH:mm:ss') AS CreationDate
MATCH (u:User), (b:Badge)
WHERE u.UserId = UserId AND b.BadgeName = BadgeName
CREATE (u)-[r:RECEIVES {CreationDate: CreationDate}]->(b)
RETURN r;
// Adding an edge from answer to question
MATCH (a:Answer), (q:Question)
WHERE q.QuestionId = a.ParentId
CREATE (a)-[r:ANSWERS {CreationDate: a.CreationDate}]->(q)
RETURN r;
// Adding an edge between a user and his answer
MATCH (u:User), (a:Answer)
WHERE u.UserId = a.OwnerUserId
CREATE (u)-[r:POSTS {CreationDate: a.CreationDate}]->(a)
RETURN r;
// Adding an edge between a user and his question
MATCH (u:User), (q:Question)
WHERE u.UserId = q.OwnerUserId
CREATE (u)-[r:POSTS {CreationDate: q.CreationDate}]->(q)
RETURN r;
// Adding an edge between a question and its tags
MATCH (q:Question), (t:Tag)
WHERE t.TagName IN q.Tags
CREATE (q)-[r:HAS_TAG]->(t)
RETURN r;
// Adding an edge between an answer and its comment
MATCH (c:Comment), (a:Answer)
WHERE c.PostId = a.AnswerId
CREATE (c)-[r:COMMENTS {CreationDate: c.CreationDate}]->(a)
RETURN r;
// Adding an edge between a question and its comment
MATCH (c:Comment), (q:Question)
WHERE c.PostId = q.QuestionId
CREATE (c)-[r:COMMENTS {CreationDate: c.CreationDate}]->(q)
RETURN r;
// Adding an edge between a user and his comment
MATCH (u:User), (c:Comment)
WHERE c.UserId = u.UserId
CREATE (u)-[r:POSTS {CreationDate: c.CreationDate}]->(c)
RETURN r;
// Adding a link between a user and the answer he accepted
MATCH p = (u:User)-[r1:POSTS]->(q:Question)<-[r2:ANSWERS]-(a
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment