Last active
September 2, 2024 16:01
-
-
Save habedi/da7a2d88b095870b2cc0e92555c8bc61 to your computer and use it in GitHub Desktop.
A script with the commands to load CSV files into a Neo4j graph database. The data is from StackExchange data dump (https://archive.org/details/stackexchange) #cypher #neo4j #stackoverflow_data #csv #graphdb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Loading the posts | |
LOAD CSV WITH HEADERS FROM 'file:///posts_all_csv.csv' AS row | |
WITH toInteger(row[0]) AS postId, row[5] AS postBody, toInteger(row[3]) AS postScore | |
RETURN count(row); | |
LOAD CSV WITH HEADERS FROM 'file:///posts_all_csv.csv' AS row FIELDTERMINATOR '\t' | |
WITH row[0] AS postId, row[3] AS postScore, row[5] AS postBody | |
MERGE (p:Post {postId: postId}) | |
SET p.postBody = postBody, p.postScore = postScore | |
RETURN p; | |
// Deleting all nodes in the database | |
MATCH (n) | |
DETACH DELETE n; | |
// Deleting edges based on their labels | |
MATCH ()-[r]->() | |
DELETE r; | |
// Loading answers | |
CREATE CONSTRAINT ON (a:Answer) ASSERT a.AnswerId IS UNIQUE; | |
CREATE CONSTRAINT ON (a:Answer) ASSERT exists(a.AnswerId); | |
LOAD CSV WITH HEADERS FROM 'file:///answers2_b64.csv' AS row FIELDTERMINATOR '\t' | |
WITH toInteger(row.Id) AS AnswerId, toInteger(row.ParentId) AS ParentId, | |
apoc.temporal.format(row.CreationDate, 'dd MMMM yyyy HH:mm:ss') AS CreationDate, | |
toInteger(row.Score) AS Score, toInteger(row.ViewCount) AS ViewCount, | |
apoc.text.base64Decode(row.Body) AS Body, toInteger(row.OwnerUserId) AS OwnerUserId, | |
apoc.temporal.format(row.LastActivityDate, 'dd MMMM yyyy HH:mm:ss') AS LastActivityDate, | |
toInteger(row.CommentCount) AS CommentCount | |
MERGE (a:Answer {AnswerId: AnswerId}) | |
SET a.ParentId = ParentId, a.CreationDate = CreationDate, a.Score = Score, | |
a.ViewCount = ViewCount, a.Body = Body, a.OwnerUserId = OwnerUserId, | |
a.LastActivityDate = LastActivityDate, a.CommentCount = CommentCount | |
RETURN a; | |
// Loading users | |
CREATE CONSTRAINT ON (u:User) ASSERT u.UserId IS UNIQUE; | |
CREATE CONSTRAINT ON (u:User) ASSERT exists(u.UserId); | |
LOAD CSV WITH HEADERS FROM 'file:///users2_b64.csv' AS row FIELDTERMINATOR '\t' | |
WITH toInteger(row.Id) AS UserId, toInteger(row.Reputation) AS Reputation, | |
apoc.temporal.format(row.CreationDate, 'dd MMMM yyyy HH:mm:ss') AS CreationDate, | |
row.DisplayName AS DisplayName, | |
apoc.temporal.format(row.LastAccessDate, 'dd MMMM yyyy HH:mm:ss') AS LastAccessDate, | |
apoc.text.base64Decode(coalesce(row.AboutMe, '')) AS AboutMe, | |
toInteger(row.Views) AS Views, toInteger(row.UpVotes) AS UpVotes, | |
toInteger(row.DownVotes) AS DownVotes | |
MERGE (r:User {UserId: UserId}) | |
SET r.Reputation = Reputation, r.CreationDate = CreationDate, | |
r.DisplayName = DisplayName, r.LastAccessDate = LastAccessDate, | |
r.AboutMe = AboutMe, r.Views = Views, r.UpVotes = UpVotes, | |
r.DownVotes = DownVotes | |
RETURN r; | |
// Loading questions | |
CREATE CONSTRAINT ON (q:Question) ASSERT q.QuestionId IS UNIQUE; | |
CREATE CONSTRAINT ON (q:Question) ASSERT exists(q.QuestionId); | |
LOAD CSV WITH HEADERS FROM 'file:///questions2_b64.csv' AS row FIELDTERMINATOR '\t' | |
WITH toInteger(row.Id) AS QuestionId, toInteger(row.AcceptedAnswerId) AS AcceptedAnswerId, | |
apoc.temporal.format(row.CreationDate, 'dd MMMM yyyy HH:mm:ss') AS CreationDate, | |
toInteger(row.Score) AS Score, toInteger(row.ViewCount) AS ViewCount, | |
apoc.text.base64Decode(row.Body) AS Body, toInteger(row.OwnerUserId) AS OwnerUserId, | |
apoc.temporal.format(row.LastActivityDate, 'dd MMMM yyyy HH:mm:ss') AS LastActivityDate, | |
apoc.text.base64Decode(row.Title) AS Title, apoc.text.split(row.Tags, ' ') AS Tags, | |
toInteger(row.AnswerCount) AS AnswerCount, toInteger(row.CommentCount) AS CommentCount, | |
toInteger(row.FavoriteCount) AS FavoriteCount, | |
apoc.temporal.format(coalesce(row.ClosedDate, ''), 'dd MMMM yyyy HH:mm:ss') AS ClosedDate | |
MERGE (q:Question {QuestionId: QuestionId}) | |
SET q.AcceptedAnswerId = AcceptedAnswerId, q.CreationDate = CreationDate, | |
q.Score = Score, q.ViewCount = ViewCount, q.Body = Body, q.OwnerUserId = OwnerUserId, | |
q.LastActivityDate = LastActivityDate, q.Title = Title, q.Tags = Tags, | |
q.AnswerCount = AnswerCount, q.CommentCount = CommentCount, q.FavoriteCount = FavoriteCount, | |
q.ClosedDate = ClosedDate | |
RETURN q; | |
// Loading comments | |
CREATE CONSTRAINT ON (c:Comment) ASSERT c.CommentId IS UNIQUE; | |
CREATE CONSTRAINT ON (c:Comment) ASSERT exists(c.CommentId); | |
LOAD CSV WITH HEADERS FROM 'file:///comments2_b64.csv' AS row FIELDTERMINATOR '\t' | |
WITH toInteger(row.Id) AS CommentId, toInteger(row.PostId) AS PostId, | |
apoc.temporal.format(row.CreationDate, 'dd MMMM yyyy HH:mm:ss') AS CreationDate, | |
toInteger(row.Score) AS Score, apoc.text.base64Decode(row.`Text`) AS Body, | |
toInteger(row.UserId) AS UserId | |
MERGE (c:Comment {CommentId: CommentId}) | |
SET c.PostId = PostId, c.CreationDate = CreationDate, c.Score = Score, | |
c.Body = Body, c.UserId = UserId | |
RETURN c; | |
// Loading tags | |
CREATE CONSTRAINT ON (t:Tag) ASSERT t.TagId IS UNIQUE; | |
CREATE CONSTRAINT ON (t:Tag) ASSERT exists(t.TagId); | |
LOAD CSV WITH HEADERS FROM 'file:///tags2_b64.csv' AS row FIELDTERMINATOR '\t' | |
WITH toInteger(row.Id) AS TagId, toInteger(row.TagCount) AS TagCount, | |
row.TagName AS TagName | |
MERGE (t:Tag {TagId: TagId}) | |
SET t.TagName = TagName, t.TagCount = TagCount | |
RETURN t; | |
// Loading badges | |
CREATE CONSTRAINT ON (b:Badge) ASSERT b.BadgeName IS UNIQUE; | |
CREATE CONSTRAINT ON (b:Badge) ASSERT exists(b.BadgeName); | |
LOAD CSV WITH HEADERS FROM 'file:///badges2_b64_nodes.csv' AS row FIELDTERMINATOR '\t' | |
WITH row.Name AS BadgeName, toInteger(row.Class) AS BadgeClass | |
MERGE (b:Badge {BadgeName: BadgeName}) | |
SET b.BadgeClass = BadgeClass | |
RETURN b; | |
// Adding an edge from user to badge | |
LOAD CSV WITH HEADERS FROM 'file:///badges2_b64_edges.csv' AS row FIELDTERMINATOR '\t' | |
WITH row.Name AS BadgeName, toInteger(row.UserId) AS UserId, | |
apoc.temporal.format(row.Date, 'dd MMMM yyyy HH:mm:ss') AS CreationDate | |
MATCH (u:User), (b:Badge) | |
WHERE u.UserId = UserId AND b.BadgeName = BadgeName | |
CREATE (u)-[r:RECEIVES {CreationDate: CreationDate}]->(b) | |
RETURN r; | |
// Adding an edge from answer to question | |
MATCH (a:Answer), (q:Question) | |
WHERE q.QuestionId = a.ParentId | |
CREATE (a)-[r:ANSWERS {CreationDate: a.CreationDate}]->(q) | |
RETURN r; | |
// Adding an edge between a user and his answer | |
MATCH (u:User), (a:Answer) | |
WHERE u.UserId = a.OwnerUserId | |
CREATE (u)-[r:POSTS {CreationDate: a.CreationDate}]->(a) | |
RETURN r; | |
// Adding an edge between a user and his question | |
MATCH (u:User), (q:Question) | |
WHERE u.UserId = q.OwnerUserId | |
CREATE (u)-[r:POSTS {CreationDate: q.CreationDate}]->(q) | |
RETURN r; | |
// Adding an edge between a question and its tags | |
MATCH (q:Question), (t:Tag) | |
WHERE t.TagName IN q.Tags | |
CREATE (q)-[r:HAS_TAG]->(t) | |
RETURN r; | |
// Adding an edge between an answer and its comment | |
MATCH (c:Comment), (a:Answer) | |
WHERE c.PostId = a.AnswerId | |
CREATE (c)-[r:COMMENTS {CreationDate: c.CreationDate}]->(a) | |
RETURN r; | |
// Adding an edge between a question and its comment | |
MATCH (c:Comment), (q:Question) | |
WHERE c.PostId = q.QuestionId | |
CREATE (c)-[r:COMMENTS {CreationDate: c.CreationDate}]->(q) | |
RETURN r; | |
// Adding an edge between a user and his comment | |
MATCH (u:User), (c:Comment) | |
WHERE c.UserId = u.UserId | |
CREATE (u)-[r:POSTS {CreationDate: c.CreationDate}]->(c) | |
RETURN r; | |
// Adding a link between a user and the answer he accepted | |
MATCH p = (u:User)-[r1:POSTS]->(q:Question)<-[r2:ANSWERS]-(a |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment