Skip to content

Instantly share code, notes, and snippets.

@eastlondoner
Last active November 1, 2018 02:40
Show Gist options
  • Save eastlondoner/4503274eebcbec0d4f94101aad81a954 to your computer and use it in GitHub Desktop.
Save eastlondoner/4503274eebcbec0d4f94101aad81a954 to your computer and use it in GitHub Desktop.
Create word2vec style graphs using cypher
call apoc.periodic.iterate('
load csv from "https://raw.githubusercontent.com/wess/iotr/master/lotr.txt" as row fieldterminator \'"\'
with row
unwind row as text
RETURN text','
WHERE text is not NULL
with reduce(t=tolower(text), delim in [",",".","!","?",\'"\',":",";","\'","-","#","*","(",")","[","]","/","`","and","the","not","but","for","with","from"] | replace(t,delim,"")) as normalized
// Remove all short words (length <2)
with [w in split(normalized," ") WHERE w IS NOT NULL and size(trim(w)) > 2 | trim(w)] as words
where words is not null and size(words) > 1
unwind range(1,2) as skipDistance
unwind range(0,size(words)-1-skipDistance) as idx
MERGE (w1:Word {name:words[idx]})
MERGE (w2:Word {name:words[idx+skipDistance]})
MERGE (w1)-[r:NEXT {distance:skipDistance}]->(w2)
ON CREATE SET r.count = 1
ON MATCH SET r.count = r.count + 1
',{batchSize:1000, iterateList:true, parallel:false})
match (w1:Word)-[r {distance:2}]->(cat:Word)
with cat, sum(r.count) as t1 LIMIT 10
match (w2:Word)-[r {distance:1}]->(cat)
with cat, t1, sum(r.count) as t2
match (cat)-[r {distance:1}]->(w3:Word)
with cat, t1, t2, sum(r.count) as t3
match (cat)-[r {distance:2}]->(w4:Word)
with cat, t1, t2, t3, sum(r.count) as t4
match (w1:Word)-[r {distance:2}]->(cat)
WITH *, toFloat(r.count)/t1 as p1
ORDER BY p1 DESCENDING
WITH cat, t1, t2, t3, t4, collect({name: w1.name, p: p1}) as p1
match (w2:Word)-[r {distance:1}]->(cat)
WITH *, toFloat(r.count)/t2 as p2
ORDER BY p2 DESCENDING
WITH cat, t1, t2, t3, t4, p1, collect({name: w2.name, p: p2}) as p2
match (cat)-[r {distance:1}]->(w3:Word)
WITH *, toFloat(r.count)/t3 as p3
ORDER BY p3 DESCENDING
WITH cat, t1, t2, t3, t4, p1, p2, collect({name: w3.name, p: p3}) as p3
match (cat)-[r {distance:2}]->(w4:Word)
WITH *, toFloat(r.count)/t4 as p4
ORDER BY p4 DESCENDING
WITH cat, t1, t2, t3, t4, p1, p2, p3, collect({name: w4.name, p: p4}) as p4
return cat.name, p1, p2, p3, p4
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment