Last active
November 1, 2018 02:40
-
-
Save eastlondoner/4503274eebcbec0d4f94101aad81a954 to your computer and use it in GitHub Desktop.
Create word2vec style graphs using cypher
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| call apoc.periodic.iterate(' | |
| load csv from "https://raw.githubusercontent.com/wess/iotr/master/lotr.txt" as row fieldterminator \'"\' | |
| with row | |
| unwind row as text | |
| RETURN text',' | |
| WHERE text is not NULL | |
| with reduce(t=tolower(text), delim in [",",".","!","?",\'"\',":",";","\'","-","#","*","(",")","[","]","/","`","and","the","not","but","for","with","from"] | replace(t,delim,"")) as normalized | |
| // Remove all short words (length <2) | |
| with [w in split(normalized," ") WHERE w IS NOT NULL and size(trim(w)) > 2 | trim(w)] as words | |
| where words is not null and size(words) > 1 | |
| unwind range(1,2) as skipDistance | |
| unwind range(0,size(words)-1-skipDistance) as idx | |
| MERGE (w1:Word {name:words[idx]}) | |
| MERGE (w2:Word {name:words[idx+skipDistance]}) | |
| MERGE (w1)-[r:NEXT {distance:skipDistance}]->(w2) | |
| ON CREATE SET r.count = 1 | |
| ON MATCH SET r.count = r.count + 1 | |
| ',{batchSize:1000, iterateList:true, parallel:false}) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| match (w1:Word)-[r {distance:2}]->(cat:Word) | |
| with cat, sum(r.count) as t1 LIMIT 10 | |
| match (w2:Word)-[r {distance:1}]->(cat) | |
| with cat, t1, sum(r.count) as t2 | |
| match (cat)-[r {distance:1}]->(w3:Word) | |
| with cat, t1, t2, sum(r.count) as t3 | |
| match (cat)-[r {distance:2}]->(w4:Word) | |
| with cat, t1, t2, t3, sum(r.count) as t4 | |
| match (w1:Word)-[r {distance:2}]->(cat) | |
| WITH *, toFloat(r.count)/t1 as p1 | |
| ORDER BY p1 DESCENDING | |
| WITH cat, t1, t2, t3, t4, collect({name: w1.name, p: p1}) as p1 | |
| match (w2:Word)-[r {distance:1}]->(cat) | |
| WITH *, toFloat(r.count)/t2 as p2 | |
| ORDER BY p2 DESCENDING | |
| WITH cat, t1, t2, t3, t4, p1, collect({name: w2.name, p: p2}) as p2 | |
| match (cat)-[r {distance:1}]->(w3:Word) | |
| WITH *, toFloat(r.count)/t3 as p3 | |
| ORDER BY p3 DESCENDING | |
| WITH cat, t1, t2, t3, t4, p1, p2, collect({name: w3.name, p: p3}) as p3 | |
| match (cat)-[r {distance:2}]->(w4:Word) | |
| WITH *, toFloat(r.count)/t4 as p4 | |
| ORDER BY p4 DESCENDING | |
| WITH cat, t1, t2, t3, t4, p1, p2, p3, collect({name: w4.name, p: p4}) as p4 | |
| return cat.name, p1, p2, p3, p4 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment