eastlondoner · November 1, 2018 02:40
diff --git a/create.txt b/create.txt
 call apoc.periodic.iterate(' 
 load csv from "https://raw.githubusercontent.com/wess/iotr/master/lotr.txt" as row fieldterminator \'"\'
 with row
 unwind row as text
 RETURN text','
 WHERE text is not NULL
 with reduce(t=tolower(text), delim in [",",".","!","?",\'"\',":",";","\'","-","#","*","(",")","[","]","/","`","and","the","not","but","for","with","from"] | replace(t,delim,"")) as normalized
 // Remove all short words (length <2)
 with [w in split(normalized," ") WHERE w IS NOT NULL and size(trim(w)) > 2 | trim(w)] as words
 where words is not null and size(words) > 1
 unwind range(1,2) as skipDistance
  unwind range(0,size(words)-1-skipDistance) as idx
    MERGE (w1:Word {name:words[idx]})
    MERGE (w2:Word {name:words[idx+skipDistance]})
    MERGE (w1)-[r:NEXT {distance:skipDistance}]->(w2)
    ON CREATE SET r.count = 1
    ON MATCH SET r.count = r.count + 1
 ',{batchSize:1000, iterateList:true, parallel:false})
diff --git a/probabilities.txt b/probabilities.txt
 match (w1:Word)-[r {distance:2}]->(cat:Word)
 with cat, sum(r.count) as t1 LIMIT 10
 match (w2:Word)-[r {distance:1}]->(cat)
 with cat, t1, sum(r.count) as t2
 match (cat)-[r {distance:1}]->(w3:Word)
 with cat, t1, t2, sum(r.count) as t3
 match (cat)-[r {distance:2}]->(w4:Word)
 with cat, t1, t2, t3, sum(r.count) as t4
 match (w1:Word)-[r {distance:2}]->(cat)
 WITH *, toFloat(r.count)/t1 as p1
 ORDER BY p1 DESCENDING
 WITH cat, t1, t2, t3, t4, collect({name: w1.name, p: p1}) as p1
 match (w2:Word)-[r {distance:1}]->(cat)
 WITH *, toFloat(r.count)/t2 as p2
 ORDER BY p2 DESCENDING
 WITH cat, t1, t2, t3, t4, p1, collect({name: w2.name, p: p2}) as p2
 match (cat)-[r {distance:1}]->(w3:Word)
 WITH *, toFloat(r.count)/t3 as p3
 ORDER BY p3 DESCENDING
 WITH cat, t1, t2, t3, t4, p1, p2, collect({name: w3.name, p: p3}) as p3
 match (cat)-[r {distance:2}]->(w4:Word)
 WITH *, toFloat(r.count)/t4 as p4
 ORDER BY p4 DESCENDING
 WITH cat, t1, t2, t3, t4, p1, p2, p3, collect({name: w4.name, p: p4}) as p4
 return cat.name, p1, p2, p3, p4
	call apoc.periodic.iterate('
	load csv from "https://raw.githubusercontent.com/wess/iotr/master/lotr.txt" as row fieldterminator \'"\'
	with row
	unwind row as text
	RETURN text','
	WHERE text is not NULL
	with reduce(t=tolower(text), delim in [",",".","!","?",\'"\',":",";","\'","-","#","*","(",")","[","]","/","`","and","the","not","but","for","with","from"] \| replace(t,delim,"")) as normalized
	// Remove all short words (length <2)
	with [w in split(normalized," ") WHERE w IS NOT NULL and size(trim(w)) > 2 \| trim(w)] as words
	where words is not null and size(words) > 1
	unwind range(1,2) as skipDistance
	unwind range(0,size(words)-1-skipDistance) as idx
	MERGE (w1:Word {name:words[idx]})
	MERGE (w2:Word {name:words[idx+skipDistance]})
	MERGE (w1)-[r:NEXT {distance:skipDistance}]->(w2)
	ON CREATE SET r.count = 1
	ON MATCH SET r.count = r.count + 1
	',{batchSize:1000, iterateList:true, parallel:false})
	match (w1:Word)-[r {distance:2}]->(cat:Word)
	with cat, sum(r.count) as t1 LIMIT 10
	match (w2:Word)-[r {distance:1}]->(cat)
	with cat, t1, sum(r.count) as t2
	match (cat)-[r {distance:1}]->(w3:Word)
	with cat, t1, t2, sum(r.count) as t3
	match (cat)-[r {distance:2}]->(w4:Word)
	with cat, t1, t2, t3, sum(r.count) as t4
	match (w1:Word)-[r {distance:2}]->(cat)
	WITH *, toFloat(r.count)/t1 as p1
	ORDER BY p1 DESCENDING
	WITH cat, t1, t2, t3, t4, collect({name: w1.name, p: p1}) as p1
	match (w2:Word)-[r {distance:1}]->(cat)
	WITH *, toFloat(r.count)/t2 as p2
	ORDER BY p2 DESCENDING
	WITH cat, t1, t2, t3, t4, p1, collect({name: w2.name, p: p2}) as p2
	match (cat)-[r {distance:1}]->(w3:Word)
	WITH *, toFloat(r.count)/t3 as p3
	ORDER BY p3 DESCENDING
	WITH cat, t1, t2, t3, t4, p1, p2, collect({name: w3.name, p: p3}) as p3
	match (cat)-[r {distance:2}]->(w4:Word)
	WITH *, toFloat(r.count)/t4 as p4
	ORDER BY p4 DESCENDING
	WITH cat, t1, t2, t3, t4, p1, p2, p3, collect({name: w4.name, p: p4}) as p4
	return cat.name, p1, p2, p3, p4