Last active
January 17, 2016 19:44
-
-
Save rvanbruggen/371e06726a58843a2938 to your computer and use it in GitHub Desktop.
Creating and Querying the Graphblog-Graph
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//Bruggen blog feed | |
//https://docs.google.com/a/neotechnology.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=0 | |
create index on :Blog(name); | |
create constraint on (p:Page) assert p.url is unique; | |
create (b:Blog {name:"Bruggen", url:"http://blog.bruggen.com"}); | |
create (n:Blog {name:"Neo4j Blog", url:"http://neo4j.com/blog"}); | |
create (n:Blog {name:"JEXP Blog", url:"http://jexp.de/blog/"}); | |
create (n:Blog {name:"Armbruster-IT Blog", url:"http://blog.armbruster-it.de/"}); | |
create (n:Blog {name:"Max De Marzi's Blog", url:"http://maxdemarzi.com/"}); | |
create (n:Blog {name:"Will Lyon's Blog", url:"http://lyonwj.com/"}); | |
create (n:Blog {name:"Mark Needham's Blog", url:"http://markhneedham.com/blog"}); | |
//create the Bruggen blog entries | |
load csv with headers from "https://docs.google.com/a/neotechnology.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=0" as csv | |
match (b:Blog {name:"Bruggen", url:"http://blog.bruggen.com"}) | |
create (p:Page {url: csv.URL, title: csv.Title, created: csv.Date})-[:PART_OF]->(b); | |
//create the link graph | |
load csv with headers from "https://docs.google.com/a/neotechnology.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=0" as csv | |
with csv.URL as URL, csv.Links as row | |
unwind row as listoflinks | |
with URL, [l in split(listoflinks,"****") | trim(l)] as links | |
unwind range(0,size(links)-2) as idx | |
MERGE (l:Page {url:links[idx]}) | |
WITH l, URL | |
MATCH (p:Page {url: URL}) | |
MERGE (p)-[:LINKS_TO]->(l); | |
//remove the redundant links | |
//linking to pages with same url (eg. archive pages, label pages...) | |
match (b:Blog {name:"Bruggen"})<-[:PART_OF]-(p1:Page)-[:LINKS_TO]->(p2:Page) | |
where p2.url starts with "http://blog.bruggen.com" | |
and not ((b)<-[:PART_OF]-(p2)) | |
detach delete p2; | |
//linking to other posts of the same blog | |
match (p1:Page)-[:PART_OF]->(b:Blog {name:"Bruggen"})<-[:PART_OF]-(p2:Page), | |
(p1)-[lt:LINKS_TO]-(p2) | |
delete lt; | |
//linking to itself | |
match (p1:Page)-[:PART_OF]->(b:Blog {name:"Bruggen"}), | |
(p1)-[lt:LINKS_TO]-(p1) | |
delete lt; | |
//linking to the blog provider (Blogger) | |
match (p:Page) | |
where p.url contains "//www.blogger.com" | |
detach delete p; | |
//Neo4j blog feed | |
//https://docs.google.com/a/neotechnology.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=999553051 | |
//create the blog entries | |
load csv with headers from "https://docs.google.com/a/neotechnology.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=999553051" as csv | |
merge (p:Page {url: csv.URL}) | |
on match set p.title=csv.Title, p.created=csv.Date | |
on create set p.title=csv.Title, p.created=csv.Date | |
with p | |
match (b:Blog {name:"Neo4j Blog", url:"http://neo4j.com/blog"}) | |
create (p)-[:PART_OF]->(b); | |
//create the link graph | |
load csv with headers from "https://docs.google.com/a/neotechnology.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=999553051" as csv | |
with csv.URL as URL, csv.Links as row | |
unwind row as linklist | |
with URL, [l in split(linklist,"****") | trim(l)] as links | |
unwind range(0,size(links)-2) as idx | |
MERGE (l:Page {url:links[idx]}) | |
WITH l, URL | |
MATCH (p:Page {url: URL}) | |
CREATE (p)-[:LINKS_TO]->(l); | |
//remove the redundant links | |
match (b:Blog {name:"Neo4j Blog"})<-[:PART_OF]-(p1:Page)-[:LINKS_TO]->(p2:Page) | |
where p2.url starts with "http://neo4j.com/blog" | |
and not ((b)<-[:PART_OF]-(p2)) | |
detach delete p2; | |
match (b:Blog {name:"Neo4j Blog"})<-[:PART_OF]-(p1:Page)-[:LINKS_TO]->(p2:Page) | |
where p2.url contains "//neo4j.com/" | |
and not ((b)<-[:PART_OF]-(p2)) | |
detach delete p2; | |
match (p1:Page)-[:PART_OF]->(b:Blog {name:"Neo4j Blog"})<-[:PART_OF]-(p2:Page), | |
(p1)-[lt:LINKS_TO]-(p2) | |
delete lt; | |
match (p1:Page)-[:PART_OF]->(b:Blog {name:"Neo4j Blog"}), | |
(p1)-[lt:LINKS_TO]-(p1) | |
delete lt; | |
//JEXP blog feed | |
//https://docs.google.com/a/neotechnology.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=2066848510 | |
//create the blog entries | |
load csv with headers from "https://docs.google.com/a/neotechnology.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=2066848510" as csv | |
merge (p:Page {url: csv.URL}) | |
on match set p.title=csv.Title, p.created=csv.Date | |
on create set p.title=csv.Title, p.created=csv.Date | |
with p | |
match (b:Blog {name:"JEXP Blog", url:"http://jexp.de/blog/"}) | |
create (p)-[:PART_OF]->(b); | |
//create the link graph | |
load csv with headers from "https://docs.google.com/a/neotechnology.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=2066848510" as csv | |
with csv.URL as URL, csv.Links as row | |
unwind row as linklist | |
with URL, [l in split(linklist,"****") | trim(l)] as links | |
unwind range(0,size(links)-2) as idx | |
MERGE (l:Page {url:links[idx]}) | |
WITH l, URL | |
MATCH (p:Page {url: URL}) | |
CREATE (p)-[:LINKS_TO]->(l); | |
//remove the redundant links | |
match (b:Blog {name:"JEXP Blog"})<-[:PART_OF]-(p1:Page)-[:LINKS_TO]->(p2:Page) | |
where p2.url starts with "http://jexp.de/blog" | |
and not ((b)<-[:PART_OF]-(p2)) | |
detach delete p2; | |
match (b:Blog {name:"JEXP Blog"})<-[:PART_OF]-(p1:Page)-[:LINKS_TO]->(p2:Page) | |
where p2.url contains "//jexp.de/" | |
and not ((b)<-[:PART_OF]-(p2)) | |
detach delete p2; | |
match (p1:Page)-[:PART_OF]->(b:Blog {name:"JEXP Blog"})<-[:PART_OF]-(p2:Page), | |
(p1)-[lt:LINKS_TO]-(p2) | |
delete lt; | |
match (p1:Page)-[:PART_OF]->(b:Blog {name:"JEXP Blog"}), | |
(p1)-[lt:LINKS_TO]-(p1) | |
delete lt; | |
//Stefan Armbruster-IT https://docs.google.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=1143698530 | |
//create the blog entries | |
load csv with headers from "https://docs.google.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=1143698530" as csv | |
merge (p:Page {url: csv.URL}) | |
on match set p.title=csv.Title, p.created=csv.Date | |
on create set p.title=csv.Title, p.created=csv.Date | |
with p | |
match (b:Blog {name:"Armbruster-IT Blog", url:"http://blog.armbruster-it.de/"}) | |
create (p)-[:PART_OF]->(b); | |
//create the link graph | |
load csv with headers from "https://docs.google.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=1143698530" as csv | |
with csv.URL as URL, csv.Links as row | |
unwind row as linklist | |
with URL, [l in split(linklist,"****") | trim(l)] as links | |
unwind range(0,size(links)-2) as idx | |
MERGE (l:Page {url:links[idx]}) | |
WITH l, URL | |
MATCH (p:Page {url: URL}) | |
CREATE (p)-[:LINKS_TO]->(l); | |
//remove the redundant links | |
match (b:Blog {name:"Armbruster-IT Blog"})<-[:PART_OF]-(p1:Page)-[:LINKS_TO]->(p2:Page) | |
where p2.url starts with "http://blog.armbruster-it.de" | |
and not ((b)<-[:PART_OF]-(p2)) | |
detach delete p2; | |
match (b:Blog {name:"Armbruster-IT Blog"})<-[:PART_OF]-(p1:Page)-[:LINKS_TO]->(p2:Page) | |
where p2.url contains "//blog.armbruster-it.de/" | |
and not ((b)<-[:PART_OF]-(p2)) | |
detach delete p2; | |
match (p1:Page)-[:PART_OF]->(b:Blog {name:"Armbruster-IT Blog"})<-[:PART_OF]-(p2:Page), | |
(p1)-[lt:LINKS_TO]-(p2) | |
delete lt; | |
match (p1:Page)-[:PART_OF]->(b:Blog {name:"Armbruster-IT Blog"}), | |
(p1)-[lt:LINKS_TO]-(p1) | |
delete lt; | |
//Max De Marzi | |
https://docs.google.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=1473003649 | |
//create the blog entries | |
load csv with headers from "https://docs.google.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=1473003649" as csv | |
merge (p:Page {url: csv.URL}) | |
on match set p.title=csv.Title, p.created=csv.Date | |
on create set p.title=csv.Title, p.created=csv.Date | |
with p | |
match (b:Blog {name:"Max De Marzi's Blog", url:"http://maxdemarzi.com/"}) | |
create (p)-[:PART_OF]->(b); | |
//create the link graph | |
load csv with headers from "https://docs.google.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=1473003649" as csv | |
with csv.URL as URL, csv.Links as row | |
unwind row as linklist | |
with URL, [l in split(linklist,"****") | trim(l)] as links | |
unwind range(0,size(links)-2) as idx | |
MERGE (l:Page {url:links[idx]}) | |
WITH l, URL | |
MATCH (p:Page {url: URL}) | |
CREATE (p)-[:LINKS_TO]->(l); | |
// //remove the redundant links | |
// match (b:Blog {name:"Max De Marzi's Blog"})<-[:PART_OF]-(p1:Page)-[:LINKS_TO]->(p2:Page) | |
// where p2.url starts with "http://maxdemarzi.com" | |
// and not ((b)<-[:PART_OF]-(p2)) | |
// detach delete p2; | |
match (b:Blog {name:"Max De Marzi's Blog"})<-[:PART_OF]-(p1:Page)-[:LINKS_TO]->(p2:Page) | |
where p2.url contains "//maxdemarzi.com/" | |
and not ((b)<-[:PART_OF]-(p2)) | |
detach delete p2; | |
match (p1:Page)-[:PART_OF]->(b:Blog {name:"Max De Marzi's Blog"})<-[:PART_OF]-(p2:Page), | |
(p1)-[lt:LINKS_TO]-(p2) | |
delete lt; | |
match (p1:Page)-[:PART_OF]->(b:Blog {name:"Max De Marzi's Blog"}), | |
(p1)-[lt:LINKS_TO]-(p1) | |
delete lt; | |
//Will Lyon | |
https://docs.google.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=1326206509 | |
//create the blog entries | |
load csv with headers from "https://docs.google.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=1326206509" as csv | |
merge (p:Page {url: csv.URL}) | |
on match set p.title=csv.Title, p.created="Unknown" | |
on create set p.title=csv.Title, p.created="Unknown" | |
with p | |
match (b:Blog {name:"Will Lyon's Blog", url:"http://lyonwj.com/"}) | |
create (p)-[:PART_OF]->(b); | |
//create the link graph | |
load csv with headers from "https://docs.google.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=1326206509" as csv | |
with csv.URL as URL, csv.Links as row | |
unwind row as linklist | |
with URL, [l in split(linklist,"****") | trim(l)] as links | |
unwind range(0,size(links)-2) as idx | |
MERGE (l:Page {url:links[idx]}) | |
WITH l, URL | |
MATCH (p:Page {url: URL}) | |
CREATE (p)-[:LINKS_TO]->(l); | |
// //remove the redundant links | |
match (b:Blog {name:"Will Lyon's Blog"})<-[:PART_OF]-(p1:Page)-[:LINKS_TO]->(p2:Page) | |
where p2.url contains "//lyonwj.com/" | |
and not ((b)<-[:PART_OF]-(p2)) | |
detach delete p2; | |
match (p1:Page)-[:PART_OF]->(b:Blog {name:"Will Lyon's Blog"})<-[:PART_OF]-(p2:Page), | |
(p1)-[lt:LINKS_TO]-(p2) | |
delete lt; | |
match (p1:Page)-[:PART_OF]->(b:Blog {name:"Will Lyon's Blog"}), | |
(p1)-[lt:LINKS_TO]-(p1) | |
delete lt; | |
//Mark Needham | |
https://docs.google.com/a/neotechnology.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=1346442478 | |
//create the blog entries | |
load csv with headers from "https://docs.google.com/a/neotechnology.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=1346442478" as csv | |
merge (p:Page {url: csv.URL}) | |
on match set p.title=csv.Title, p.created="Unknown" | |
on create set p.title=csv.Title, p.created="Unknown" | |
with p | |
match (b:Blog {name:"Mark Needham's Blog"}) | |
create (p)-[:PART_OF]->(b); | |
//create the link graph | |
load csv with headers from "https://docs.google.com/a/neotechnology.com/spreadsheets/d/1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w/export?format=csv&id=1LAQarqQ-id74-zxV6R4SdG7mCq_24xACXO5WNOP-2_w&gid=1346442478" as csv | |
with csv.URL as URL, csv.Links as row | |
unwind row as linklist | |
with URL, [l in split(linklist,"****") | trim(l)] as links | |
unwind range(0,size(links)-2) as idx | |
MERGE (l:Page {url:links[idx]}) | |
WITH l, URL | |
MATCH (p:Page {url: URL}) | |
CREATE (p)-[:LINKS_TO]->(l); | |
// //remove the redundant links | |
match (b:Blog {name:"Mark Needham's Blog"})<-[:PART_OF]-(p1:Page)-[:LINKS_TO]->(p2:Page) | |
where p2.url contains "//www.markhneedham.com/" | |
and not ((b)<-[:PART_OF]-(p2)) | |
detach delete p2; | |
match (p1:Page)-[:PART_OF]->(b:Blog {name:"Mark Needham's Blog"})<-[:PART_OF]-(p2:Page), | |
(p1)-[lt:LINKS_TO]-(p2) | |
delete lt; | |
match (p1:Page)-[:PART_OF]->(b:Blog {name:"Mark Needham's Blog"}), | |
(p1)-[lt:LINKS_TO]-(p1) | |
delete lt; | |
//clear out the relative and incomplete URLs | |
match (p:Page) | |
where p.url starts with "/" | |
detach delete p; | |
match (p:Page) | |
where p.url starts with "#" | |
detach delete p; | |
match (p:Page) | |
where p.url starts with "javascript" | |
detach delete p; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//Query the Bloggraph | |
//Which pages have the most links | |
match (b:Blog)--(p:Page)-[r:LINKS_TO]->(p2:Page) | |
return b.name, p.title, count(r) | |
order by count(r) desc | |
//Which pages are being linked to most | |
match ()-[r:LINKS_TO]->(p:Page) | |
return p.url, count(r) | |
order by count(r) DESC | |
limit 10; | |
//links between blogposts | |
MATCH p=((n1:Blog)--(p1:Page)-[:LINKS_TO]-(p2:Page)--(b2:Blog)) | |
RETURN p | |
//show all the paths between blogs | |
match (b1:Blog {name:"Bruggen"}), (b2:Blog {name:"Neo4j Blog"}), | |
p1 = allshortestpaths((b1)-[*]-(b2)) | |
return p1 as paths | |
union | |
match (b1:Blog {name:"Bruggen"}),(b3:Blog {name:"JEXP Blog"}), | |
p2 = allshortestpaths((b1)-[*]-(b3)) | |
return p2 as paths | |
// union | |
// match (b2:Blog {name:"Neo4j Blog"}),(b3:Blog {name:"JEXP Blog"}), | |
// p3 = allshortestpaths((b2)-[*]-(b3)) | |
// return p3 as paths; | |
//Pagerank ftw | |
//1. calculate the PageRank | |
UNWIND range(1,50) AS round | |
MATCH (n:Page) | |
WHERE rand() < 0.1 | |
MATCH (n:Page)-[:LINKS_TO*..10]->(m:Page) | |
SET m.rank = coalesce(m.rank,0) + 1 | |
//2. Show the PageRank | |
match (n:Page) | |
where n.rank is not null | |
return n.url, n.rank | |
order by n.rank desc | |
limit 10; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment