Created
January 31, 2018 14:03
-
-
Save cheerfulstoic/7d64cc4a3d87162c09562ffe283d813c to your computer and use it in GitHub Desktop.
Queries and R code for Paradise Papers presentation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# R setup | |
library(RNeo4j) | |
graph = startGraph("http://localhost:7474/db/data/") | |
# Queries: | |
# What types of objects are we dealing with? | |
MATCH (n) | |
RETURN labels(n), count(*) | |
ORDER BY count(*) DESC | |
// What is related, and how | |
CALL db.schema() | |
// Average connections (degree) to which different types of nodes are connected | |
MATCH (n) WITH labels(n) AS type, SIZE( (n)--() ) AS degree | |
RETURN type, MAX(degree) AS max, ROUND(AVG(degree)) AS avg, ROUND(STDEV(degree)) AS stdev | |
// Let's explore! | |
// Look at names of officers | |
MATCH (o:Officer) | |
RETURN o.name, count(*) | |
ORDER BY count(*) DESC | |
// They're linked together (probably by the person who imported the data into Neo4j) | |
MATCH (o:Officer {name: 'CLEMENTI LIMITED'}) | |
RETURN * | |
// Find what jurisditions the officer is involved in | |
MATCH (o:Officer {name: 'CLEMENTI LIMITED'})-[r:OFFICER_OF]->(entity:Entity) | |
RETURN entity.jurisdiction_description, count(*) ORDER BY count(*) DESC | |
// How active has this officer's entities been | |
MATCH (o:Officer {name: 'CLEMENTI LIMITED'})-[r:OFFICER_OF]->(entity:Entity) | |
RETURN entity.incorporation_date ORDER BY entity.incorporation_date | |
// Let's aggregate the years | |
MATCH (o:Officer {name: 'CLEMENTI LIMITED'})-[r:OFFICER_OF]->(entity:Entity) | |
RETURN split(entity.incorporation_date, '-')[2] AS year, count(*) ORDER BY year | |
// That might be worth charting | |
query = "MATCH (o:Officer {name: 'CLEMENTI LIMITED'})-[r:OFFICER_OF]->(entity:Entity) | |
RETURN split(entity.incorporation_date, '-')[2] AS year, count(*) AS count ORDER BY year" | |
result <- cypher(graph, query) | |
result$year <- as.integer(result$year) | |
years <- seq(min(result$year), max(result$year)) | |
counts <- sapply(years, function(year) { result[result$year == year, "count"][1] }) | |
plot(years, counts) | |
// Big deal, those are basically just joins... | |
// Shortest distance between Rex Tillerson and the Queen | |
MATCH p=shortestPath((rex:Officer)-[*]-(queen:Officer)) | |
WHERE rex.name = "Tillerson - Rex" AND queen.name = "The Duchy of Lancaster" | |
RETURN p | |
// All shortest distances between Rex Tillerson and the Queen | |
MATCH p=allShortestPaths((rex:Officer)-[*]-(queen:Officer)) | |
WHERE rex.name = "Tillerson - Rex" AND queen.name = "The Duchy of Lancaster" | |
RETURN p | |
// Page Rank slide! | |
CALL algo.pageRank.stream(null, null ,{iterations:5, dampingFactor:0.85}) | |
YIELD node, score | |
WHERE node:Entity | |
RETURN node.name AS name, node.jurisdiction_description AS jurisdiction, score | |
ORDER BY score DESC LIMIT 10; | |
// I prepared this query earlier... | |
// Using page rank algorithm to find central nodes | |
CALL algo.pageRank(null,null,{write:true,writeProperty:'pagerank_g'}) | |
// Use the `pagerank_g` property which was set to find top entities by pagerank score | |
MATCH (e:Entity) WHERE exists(e.pagerank_g) | |
RETURN e.name AS entity, e.jurisdiction_description AS jurisdiction, | |
e.pagerank_g AS pagerank ORDER BY pagerank DESC LIMIT 15 | |
// I could just pull a random entity: | |
MATCH (e:Entity) WITH e LIMIT 1 | |
RETURN e LIMIT 1 | |
// But I could use PageRack to jump to tet most connected... | |
MATCH (e:Entity) | |
RETURN e ORDER BY e.pagerank_g DESC LIMIT 1 | |
// Get the top five most connected entities and find the entities that they are connected to | |
// Using `collect` function to return a data structure that is easy to use in your programming language of choice | |
MATCH (e:Entity) | |
WITH e, e.pagerank_g AS pagerank ORDER BY e.pagerank_g DESC LIMIT 5 | |
MATCH (e)--(through)--(other:Entity) | |
WITH e.name AS entity, through.name AS through, other.name AS other, count(*) AS count ORDER BY count DESC | |
RETURN entity, collect({through: through, other: other, count: count}) | |
// Find linked addresses | |
// Take the top most connect address and find all addresses linked through some intermediate node | |
MATCH (a:Address) | |
WITH a ORDER BY a.pagerank_g DESC LIMIT 1 | |
MATCH (a)--(middle)--(other:Address) | |
WHERE (a) <> other | |
WITH a, middle, other ORDER BY middle.name | |
WITH a.address AS address, other.address AS other, labels(middle)[0] AS label, collect(middle.name) AS middle_names, count(*) AS count ORDER BY count(*) DESC | |
RETURN address, collect({label: label, middle_names: middle_names}), count, other ORDER BY count DESC | |
# Query from PowerPoint: | |
MATCH (o:Officer)-[r1:REGISTERED_ADDRESS]->(a:Address)<-[r2:REGISTERED_ADDRESS]-(i:Intermediary)-[r3:CONNECTED_TO|INTERMEDIARY_OF]->(e:Entity)-[:CONNECTED_TO|OFFICER_OF]-(other:Other), | |
(a)-[rother]-(e) | |
RETURN * LIMIT 1 | |
// Let's visualize that | |
query = "MATCH (a:Address) | |
WITH a ORDER BY a.pagerank_g DESC LIMIT 1 | |
MATCH (a)--()--(other:Address) | |
WHERE (a) <> other | |
WITH other.countries AS country, count(*) AS count ORDER BY count DESC | |
WHERE count > 10 | |
RETURN country, count" | |
result <- cypher(graph, query) | |
result$country <- factor(result$country, levels=result[order(result$count), "country"]) | |
library(ggplot2) | |
ggplot(aes(x=count, y=country), data=result) + geom_point() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment