Skip to content

Instantly share code, notes, and snippets.

@MatMoore
Last active January 12, 2018 16:50
Show Gist options
  • Save MatMoore/b478226c010cc0ad7582956e7ddd3591 to your computer and use it in GitHub Desktop.
Save MatMoore/b478226c010cc0ad7582956e7ddd3591 to your computer and use it in GitHub Desktop.
Building a graph of GOV.UK
# Run in publishing_api_development
# These queries need reviewing to filter out links to non-live content
\copy (select l.target_content_id,coalesce(d.content_id, d2.content_id) as source_content_id, l.link_type, l.position, case when l.edition_id is null then 'document' else 'edition' end as link_scope from links l left outer join link_sets ls on l.link_set_id=ls.id left outer join documents d on ls.content_id=d.content_id left outer join editions e on e.id=l.edition_id left outer join documents d2 on d2.id=e.document_id) to 'content-links.csv' with csv header;
\copy (select d.content_id,d.locale,e.title,e.description,e.document_type,e.analytics_identifier,e.state from documents d join editions e on e.document_id=d.id and e.content_store='live') to 'content.csv' with csv header;
\copy (select d.content_id, d2.content_id as owning_content_id from documents d join documents d2 on d.owning_document_id=d2.id) to owning_documents.csv with csv header
# First copy CSV to some stupid directory like /Users/matmoore/Library/Application\ Support/Neo4j\ Desktop/Application/neo4jDatabases/database-488c5f78-fb53-4b30-bcbb-a5737ee9f59b/installation-3.3.1/import/
# Create nodes
using periodic commit
load csv with headers from "file:///content.csv" as row
CREATE (:PublishedContentItem {contentId: row.content_id, locale: row.locale, title: row.title, basePath: row.base_path, description: row.description, documentType: row.document_type, analyticsIdentifier: row.analytics_identifier});
# Create indexes
CREATE INDEX ON :PublishedContentItem(contentId);
CREATE INDEX ON :PublishedContentItem(analyticsIdentifier);
CREATE INDEX ON :PublishedContentItem(basePath);
CREATE INDEX ON :PublishedContentItem(documentType);
# Create some relationships.
# This may be missing stuff.
USING PERIODIC COMMIT
LOAD CSV WITH HEADERS FROM "file:///content-links.csv" AS row
WITH row
WHERE row.link_type = "taxons"
MATCH (target:PublishedContentItem {contentId: row.target_content_id})
MATCH (source:PublishedContentItem {contentId: row.source_content_id})
MERGE (source)-[:BELONGS_TO_TAXON]->(target);
USING PERIODIC COMMIT
LOAD CSV WITH HEADERS FROM "file:///content-links.csv" AS row
WITH row
WHERE row.link_type = "parent_taxons"
MATCH (target:PublishedContentItem {contentId: row.target_content_id})
MATCH (source:PublishedContentItem {contentId: row.source_content_id})
MERGE (source)-[:CHILD_TAXON_OF]->(target);
USING PERIODIC COMMIT
LOAD CSV WITH HEADERS FROM "file:///content-links.csv" AS row
WITH row
WHERE row.link_type = "root_taxon" or row.link_type="root_taxons"
MATCH (target:PublishedContentItem {contentId: row.target_content_id})
MATCH (source:PublishedContentItem {contentId: row.source_content_id})
MERGE (source)-[:CHILD_TAXON_OF]->(target);
USING PERIODIC COMMIT
LOAD CSV WITH HEADERS FROM "file:///content-links.csv" AS row
WITH row
WHERE row.link_type="children"
MATCH (target:PublishedContentItem {contentId: row.target_content_id})
MATCH (source:PublishedContentItem {contentId: row.source_content_id})
MERGE (target)-[:CHILD_OF]->(source);
USING PERIODIC COMMIT
LOAD CSV WITH HEADERS FROM "file:///content-links.csv" AS row
WITH row
WHERE row.link_type="mainstream_browse_pages"
MATCH (target:PublishedContentItem {contentId: row.target_content_id})
MATCH (source:PublishedContentItem {contentId: row.source_content_id})
MERGE (source)-[:BELONGS_TO_BROWSE_PAGE]->(target);
USING PERIODIC COMMIT
LOAD CSV WITH HEADERS FROM "file:///content-links.csv" AS row
WITH row
WHERE row.link_type="parent"
MATCH (target:PublishedContentItem {contentId: row.target_content_id})
MATCH (source:PublishedContentItem {contentId: row.source_content_id})
MERGE (source)-[:PART_OF]->(target);
USING PERIODIC COMMIT
LOAD CSV WITH HEADERS FROM "file:///content-links.csv" AS row
WITH row
WHERE row.link_type="topics"
MATCH (target:PublishedContentItem {contentId: row.target_content_id})
MATCH (source:PublishedContentItem {contentId: row.source_content_id})
MERGE (source)-[:BELONGS_TO_SPECIALIST_TOPIC]->(target);
USING PERIODIC COMMIT
LOAD CSV WITH HEADERS FROM "file:///content-links.csv" AS row
WITH row
WHERE row.link_type="policies"
MATCH (target:PublishedContentItem {contentId: row.target_content_id})
MATCH (source:PublishedContentItem {contentId: row.source_content_id})
MERGE (source)-[:BELONGS_TO_POLICY]->(target);
USING PERIODIC COMMIT
LOAD CSV WITH HEADERS FROM "file:///content-links.csv" AS row
WITH row
WHERE row.link_type="policies"
MATCH (target:PublishedContentItem {contentId: row.target_content_id})
MATCH (source:PublishedContentItem {contentId: row.source_content_id})
MERGE (source)-[:BELONGS_TO_POLICY]->(target);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment