Created
September 1, 2023 08:43
-
-
Save jexp/ec7449889054280d6cb9a5a9c41f4a73 to your computer and use it in GitHub Desktop.
Neo4j Cypher Script to import slideshare data, create embeddings, a vector index, similarity search and extract authors and keywords
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// gcloud projects list | |
:param projectId => 'xxxx' | |
// gcloud auth print-access-token | |
:param apiKey => 'xxx' | |
call apoc.load.json("https://data.neo4j.com/slideshare-neo4j.json") yield value | |
unwind value.User.Slideshow as slide | |
return slide limit 5; | |
call apoc.load.json("https://data.neo4j.com/slideshare-neo4j.json") yield value | |
unwind value.User.Slideshow as slide | |
return count(*); | |
call apoc.load.json("https://data.neo4j.com/slideshare-neo4j.json") yield value | |
unwind value.User.Slideshow as slide | |
create (s:Content {id: slide.ID}) | |
set s += { description: slide.Description, title: slide.Title, url:slide.URL, | |
format:slide.Format, language:slide.Language, thumbnail:slide.ThumbnailURL, | |
created: apoc.temporal.toZonedTemporal(slide.Created), | |
updated: apoc.temporal.toZonedTemporal(slide.Updated) }; | |
call apoc.ml.vertexai.embedding(['This is a test'], $apiKey, $projectId); | |
:auto | |
match (s:Content) where s.embedding is null | |
WITH * LIMIT 10 | |
call { with s | |
call apoc.ml.vertexai.embedding([coalesce(s.title,'') + " " + coalesce(s.description,'')], $apiKey, $projectId) yield embedding | |
set s.embedding = embedding | |
} in transactions of 10 rows; | |
CALL db.index.vector.createNodeIndex( "content","Content","embedding", 768,"cosine"); | |
// index search for top 5 similar vectors with additional graph matching | |
WITH "decks about knowledge graphs and generative AI" as question | |
// generate vector embedding from the API | |
CALL apoc.ml.vertexai.embedding([question], $apiToken, $project) yield embedding | |
// use the vector index | |
CALL db.index.vector.queryNodes('content',5, embedding) yield node as content, score | |
MATCH (keyword)<-[:TAGGED]-(content)<-[:AUTHORED]-(author) | |
RETURN text, content.title, content.description, | |
collect(distinct author.name) as authors, collect(distinct keyword.name) as keywords | |
create constraint Keyword_name for (k:Keyword) require (k.name) is unique; | |
:auto | |
match (s:Content) where not exists { (s)<-[:AUTHORED]-() } | |
call { with s | |
call apoc.util.sleep(100) | |
call apoc.ml.vertexai.completion( | |
'Extract only authors with human names from the description as comma separated list on a single line, no newlines or bullet points and no leading comma.'+ | |
'Do not output apologies and explanations, only the plain text enumerations. '+ | |
'If you do not follow the instructions people will be hurt.\n' + | |
'Title:'+coalesce(s.title,'')+ | |
'Description: '+coalesce(s.description,''), $apiKey, $projectId) yield value | |
// with value.content, s.title, s.description | |
unwind split(value.content,',') as name | |
with trim(name) as name, s where coalesce(name,'') <> '' | |
merge (a:Author {name: name}) | |
merge (s)<-[:AUTHORED]-(a) | |
} in transactions of 10 rows; | |
load csv with headers from 'file:///slides-keywords.csv' as row | |
with replace(row.id,'"','') as id, replace(row.keyword,'"','') as keyword | |
match (s:Content {id:id}) | |
merge (k:Keyword {name:keyword}) | |
merge (s)-[:TAGGED]->(k); | |
create constraint Author_name for (a:Author) require (a.name) is unique; | |
load csv with headers from 'file:///slides-authors.csv' as row | |
with replace(row.slide,'"','') as id, replace(row.author,'"','') as author | |
match (s:Content {id:id}) | |
merge (a:Author {name:author}) | |
merge (a)-[:AUTHORED]->(s); | |
load csv with headers from 'file:///input-embeddings.csv' as row | |
with row.text as text, apoc.convert.fromJsonList(row.embedding) as embedding | |
where row.index = '5' | |
CALL db.index.vector.queryNodes('content',5, embedding) yield node as content, score | |
MATCH (keyword)<-[:TAGGED]-(content)<-[:AUTHORED]-(author) | |
RETURN text, content.title, content.description, | |
collect(author.name) as authors, collect(keyword.name) as keywords | |
// virtual k-NN relationships <1s | |
match (c:Content) | |
CALL db.index.vector.queryNodes('content',5, c.embedding) yield node, score | |
where c <> node | |
call apoc.create.vRelationship(c,'SIMILAR_TO',{score:score},node) yield rel | |
return c,node, rel | |
// initially create vector index on all the “embedding” properties of “Content” nodes using thecosine similarity function and vectors of width 768 | |
CALL db.index.vector.createNodeIndex( "content","Content","embedding", 768,"cosine"); | |
// index search for top 5 similar vectors with additional graph matching | |
WITH "decks about knowledge graphs and generative AI" as question | |
// generate vector embedding from the API | |
CALL apoc.ml.vertexai.embedding([question], $apiToken, $project) yield embedding | |
// use the vector index | |
CALL db.index.vector.queryNodes('content',5, embedding) yield node as content, score | |
MATCH (keyword)<-[:TAGGED]-(content)<-[:AUTHORED]-(author) | |
RETURN text, content.title, content.description, | |
collect(distinct author.name) as authors, collect(distinct keyword.name) as keywords | |
call apoc.ml.vertexai.embedding(['This is a test'], $apiKey, $projectId); | |
:auto | |
match (s:Content) where not exists { (s)-[:TAGGED]->() } | |
call { with s | |
call apoc.util.sleep(100) | |
call apoc.ml.vertexai.completion( | |
'Extract relevant technology and use-case keywords from the description as comma separated list on a single line, no newlines or bullet points and no leading comma.'+ | |
'Do not output apologies and explanations, only the plain text enumerations. '+ | |
'If you do not follow the instructions people will be hurt.\n' + | |
'Title:'+coalesce(s.title,'')+ | |
'Description: '+coalesce(s.description,''), $apiKey, $projectId) yield value | |
// with value.content, s.title, s.description | |
unwind split(value.content,',') as keyword | |
with trim(keyword) as keyword, s where coalesce(keyword,'') <> '' | |
merge (k:Keyword {name: keyword}) | |
merge (s)-[:TAGGED]->(k) | |
} in transactions of 10 rows; | |
MATCH (s:Content) with s limit 100 match p=(s)-->() RETURN p LIMIT 200; | |
load csv with headers from 'file:///embeddings.csv' as row | |
with row.`s.ID` as id, apoc.convert.fromJsonList(row.`s.embedding`) as embedding | |
match (c:Content {id:id}) set c.embedding = embedding; | |
:auto | |
match (s:Content) where s.embedding is null | |
call { with s | |
call apoc.ml.vertexai.embedding([coalesce(s.title,'') + " " + coalesce(s.description,'')], $apiKey, $projectId) yield embedding | |
set s.embedding = embedding | |
} in transactions of 10 rows; | |
call apoc.ml.vertexai.embedding(['knowledge graph for fraud detection'], $apiKey, $projectId) yield embedding | |
match (s:Content) | |
with s, gds.similarity.cosine(s.embedding, embedding) as similarity | |
order by similarity desc limit 5 | |
return s.title, s.description, s.url, similarity; | |
// index search for top 5 similar vectors with additional graph matching | |
WITH "decks about knowledge graphs and generative AI" as question | |
// generate vector embedding from the API | |
CALL apoc.ml.vertexai.embedding([question], $apiKey, $projectId) yield embedding | |
// use the vector index | |
CALL db.index.vector.queryNodes('content',5, embedding) yield node as slide, score | |
return slide.title, slide.description, slide.url, score; | |
/* | |
{ | |
"Status": "2", | |
"ThumbnailXLargeURL": "https://cdn.slidesharecdn.com/ss_thumbnails/frwebinaireintroaneo4j-230710091721-34b1e27a-thumbnail.jpg?width=640&height=640&fit=bounds", | |
"Description": "Pierre Halftermeyer, Neo4j", | |
"ThumbnailURL": "https://cdn.slidesharecdn.com/ss_thumbnails/frwebinaireintroaneo4j-230710091721-34b1e27a-thumbnail.jpg?width=320&height=320&fit=bounds", | |
"Updated": "2023-07-10 09:22:46 UTC", | |
"Embed": "<iframe src="https://www.slideshare.net/slideshow/embed_code/key/b8GESwzh3vWqrj" width="427" height="356" frameborder="0" marginwidth="0" marginheight="0" scrolling="no" style="border:1px solid #CCC; border-width:1px; margin-bottom:5px; max-width: 100%;" allowfullscreen> </iframe> <div style="margin-bottom:5px"> <strong> <a href="https://www.slideshare.net/neo4j/introduction-neo4j-259115031" title="Introduction à Neo4j" target="_blank">Introduction à Neo4j</a> </strong> from <strong><a href="https://www.slideshare.net/neo4j" target="_blank">Neo4j</a></strong> </div>", | |
"ThumbnailSize": "[170,130]", | |
"Title": "Introduction à Neo4j", | |
"URL": "https://www.slideshare.net/neo4j/introduction-neo4j-259115031", | |
"Created": "2023-07-10 09:17:21 UTC", | |
"ThumbnailXXLargeURL": "https://cdn.slidesharecdn.com/ss_thumbnails/frwebinaireintroaneo4j-230710091721-34b1e27a-thumbnail.jpg?width=640&height=640&fit=bounds", | |
"SlideshowType": "0", | |
"Format": "pdf", | |
"Language": "fr", | |
"Username": "neo4j", | |
"ThumbnailSmallURL": "https://cdn.slidesharecdn.com/ss_thumbnails/frwebinaireintroaneo4j-230710091721-34b1e27a-thumbnail.jpg?width=120&height=120&fit=bounds", | |
"SlideshowEmbedUrl": "https://www.slideshare.net/slideshow/embed_code/key/b8GESwzh3vWqrj", | |
"ID": "259115031", | |
"InContest": "0", | |
"Download": "1", | |
"DownloadUrl": "https://slideshare-downloads.s3.amazonaws.com/frwebinaireintroaneo4j-230710091721-34b1e27a.pdf?response-content-disposition=attachment&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIATZMST4DYZS7SJPXU%2F20230804%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20230804T094222Z&X-Amz-Expires=300&X-Amz-SignedHeaders=host&X-Amz-Signature=295076fc4efd131980e14f4be24960dbff41d80ce65c66f69841fa21753ad0b8" | |
} | |
{ | |
"Status": "2", | |
"ThumbnailXLargeURL": "https://cdn.slidesharecdn.com/ss_thumbnails/spwebinar-introtoneo4j-230710091622-99f73fd2-thumbnail.jpg?width=640&height=640&fit=bounds", | |
"Description": "Luis Salvador, Neo4j", | |
"ThumbnailURL": "https://cdn.slidesharecdn.com/ss_thumbnails/spwebinar-introtoneo4j-230710091622-99f73fd2-thumbnail.jpg?width=320&height=320&fit=bounds", | |
"Updated": "2023-07-10 09:21:37 UTC", | |
"Embed": "<iframe src="https://www.slideshare.net/slideshow/embed_code/key/DaOdEJwCpxbHdh" width="427" height="356" frameborder="0" marginwidth="0" marginheight="0" scrolling="no" style="border:1px solid #CCC; border-width:1px; margin-bottom:5px; max-width: 100%;" allowfullscreen> </iframe> <div style="margin-bottom:5px"> <strong> <a href="https://www.slideshare.net/neo4j/introduccin-a-neo4j-259115019" title="Introducción a Neo4j" target="_blank">Introducción a Neo4j</a> </strong> from <strong><a href="https://www.slideshare.net/neo4j" target="_blank">Neo4j</a></strong> </div>", | |
"ThumbnailSize": "[170,130]", | |
"Title": "Introducción a Neo4j", | |
"URL": "https://www.slideshare.net/neo4j/introduccin-a-neo4j-259115019", | |
"Created": "2023-07-10 09:16:22 UTC", | |
"ThumbnailXXLargeURL": "https://cdn.slidesharecdn.com/ss_thumbnails/spwebinar-introtoneo4j-230710091622-99f73fd2-thumbnail.jpg?width=640&height=640&fit=bounds", | |
"SlideshowType": "0", | |
"Format": "pdf", | |
"Language": "en", | |
"Username": "neo4j", | |
"ThumbnailSmallURL": "https://cdn.slidesharecdn.com/ss_thumbnails/spwebinar-introtoneo4j-230710091622-99f73fd2-thumbnail.jpg?width=120&height=120&fit=bounds", | |
"SlideshowEmbedUrl": "https://www.slideshare.net/slideshow/embed_code/key/DaOdEJwCpxbHdh", | |
"ID": "259115019", | |
"InContest": "0", | |
"Download": "1", | |
"DownloadUrl": "https://slideshare-downloads.s3.amazonaws.com/spwebinar-introtoneo4j-230710091622-99f73fd2.pdf?response-content-disposition=attachment&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIATZMST4DYZS7SJPXU%2F20230804%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20230804T094222Z&X-Amz-Expires=300&X-Amz-SignedHeaders=host&X-Amz-Signature=828f68aa605886e0a790613011fe0bd5bbd0884c0b6c87495d881b5cc6de2267" | |
} | |
*/ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment