Skip to content

Instantly share code, notes, and snippets.

@jermnelson
Last active January 27, 2025 18:35
Show Gist options
  • Save jermnelson/c413177924bc2bcc62f38b80cf051a70 to your computer and use it in GitHub Desktop.
Save jermnelson/c413177924bc2bcc62f38b80cf051a70 to your computer and use it in GitHub Desktop.
BIBFRAME Semantic Search
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"id": "a856656c-2297-4588-918b-83d253daa86c",
"metadata": {},
"source": [
"# Blue Core Data Model with RAG support"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "c8cdf081-9c96-46b6-bf38-c285e6c4fd68",
"metadata": {},
"outputs": [],
"source": [
"import datetime\n",
"import os\n",
"import pathlib\n",
"import sys\n",
"import time\n",
"\n",
"from string import Template\n",
"\n",
"import rdflib\n",
"import torch\n",
"\n",
"from pydantic_ai import Agent, RunContext\n",
"from pydantic_ai.models.openai import OpenAIModel\n",
"from sqlalchemy.orm import sessionmaker\n",
"from sqlalchemy import create_engine, select\n",
"\n",
"import nest_asyncio\n",
"\n",
"nest_asyncio.apply()\n",
"\n",
"sys.path.append(\"/Users/jpnelson/30-39 Sinopia, Blue-Core, FOLIO, and PCC/32.01 Blue Core API/src\")\n",
"\n",
"from bluecore.models import (\n",
" Instance,\n",
" Work,\n",
" Version,\n",
" TripleVectorIndex,\n",
" VECTOR_SIZE\n",
")\n",
"\n",
"from bluecore.helpers.graph import (\n",
" generate_embedding,\n",
" init_graph,\n",
" generate_entity_graph\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "b5a53c77-9a07-445c-ad54-17fefebbdd22",
"metadata": {},
"outputs": [],
"source": [
"BF = rdflib.Namespace(\"http://id.loc.gov/ontologies/bibframe/\")\n",
"\n",
"data_path = pathlib.Path(\"/Users/jpnelson/30-39 Sinopia, Blue-Core, FOLIO, and PCC/32.05 Blue Core Prototype Data/data\")\n",
"\n",
"SQLALCHEMY_DATABASE_URL = \"postgresql://bluecore_admin:bluecore_admin@localhost/bluecore\"\n",
"\n",
"engine = create_engine(SQLALCHEMY_DATABASE_URL)\n",
"\n",
"Session = sessionmaker(bind=engine)\n",
"\n",
"session = Session()"
]
},
{
"cell_type": "markdown",
"id": "2fcf4c2a-1d70-4cd5-bd54-ae3f316bf436",
"metadata": {},
"source": [
"## Sample LOC Records\n",
"\n",
"Ingested random 2,000 records from the LOC test set of 349,141. Total time was about 80 minutes (**not great** but a lot of database optimizations remain to improve throughput)."
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "2e815016-f5e6-446a-bc31-8d8efa1489fd",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Works: 1,983\n",
"Instances: 2,667\n",
"Versions: 4,650\n",
"TripleVectorIndex: 318,127\n"
]
}
],
"source": [
"print(f\"\"\"Works: {session.query(Work).count():,}\n",
"Instances: {session.query(Instance).count():,}\n",
"Versions: {session.query(Version).count():,}\n",
"TripleVectorIndex: {session.query(TripleVectorIndex).count():,}\"\"\")"
]
},
{
"cell_type": "markdown",
"id": "cca2c2b3-000b-4a6b-bf4f-bfe4b1071a7f",
"metadata": {},
"source": [
"## Sample Work Object"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "c410f14f-a482-44e4-adc6-0bea5e63750b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<Graph identifier=N91883a3cc184480484b97a31adcf0d07 (<class 'rdflib.graph.Graph'>)>"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sample_loc_record = data_path / \"22834861.jsonld\"\n",
"sample_loc_graph = init_graph()\n",
"sample_loc_graph.parse(data=sample_loc_record.read_text(),\n",
" format='json-ld')"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "f751970e-74e6-460b-935a-c80a2c61e850",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"http://id.loc.gov/resources/works/22834861\n"
]
}
],
"source": [
"for work in sample_loc_graph.subjects(predicate=rdflib.RDF.type,\n",
" object=BF.Work):\n",
" print(work)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "c71c5899-a8ad-486d-9802-379fec0392bb",
"metadata": {},
"outputs": [],
"source": [
"work_query = session.query(Work).where(Work.uri == \"http://id.loc.gov/resources/works/22834861\")"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "1f7cd54a-7aa2-4a6e-b72b-cca1f26fd88c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<Work http://id.loc.gov/resources/works/22834861>\n"
]
}
],
"source": [
"work = work_query.first()\n",
"print(work)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "b739cf0a-ff5b-4ec5-aa72-0801d2dc641c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<Graph identifier=Nceae7ec24fb34f7cb3491c347a4ecec1 (<class 'rdflib.graph.Graph'>)>"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"work_graph = init_graph()\n",
"work_graph.parse(data=work.data, format='json-ld')"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "90658918-9a00-43e5-9a17-e402e6a296e7",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"@prefix bf: <http://id.loc.gov/ontologies/bibframe/> .\n",
"@prefix bflc: <http://id.loc.gov/ontologies/bflc/> .\n",
"@prefix mads: <http://www.loc.gov/mads/rdf/v1#> .\n",
"@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .\n",
"@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .\n",
"@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .\n",
"\n",
"<http://id.loc.gov/resources/works/22834861> a bf:Monograph,\n",
" bf:Text,\n",
" bf:Work ;\n",
" bflc:aap \"Daignault, Susan A full net\" ;\n",
" bflc:aap-normalized \"daignaultsusanafullnet\" ;\n",
" bf:adminMetadata [ a bf:AdminMetadata ;\n",
" bf:agent <http://id.loc.gov/vocabulary/organizations/mea> ;\n",
" bf:date \"2022-10-19\"^^xsd:date ;\n",
" bf:status <http://id.loc.gov/vocabulary/mstatus/n> ],\n",
" [ a bf:AdminMetadata ;\n",
" bf:date \"2024-08-28T09:39:19\"^^xsd:dateTime ;\n",
" bf:descriptionModifier <http://id.loc.gov/vocabulary/organizations/dlc> ;\n",
" bf:status <http://id.loc.gov/vocabulary/mstatus/c> ],\n",
" [ a bf:AdminMetadata ;\n",
" bf:agent <http://id.loc.gov/vocabulary/organizations/dlcmrc> ;\n",
" bf:date \"2024-08-29T01:38:40.631733-04:00\"^^xsd:dateTime ;\n",
" bf:generationProcess <https://github.com/lcnetdev/marc2bibframe2/releases/tag/v2.7.0> ;\n",
" bf:status <http://id.loc.gov/vocabulary/mstatus/c> ],\n",
" [ a bf:AdminMetadata ;\n",
" bflc:encodingLevel <http://id.loc.gov/vocabulary/menclvl/f> ;\n",
" bf:descriptionAuthentication <http://id.loc.gov/vocabulary/marcauthen/lccopycat> ;\n",
" bf:descriptionConventions <http://id.loc.gov/vocabulary/descriptionConventions/isbd>,\n",
" <http://id.loc.gov/vocabulary/descriptionConventions/rda> ;\n",
" bf:descriptionLanguage <http://id.loc.gov/vocabulary/languages/eng> ;\n",
" bf:descriptionLevel <http://id.loc.gov/ontologies/bibframe-2-3-0/> ;\n",
" bf:identifiedBy [ a bf:Local ;\n",
" bf:assigner <http://id.loc.gov/vocabulary/organizations/dlc> ;\n",
" rdf:value \"22834861\" ] ] ;\n",
" bf:classification [ a bf:ClassificationLcc ;\n",
" bf:assigner <http://id.loc.gov/vocabulary/organizations/dlc> ;\n",
" bf:classificationPortion \"PN6071.F47\" ;\n",
" bf:itemPortion \"D37 2023\" ;\n",
" bf:status <http://id.loc.gov/vocabulary/mstatus/uba> ] ;\n",
" bf:content <http://id.loc.gov/vocabulary/contentTypes/txt> ;\n",
" bf:contribution [ a bf:Contribution,\n",
" bf:PrimaryContribution ;\n",
" bf:agent [ a bf:Agent,\n",
" bf:Person ;\n",
" rdfs:label \"Daignault, Susan\" ] ;\n",
" bf:role <http://id.loc.gov/vocabulary/relators/aut> ] ;\n",
" bf:genreForm <http://id.loc.gov/authorities/genreForms/gf2014026049> ;\n",
" bf:hasInstance <http://id.loc.gov/resources/instances/22834861> ;\n",
" bf:illustrativeContent <http://id.loc.gov/vocabulary/millus/ill> ;\n",
" bf:language <http://id.loc.gov/vocabulary/languages/eng> ;\n",
" bf:subject [ a bf:Agent,\n",
" bf:Person,\n",
" mads:PersonalName ;\n",
" rdfs:label \"Daignault, Susan\" ;\n",
" bflc:aap-normalized \"daignaultsusan\" ;\n",
" bf:source <http://id.loc.gov/authorities/subjects> ;\n",
" mads:authoritativeLabel \"Daignault, Susan\" ;\n",
" mads:isMemberOfMADSScheme <http://id.loc.gov/authorities/subjects> ],\n",
" [ a bf:Topic,\n",
" mads:ComplexSubject ;\n",
" rdfs:label \"Fishing--Maine\" ;\n",
" bflc:aap-normalized \"fishingmaine\" ;\n",
" bf:source <http://id.loc.gov/authorities/subjects> ;\n",
" mads:authoritativeLabel \"Fishing--Maine\" ;\n",
" mads:componentList ( <http://id.loc.gov/authorities/subjects/sh85048830> <http://id.loc.gov/rwo/agents/n79005604-781> ) ;\n",
" mads:isMemberOfMADSScheme <http://id.loc.gov/authorities/subjects> ],\n",
" [ a bf:Topic,\n",
" mads:Topic ;\n",
" rdfs:label \"Histoires de pêche\" ;\n",
" bflc:aap-normalized \"histoiresdepêche\" ;\n",
" bf:source <http://id.loc.gov/vocabulary/subjectSchemes/rvm> ;\n",
" mads:authoritativeLabel \"Histoires de pêche\" ],\n",
" [ a bf:Topic,\n",
" mads:Topic ;\n",
" rdfs:label \"Pêche sportive\" ;\n",
" bflc:aap-normalized \"pêchesportive\" ;\n",
" bf:source <http://id.loc.gov/vocabulary/subjectSchemes/rvm> ;\n",
" mads:authoritativeLabel \"Pêche sportive\" ],\n",
" [ a bf:Topic,\n",
" mads:ComplexSubject ;\n",
" rdfs:label \"Pêche sportive--Maine\" ;\n",
" bflc:aap-normalized \"pêchesportivemaine\" ;\n",
" bf:source <http://id.loc.gov/vocabulary/subjectSchemes/rvm> ;\n",
" mads:authoritativeLabel \"Pêche sportive--Maine\" ;\n",
" mads:componentList ( [ a mads:Topic ;\n",
" mads:authoritativeLabel \"Pêche sportive\" ] <http://id.loc.gov/rwo/agents/n79005604-781> ) ],\n",
" [ a bf:Topic,\n",
" mads:Topic ;\n",
" rdfs:label \"fishing\" ;\n",
" bflc:aap-normalized \"fishing\" ;\n",
" bf:source <http://id.loc.gov/vocabulary/subjectSchemes/aat> ;\n",
" mads:authoritativeLabel \"fishing\" ;\n",
" mads:isMemberOfMADSScheme <http://id.loc.gov/vocabulary/subjectSchemes/aat> ],\n",
" <http://id.loc.gov/authorities/subjects/sh85048830>,\n",
" <http://id.loc.gov/authorities/subjects/sh85048873> ;\n",
" bf:title [ a bf:Title ;\n",
" bflc:nonSortNum \"2\" ;\n",
" bf:mainTitle \"A full net\" ] .\n",
"\n",
"\n"
]
}
],
"source": [
"print(work_graph.serialize(format='turtle'))"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "6f421ed5-4f38-4e8a-b570-7b796663e5b0",
"metadata": {},
"outputs": [],
"source": [
"version_query = session.query(Version).where(Version.resource == work)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "0259ccb8-5a94-4717-84ba-4ea2397f6b7c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<Version at 2025-01-23 00:21:20.322643 for http://id.loc.gov/resources/works/22834861>"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"version = version_query.first()\n",
"version"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "015297aa-b844-404e-83ad-11d59288b480",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"116"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(version.vector_index)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "094f9f2c-e4a7-42ee-baee-b0b80ac42a0c",
"metadata": {},
"outputs": [],
"source": [
"work_skolemized = work_graph.skolemize(basepath=\"http://id.loc.gov/resources/works/22834861#\")"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "c374ab14-56ad-4a29-a425-e8cbe673f1cc",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"116"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"work_triples = work_skolemized.serialize(format='nt').splitlines()\n",
"len(work_triples)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "16c24a9a-a9f4-40dd-8b72-a348c2244c57",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'<http://id.loc.gov/resources/works/22834861#b471iddOtlocdOtgovresourcesworks22834861> <http://id.loc.gov/ontologies/bibframe/descriptionConventions> <http://id.loc.gov/vocabulary/descriptionConventions/rda> .'"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"work_triples[4]"
]
},
{
"cell_type": "markdown",
"id": "92e8cb38-e486-4240-ac4f-04ca47ea20f9",
"metadata": {},
"source": [
"## RDF Triple Semantic Search using Cosine Distance\n"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "147948f6-1309-4059-87ec-2a2e63cd623d",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"768"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"work_triple_4_vector = generate_embedding(work_triples[4], VECTOR_SIZE)\n",
"len(work_triple_4_vector)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "db4fa330-7424-4b94-b92a-47ecbd022c42",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[-0.790466845035553,\n",
" -0.15182358026504517,\n",
" -0.06164858117699623,\n",
" -0.5249563455581665,\n",
" -0.24001815915107727,\n",
" -0.48603495955467224,\n",
" 0.35869720578193665,\n",
" 0.4758962094783783,\n",
" -0.024324936792254448,\n",
" -0.25119921565055847,\n",
" -0.9543370604515076,\n",
" -0.047265395522117615,\n",
" -0.4069328308105469,\n",
" -0.050489842891693115,\n",
" 0.5017445087432861,\n",
" 0.4112482964992523,\n",
" -0.2937135100364685,\n",
" 0.9913594722747803,\n",
" 0.5083059668540955,\n",
" 0.001985362730920315,\n",
" -0.11200801283121109,\n",
" -0.9448103904724121,\n",
" -0.1506381332874298,\n",
" -0.37201598286628723,\n",
" 0.1083354502916336]"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"work_triple_4_vector[0:25]"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "263d6cdc-2774-46df-aded-b51d48402f56",
"metadata": {},
"outputs": [],
"source": [
"stmt = select(TripleVectorIndex,\n",
" TripleVectorIndex.vector.cosine_distance(work_triple_4_vector)\n",
" .label(\"distance\")).filter(TripleVectorIndex.vector.cosine_distance(work_triple_4_vector) < 0.2).order_by(\"distance\").limit(5)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "d61d7c0f-ee5c-423a-82e5-22de216e1796",
"metadata": {},
"outputs": [],
"source": [
"query_result = session.execute(stmt)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "11d692cb-8f43-4582-ae3d-0859ae1b3623",
"metadata": {},
"outputs": [],
"source": [
"matched_embeddings = [r for r in query_result]"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "c46a4b15-46b1-4069-9d34-e99d3edb0dac",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[(<bluecore.models.TripleVectorIndex object at 0x11fb581d0>, 0.0),\n",
" (<bluecore.models.TripleVectorIndex object at 0x12fe78190>, 0.0008967124305808705),\n",
" (<bluecore.models.TripleVectorIndex object at 0x30e3decd0>, 0.000974517533014585),\n",
" (<bluecore.models.TripleVectorIndex object at 0x30e3ded50>, 0.001074582654343681),\n",
" (<bluecore.models.TripleVectorIndex object at 0x11fb4e5d0>, 0.0011666013023269306)]"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"matched_embeddings"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "7bea214c-a741-41a1-a7bf-0f336417c78c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<bluecore.models.TripleVectorIndex at 0x11fb581d0>"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"matched_embeddings[0][0]"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "07d91258-3202-4c66-b5eb-dc3ebba4136e",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([ True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True, True, True, True, True, True, True,\n",
" True, True, True])"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"matched_embeddings[0][0].vector == work_triple_4_vector"
]
},
{
"cell_type": "markdown",
"id": "42318357-63f8-48f4-8a79-9604663c276e",
"metadata": {},
"source": [
"## Using with a LLM"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "0324da13-14b7-4d41-ada9-e894d7a5f182",
"metadata": {},
"outputs": [],
"source": [
"open_ai_model = OpenAIModel('gpt-4o-mini')"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "7a7b85c7-310f-4389-b7c7-f6f7aab45802",
"metadata": {},
"outputs": [],
"source": [
"rdf_generate_agent = Agent(\n",
" open_ai_model,\n",
" system_prompt=(\n",
" 'From a text prompt generate a BIBFRAME RDF record.'\n",
" 'Question: Create a BIBFRAME Work record for a Monograph Print book, War '\n",
" 'and Peace by Leo Tolstoy '\n",
" 'Answer: <http://id.loc.gov/resources/works/1948901> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://id.loc.gov/ontologies/bibframe/Work> .'\n",
" '<http://id.loc.gov/resources/works/1948901> <http://id.loc.gov/ontologies/bibframe/title> http://id.loc.gov/resources/works/1948901#b76iddOtlocdOtgovresourcesworks1948901 .'\n",
" '<http://id.loc.gov/resources/works/1948901#b76iddOtlocdOtgovresourcesworks1948901> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://id.loc.gov/ontologies/bibframe/Title> .'\n",
" '<http://id.loc.gov/resources/works/1948901#b76iddOtlocdOtgovresourcesworks1948901> <http://id.loc.gov/ontologies/bibframe/mainTitle> \"Leo Tolstoy War and peace\" .'\n",
" '<http://id.loc.gov/resources/works/1948901> <http://id.loc.gov/ontologies/bibframe/contribution> <http://id.loc.gov/resources/works/1948901#b90iddOtlocdOtgovresourcesworks1948901> .'\n",
" '<http://id.loc.gov/resources/works/1948901#b90iddOtlocdOtgovresourcesworks1948901> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://id.loc.gov/ontologies/bibframe/Contribution> .'\n",
" '<http://id.loc.gov/resources/works/1948901#b90iddOtlocdOtgovresourcesworks1948901> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://id.loc.gov/ontologies/bibframe/PrimaryContribution> .'\n",
" '<http://id.loc.gov/resources/works/1948901#b90iddOtlocdOtgovresourcesworks1948901> <http://id.loc.gov/ontologies/bibframe/agent> <http://id.loc.gov/rwo/agents/n79068416> .'\n",
" )\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "7978d6e4-8fd8-44b9-a8ab-d37a2d740bab",
"metadata": {},
"outputs": [],
"source": [
"result = rdf_generate_agent.run_sync(\n",
" \"Create a BIBFRAME Work for Neal Stephenson's Snowcrash monograph print published in 1994\"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 34,
"id": "ba15e516-71d2-46b8-87b6-308bd97562bd",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Here is a BIBFRAME Work record for Neal Stephenson's \"Snow Crash\", a monograph print published in 1994:\n",
"\n",
"```\n",
"<http://id.loc.gov/resources/works/1948902> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://id.loc.gov/ontologies/bibframe/Work> .\n",
"<http://id.loc.gov/resources/works/1948902> <http://id.loc.gov/ontologies/bibframe/title> <http://id.loc.gov/resources/works/1948902#b76iddOtlocdOtgovresourcesworks1948902> .\n",
"<http://id.loc.gov/resources/works/1948902#b76iddOtlocdOtgovresourcesworks1948902> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://id.loc.gov/ontologies/bibframe/Title> .\n",
"<http://id.loc.gov/resources/works/1948902#b76iddOtlocdOtgovresourcesworks1948902> <http://id.loc.gov/ontologies/bibframe/mainTitle> \"Snow Crash\" .\n",
"<http://id.loc.gov/resources/works/1948902> <http://id.loc.gov/ontologies/bibframe/contribution> <http://id.loc.gov/resources/works/1948902#b90iddOtlocdOtgovresourcesworks1948902> .\n",
"<http://id.loc.gov/resources/works/1948902#b90iddOtlocdOtgovresourcesworks1948902> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://id.loc.gov/ontologies/bibframe/Contribution> .\n",
"<http://id.loc.gov/resources/works/1948902#b90iddOtlocdOtgovresourcesworks1948902> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://id.loc.gov/ontologies/bibframe/PrimaryContribution> .\n",
"<http://id.loc.gov/resources/works/1948902#b90iddOtlocdOtgovresourcesworks1948902> <http://id.loc.gov/ontologies/bibframe/agent> <http://id.loc.gov/rwo/agents/n91065559> . # Assuming n91065559 is the identifier for Neal Stephenson\n",
"<http://id.loc.gov/resources/works/1948902> <http://id.loc.gov/ontologies/bibframe/publication> \"Published in 1994\" .\n",
"```\n",
"\n",
"Note: The identifier `n91065559` is an example and should be verified for Neal Stephenson to ensure that it matches the correct bibliographic identity. The publication details may also need to be more specific based on the publisher and other metadata as necessary.\n"
]
}
],
"source": [
"print(result.new_messages()[1].parts[0].content)"
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "c84718e0-347c-41c3-9423-4ffe23e78b3d",
"metadata": {},
"outputs": [],
"source": [
"title_embedding = generate_embedding(\"\"\"<http://id.loc.gov/resources/works/1948902#b76iddOtlocdOtgovresourcesworks1948902> <http://id.loc.gov/ontologies/bibframe/mainTitle> \"Snow Crash\" .\"\"\",\n",
" VECTOR_SIZE)"
]
},
{
"cell_type": "code",
"execution_count": 36,
"id": "fbde1065-715e-4433-ae1e-9187acacc7cc",
"metadata": {},
"outputs": [],
"source": [
"snowcrash_stmt = select(TripleVectorIndex,\n",
" TripleVectorIndex.vector.cosine_distance(title_embedding)\n",
" .label(\"distance\")).filter(TripleVectorIndex.vector.cosine_distance(title_embedding) < 0.2).order_by(\"distance\").limit(5)"
]
},
{
"cell_type": "code",
"execution_count": 37,
"id": "77bb6ff6-a866-4aa2-9023-b5d2d18bdf0a",
"metadata": {},
"outputs": [],
"source": [
"query_result = session.execute(snowcrash_stmt)"
]
},
{
"cell_type": "code",
"execution_count": 38,
"id": "e42b7aba-29f3-471f-8bba-25ca0e57b5a4",
"metadata": {},
"outputs": [],
"source": [
"snowcrash_title_results = [r for r in query_result]"
]
},
{
"cell_type": "code",
"execution_count": 39,
"id": "0dd45307-0a48-46aa-af5f-f39dba76b17b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[(<bluecore.models.TripleVectorIndex object at 0x152406c10>, 0.01885835600303931),\n",
" (<bluecore.models.TripleVectorIndex object at 0x12fe56690>, 0.018882364240791483),\n",
" (<bluecore.models.TripleVectorIndex object at 0x12fe57650>, 0.019057844438853055),\n",
" (<bluecore.models.TripleVectorIndex object at 0x12fe55650>, 0.019086425616838376),\n",
" (<bluecore.models.TripleVectorIndex object at 0x12fe54590>, 0.01959304759398728)]"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"snowcrash_title_results"
]
},
{
"cell_type": "code",
"execution_count": 41,
"id": "c62dcae6-d395-47f3-94b9-2c14f82a713b",
"metadata": {},
"outputs": [],
"source": [
"work = snowcrash_title_results[0][0].version.resource"
]
},
{
"cell_type": "code",
"execution_count": 42,
"id": "3adc4f36-d385-48f8-bfd1-d29a7d0098be",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<Work http://id.loc.gov/resources/works/23770044>"
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"work"
]
},
{
"cell_type": "code",
"execution_count": 44,
"id": "c1e50ff1-dfeb-4704-b178-83d870d4e5c4",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<Graph identifier=N30df5bbdd368450ba8769654b49fe81b (<class 'rdflib.graph.Graph'>)>"
]
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"work2_graph = init_graph()\n",
"work2_graph.parse(data=work.data, format='json-ld')"
]
},
{
"cell_type": "code",
"execution_count": 45,
"id": "96db8d6c-9513-4442-8065-f06292c2db09",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"@prefix bf: <http://id.loc.gov/ontologies/bibframe/> .\n",
"@prefix bflc: <http://id.loc.gov/ontologies/bflc/> .\n",
"@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .\n",
"@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .\n",
"@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .\n",
"\n",
"<http://id.loc.gov/resources/works/23770044> a bf:Monograph,\n",
" bf:Text,\n",
" bf:Work ;\n",
" bflc:aap \"Wharton, Edith Twilight Sleep\" ;\n",
" bflc:aap-normalized \"whartonedithtwilightsleep\" ;\n",
" bf:adminMetadata [ a bf:AdminMetadata ;\n",
" bf:agent <http://id.loc.gov/vocabulary/organizations/dlc> ;\n",
" bf:date \"2024-07-10\"^^xsd:date ;\n",
" bf:status <http://id.loc.gov/vocabulary/mstatus/n> ],\n",
" [ a bf:AdminMetadata ;\n",
" bf:date \"2024-07-10T06:44:12\"^^xsd:dateTime ;\n",
" bf:status <http://id.loc.gov/vocabulary/mstatus/c> ],\n",
" [ a bf:AdminMetadata ;\n",
" bf:agent <http://id.loc.gov/vocabulary/organizations/dlcmrc> ;\n",
" bf:date \"2024-08-03T04:31:58.966471-04:00\"^^xsd:dateTime ;\n",
" bf:generationProcess <https://github.com/lcnetdev/marc2bibframe2/releases/tag/v2.7.0> ;\n",
" bf:status <http://id.loc.gov/vocabulary/mstatus/c> ],\n",
" [ a bf:AdminMetadata ;\n",
" bflc:encodingLevel <http://id.loc.gov/vocabulary/menclvl/5> ;\n",
" bf:descriptionAuthentication <http://id.loc.gov/vocabulary/marcauthen/pcc> ;\n",
" bf:descriptionConventions <http://id.loc.gov/vocabulary/descriptionConventions/isbd>,\n",
" <http://id.loc.gov/vocabulary/descriptionConventions/rda> ;\n",
" bf:descriptionLanguage <http://id.loc.gov/vocabulary/languages/eng> ;\n",
" bf:descriptionLevel <http://id.loc.gov/ontologies/bibframe-2-3-0/> ;\n",
" bf:identifiedBy [ a bf:Local ;\n",
" bf:assigner <http://id.loc.gov/vocabulary/organizations/dlc> ;\n",
" rdf:value \"23770044\" ] ] ;\n",
" bf:content <http://id.loc.gov/vocabulary/contentTypes/txt> ;\n",
" bf:contribution [ a bf:Contribution,\n",
" bf:PrimaryContribution ;\n",
" bf:agent [ a bf:Agent,\n",
" bf:Person ;\n",
" rdfs:label \"Wharton, Edith\" ] ;\n",
" bf:role <http://id.loc.gov/vocabulary/relators/aut> ],\n",
" [ a bf:Contribution ;\n",
" bf:agent [ a bf:Agent,\n",
" bf:Person ;\n",
" rdfs:label \"Smith, Allison Miriam\" ] ;\n",
" bf:role <http://id.loc.gov/vocabulary/relators/oth> ],\n",
" [ a bf:Contribution ;\n",
" bf:agent <http://id.loc.gov/rwo/agents/n81101794> ;\n",
" bf:role <http://id.loc.gov/vocabulary/relators/oth> ] ;\n",
" bf:hasInstance <http://id.loc.gov/resources/instances/23770044> ;\n",
" bf:language <http://id.loc.gov/vocabulary/languages/eng> ;\n",
" bf:summary [ a bf:Summary ;\n",
" rdfs:label \"\\\"Mrs. Pauline Manford is a busy woman, as any upstanding New York society lady should be. Parties, dinners, charity luncheons, balls, and strict exercise and beauty regimes fill her daily schedule to an exhausting degree. Her secretary can hardly keep up. To manage a modern household is to hold the family together, staying on trend with all things helpful, but her daughter has horrible taste in married men, her ex-husband is unwell, and her son is struggling to forge a career for himself while his postpartum wife refuses to settle down from lavish partying. Pauline can't decide if she should bob her hair, redecorate, or get a face lift, and she surely doesn't have time to notice her current husband's wandering eye as anything other than harmless flirtations. When a rakish Italian actor bound for Hollywood and a scandal with the local wellness guru threaten to tear her perfectly constructed life apart, Pauline moves on to new spiritually medicinal treatments, and the Manfords must navigate the fraught tensions that bind them together. Hopefully a vacation from NYC's ruthless grind to their quiet country house will deter any further worries. Twilight Sleep, named for the early anesthetic that predates epidural and induces memory loss, is Edith Wharton's oft-forgotten novel of modern motherhood and the pressures that lead women to reconstruct or completely escape their lives. Sharp and humorous, it feels as relevant today as it did in the 1920s\\\"-- Provided by publisher\" ] ;\n",
" bf:title [ a bf:Title ;\n",
" bf:mainTitle \"Twilight Sleep\" ] .\n",
"\n",
"\n"
]
}
],
"source": [
"print(work2_graph.serialize(format='turtle'))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "062cff91-ccdf-4133-9ae5-cdd83f614a28",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment