Last active
March 12, 2024 19:15
-
-
Save datadavev/2ca265f34eccc502b28dcc71559f2a9e to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"id": "8d483671-3274-4c2f-9a4d-063f601f43ac", | |
"metadata": {}, | |
"source": [ | |
"Connect to solr and download records, streaming them through a streaming json parser.\n", | |
"\n", | |
"Needs ijson\n", | |
"\n", | |
"Solr is expected on localhost:8983, e.g, `ssh -L8983:localhost:8983 hyde`\n", | |
"\n", | |
"Create a subfolder \"data\"\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "ad59c854-a6d0-4f2b-a825-3865c3b25435", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import json\n", | |
"import time\n", | |
"import urllib.parse\n", | |
"import urllib.request\n", | |
"import ijson\n", | |
"\n", | |
"def transform(d:dict) -> dict:\n", | |
" # simple transformer, but not used here\n", | |
" return {\n", | |
" \"id\":d.get(\"id\"),\n", | |
" \"source\":d.get(\"source\"),\n", | |
" \"label\":d.get(\"label\"),\n", | |
" \"material\":d.get(\"hasMaterialCategory\"),\n", | |
" \"specimen\":d.get(\"hasSpecimenCategory\"),\n", | |
" \"context\":d.get(\"hasContextCategory\"),\n", | |
" \"keywords\":d.get(\"keywords\"),\n", | |
" \"description\":d.get(\"producedBy_description\"),\n", | |
" \"responsibility\":d.get(\"producedBy_responsibility\"),\n", | |
" \"resulttime\":d.get(\"producedBy_resultTime\"),\n", | |
" \"sitelabel\":d.get(\"producedBy_samplingSite_label\"),\n", | |
" \"latitude\":d.get(\"producedBy_samplingSite_location_latitude\"),\n", | |
" \"longitude\":d.get(\"producedBy_samplingSite_location_longitude\"),\n", | |
" \"curation\":d.get(\"curation_responsibility\"),\n", | |
" \"relatedid\":d.get(\"relatedResource_isb_core_id\"),\n", | |
" }\n", | |
"\n", | |
"\n", | |
"fields_long = [\n", | |
" \"id\",\n", | |
" \"source\",\n", | |
" \"label\",\n", | |
" \"hasMaterialCategory\",\n", | |
" \"hasSpecimenCategory\",\n", | |
" \"hasContextCategory\",\n", | |
" \"keywords\",\n", | |
" \"producedBy_description\",\n", | |
" \"producedBy_responsibility\",\n", | |
" \"producedBy_resultTime\",\n", | |
" \"producedBy_samplingSite_label\",\n", | |
" \"producedBy_samplingSite_location_latitude\",\n", | |
" \"producedBy_samplingSite_location_longitude\",\n", | |
" \"curation_responsibility\",\n", | |
" \"relatedResource_isb_core_id\",\n", | |
"]\n", | |
"\n", | |
"fields_short = [\n", | |
" \"id\",\n", | |
" \"relatedResource_isb_core_id\",\n", | |
" \"producedBy_samplingSite_location_latitude\",\n", | |
" \"producedBy_samplingSite_location_longitude\",\n", | |
"]\n", | |
"\n", | |
"fields_full = ['id', 'authorizedBy', 'compliesWith', 'producedBy_samplingSite_location_longitude', \n", | |
" 'producedBy_samplingSite_location_latitude', 'relatedResource_isb_core_id', \n", | |
" 'curation_responsibility', 'curation_location', 'curation_accessContraints', \n", | |
" 'curation_description', 'curation_label', 'samplingPurpose', 'registrant', \n", | |
" 'producedBy_samplingSite_placeName', 'producedBy_samplingSite_location_elevationInMeters', \n", | |
" 'producedBy_samplingSite_label', 'producedBy_samplingSite_description', \n", | |
" 'producedBy_resultTime', 'producedBy_responsibility', 'producedBy_hasFeatureOfInterest', \n", | |
" 'producedBy_description', 'producedBy_label', 'producedBy_isb_core_id', \n", | |
" 'informalClassification', 'keywords', 'hasSpecimenCategory', 'hasMaterialCategory', \n", | |
" 'hasContextCategory', 'description', 'label', 'source']\n", | |
"\n", | |
" \n", | |
"def stream_records(q:str, dest:str, fields:list[str]=fields_long): \n", | |
" params = {\n", | |
" \"q\":q,\n", | |
" \"sort\":\"id asc\",\n", | |
" \"fl\": \",\".join(fields),\n", | |
" \"wt\":\"json\"\n", | |
" }\n", | |
" url = \"http://localhost:8983/solr/isb_core_records/export\"\n", | |
" url = f\"{url}?{urllib.parse.urlencode(params)}\"\n", | |
" #print(url)\n", | |
" src = urllib.request.urlopen(url)\n", | |
" docs = ijson.items(src, \"response.docs.item\", use_float=True)\n", | |
" counter = 0\n", | |
" with open(dest, \"w\") as dst:\n", | |
" for doc in docs:\n", | |
" json.dump(doc, dst)\n", | |
" dst.write(\"\\n\")\n", | |
" if counter % 10000 == 0:\n", | |
" print(f\"{int(counter/1000):03}k\", end=\" \")\n", | |
" counter += 1\n", | |
" return counter\n", | |
"\n", | |
"\n", | |
"def get_source(source:str):\n", | |
" source=source.upper()\n", | |
" q = f\"source:{source}\"\n", | |
" dest = f\"data/{source.lower()}.jsonl\"\n", | |
" t0 = time.time()\n", | |
" total = stream_records(q, dest, fields=fields_full)\n", | |
" t1 = time.time()\n", | |
" print(f\"{source}, {total} records, {t1-t0:.1f} seconds\")\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"id": "0d6b9503-65af-4420-bdff-731132dc77e6", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"000k 010k 020k 030k 040k 050k 060k 070k 080k 090k 100k 110k 120k 130k 140k 150k 160k 170k 180k 190k 200k 210k 220k 230k 240k 250k 260k 270k 280k 290k 300k 310k 320k 330k 340k 350k 360k 370k 380k 390k 400k 410k 420k 430k 440k 450k 460k 470k 480k 490k 500k 510k 520k 530k 540k 550k 560k GEOME, 560604 records, 50.1 seconds\n" | |
] | |
} | |
], | |
"source": [ | |
"get_source(\"GEOME\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"id": "a6065a9d-f4cd-484a-8477-52b3151df32d", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"000k 010k 020k 030k 040k 050k 060k 070k 080k 090k 100k 110k 120k 130k 140k 150k 160k 170k 180k 190k 200k 210k 220k 230k 240k 250k 260k 270k 280k 290k 300k 310k 320k 330k 340k 350k 360k 370k 380k 390k 400k 410k 420k 430k 440k 450k 460k 470k 480k 490k 500k 510k 520k 530k 540k 550k 560k 570k 580k 590k 600k 610k 620k 630k 640k 650k 660k 670k 680k 690k 700k 710k 720k 730k 740k 750k 760k 770k 780k 790k 800k 810k 820k 830k 840k 850k 860k 870k 880k OPENCONTEXT, 882128 records, 111.5 seconds\n" | |
] | |
} | |
], | |
"source": [ | |
"get_source(\"OPENCONTEXT\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"id": "0176381b-8785-4743-b444-ffd73fc18d07", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"000k 010k 020k 030k 040k 050k 060k 070k 080k 090k 100k 110k 120k 130k 140k 150k 160k 170k 180k 190k 200k 210k SMITHSONIAN, 213411 records, 40.8 seconds\n" | |
] | |
} | |
], | |
"source": [ | |
"get_source(\"SMITHSONIAN\")" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.11.8" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib.parse | |
import urllib.request | |
import ijson | |
def stream_records(q:str): | |
fields = [ | |
"id", | |
"source", | |
"label", | |
"hasMaterialCategory", | |
"hasSpecimenCategory", | |
"hasContextCategory", | |
"keywords", | |
"producedBy_label", | |
"producedBy_description", | |
"producedBy_responsibility", | |
"producedBy_resultTime", | |
"producedBy_samplingSite_label", | |
"producedBy_samplingSite_location_latitude", | |
"producedBy_samplingSite_location_longitude", | |
"curation_responsibility", | |
"relatedResource_isb_core_id", | |
] | |
params = { | |
"q":q, | |
"sort":"id asc", | |
"fl": ",".join(fields), | |
"wt":"json" | |
} | |
url = "http://localhost:8983/solr/isb_core_records/export" | |
url = f"{url}?{urllib.parse.urlencode(params)}" | |
print(url) | |
src = urllib.request.urlopen(url) | |
docs = ijson.items(src, "response.docs.item") | |
counter = 0 | |
for doc in docs: | |
if counter % 10000 == 0: | |
print(f"{counter:07}") | |
counter += 1 | |
print(f"Total = {counter}") | |
q = "source:GEOME" | |
stream_records(q) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment