Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save MikeTrizna/3492cfe218a418a765c3aec03925e2b8 to your computer and use it in GitHub Desktop.
Save MikeTrizna/3492cfe218a418a765c3aec03925e2b8 to your computer and use it in GitHub Desktop.
Processing SI Open Access GitHub with Dask
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"metadata": {
"ExecuteTime": {
"start_time": "2020-11-09T15:12:39.585901Z",
"end_time": "2020-11-09T15:12:40.140110Z"
},
"trusted": true
},
"cell_type": "code",
"source": "import pandas as pd\nimport numpy as np",
"execution_count": 1,
"outputs": []
},
{
"metadata": {
"ExecuteTime": {
"start_time": "2020-11-09T15:12:40.141571Z",
"end_time": "2020-11-09T15:12:40.435874Z"
},
"trusted": true
},
"cell_type": "code",
"source": "from dask.distributed import Client\nimport dask.bag as db\nimport json",
"execution_count": 2,
"outputs": []
},
{
"metadata": {
"ExecuteTime": {
"start_time": "2020-11-09T15:12:40.438000Z",
"end_time": "2020-11-09T15:12:42.435853Z"
},
"trusted": true
},
"cell_type": "code",
"source": "client = Client(n_workers=8, threads_per_worker=4)\nclient",
"execution_count": 3,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 3,
"data": {
"text/plain": "<Client: 'tcp://127.0.0.1:59885' processes=8 threads=32, memory=17.18 GB>",
"text/html": "<table style=\"border: 2px solid white;\">\n<tr>\n<td style=\"vertical-align: top; border: 0px solid white\">\n<h3 style=\"text-align: left;\">Client</h3>\n<ul style=\"text-align: left; list-style: none; margin: 0; padding: 0;\">\n <li><b>Scheduler: </b>tcp://127.0.0.1:59885</li>\n <li><b>Dashboard: </b><a href='http://127.0.0.1:8787/status' target='_blank'>http://127.0.0.1:8787/status</a></li>\n</ul>\n</td>\n<td style=\"vertical-align: top; border: 0px solid white\">\n<h3 style=\"text-align: left;\">Cluster</h3>\n<ul style=\"text-align: left; list-style:none; margin: 0; padding: 0;\">\n <li><b>Workers: </b>8</li>\n <li><b>Cores: </b>32</li>\n <li><b>Memory: </b>17.18 GB</li>\n</ul>\n</td>\n</tr>\n</table>"
},
"metadata": {}
}
]
},
{
"metadata": {
"ExecuteTime": {
"start_time": "2020-11-09T15:12:42.438170Z",
"end_time": "2020-11-09T15:12:42.515390Z"
},
"trusted": true
},
"cell_type": "code",
"source": "b = db.read_text('/Users/triznam/Downloads/OpenAccess/metadata/objects/NMAH/*.txt.bz2',\n compression='bz2').map(json.loads)",
"execution_count": 4,
"outputs": []
},
{
"metadata": {
"ExecuteTime": {
"start_time": "2020-11-09T15:12:42.516770Z",
"end_time": "2020-11-09T15:12:42.566818Z"
},
"trusted": true
},
"cell_type": "code",
"source": "example = b.take(1)\nprint(json.dumps(example, indent=2))",
"execution_count": 5,
"outputs": [
{
"output_type": "stream",
"text": "[\n {\n \"id\": \"edanmdm-nmah_1632369\",\n \"version\": \"\",\n \"unitCode\": \"NMAH\",\n \"linkedId\": \"0\",\n \"type\": \"edanmdm\",\n \"content\": {\n \"descriptiveNonRepeating\": {\n \"record_ID\": \"nmah_1632369\",\n \"unit_code\": \"NMAH\",\n \"title_sort\": \"CERTIFIED PROOF\",\n \"guid\": \"http://n2t.net/ark:/65665/ng49ca746b1-2f39-704b-e053-15f76fa0b4fa\",\n \"title\": {\n \"label\": \"Object Name\",\n \"content\": \"certified proof\"\n },\n \"metadata_usage\": {\n \"access\": \"CC0\"\n },\n \"data_source\": \"National Museum of American History\"\n },\n \"indexedStructured\": {\n \"object_type\": [\n \"Exchange Medium\",\n \"certified proof\"\n ],\n \"name\": [\n \"U.S. Department of the Treasury\"\n ],\n \"online_media_type\": [\n \"Images\"\n ]\n },\n \"freetext\": {\n \"setName\": [\n {\n \"label\": \"See more items in\",\n \"content\": \"Work and Industry: National Numismatic Collection\"\n },\n {\n \"label\": \"See more items in\",\n \"content\": \"Coins, Currency and Medals\"\n }\n ],\n \"identifier\": [\n {\n \"label\": \"ID Number\",\n \"content\": \"NU.297219.167009\"\n },\n {\n \"label\": \"catalog number\",\n \"content\": \"297219.167009\"\n },\n {\n \"label\": \"accession number\",\n \"content\": \"297219\"\n }\n ],\n \"notes\": [\n {\n \"label\": \"Crowdsourcing\",\n \"content\": \"Available for transcription\"\n }\n ],\n \"name\": [\n {\n \"label\": \"issuing authority\",\n \"content\": \"U.S. Department of the Treasury\"\n }\n ],\n \"dataSource\": [\n {\n \"label\": \"Data Source\",\n \"content\": \"National Museum of American History\"\n }\n ],\n \"objectType\": [\n {\n \"label\": \"Object Name\",\n \"content\": \"certified proof\"\n },\n {\n \"label\": \"Object Type\",\n \"content\": \"Exchange Medium\"\n }\n ]\n }\n },\n \"url\": \"edanmdm:nmah_1632369\",\n \"hash\": \"0084dd244488e3bf816709522029745bd21054dc\",\n \"docSignature\": \"6885e817f6acb889644caea91812aa4f62fe61df_59e9ea4b8270170a968ded2fb48b16fc\",\n \"timestamp\": 1594109633,\n \"lastTimeUpdated\": 1594109587,\n \"title\": \"certified proof\"\n }\n]\n",
"name": "stdout"
}
]
},
{
"metadata": {
"ExecuteTime": {
"start_time": "2020-11-09T15:12:42.568039Z",
"end_time": "2020-11-09T15:13:12.789164Z"
},
"trusted": true
},
"cell_type": "code",
"source": "b.count().compute()",
"execution_count": 6,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 6,
"data": {
"text/plain": "1314921"
},
"metadata": {}
}
]
},
{
"metadata": {
"ExecuteTime": {
"start_time": "2020-11-09T15:13:12.792483Z",
"end_time": "2020-11-09T15:13:14.175226Z"
},
"trusted": true
},
"cell_type": "code",
"source": "s3_b = db.read_text('s3://smithsonian-open-access/metadata/edan/nmah/*.txt',\n storage_options={'anon': True}).map(json.loads)",
"execution_count": 7,
"outputs": []
},
{
"metadata": {
"ExecuteTime": {
"start_time": "2020-11-09T15:13:14.178147Z",
"end_time": "2020-11-09T15:16:37.933001Z"
},
"trusted": true
},
"cell_type": "code",
"source": "s3_b.count().compute()",
"execution_count": 8,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 8,
"data": {
"text/plain": "1315047"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "",
"execution_count": null,
"outputs": []
}
],
"metadata": {
"kernelspec": {
"name": "python3",
"display_name": "Python 3",
"language": "python"
},
"language_info": {
"name": "python",
"version": "3.8.4",
"mimetype": "text/x-python",
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"pygments_lexer": "ipython3",
"nbconvert_exporter": "python",
"file_extension": ".py"
},
"gist": {
"id": "",
"data": {
"description": "Processing SI Open Access GitHub with Dask",
"public": true
}
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment