Created
November 9, 2020 15:17
-
-
Save MikeTrizna/3492cfe218a418a765c3aec03925e2b8 to your computer and use it in GitHub Desktop.
Processing SI Open Access GitHub with Dask
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2020-11-09T15:12:39.585901Z", | |
"end_time": "2020-11-09T15:12:40.140110Z" | |
}, | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "import pandas as pd\nimport numpy as np", | |
"execution_count": 1, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2020-11-09T15:12:40.141571Z", | |
"end_time": "2020-11-09T15:12:40.435874Z" | |
}, | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "from dask.distributed import Client\nimport dask.bag as db\nimport json", | |
"execution_count": 2, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2020-11-09T15:12:40.438000Z", | |
"end_time": "2020-11-09T15:12:42.435853Z" | |
}, | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "client = Client(n_workers=8, threads_per_worker=4)\nclient", | |
"execution_count": 3, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 3, | |
"data": { | |
"text/plain": "<Client: 'tcp://127.0.0.1:59885' processes=8 threads=32, memory=17.18 GB>", | |
"text/html": "<table style=\"border: 2px solid white;\">\n<tr>\n<td style=\"vertical-align: top; border: 0px solid white\">\n<h3 style=\"text-align: left;\">Client</h3>\n<ul style=\"text-align: left; list-style: none; margin: 0; padding: 0;\">\n <li><b>Scheduler: </b>tcp://127.0.0.1:59885</li>\n <li><b>Dashboard: </b><a href='http://127.0.0.1:8787/status' target='_blank'>http://127.0.0.1:8787/status</a></li>\n</ul>\n</td>\n<td style=\"vertical-align: top; border: 0px solid white\">\n<h3 style=\"text-align: left;\">Cluster</h3>\n<ul style=\"text-align: left; list-style:none; margin: 0; padding: 0;\">\n <li><b>Workers: </b>8</li>\n <li><b>Cores: </b>32</li>\n <li><b>Memory: </b>17.18 GB</li>\n</ul>\n</td>\n</tr>\n</table>" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2020-11-09T15:12:42.438170Z", | |
"end_time": "2020-11-09T15:12:42.515390Z" | |
}, | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "b = db.read_text('/Users/triznam/Downloads/OpenAccess/metadata/objects/NMAH/*.txt.bz2',\n compression='bz2').map(json.loads)", | |
"execution_count": 4, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2020-11-09T15:12:42.516770Z", | |
"end_time": "2020-11-09T15:12:42.566818Z" | |
}, | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "example = b.take(1)\nprint(json.dumps(example, indent=2))", | |
"execution_count": 5, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "[\n {\n \"id\": \"edanmdm-nmah_1632369\",\n \"version\": \"\",\n \"unitCode\": \"NMAH\",\n \"linkedId\": \"0\",\n \"type\": \"edanmdm\",\n \"content\": {\n \"descriptiveNonRepeating\": {\n \"record_ID\": \"nmah_1632369\",\n \"unit_code\": \"NMAH\",\n \"title_sort\": \"CERTIFIED PROOF\",\n \"guid\": \"http://n2t.net/ark:/65665/ng49ca746b1-2f39-704b-e053-15f76fa0b4fa\",\n \"title\": {\n \"label\": \"Object Name\",\n \"content\": \"certified proof\"\n },\n \"metadata_usage\": {\n \"access\": \"CC0\"\n },\n \"data_source\": \"National Museum of American History\"\n },\n \"indexedStructured\": {\n \"object_type\": [\n \"Exchange Medium\",\n \"certified proof\"\n ],\n \"name\": [\n \"U.S. Department of the Treasury\"\n ],\n \"online_media_type\": [\n \"Images\"\n ]\n },\n \"freetext\": {\n \"setName\": [\n {\n \"label\": \"See more items in\",\n \"content\": \"Work and Industry: National Numismatic Collection\"\n },\n {\n \"label\": \"See more items in\",\n \"content\": \"Coins, Currency and Medals\"\n }\n ],\n \"identifier\": [\n {\n \"label\": \"ID Number\",\n \"content\": \"NU.297219.167009\"\n },\n {\n \"label\": \"catalog number\",\n \"content\": \"297219.167009\"\n },\n {\n \"label\": \"accession number\",\n \"content\": \"297219\"\n }\n ],\n \"notes\": [\n {\n \"label\": \"Crowdsourcing\",\n \"content\": \"Available for transcription\"\n }\n ],\n \"name\": [\n {\n \"label\": \"issuing authority\",\n \"content\": \"U.S. Department of the Treasury\"\n }\n ],\n \"dataSource\": [\n {\n \"label\": \"Data Source\",\n \"content\": \"National Museum of American History\"\n }\n ],\n \"objectType\": [\n {\n \"label\": \"Object Name\",\n \"content\": \"certified proof\"\n },\n {\n \"label\": \"Object Type\",\n \"content\": \"Exchange Medium\"\n }\n ]\n }\n },\n \"url\": \"edanmdm:nmah_1632369\",\n \"hash\": \"0084dd244488e3bf816709522029745bd21054dc\",\n \"docSignature\": \"6885e817f6acb889644caea91812aa4f62fe61df_59e9ea4b8270170a968ded2fb48b16fc\",\n \"timestamp\": 1594109633,\n \"lastTimeUpdated\": 1594109587,\n \"title\": \"certified proof\"\n }\n]\n", | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2020-11-09T15:12:42.568039Z", | |
"end_time": "2020-11-09T15:13:12.789164Z" | |
}, | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "b.count().compute()", | |
"execution_count": 6, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 6, | |
"data": { | |
"text/plain": "1314921" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2020-11-09T15:13:12.792483Z", | |
"end_time": "2020-11-09T15:13:14.175226Z" | |
}, | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "s3_b = db.read_text('s3://smithsonian-open-access/metadata/edan/nmah/*.txt',\n storage_options={'anon': True}).map(json.loads)", | |
"execution_count": 7, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2020-11-09T15:13:14.178147Z", | |
"end_time": "2020-11-09T15:16:37.933001Z" | |
}, | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "s3_b.count().compute()", | |
"execution_count": 8, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 8, | |
"data": { | |
"text/plain": "1315047" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "", | |
"execution_count": null, | |
"outputs": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3", | |
"language": "python" | |
}, | |
"language_info": { | |
"name": "python", | |
"version": "3.8.4", | |
"mimetype": "text/x-python", | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"pygments_lexer": "ipython3", | |
"nbconvert_exporter": "python", | |
"file_extension": ".py" | |
}, | |
"gist": { | |
"id": "", | |
"data": { | |
"description": "Processing SI Open Access GitHub with Dask", | |
"public": true | |
} | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment