Created
February 28, 2020 17:11
-
-
Save MikeTrizna/82aed87971cf6de5b5036fcd1c9e1ca9 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<table style=\"border: 2px solid white;\">\n", | |
"<tr>\n", | |
"<td style=\"vertical-align: top; border: 0px solid white\">\n", | |
"<h3>Client</h3>\n", | |
"<ul>\n", | |
" <li><b>Scheduler: </b>tcp://127.0.0.1:43274\n", | |
" <li><b>Dashboard: </b><a href='http://127.0.0.1:8787/status' target='_blank'>http://127.0.0.1:8787/status</a>\n", | |
"</ul>\n", | |
"</td>\n", | |
"<td style=\"vertical-align: top; border: 0px solid white\">\n", | |
"<h3>Cluster</h3>\n", | |
"<ul>\n", | |
" <li><b>Workers: </b>8</li>\n", | |
" <li><b>Cores: </b>8</li>\n", | |
" <li><b>Memory: </b>404.34 GB</li>\n", | |
"</ul>\n", | |
"</td>\n", | |
"</tr>\n", | |
"</table>" | |
], | |
"text/plain": [ | |
"<Client: scheduler='tcp://127.0.0.1:43274' processes=8 cores=8>" | |
] | |
}, | |
"execution_count": 1, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"from dask.distributed import Client, progress\n", | |
"client = Client(n_workers=8, threads_per_worker=1)\n", | |
"client" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import json\n", | |
"import bz2" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import dask.bag as db" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#All\n", | |
"b = db.read_text('metadata/objects/*/*.txt.bz2', \n", | |
" compression='bz2').map(json.loads)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#Just NASM(141) and FSG(3133)\n", | |
"b = db.read_text(['metadata/objects/NASM/*.txt.bz2','metadata/objects/FSG/*.txt.bz2'], \n", | |
" compression='bz2').map(json.loads)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"dask.bag<loads-2..., npartitions=6262>" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"b" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"({'id': 'edanmdm-siris_arc_305285',\n", | |
" 'version': '',\n", | |
" 'unitCode': 'ACAH',\n", | |
" 'linkedId': '0',\n", | |
" 'type': 'edanmdm',\n", | |
" 'content': {'descriptiveNonRepeating': {'record_ID': 'siris_arc_305285',\n", | |
" 'unit_code': 'ACAH',\n", | |
" 'title_sort': 'JORDAN MARSH SPACE SCENE COLOR DRAWING',\n", | |
" 'title': {'label': 'Title',\n", | |
" 'content': '[Jordan Marsh -- Space Scene] color drawing]'},\n", | |
" 'metadata_usage': {'access': 'CC0'},\n", | |
" 'data_source': 'Archives Center, National Museum of American History'},\n", | |
" 'indexedStructured': {'date': ['2000s', '1930s'],\n", | |
" 'object_type': ['Archival materials', 'Drawings'],\n", | |
" 'name': [{'type': 'corporate_main', 'content': 'Vaughn Parades, Inc'},\n", | |
" {'type': 'corporate_main', 'content': 'Orange Bowl Committee'}],\n", | |
" 'topic': ['Science fiction', 'Space flight', 'Parade floats'],\n", | |
" 'usage_flag': ['SuppressFromCSC']},\n", | |
" 'freetext': {'date': [{'label': 'Date', 'content': '1932'},\n", | |
" {'label': 'Date', 'content': '2002'}],\n", | |
" 'setName': [{'label': 'See more items in',\n", | |
" 'content': 'Orange Bowl Collection'}],\n", | |
" 'identifier': [{'label': 'Local number',\n", | |
" 'content': 'AC1191-0000179 (AC Scan No.)'}],\n", | |
" 'notes': [{'label': 'Summary',\n", | |
" 'content': 'Design for a parade float. Drawing on brown paper showing rocket ship, etc.'},\n", | |
" {'label': 'Cite as',\n", | |
" 'content': 'Orange Bowl Collection, Archives Center, National Museum of American History'},\n", | |
" {'label': 'Repository Loc.',\n", | |
" 'content': 'Smithsonian Institution, National Museum of American History : Archives Center. P.O. Box 37012, MRC 6 1, Constitution Ave. between 12th and 14th Streets, N.W., Washington, D.C. 20013-7012. Call 202-633-3270 for appointment. Fax 202-786-2453'}],\n", | |
" 'name': [{'label': 'designer', 'content': 'Vaughn Parades, Inc'},\n", | |
" {'label': 'donor', 'content': 'Orange Bowl Committee'}],\n", | |
" 'topic': [{'label': 'Topic', 'content': 'Parade floats'},\n", | |
" {'label': 'Topic', 'content': 'Space flight'},\n", | |
" {'label': 'Topic', 'content': 'Science fiction'}],\n", | |
" 'physicalDescription': [{'label': 'Physical description',\n", | |
" 'content': 'Graphite on paper'},\n", | |
" {'label': 'Physical description', 'content': '1 item, 2.2\" x 2.7\"'}],\n", | |
" 'dataSource': [{'label': 'Data Source',\n", | |
" 'content': 'Archives Center, National Museum of American History'}],\n", | |
" 'objectRights': [{'label': 'Restrictions & Rights',\n", | |
" 'content': 'Fees for commercial use'}],\n", | |
" 'objectType': [{'label': 'Type', 'content': 'Drawings'}]}},\n", | |
" 'url': 'edanmdm:siris_arc_305285',\n", | |
" 'hash': '3f4b2b8ae10d26163b0737bd44203161cac5559d',\n", | |
" 'docSignature': '23c2748afb7e0d447e018d73fd2636a0a4f671fc_736843189add0272712bc99dbf373611',\n", | |
" 'timestamp': 1579857475,\n", | |
" 'lastTimeUpdated': 1579857464,\n", | |
" 'title': '[Jordan Marsh -- Space Scene] color drawing]'},\n", | |
" {'id': 'edanmdm-siris_arc_305286',\n", | |
" 'version': '',\n", | |
" 'unitCode': 'ACAH',\n", | |
" 'linkedId': '0',\n", | |
" 'type': 'edanmdm',\n", | |
" 'content': {'descriptiveNonRepeating': {'record_ID': 'siris_arc_305286',\n", | |
" 'unit_code': 'ACAH',\n", | |
" 'title_sort': 'POWER FOR PEACE FOR 180 YEARS TEXT IN IMAGE DRAWING',\n", | |
" 'title': {'label': 'Title',\n", | |
" 'content': 'Power for Peace / for 180 Years [text in image] [drawing]'},\n", | |
" 'metadata_usage': {'access': 'CC0'},\n", | |
" 'data_source': 'Archives Center, National Museum of American History'},\n", | |
" 'indexedStructured': {'date': ['1950s'],\n", | |
" 'object_type': ['Archival materials', 'Drawings'],\n", | |
" 'name': [{'type': 'corporate_main', 'content': 'Vaughn Parades, Inc'},\n", | |
" {'type': 'corporate_main', 'content': 'Orange Bowl Committee'},\n", | |
" {'type': 'corporate_subj', 'content': 'United States'}],\n", | |
" 'topic': ['Peace', 'Mythology', 'Space flight', 'Chariots'],\n", | |
" 'usage_flag': ['SuppressFromCSC']},\n", | |
" 'freetext': {'date': [{'label': 'Date', 'content': '1950'}],\n", | |
" 'setName': [{'label': 'See more items in',\n", | |
" 'content': 'Orange Bowl Collection'}],\n", | |
" 'identifier': [{'label': 'Local number',\n", | |
" 'content': 'AC1191-0000180.tif (AC Scan No.)'}],\n", | |
" 'notes': [{'label': 'Summary',\n", | |
" 'content': 'Design includes a winged woman driving a horse-drawn chariot, a rocket ship, a dove of peace, etc. With additional text: \"Your / United States / Army\" and \"Sponsored / by / Coca-Cola / Bottling / Co.\"'},\n", | |
" {'label': 'Cite as',\n", | |
" 'content': 'Orange Bowl Collection, Archives Center, National Museum of American History'},\n", | |
" {'label': 'Repository Loc.',\n", | |
" 'content': 'Smithsonian Institution, National Museum of American History : Archives Center. P.O. Box 37012, MRC 6 1, Constitution Ave. between 12th and 14th Streets, N.W., Washington, D.C. 20013-7012. Call 202-633-3270 for appointment. Fax 202-786-2453'}],\n", | |
" 'name': [{'label': 'designer', 'content': 'Vaughn Parades, Inc'},\n", | |
" {'label': 'donor', 'content': 'Orange Bowl Committee'},\n", | |
" {'label': 'Uniform title', 'content': 'Coca-Cola (Trademark)'},\n", | |
" {'label': 'Subject', 'content': 'United States Army'}],\n", | |
" 'topic': [{'label': 'Topic', 'content': 'Space flight'},\n", | |
" {'label': 'Topic', 'content': 'Peace'},\n", | |
" {'label': 'Topic', 'content': 'Mythology'},\n", | |
" {'label': 'Topic', 'content': 'Chariots'}],\n", | |
" 'physicalDescription': [{'label': 'Physical description',\n", | |
" 'content': 'Graphite on paper'},\n", | |
" {'label': 'Physical description', 'content': '1 item, 3.5\" x 2.0\"'}],\n", | |
" 'dataSource': [{'label': 'Data Source',\n", | |
" 'content': 'Archives Center, National Museum of American History'}],\n", | |
" 'objectRights': [{'label': 'Restrictions & Rights',\n", | |
" 'content': 'Fees for commercial use'}],\n", | |
" 'objectType': [{'label': 'Type', 'content': 'Drawings'}]}},\n", | |
" 'url': 'edanmdm:siris_arc_305286',\n", | |
" 'hash': '623576939e5cf1bf02ffce605ec2b63d029c7c77',\n", | |
" 'docSignature': 'cedab45cf8ca953ab2b6d505a6e74a18400fb02d_89d5dbe103bfd95423d0d51ca2a326de',\n", | |
" 'timestamp': 1579857475,\n", | |
" 'lastTimeUpdated': 1579857464,\n", | |
" 'title': 'Power for Peace / for 180 Years [text in image] [drawing]'})" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"b.take(2,npartitions=-1)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"('edanmdm-siris_arc_305285', 'edanmdm-siris_arc_305286')" | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"b.map(lambda record: record['id']).take(2,npartitions=-1)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 42 s, sys: 3.11 s, total: 45.1 s\n", | |
"Wall time: 2min 19s\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"11251019" | |
] | |
}, | |
"execution_count": 9, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"\n", | |
"b.count().compute()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"[('ACAH', 11), ('ACM', 249), ('CHNDM', 40196), ('FBR', 1517), ('FSA', 26), ('FSG', 3133), ('HAC', 252), ('HMSG', 502), ('HSFA', 77), ('NAA', 8), ('NASM', 141), ('NMAAHC', 2389), ('NMAH', 1292751), ('NMAfA', 136), ('NMNHANTHRO', 479774), ('NMNHBIRDS', 554194), ('NMNHBOTANY', 3614685), ('NMNHEDUCATION', 5799), ('NMNHENTO', 520812), ('NMNHFISHES', 473581), ('NMNHHERPS', 581303), ('NMNHINV', 1926719), ('NMNHMAMMALS', 616251), ('NMNHMINSCI', 443706), ('NMNHPALEO', 626183), ('NPG', 8309), ('NPM', 2547), ('SAAM', 12547), ('SI', 11), ('SIA', 29416), ('SIL', 13794)]\n", | |
"CPU times: user 44.6 s, sys: 3.84 s, total: 48.4 s\n", | |
"Wall time: 2min 23s\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"# This one is comparatively fast and produces the same result.\n", | |
"from operator import add\n", | |
"def incr(tot, _):\n", | |
" return tot+1\n", | |
"\n", | |
"result = b.foldby(key='unitCode',\n", | |
" binop=incr,\n", | |
" initial=0,\n", | |
" combine=add,\n", | |
" combine_initial=0).compute()\n", | |
"print(sorted(result))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"distributed.comm.tcp - WARNING - Closing dangling stream in <TCP local=tcp://127.0.0.1:54597 remote=tcp://127.0.0.1:43274>\n", | |
"distributed.comm.tcp - WARNING - Closing dangling stream in <TCP local=tcp://127.0.0.1:54599 remote=tcp://127.0.0.1:43274>\n", | |
"distributed.comm.tcp - WARNING - Closing dangling stream in <TCP local=tcp://127.0.0.1:54601 remote=tcp://127.0.0.1:43274>\n", | |
"distributed.comm.tcp - WARNING - Closing dangling stream in <TCP local=tcp://127.0.0.1:54603 remote=tcp://127.0.0.1:43274>\n", | |
"distributed.comm.tcp - WARNING - Closing dangling stream in <TCP local=tcp://127.0.0.1:54605 remote=tcp://127.0.0.1:43274>\n", | |
"distributed.comm.tcp - WARNING - Closing dangling stream in <TCP local=tcp://127.0.0.1:54607 remote=tcp://127.0.0.1:43274>\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"\n", | |
"b.groupby(lambda item: item['unitCode']).starmap(lambda k, v: (k, len(v))).compute()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.3" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment