Skip to content

Instantly share code, notes, and snippets.

@MikeTrizna
Created February 28, 2020 17:11
Show Gist options
  • Save MikeTrizna/82aed87971cf6de5b5036fcd1c9e1ca9 to your computer and use it in GitHub Desktop.
Save MikeTrizna/82aed87971cf6de5b5036fcd1c9e1ca9 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<table style=\"border: 2px solid white;\">\n",
"<tr>\n",
"<td style=\"vertical-align: top; border: 0px solid white\">\n",
"<h3>Client</h3>\n",
"<ul>\n",
" <li><b>Scheduler: </b>tcp://127.0.0.1:43274\n",
" <li><b>Dashboard: </b><a href='http://127.0.0.1:8787/status' target='_blank'>http://127.0.0.1:8787/status</a>\n",
"</ul>\n",
"</td>\n",
"<td style=\"vertical-align: top; border: 0px solid white\">\n",
"<h3>Cluster</h3>\n",
"<ul>\n",
" <li><b>Workers: </b>8</li>\n",
" <li><b>Cores: </b>8</li>\n",
" <li><b>Memory: </b>404.34 GB</li>\n",
"</ul>\n",
"</td>\n",
"</tr>\n",
"</table>"
],
"text/plain": [
"<Client: scheduler='tcp://127.0.0.1:43274' processes=8 cores=8>"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from dask.distributed import Client, progress\n",
"client = Client(n_workers=8, threads_per_worker=1)\n",
"client"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"import bz2"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"import dask.bag as db"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"#All\n",
"b = db.read_text('metadata/objects/*/*.txt.bz2', \n",
" compression='bz2').map(json.loads)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"#Just NASM(141) and FSG(3133)\n",
"b = db.read_text(['metadata/objects/NASM/*.txt.bz2','metadata/objects/FSG/*.txt.bz2'], \n",
" compression='bz2').map(json.loads)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"dask.bag<loads-2..., npartitions=6262>"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"b"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"({'id': 'edanmdm-siris_arc_305285',\n",
" 'version': '',\n",
" 'unitCode': 'ACAH',\n",
" 'linkedId': '0',\n",
" 'type': 'edanmdm',\n",
" 'content': {'descriptiveNonRepeating': {'record_ID': 'siris_arc_305285',\n",
" 'unit_code': 'ACAH',\n",
" 'title_sort': 'JORDAN MARSH SPACE SCENE COLOR DRAWING',\n",
" 'title': {'label': 'Title',\n",
" 'content': '[Jordan Marsh -- Space Scene] color drawing]'},\n",
" 'metadata_usage': {'access': 'CC0'},\n",
" 'data_source': 'Archives Center, National Museum of American History'},\n",
" 'indexedStructured': {'date': ['2000s', '1930s'],\n",
" 'object_type': ['Archival materials', 'Drawings'],\n",
" 'name': [{'type': 'corporate_main', 'content': 'Vaughn Parades, Inc'},\n",
" {'type': 'corporate_main', 'content': 'Orange Bowl Committee'}],\n",
" 'topic': ['Science fiction', 'Space flight', 'Parade floats'],\n",
" 'usage_flag': ['SuppressFromCSC']},\n",
" 'freetext': {'date': [{'label': 'Date', 'content': '1932'},\n",
" {'label': 'Date', 'content': '2002'}],\n",
" 'setName': [{'label': 'See more items in',\n",
" 'content': 'Orange Bowl Collection'}],\n",
" 'identifier': [{'label': 'Local number',\n",
" 'content': 'AC1191-0000179 (AC Scan No.)'}],\n",
" 'notes': [{'label': 'Summary',\n",
" 'content': 'Design for a parade float. Drawing on brown paper showing rocket ship, etc.'},\n",
" {'label': 'Cite as',\n",
" 'content': 'Orange Bowl Collection, Archives Center, National Museum of American History'},\n",
" {'label': 'Repository Loc.',\n",
" 'content': 'Smithsonian Institution, National Museum of American History : Archives Center. P.O. Box 37012, MRC 6 1, Constitution Ave. between 12th and 14th Streets, N.W., Washington, D.C. 20013-7012. Call 202-633-3270 for appointment. Fax 202-786-2453'}],\n",
" 'name': [{'label': 'designer', 'content': 'Vaughn Parades, Inc'},\n",
" {'label': 'donor', 'content': 'Orange Bowl Committee'}],\n",
" 'topic': [{'label': 'Topic', 'content': 'Parade floats'},\n",
" {'label': 'Topic', 'content': 'Space flight'},\n",
" {'label': 'Topic', 'content': 'Science fiction'}],\n",
" 'physicalDescription': [{'label': 'Physical description',\n",
" 'content': 'Graphite on paper'},\n",
" {'label': 'Physical description', 'content': '1 item, 2.2\" x 2.7\"'}],\n",
" 'dataSource': [{'label': 'Data Source',\n",
" 'content': 'Archives Center, National Museum of American History'}],\n",
" 'objectRights': [{'label': 'Restrictions & Rights',\n",
" 'content': 'Fees for commercial use'}],\n",
" 'objectType': [{'label': 'Type', 'content': 'Drawings'}]}},\n",
" 'url': 'edanmdm:siris_arc_305285',\n",
" 'hash': '3f4b2b8ae10d26163b0737bd44203161cac5559d',\n",
" 'docSignature': '23c2748afb7e0d447e018d73fd2636a0a4f671fc_736843189add0272712bc99dbf373611',\n",
" 'timestamp': 1579857475,\n",
" 'lastTimeUpdated': 1579857464,\n",
" 'title': '[Jordan Marsh -- Space Scene] color drawing]'},\n",
" {'id': 'edanmdm-siris_arc_305286',\n",
" 'version': '',\n",
" 'unitCode': 'ACAH',\n",
" 'linkedId': '0',\n",
" 'type': 'edanmdm',\n",
" 'content': {'descriptiveNonRepeating': {'record_ID': 'siris_arc_305286',\n",
" 'unit_code': 'ACAH',\n",
" 'title_sort': 'POWER FOR PEACE FOR 180 YEARS TEXT IN IMAGE DRAWING',\n",
" 'title': {'label': 'Title',\n",
" 'content': 'Power for Peace / for 180 Years [text in image] [drawing]'},\n",
" 'metadata_usage': {'access': 'CC0'},\n",
" 'data_source': 'Archives Center, National Museum of American History'},\n",
" 'indexedStructured': {'date': ['1950s'],\n",
" 'object_type': ['Archival materials', 'Drawings'],\n",
" 'name': [{'type': 'corporate_main', 'content': 'Vaughn Parades, Inc'},\n",
" {'type': 'corporate_main', 'content': 'Orange Bowl Committee'},\n",
" {'type': 'corporate_subj', 'content': 'United States'}],\n",
" 'topic': ['Peace', 'Mythology', 'Space flight', 'Chariots'],\n",
" 'usage_flag': ['SuppressFromCSC']},\n",
" 'freetext': {'date': [{'label': 'Date', 'content': '1950'}],\n",
" 'setName': [{'label': 'See more items in',\n",
" 'content': 'Orange Bowl Collection'}],\n",
" 'identifier': [{'label': 'Local number',\n",
" 'content': 'AC1191-0000180.tif (AC Scan No.)'}],\n",
" 'notes': [{'label': 'Summary',\n",
" 'content': 'Design includes a winged woman driving a horse-drawn chariot, a rocket ship, a dove of peace, etc. With additional text: \"Your / United States / Army\" and \"Sponsored / by / Coca-Cola / Bottling / Co.\"'},\n",
" {'label': 'Cite as',\n",
" 'content': 'Orange Bowl Collection, Archives Center, National Museum of American History'},\n",
" {'label': 'Repository Loc.',\n",
" 'content': 'Smithsonian Institution, National Museum of American History : Archives Center. P.O. Box 37012, MRC 6 1, Constitution Ave. between 12th and 14th Streets, N.W., Washington, D.C. 20013-7012. Call 202-633-3270 for appointment. Fax 202-786-2453'}],\n",
" 'name': [{'label': 'designer', 'content': 'Vaughn Parades, Inc'},\n",
" {'label': 'donor', 'content': 'Orange Bowl Committee'},\n",
" {'label': 'Uniform title', 'content': 'Coca-Cola (Trademark)'},\n",
" {'label': 'Subject', 'content': 'United States Army'}],\n",
" 'topic': [{'label': 'Topic', 'content': 'Space flight'},\n",
" {'label': 'Topic', 'content': 'Peace'},\n",
" {'label': 'Topic', 'content': 'Mythology'},\n",
" {'label': 'Topic', 'content': 'Chariots'}],\n",
" 'physicalDescription': [{'label': 'Physical description',\n",
" 'content': 'Graphite on paper'},\n",
" {'label': 'Physical description', 'content': '1 item, 3.5\" x 2.0\"'}],\n",
" 'dataSource': [{'label': 'Data Source',\n",
" 'content': 'Archives Center, National Museum of American History'}],\n",
" 'objectRights': [{'label': 'Restrictions & Rights',\n",
" 'content': 'Fees for commercial use'}],\n",
" 'objectType': [{'label': 'Type', 'content': 'Drawings'}]}},\n",
" 'url': 'edanmdm:siris_arc_305286',\n",
" 'hash': '623576939e5cf1bf02ffce605ec2b63d029c7c77',\n",
" 'docSignature': 'cedab45cf8ca953ab2b6d505a6e74a18400fb02d_89d5dbe103bfd95423d0d51ca2a326de',\n",
" 'timestamp': 1579857475,\n",
" 'lastTimeUpdated': 1579857464,\n",
" 'title': 'Power for Peace / for 180 Years [text in image] [drawing]'})"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"b.take(2,npartitions=-1)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"('edanmdm-siris_arc_305285', 'edanmdm-siris_arc_305286')"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"b.map(lambda record: record['id']).take(2,npartitions=-1)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 42 s, sys: 3.11 s, total: 45.1 s\n",
"Wall time: 2min 19s\n"
]
},
{
"data": {
"text/plain": [
"11251019"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%time\n",
"\n",
"b.count().compute()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[('ACAH', 11), ('ACM', 249), ('CHNDM', 40196), ('FBR', 1517), ('FSA', 26), ('FSG', 3133), ('HAC', 252), ('HMSG', 502), ('HSFA', 77), ('NAA', 8), ('NASM', 141), ('NMAAHC', 2389), ('NMAH', 1292751), ('NMAfA', 136), ('NMNHANTHRO', 479774), ('NMNHBIRDS', 554194), ('NMNHBOTANY', 3614685), ('NMNHEDUCATION', 5799), ('NMNHENTO', 520812), ('NMNHFISHES', 473581), ('NMNHHERPS', 581303), ('NMNHINV', 1926719), ('NMNHMAMMALS', 616251), ('NMNHMINSCI', 443706), ('NMNHPALEO', 626183), ('NPG', 8309), ('NPM', 2547), ('SAAM', 12547), ('SI', 11), ('SIA', 29416), ('SIL', 13794)]\n",
"CPU times: user 44.6 s, sys: 3.84 s, total: 48.4 s\n",
"Wall time: 2min 23s\n"
]
}
],
"source": [
"%%time\n",
"# This one is comparatively fast and produces the same result.\n",
"from operator import add\n",
"def incr(tot, _):\n",
" return tot+1\n",
"\n",
"result = b.foldby(key='unitCode',\n",
" binop=incr,\n",
" initial=0,\n",
" combine=add,\n",
" combine_initial=0).compute()\n",
"print(sorted(result))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"distributed.comm.tcp - WARNING - Closing dangling stream in <TCP local=tcp://127.0.0.1:54597 remote=tcp://127.0.0.1:43274>\n",
"distributed.comm.tcp - WARNING - Closing dangling stream in <TCP local=tcp://127.0.0.1:54599 remote=tcp://127.0.0.1:43274>\n",
"distributed.comm.tcp - WARNING - Closing dangling stream in <TCP local=tcp://127.0.0.1:54601 remote=tcp://127.0.0.1:43274>\n",
"distributed.comm.tcp - WARNING - Closing dangling stream in <TCP local=tcp://127.0.0.1:54603 remote=tcp://127.0.0.1:43274>\n",
"distributed.comm.tcp - WARNING - Closing dangling stream in <TCP local=tcp://127.0.0.1:54605 remote=tcp://127.0.0.1:43274>\n",
"distributed.comm.tcp - WARNING - Closing dangling stream in <TCP local=tcp://127.0.0.1:54607 remote=tcp://127.0.0.1:43274>\n"
]
}
],
"source": [
"%%time\n",
"\n",
"b.groupby(lambda item: item['unitCode']).starmap(lambda k, v: (k, len(v))).compute()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment