Skip to content

Instantly share code, notes, and snippets.

@MikeTrizna
Created January 26, 2021 20:24
Show Gist options
  • Save MikeTrizna/fdd1063924460a9d9d18c1adf05efe5e to your computer and use it in GitHub Desktop.
Save MikeTrizna/fdd1063924460a9d9d18c1adf05efe5e to your computer and use it in GitHub Desktop.
gmu_openaccess/Parsing GitHub Open Access.ipynb
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"metadata": {
"ExecuteTime": {
"end_time": "2021-01-05T19:58:34.744453Z",
"start_time": "2021-01-05T19:58:34.305848Z"
},
"trusted": true
},
"cell_type": "code",
"source": "from dask.distributed import Client, progress",
"execution_count": 1,
"outputs": []
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2021-01-05T19:58:37.085418Z",
"start_time": "2021-01-05T19:58:34.746197Z"
},
"trusted": true
},
"cell_type": "code",
"source": "client = Client(n_workers = 8, threads_per_worker = 2)\nclient",
"execution_count": 2,
"outputs": [
{
"data": {
"text/html": "<table style=\"border: 2px solid white;\">\n<tr>\n<td style=\"vertical-align: top; border: 0px solid white\">\n<h3 style=\"text-align: left;\">Client</h3>\n<ul style=\"text-align: left; list-style: none; margin: 0; padding: 0;\">\n <li><b>Scheduler: </b>tcp://127.0.0.1:58469</li>\n <li><b>Dashboard: </b><a href='http://127.0.0.1:8787/status' target='_blank'>http://127.0.0.1:8787/status</a></li>\n</ul>\n</td>\n<td style=\"vertical-align: top; border: 0px solid white\">\n<h3 style=\"text-align: left;\">Cluster</h3>\n<ul style=\"text-align: left; list-style:none; margin: 0; padding: 0;\">\n <li><b>Workers: </b>8</li>\n <li><b>Cores: </b>16</li>\n <li><b>Memory: </b>17.18 GB</li>\n</ul>\n</td>\n</tr>\n</table>",
"text/plain": "<Client: 'tcp://127.0.0.1:58469' processes=8 threads=16, memory=17.18 GB>"
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
]
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2021-01-05T19:58:37.089761Z",
"start_time": "2021-01-05T19:58:37.087552Z"
},
"trusted": true
},
"cell_type": "code",
"source": "import json\nimport bz2",
"execution_count": 3,
"outputs": []
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2021-01-05T19:58:37.116554Z",
"start_time": "2021-01-05T19:58:37.091499Z"
},
"trusted": true
},
"cell_type": "code",
"source": "import dask.bag as db",
"execution_count": 4,
"outputs": []
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2021-01-05T19:58:37.205333Z",
"start_time": "2021-01-05T19:58:37.118390Z"
},
"trusted": true
},
"cell_type": "code",
"source": "b = db.read_text('~/Downloads/NMAH/*.txt.bz2', compression='bz2').map(json.loads)\nb",
"execution_count": 5,
"outputs": [
{
"data": {
"text/plain": "dask.bag<loads, npartitions=256>"
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
]
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2021-01-05T19:58:37.270503Z",
"start_time": "2021-01-05T19:58:37.206773Z"
},
"trusted": true
},
"cell_type": "code",
"source": "b.take(1)",
"execution_count": 6,
"outputs": [
{
"data": {
"text/plain": "({'id': 'edanmdm-nmah_1632369',\n 'version': '',\n 'unitCode': 'NMAH',\n 'linkedId': '0',\n 'type': 'edanmdm',\n 'content': {'descriptiveNonRepeating': {'record_ID': 'nmah_1632369',\n 'unit_code': 'NMAH',\n 'title_sort': 'CERTIFIED PROOF',\n 'guid': 'http://n2t.net/ark:/65665/ng49ca746b1-2f39-704b-e053-15f76fa0b4fa',\n 'title': {'label': 'Object Name', 'content': 'certified proof'},\n 'metadata_usage': {'access': 'CC0'},\n 'data_source': 'National Museum of American History'},\n 'indexedStructured': {'object_type': ['Exchange Medium', 'certified proof'],\n 'name': ['U.S. Department of the Treasury'],\n 'online_media_type': ['Images']},\n 'freetext': {'setName': [{'label': 'See more items in',\n 'content': 'Work and Industry: National Numismatic Collection'},\n {'label': 'See more items in', 'content': 'Coins, Currency and Medals'}],\n 'identifier': [{'label': 'ID Number', 'content': 'NU.297219.167009'},\n {'label': 'catalog number', 'content': '297219.167009'},\n {'label': 'accession number', 'content': '297219'}],\n 'notes': [{'label': 'Crowdsourcing',\n 'content': 'Available for transcription'}],\n 'name': [{'label': 'issuing authority',\n 'content': 'U.S. Department of the Treasury'}],\n 'dataSource': [{'label': 'Data Source',\n 'content': 'National Museum of American History'}],\n 'objectType': [{'label': 'Object Name', 'content': 'certified proof'},\n {'label': 'Object Type', 'content': 'Exchange Medium'}]}},\n 'url': 'edanmdm:nmah_1632369',\n 'hash': '0084dd244488e3bf816709522029745bd21054dc',\n 'docSignature': '6885e817f6acb889644caea91812aa4f62fe61df_59e9ea4b8270170a968ded2fb48b16fc',\n 'timestamp': 1594109633,\n 'lastTimeUpdated': 1594109587,\n 'title': 'certified proof'},)"
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
]
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2021-01-05T19:59:05.466129Z",
"start_time": "2021-01-05T19:58:37.273059Z"
},
"trusted": true
},
"cell_type": "code",
"source": "bongo = b.filter(lambda record: record['id'] == 'edanmdm-nmah_1289708').compute()\nbongo",
"execution_count": 7,
"outputs": [
{
"data": {
"text/plain": "[{'id': 'edanmdm-nmah_1289708',\n 'version': '',\n 'unitCode': 'NMAH',\n 'linkedId': '0',\n 'type': 'edanmdm',\n 'content': {'descriptiveNonRepeating': {'record_ID': 'nmah_1289708',\n 'unit_code': 'NMAH',\n 'title_sort': 'BONGO DRUMS USED BY DESI ARNAZ ON TV SHOW \"I LOVE LUCY\"',\n 'guid': 'http://n2t.net/ark:/65665/ng49ca746b2-9dcb-704b-e053-15f76fa0b4fa',\n 'title': {'label': 'Title',\n 'content': 'bongo drums used by Desi Arnaz on TV show \"I Love Lucy\"'},\n 'metadata_usage': {'access': 'CC0'},\n 'data_source': 'National Museum of American History'},\n 'indexedStructured': {'date': ['1950s'],\n 'object_type': ['drums, bongo, , used by Desi Arnaz on TV show \"I Love Lucy\"',\n 'drums, bongo',\n 'drums, used by Desi Arnaz on TV show \"I Love Lucy\"',\n 'drums, bongo,',\n 'Drums'],\n 'online_media_type': ['Images']},\n 'freetext': {'setName': [{'label': 'See more items in',\n 'content': 'Cultural and Community Life: Entertainment'}],\n 'date': [{'label': 'date made', 'content': '1952-57'}],\n 'identifier': [{'label': 'ID Number', 'content': '2004.0276.02'},\n {'label': 'accession number', 'content': '2004.0276'},\n {'label': 'catalog number', 'content': '2004.0276.02'}],\n 'notes': [{'label': 'Location', 'content': 'Currently not on view'}],\n 'creditLine': [{'label': 'Credit Line',\n 'content': 'Gift of David L. Cook'}],\n 'dataSource': [{'label': 'Data Source',\n 'content': 'National Museum of American History'}],\n 'objectType': [{'label': 'Object Name', 'content': 'Drums'},\n {'label': 'Object Name',\n 'content': 'drums, used by Desi Arnaz on TV show \"I Love Lucy\"'},\n {'label': 'Object Name',\n 'content': 'drums, bongo, , used by Desi Arnaz on TV show \"I Love Lucy\"'},\n {'label': 'Object Name', 'content': 'drums, bongo,'},\n {'label': 'Object Name', 'content': 'drums, bongo'}]}},\n 'url': 'edanmdm:nmah_1289708',\n 'hash': 'eb01e52b1fe6df9740de8e7b9195c1ea3797c618',\n 'docSignature': '49e2a303fbb06daba788003b89a0d47675ffd7e9_72bf4e892e5c304f2150c831257906b6',\n 'timestamp': 1594110245,\n 'lastTimeUpdated': 1594110231,\n 'title': 'bongo drums used by Desi Arnaz on TV show \"I Love Lucy\"'}]"
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
]
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2021-01-05T19:59:05.485322Z",
"start_time": "2021-01-05T19:59:05.479638Z"
},
"trusted": true
},
"cell_type": "code",
"source": "print(json.dumps(bongo, indent=2))",
"execution_count": 8,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": "[\n {\n \"id\": \"edanmdm-nmah_1289708\",\n \"version\": \"\",\n \"unitCode\": \"NMAH\",\n \"linkedId\": \"0\",\n \"type\": \"edanmdm\",\n \"content\": {\n \"descriptiveNonRepeating\": {\n \"record_ID\": \"nmah_1289708\",\n \"unit_code\": \"NMAH\",\n \"title_sort\": \"BONGO DRUMS USED BY DESI ARNAZ ON TV SHOW \\\"I LOVE LUCY\\\"\",\n \"guid\": \"http://n2t.net/ark:/65665/ng49ca746b2-9dcb-704b-e053-15f76fa0b4fa\",\n \"title\": {\n \"label\": \"Title\",\n \"content\": \"bongo drums used by Desi Arnaz on TV show \\\"I Love Lucy\\\"\"\n },\n \"metadata_usage\": {\n \"access\": \"CC0\"\n },\n \"data_source\": \"National Museum of American History\"\n },\n \"indexedStructured\": {\n \"date\": [\n \"1950s\"\n ],\n \"object_type\": [\n \"drums, bongo, , used by Desi Arnaz on TV show \\\"I Love Lucy\\\"\",\n \"drums, bongo\",\n \"drums, used by Desi Arnaz on TV show \\\"I Love Lucy\\\"\",\n \"drums, bongo,\",\n \"Drums\"\n ],\n \"online_media_type\": [\n \"Images\"\n ]\n },\n \"freetext\": {\n \"setName\": [\n {\n \"label\": \"See more items in\",\n \"content\": \"Cultural and Community Life: Entertainment\"\n }\n ],\n \"date\": [\n {\n \"label\": \"date made\",\n \"content\": \"1952-57\"\n }\n ],\n \"identifier\": [\n {\n \"label\": \"ID Number\",\n \"content\": \"2004.0276.02\"\n },\n {\n \"label\": \"accession number\",\n \"content\": \"2004.0276\"\n },\n {\n \"label\": \"catalog number\",\n \"content\": \"2004.0276.02\"\n }\n ],\n \"notes\": [\n {\n \"label\": \"Location\",\n \"content\": \"Currently not on view\"\n }\n ],\n \"creditLine\": [\n {\n \"label\": \"Credit Line\",\n \"content\": \"Gift of David L. Cook\"\n }\n ],\n \"dataSource\": [\n {\n \"label\": \"Data Source\",\n \"content\": \"National Museum of American History\"\n }\n ],\n \"objectType\": [\n {\n \"label\": \"Object Name\",\n \"content\": \"Drums\"\n },\n {\n \"label\": \"Object Name\",\n \"content\": \"drums, used by Desi Arnaz on TV show \\\"I Love Lucy\\\"\"\n },\n {\n \"label\": \"Object Name\",\n \"content\": \"drums, bongo, , used by Desi Arnaz on TV show \\\"I Love Lucy\\\"\"\n },\n {\n \"label\": \"Object Name\",\n \"content\": \"drums, bongo,\"\n },\n {\n \"label\": \"Object Name\",\n \"content\": \"drums, bongo\"\n }\n ]\n }\n },\n \"url\": \"edanmdm:nmah_1289708\",\n \"hash\": \"eb01e52b1fe6df9740de8e7b9195c1ea3797c618\",\n \"docSignature\": \"49e2a303fbb06daba788003b89a0d47675ffd7e9_72bf4e892e5c304f2150c831257906b6\",\n \"timestamp\": 1594110245,\n \"lastTimeUpdated\": 1594110231,\n \"title\": \"bongo drums used by Desi Arnaz on TV show \\\"I Love Lucy\\\"\"\n }\n]\n"
}
]
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2021-01-05T19:59:05.490239Z",
"start_time": "2021-01-05T19:59:05.486932Z"
},
"trusted": true
},
"cell_type": "code",
"source": "def parse_record(record):\n date = record['content']['indexedStructured'].get('date', 'NO DATE')\n title = record['title']\n code = record['unitCode']\n record_dict = {'date': date,\n 'title': title,\n 'code': code}\n return record_dict",
"execution_count": 9,
"outputs": []
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2021-01-05T19:59:05.496068Z",
"start_time": "2021-01-05T19:59:05.491977Z"
},
"trusted": true
},
"cell_type": "code",
"source": "def flatten(record):\n record_dict = {}\n record_dict['object_id'] = record['id']\n record_dict['title'] = record['title']\n record_dict['unit_code'] = record['unitCode']\n if 'freetext' in record['content']:\n if 'date' in record['content']['freetext']:\n for date_entry in record['content']['freetext']['date']:\n key = date_entry['label']\n value = date_entry['content']\n record_dict[key] = value\n return record_dict",
"execution_count": 10,
"outputs": []
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2021-01-05T19:59:05.502068Z",
"start_time": "2021-01-05T19:59:05.497254Z"
},
"trusted": true
},
"cell_type": "code",
"source": "flatten(bongo[0])",
"execution_count": 11,
"outputs": [
{
"data": {
"text/plain": "{'object_id': 'edanmdm-nmah_1289708',\n 'title': 'bongo drums used by Desi Arnaz on TV show \"I Love Lucy\"',\n 'unit_code': 'NMAH',\n 'date made': '1952-57'}"
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
]
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2021-01-05T19:59:35.295874Z",
"start_time": "2021-01-05T19:59:05.503226Z"
},
"trusted": true
},
"cell_type": "code",
"source": "nmah_dates = b.map(flatten).compute()",
"execution_count": 12,
"outputs": []
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2021-01-05T19:59:35.330473Z",
"start_time": "2021-01-05T19:59:35.302807Z"
},
"trusted": true
},
"cell_type": "code",
"source": "len(nmah_dates)",
"execution_count": 13,
"outputs": [
{
"data": {
"text/plain": "1314921"
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
]
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2021-01-05T19:59:35.374845Z",
"start_time": "2021-01-05T19:59:35.347964Z"
},
"trusted": true
},
"cell_type": "code",
"source": "nmah_dates[0]",
"execution_count": 14,
"outputs": [
{
"data": {
"text/plain": "{'object_id': 'edanmdm-nmah_1632369',\n 'title': 'certified proof',\n 'unit_code': 'NMAH'}"
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
]
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2021-01-05T19:59:35.385864Z",
"start_time": "2021-01-05T19:59:35.381324Z"
},
"trusted": true
},
"cell_type": "code",
"source": "import pandas as pd",
"execution_count": 15,
"outputs": []
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2021-01-05T19:59:35.706276Z",
"start_time": "2021-01-05T19:59:35.388443Z"
},
"trusted": true
},
"cell_type": "code",
"source": "max_keys = 0\nfor record in nmah_dates:\n key_count = len(record.keys())\n if key_count > max_keys:\n max_keys = key_count\n if key_count == 11:\n print(json.dumps(record, indent=2))\n \nmax_keys",
"execution_count": 16,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": "{\n \"object_id\": \"edanmdm-nmah_844187\",\n \"title\": \"Rigged Model of Packet Ship \\\"Shenandoah\\\"\",\n \"unit_code\": \"NMAH\",\n \"Date made\": \"1963\",\n \"original ship built\": \"1840\",\n \"Cope Line operated\": \"1820-1870\",\n \"sailed with the Cope Line\": \"1839-1844\",\n \"sailed for the Dunham & Dimon Liverpool Line out of New York\": \"1845\",\n \"sailed for the Black Diamond Line out of Philadelphia\": \"1846\",\n \"sailed for the New LIne out of Philadelphia\": \"1847\",\n \"abandoned at sea\": \"1854-08\"\n}\n{\n \"object_id\": \"edanmdm-nmah_844188\",\n \"title\": \"Rigged Model, Auxiliary Steamship <I>Savannah</I>\",\n \"unit_code\": \"NMAH\",\n \"Date made\": \"1961\",\n \"cleared Savannah, Georgia\": \"1819-05-22\",\n \"sailed from Liverpool to Stockholm\": \"1819-07-23\",\n \"sailed in the Baltic\": \"1919-08-13\",\n \"returned to Savannah\": \"1819-11\",\n \"sailed from New York to Savannah\": \"1820-10\",\n \"wrecked at Fire Island\": \"1821-11-05\",\n \"President Franklin Delano Roosevelt named May 22 as National Maritime Day in honor of the <I>Savannah</I>\": \"1933-05-22\"\n}\n"
},
{
"data": {
"text/plain": "11"
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
]
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2021-01-05T19:59:37.097248Z",
"start_time": "2021-01-05T19:59:35.707667Z"
},
"trusted": true
},
"cell_type": "code",
"source": "duplicate_keys = {}\nfor record in nmah_dates:\n record_keys = list(record.keys())\n for key in record_keys:\n if key in duplicate_keys:\n duplicate_keys[key] += 1\n else:\n duplicate_keys[key] = 1\n",
"execution_count": 17,
"outputs": []
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2021-01-05T19:59:37.103257Z",
"start_time": "2021-01-05T19:59:37.098970Z"
},
"trusted": true
},
"cell_type": "code",
"source": "len(duplicate_keys.keys())",
"execution_count": 18,
"outputs": [
{
"data": {
"text/plain": "816"
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
]
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2021-01-05T19:59:37.149453Z",
"start_time": "2021-01-05T19:59:37.104534Z"
},
"trusted": true
},
"cell_type": "code",
"source": "sorted_keys = [{k: v} for k, v in sorted(duplicate_keys.items(), \n key=lambda item: item[1], \n reverse=True)]\nprint(json.dumps(sorted_keys[:50], indent=2))",
"execution_count": 19,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": "[\n {\n \"object_id\": 1314921\n },\n {\n \"title\": 1314921\n },\n {\n \"unit_code\": 1314921\n },\n {\n \"date made\": 221282\n },\n {\n \"Date made\": 147375\n },\n {\n \"associated date\": 91410\n },\n {\n \"plate date\": 45222\n },\n {\n \"BEP certification date\": 40054\n },\n {\n \"series date\": 31108\n },\n {\n \"alternate calendar date\": 10089\n },\n {\n \"associated dates\": 6898\n },\n {\n \"patent date\": 6658\n },\n {\n \"date on object\": 6538\n },\n {\n \"period of administration\": 6346\n },\n {\n \"Associated Date\": 3386\n },\n {\n \"used date\": 2191\n },\n {\n \"date photographed\": 1798\n },\n {\n \"regnal date\": 1498\n },\n {\n \"date cataloged\": 1368\n },\n {\n \"copyright date\": 632\n },\n {\n \"date used\": 272\n },\n {\n \"date printed\": 257\n },\n {\n \"expiration date\": 256\n },\n {\n \"Date Made\": 175\n },\n {\n \"user\": 116\n },\n {\n \"date(s) of previous ownership\": 113\n },\n {\n \"fair dates\": 101\n },\n {\n \"product expiration date\": 100\n },\n {\n \"original issue\": 98\n },\n {\n \"associated\": 91\n },\n {\n \"date purchased\": 87\n },\n {\n \"postmark date\": 86\n },\n {\n \"date published\": 80\n },\n {\n \"print\": 75\n },\n {\n \"date of book publication\": 75\n },\n {\n \"date ordered, given, or borrowed\": 63\n },\n {\n \"negative\": 60\n },\n {\n \"year acquired\": 51\n },\n {\n \"made during\": 50\n },\n {\n \"inscribed date\": 50\n },\n {\n \"associated date; used date\": 49\n },\n {\n \"date patented\": 47\n },\n {\n \"model constructed\": 41\n },\n {\n \"date on coin\": 38\n },\n {\n \"presentation date\": 32\n },\n {\n \"used\": 32\n },\n {\n \"date presented\": 32\n },\n {\n \"date filmed\": 31\n },\n {\n \"print made\": 31\n },\n {\n \"dates used\": 30\n }\n]\n"
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "",
"execution_count": null,
"outputs": []
}
],
"metadata": {
"kernelspec": {
"name": "python3",
"display_name": "Python 3",
"language": "python"
},
"language_info": {
"name": "python",
"version": "3.8.5",
"mimetype": "text/x-python",
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"pygments_lexer": "ipython3",
"nbconvert_exporter": "python",
"file_extension": ".py"
},
"gist": {
"id": "",
"data": {
"description": "gmu_openaccess/Parsing GitHub Open Access.ipynb",
"public": true
}
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment