Created
September 15, 2017 02:52
-
-
Save cdrini/70c9ff1a58ed1e9234c57df7755881ef to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 88, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd\n", | |
"from catharbot import catharbot\n", | |
"bot = catharbot.CatharBot()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"master_olid = 'OL5782001W' # Chosen as master because: Most Editions, On the most Lists, on Staff Picks List\n", | |
"dupe_olids = [\n", | |
" 'OL5781992W',\n", | |
" 'OL17174243W',\n", | |
" 'OL16068777W',\n", | |
" 'OL17348501W',\n", | |
" 'OL16049475W'\n", | |
"]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 29, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"OL5782001W\n", | |
"OL5781992W\n", | |
"OL17174243W\n", | |
"OL16068777W\n", | |
"OL17348501W\n", | |
"OL16049475W\n" | |
] | |
} | |
], | |
"source": [ | |
"olids = [master_olid] + dupe_olids\n", | |
"works = [bot.load_doc(olid) for olid in olids]\n", | |
"editions = { olid : bot.get_editions_from_work(olid) for olid in olids}" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 94, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def flatten_ol_json(json_dict):\n", | |
" def extract_olid(olid):\n", | |
" \"\"\"Convert a string like '/authors/OL1412764A' to just 'OL1412764A'\"\"\"\n", | |
" return olid.split('/')[-1]\n", | |
"\n", | |
" def stringify(val):\n", | |
" if type(val) in [str, unicode]:\n", | |
" return val\n", | |
" elif type(val) in [int]:\n", | |
" return str(val)\n", | |
" elif isinstance(val, list):\n", | |
" return \"; \".join(map(stringify, val))\n", | |
" elif isinstance(val, dict):\n", | |
" if val.keys() == ['key']:\n", | |
" return extract_olid(val['key'])\n", | |
" if val['type'] == '/type/datetime':\n", | |
" return val['value']\n", | |
" if val['type']['key'] == '/type/author_role':\n", | |
" return extract_olid(val['author']['key'])\n", | |
" else:\n", | |
" raise Exception(\"Cannot stringify value '%s'\" % str(val))\n", | |
" else: raise Exception(\"Cannot stringify value '%s'\" % str(val))\n", | |
" \n", | |
" result = { key: stringify(val) for key, val in json_dict.iteritems() }\n", | |
" if 'key' in result:\n", | |
" result['key'] = extract_olid(result['key'])\n", | |
" return result\n", | |
"\n", | |
"flat_works = [flatten_ol_json(work) for work in works]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 95, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"for work in flat_works:\n", | |
" work[u'editions'] = len(editions[work['key']])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 102, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style>\n", | |
" .dataframe thead tr:only-child th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: left;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>authors</th>\n", | |
" <th>covers</th>\n", | |
" <th>created</th>\n", | |
" <th>editions</th>\n", | |
" <th>key</th>\n", | |
" <th>last_modified</th>\n", | |
" <th>latest_revision</th>\n", | |
" <th>revision</th>\n", | |
" <th>subject_places</th>\n", | |
" <th>subjects</th>\n", | |
" <th>title</th>\n", | |
" <th>type</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>OL1412764A</td>\n", | |
" <td>872426</td>\n", | |
" <td>2009-12-10T17:58:21.861697</td>\n", | |
" <td>16</td>\n", | |
" <td>OL5782001W</td>\n", | |
" <td>2017-08-07T19:00:35.407995</td>\n", | |
" <td>24</td>\n", | |
" <td>24</td>\n", | |
" <td>Afghanistan; Kabul (Afganistán); Kabul (Afghan...</td>\n", | |
" <td>Kabul (Afghanistan); Teenage boys; Male friend...</td>\n", | |
" <td>The kite runner</td>\n", | |
" <td>work</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>OL1412764A</td>\n", | |
" <td>1989063</td>\n", | |
" <td>2009-12-10T17:58:21.861697</td>\n", | |
" <td>3</td>\n", | |
" <td>OL5781992W</td>\n", | |
" <td>2011-03-13T19:13:47.417251</td>\n", | |
" <td>4</td>\n", | |
" <td>4</td>\n", | |
" <td>NaN</td>\n", | |
" <td>New York Times bestseller; nyt:trade_fiction_p...</td>\n", | |
" <td>The Kite Runner</td>\n", | |
" <td>work</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>OL1412764A</td>\n", | |
" <td>NaN</td>\n", | |
" <td>2015-07-20T21:28:13.052359</td>\n", | |
" <td>1</td>\n", | |
" <td>OL17174243W</td>\n", | |
" <td>2015-07-20T21:28:13.052359</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>The Kite Runner</td>\n", | |
" <td>work</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>OL1412764A</td>\n", | |
" <td>NaN</td>\n", | |
" <td>2011-08-12T03:01:53.763005</td>\n", | |
" <td>1</td>\n", | |
" <td>OL16068777W</td>\n", | |
" <td>2012-05-18T18:34:13.218995</td>\n", | |
" <td>4</td>\n", | |
" <td>4</td>\n", | |
" <td>Afghanistan; Kābol (Afghanistan)</td>\n", | |
" <td>Male friendship; Fiction; Betrayal; Boys; Soci...</td>\n", | |
" <td>The kite runner</td>\n", | |
" <td>work</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>OL1412764A</td>\n", | |
" <td>NaN</td>\n", | |
" <td>2016-06-18T06:50:06.637518</td>\n", | |
" <td>1</td>\n", | |
" <td>OL17348501W</td>\n", | |
" <td>2016-06-18T06:50:06.637518</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>The Kite Runner</td>\n", | |
" <td>work</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>OL1412764A</td>\n", | |
" <td>7003010</td>\n", | |
" <td>2011-08-11T01:45:18.883705</td>\n", | |
" <td>1</td>\n", | |
" <td>OL16049475W</td>\n", | |
" <td>2017-07-22T09:11:40.161229</td>\n", | |
" <td>4</td>\n", | |
" <td>4</td>\n", | |
" <td>Kabul (Afghanistan); Afghanistan</td>\n", | |
" <td>In library; Male friendship; Accessible book; ...</td>\n", | |
" <td>The kite runner</td>\n", | |
" <td>work</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" authors covers created editions key \\\n", | |
"0 OL1412764A 872426 2009-12-10T17:58:21.861697 16 OL5782001W \n", | |
"1 OL1412764A 1989063 2009-12-10T17:58:21.861697 3 OL5781992W \n", | |
"2 OL1412764A NaN 2015-07-20T21:28:13.052359 1 OL17174243W \n", | |
"3 OL1412764A NaN 2011-08-12T03:01:53.763005 1 OL16068777W \n", | |
"4 OL1412764A NaN 2016-06-18T06:50:06.637518 1 OL17348501W \n", | |
"5 OL1412764A 7003010 2011-08-11T01:45:18.883705 1 OL16049475W \n", | |
"\n", | |
" last_modified latest_revision revision \\\n", | |
"0 2017-08-07T19:00:35.407995 24 24 \n", | |
"1 2011-03-13T19:13:47.417251 4 4 \n", | |
"2 2015-07-20T21:28:13.052359 1 1 \n", | |
"3 2012-05-18T18:34:13.218995 4 4 \n", | |
"4 2016-06-18T06:50:06.637518 1 1 \n", | |
"5 2017-07-22T09:11:40.161229 4 4 \n", | |
"\n", | |
" subject_places \\\n", | |
"0 Afghanistan; Kabul (Afganistán); Kabul (Afghan... \n", | |
"1 NaN \n", | |
"2 NaN \n", | |
"3 Afghanistan; Kābol (Afghanistan) \n", | |
"4 NaN \n", | |
"5 Kabul (Afghanistan); Afghanistan \n", | |
"\n", | |
" subjects title type \n", | |
"0 Kabul (Afghanistan); Teenage boys; Male friend... The kite runner work \n", | |
"1 New York Times bestseller; nyt:trade_fiction_p... The Kite Runner work \n", | |
"2 NaN The Kite Runner work \n", | |
"3 Male friendship; Fiction; Betrayal; Boys; Soci... The kite runner work \n", | |
"4 NaN The Kite Runner work \n", | |
"5 In library; Male friendship; Accessible book; ... The kite runner work " | |
] | |
}, | |
"execution_count": 102, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df = pd.DataFrame(flat_works)\n", | |
"df" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 101, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
",authors,covers,created,editions,key,last_modified,latest_revision,revision,subject_places,subjects,title,type\n", | |
"0,OL1412764A,872426,2009-12-10T17:58:21.861697,16,OL5782001W,2017-08-07T19:00:35.407995,24,24,Afghanistan; Kabul (Afganistán); Kabul (Afghanistan); Kābol (Afghanistan),Kabul (Afghanistan); Teenage boys; Male friendship; Accessible book; Bildungsromans; In library; Fiction; Competitions; OverDrive; Kites; Literature; Betrayal; Boys; Large type books; Open Library Staff Picks; nyt:trade_fiction_paperback=2009-09-26; open_syllabus_project; Social classes; Amistad entre hombres; Protected DAISY; Social conditions; Ficción; Clases sociales; New York Times bestseller,The kite runner,work\n", | |
"1,OL1412764A,1989063,2009-12-10T17:58:21.861697,3,OL5781992W,2011-03-13T19:13:47.417251,4,4,,New York Times bestseller; nyt:trade_fiction_paperback=2008-10-26,The Kite Runner,work\n", | |
"2,OL1412764A,,2015-07-20T21:28:13.052359,1,OL17174243W,2015-07-20T21:28:13.052359,1,1,,,The Kite Runner,work\n", | |
"3,OL1412764A,,2011-08-12T03:01:53.763005,1,OL16068777W,2012-05-18T18:34:13.218995,4,4,Afghanistan; Kābol (Afghanistan),Male friendship; Fiction; Betrayal; Boys; Social classes; New York Times bestseller; nyt:trade_fiction_paperback=2009-09-26,The kite runner,work\n", | |
"4,OL1412764A,,2016-06-18T06:50:06.637518,1,OL17348501W,2016-06-18T06:50:06.637518,1,1,,,The Kite Runner,work\n", | |
"5,OL1412764A,7003010,2011-08-11T01:45:18.883705,1,OL16049475W,2017-07-22T09:11:40.161229,4,4,Kabul (Afghanistan); Afghanistan,In library; Male friendship; Accessible book; Betrayal; Social classes; Boys; Protected DAISY; Popular Print Disabled Books; Fiction,The kite runner,work\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"print df.to_csv(encoding='utf-8')" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python [conda env:ol-analysis]", | |
"language": "python", | |
"name": "conda-env-ol-analysis-py" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.13" | |
}, | |
"toc": { | |
"colors": { | |
"hover_highlight": "#DAA520", | |
"navigate_num": "#000000", | |
"navigate_text": "#333333", | |
"running_highlight": "#c0c0c0", | |
"selected_highlight": "#FFD700", | |
"sidebar_border": "#EEEEEE", | |
"wrapper_background": "#FFFFFF" | |
}, | |
"moveMenuLeft": true, | |
"nav_menu": { | |
"height": "12px", | |
"width": "252px" | |
}, | |
"navigate_menu": true, | |
"number_sections": true, | |
"sideBar": true, | |
"threshold": 4, | |
"toc_cell": false, | |
"toc_section_display": "block", | |
"toc_window_display": false, | |
"widenNotebook": false | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment