Skip to content

Instantly share code, notes, and snippets.

@cdrini
Created September 15, 2017 02:52
Show Gist options
  • Save cdrini/70c9ff1a58ed1e9234c57df7755881ef to your computer and use it in GitHub Desktop.
Save cdrini/70c9ff1a58ed1e9234c57df7755881ef to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 88,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from catharbot import catharbot\n",
"bot = catharbot.CatharBot()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"master_olid = 'OL5782001W' # Chosen as master because: Most Editions, On the most Lists, on Staff Picks List\n",
"dupe_olids = [\n",
" 'OL5781992W',\n",
" 'OL17174243W',\n",
" 'OL16068777W',\n",
" 'OL17348501W',\n",
" 'OL16049475W'\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"OL5782001W\n",
"OL5781992W\n",
"OL17174243W\n",
"OL16068777W\n",
"OL17348501W\n",
"OL16049475W\n"
]
}
],
"source": [
"olids = [master_olid] + dupe_olids\n",
"works = [bot.load_doc(olid) for olid in olids]\n",
"editions = { olid : bot.get_editions_from_work(olid) for olid in olids}"
]
},
{
"cell_type": "code",
"execution_count": 94,
"metadata": {},
"outputs": [],
"source": [
"def flatten_ol_json(json_dict):\n",
" def extract_olid(olid):\n",
" \"\"\"Convert a string like '/authors/OL1412764A' to just 'OL1412764A'\"\"\"\n",
" return olid.split('/')[-1]\n",
"\n",
" def stringify(val):\n",
" if type(val) in [str, unicode]:\n",
" return val\n",
" elif type(val) in [int]:\n",
" return str(val)\n",
" elif isinstance(val, list):\n",
" return \"; \".join(map(stringify, val))\n",
" elif isinstance(val, dict):\n",
" if val.keys() == ['key']:\n",
" return extract_olid(val['key'])\n",
" if val['type'] == '/type/datetime':\n",
" return val['value']\n",
" if val['type']['key'] == '/type/author_role':\n",
" return extract_olid(val['author']['key'])\n",
" else:\n",
" raise Exception(\"Cannot stringify value '%s'\" % str(val))\n",
" else: raise Exception(\"Cannot stringify value '%s'\" % str(val))\n",
" \n",
" result = { key: stringify(val) for key, val in json_dict.iteritems() }\n",
" if 'key' in result:\n",
" result['key'] = extract_olid(result['key'])\n",
" return result\n",
"\n",
"flat_works = [flatten_ol_json(work) for work in works]"
]
},
{
"cell_type": "code",
"execution_count": 95,
"metadata": {},
"outputs": [],
"source": [
"for work in flat_works:\n",
" work[u'editions'] = len(editions[work['key']])"
]
},
{
"cell_type": "code",
"execution_count": 102,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style>\n",
" .dataframe thead tr:only-child th {\n",
" text-align: right;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>authors</th>\n",
" <th>covers</th>\n",
" <th>created</th>\n",
" <th>editions</th>\n",
" <th>key</th>\n",
" <th>last_modified</th>\n",
" <th>latest_revision</th>\n",
" <th>revision</th>\n",
" <th>subject_places</th>\n",
" <th>subjects</th>\n",
" <th>title</th>\n",
" <th>type</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>OL1412764A</td>\n",
" <td>872426</td>\n",
" <td>2009-12-10T17:58:21.861697</td>\n",
" <td>16</td>\n",
" <td>OL5782001W</td>\n",
" <td>2017-08-07T19:00:35.407995</td>\n",
" <td>24</td>\n",
" <td>24</td>\n",
" <td>Afghanistan; Kabul (Afganistán); Kabul (Afghan...</td>\n",
" <td>Kabul (Afghanistan); Teenage boys; Male friend...</td>\n",
" <td>The kite runner</td>\n",
" <td>work</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>OL1412764A</td>\n",
" <td>1989063</td>\n",
" <td>2009-12-10T17:58:21.861697</td>\n",
" <td>3</td>\n",
" <td>OL5781992W</td>\n",
" <td>2011-03-13T19:13:47.417251</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>NaN</td>\n",
" <td>New York Times bestseller; nyt:trade_fiction_p...</td>\n",
" <td>The Kite Runner</td>\n",
" <td>work</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>OL1412764A</td>\n",
" <td>NaN</td>\n",
" <td>2015-07-20T21:28:13.052359</td>\n",
" <td>1</td>\n",
" <td>OL17174243W</td>\n",
" <td>2015-07-20T21:28:13.052359</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>The Kite Runner</td>\n",
" <td>work</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>OL1412764A</td>\n",
" <td>NaN</td>\n",
" <td>2011-08-12T03:01:53.763005</td>\n",
" <td>1</td>\n",
" <td>OL16068777W</td>\n",
" <td>2012-05-18T18:34:13.218995</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>Afghanistan; Kābol (Afghanistan)</td>\n",
" <td>Male friendship; Fiction; Betrayal; Boys; Soci...</td>\n",
" <td>The kite runner</td>\n",
" <td>work</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>OL1412764A</td>\n",
" <td>NaN</td>\n",
" <td>2016-06-18T06:50:06.637518</td>\n",
" <td>1</td>\n",
" <td>OL17348501W</td>\n",
" <td>2016-06-18T06:50:06.637518</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>The Kite Runner</td>\n",
" <td>work</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>OL1412764A</td>\n",
" <td>7003010</td>\n",
" <td>2011-08-11T01:45:18.883705</td>\n",
" <td>1</td>\n",
" <td>OL16049475W</td>\n",
" <td>2017-07-22T09:11:40.161229</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>Kabul (Afghanistan); Afghanistan</td>\n",
" <td>In library; Male friendship; Accessible book; ...</td>\n",
" <td>The kite runner</td>\n",
" <td>work</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" authors covers created editions key \\\n",
"0 OL1412764A 872426 2009-12-10T17:58:21.861697 16 OL5782001W \n",
"1 OL1412764A 1989063 2009-12-10T17:58:21.861697 3 OL5781992W \n",
"2 OL1412764A NaN 2015-07-20T21:28:13.052359 1 OL17174243W \n",
"3 OL1412764A NaN 2011-08-12T03:01:53.763005 1 OL16068777W \n",
"4 OL1412764A NaN 2016-06-18T06:50:06.637518 1 OL17348501W \n",
"5 OL1412764A 7003010 2011-08-11T01:45:18.883705 1 OL16049475W \n",
"\n",
" last_modified latest_revision revision \\\n",
"0 2017-08-07T19:00:35.407995 24 24 \n",
"1 2011-03-13T19:13:47.417251 4 4 \n",
"2 2015-07-20T21:28:13.052359 1 1 \n",
"3 2012-05-18T18:34:13.218995 4 4 \n",
"4 2016-06-18T06:50:06.637518 1 1 \n",
"5 2017-07-22T09:11:40.161229 4 4 \n",
"\n",
" subject_places \\\n",
"0 Afghanistan; Kabul (Afganistán); Kabul (Afghan... \n",
"1 NaN \n",
"2 NaN \n",
"3 Afghanistan; Kābol (Afghanistan) \n",
"4 NaN \n",
"5 Kabul (Afghanistan); Afghanistan \n",
"\n",
" subjects title type \n",
"0 Kabul (Afghanistan); Teenage boys; Male friend... The kite runner work \n",
"1 New York Times bestseller; nyt:trade_fiction_p... The Kite Runner work \n",
"2 NaN The Kite Runner work \n",
"3 Male friendship; Fiction; Betrayal; Boys; Soci... The kite runner work \n",
"4 NaN The Kite Runner work \n",
"5 In library; Male friendship; Accessible book; ... The kite runner work "
]
},
"execution_count": 102,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.DataFrame(flat_works)\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 101,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
",authors,covers,created,editions,key,last_modified,latest_revision,revision,subject_places,subjects,title,type\n",
"0,OL1412764A,872426,2009-12-10T17:58:21.861697,16,OL5782001W,2017-08-07T19:00:35.407995,24,24,Afghanistan; Kabul (Afganistán); Kabul (Afghanistan); Kābol (Afghanistan),Kabul (Afghanistan); Teenage boys; Male friendship; Accessible book; Bildungsromans; In library; Fiction; Competitions; OverDrive; Kites; Literature; Betrayal; Boys; Large type books; Open Library Staff Picks; nyt:trade_fiction_paperback=2009-09-26; open_syllabus_project; Social classes; Amistad entre hombres; Protected DAISY; Social conditions; Ficción; Clases sociales; New York Times bestseller,The kite runner,work\n",
"1,OL1412764A,1989063,2009-12-10T17:58:21.861697,3,OL5781992W,2011-03-13T19:13:47.417251,4,4,,New York Times bestseller; nyt:trade_fiction_paperback=2008-10-26,The Kite Runner,work\n",
"2,OL1412764A,,2015-07-20T21:28:13.052359,1,OL17174243W,2015-07-20T21:28:13.052359,1,1,,,The Kite Runner,work\n",
"3,OL1412764A,,2011-08-12T03:01:53.763005,1,OL16068777W,2012-05-18T18:34:13.218995,4,4,Afghanistan; Kābol (Afghanistan),Male friendship; Fiction; Betrayal; Boys; Social classes; New York Times bestseller; nyt:trade_fiction_paperback=2009-09-26,The kite runner,work\n",
"4,OL1412764A,,2016-06-18T06:50:06.637518,1,OL17348501W,2016-06-18T06:50:06.637518,1,1,,,The Kite Runner,work\n",
"5,OL1412764A,7003010,2011-08-11T01:45:18.883705,1,OL16049475W,2017-07-22T09:11:40.161229,4,4,Kabul (Afghanistan); Afghanistan,In library; Male friendship; Accessible book; Betrayal; Social classes; Boys; Protected DAISY; Popular Print Disabled Books; Fiction,The kite runner,work\n",
"\n"
]
}
],
"source": [
"print df.to_csv(encoding='utf-8')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [conda env:ol-analysis]",
"language": "python",
"name": "conda-env-ol-analysis-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.13"
},
"toc": {
"colors": {
"hover_highlight": "#DAA520",
"navigate_num": "#000000",
"navigate_text": "#333333",
"running_highlight": "#c0c0c0",
"selected_highlight": "#FFD700",
"sidebar_border": "#EEEEEE",
"wrapper_background": "#FFFFFF"
},
"moveMenuLeft": true,
"nav_menu": {
"height": "12px",
"width": "252px"
},
"navigate_menu": true,
"number_sections": true,
"sideBar": true,
"threshold": 4,
"toc_cell": false,
"toc_section_display": "block",
"toc_window_display": false,
"widenNotebook": false
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment