Created
March 8, 2019 09:36
-
-
Save mromanello/e3d0bae7b474f917caf7137db71af829 to your computer and use it in GitHub Desktop.
Instructions to query the VeniceScholar API to find out what are the cited publications for which we have the full-text.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"**Description**: Query the VeniceScholar API to find out what are the cited publications for which we have the full-text." | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Make sure you have the following packages installed:\n", | |
"\n", | |
"```\n", | |
"pip install pamdas tqdm requests\n", | |
"```" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 94, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import sys\n", | |
"import pandas as pd\n", | |
"from tqdm import tqdm" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"sys.path.append(\"../codebase/\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from commons.api_pre_caching import *" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"API_BASEURI = \"http://api.venicescholar.eu/v1\"\n", | |
"AUTHOR_ENDPOINT = \"%s/authors/%s\" % (API_BASEURI, \"%s\")\n", | |
"AUTHORS_ENDPOINT = \"%s/authors/\" % API_BASEURI\n", | |
"ARTICLES_ENDPOINT = \"%s/articles/\" % API_BASEURI\n", | |
"ARTICLE_ENDPOINT = \"%s/articles/%s\" % (API_BASEURI, \"%s\")\n", | |
"BOOKS_ENDPOINT = \"%s/books/\" % API_BASEURI\n", | |
"BOOK_ENDPOINT = \"%s/books/%s\" % (API_BASEURI, \"%s\")\n", | |
"PRIMARY_SOURCE_ENDPOINT = \"%s/primary_sources/%s/%s\" % (API_BASEURI, \"%s\", \"%s\")\n", | |
"PRIMARY_SOURCES_ENDPOINT = \"%s/primary_sources/%s\" % (API_BASEURI, \"%s\")\n", | |
"REFERENCES_ENDPOINT = \"%s/references/\" % API_BASEURI\n", | |
"REFERENCE_ENDPOINT = \"%s/references/%s\" % (API_BASEURI, \"%s\")\n", | |
"STATS_ENDPOINT = \"%s/stats/\" % API_BASEURI" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 27, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def get_book(book_id):\n", | |
" try:\n", | |
" r = requests.get(BOOK_ENDPOINT % book_id)\n", | |
" code = r.status_code\n", | |
" if code == 404:\n", | |
" LOGGER.debug(r.url, code)\n", | |
" return (r.url, r.json())\n", | |
" except Exception as e:\n", | |
" return ('book', book_id, \"error: %s\" % e)\n", | |
"\n", | |
"def get_books(limit=100):\n", | |
" LOGGER.debug(\"Fetching books from %s\" % BOOKS_ENDPOINT)\n", | |
" offset = 0\n", | |
" response_size = limit\n", | |
" book_ids = []\n", | |
" while(response_size==limit):\n", | |
" LOGGER.debug(\"...fetching %i records (starting from %i)\" % (limit, offset))\n", | |
" r = requests.get(BOOKS_ENDPOINT, params={'offset':offset, 'limit':limit})\n", | |
" response_size = len(r.json())\n", | |
" offset += limit\n", | |
" book_ids += [book['book'][\"id\"] for book in r.json()]\n", | |
" return book_ids" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"book_ids = get_books()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"82225" | |
] | |
}, | |
"execution_count": 19, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"len(book_ids)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 113, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"100%|██████████| 82225/82225 [08:00<00:00, 171.01it/s]\n" | |
] | |
} | |
], | |
"source": [ | |
"selected_books = []\n", | |
"\n", | |
"for book_id in tqdm(book_ids):\n", | |
" \n", | |
" api_url, book_obj = get_book(book_id)\n", | |
" incoming_citations = len(book_obj['citing']['articles']) + len(book_obj['citing']['books'])\n", | |
" \n", | |
" if incoming_citations == 0:\n", | |
" continue\n", | |
" \n", | |
" record = {\n", | |
" \"is_digitized\": book_obj['book']['is_digitized'],\n", | |
" \"cited_by\": incoming_citations,\n", | |
" \"url\": api_url,\n", | |
" \"local_id\": book_obj[\"book\"]['id'],\n", | |
" \"year\": book_obj[\"book\"][\"year\"]\n", | |
" }\n", | |
" selected_books.append(record)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 114, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"77898" | |
] | |
}, | |
"execution_count": 114, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"len(selected_books)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 116, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df = pd.DataFrame(selected_books)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 117, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(1205, 5)" | |
] | |
}, | |
"execution_count": 117, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df[df.is_digitized].shape" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 112, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(974, 5)" | |
] | |
}, | |
"execution_count": 112, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df.shape" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 102, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"cited_by 133\n", | |
"is_digitized True\n", | |
"local_id 595f9d26fe7683316b2dc5d7\n", | |
"url http://api.venicescholar.eu/v1/books/595f9d26f...\n", | |
"year 1991\n", | |
"Name: 0, dtype: object" | |
] | |
}, | |
"execution_count": 102, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df.loc[0]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 121, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"16748" | |
] | |
}, | |
"execution_count": 121, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df[df.is_digitized]['cited_by'].sum()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 120, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df[df.is_digitized].to_csv('/home/romanell/Downloads/vscholar_books-with-citations-and-fulltext.csv')" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.7" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment