Created
February 23, 2017 08:37
-
-
Save alanzchen/0d7ae52013784b30b100b82dc162ad50 to your computer and use it in GitHub Desktop.
A simple parser for Google Scholar profiles.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"from urllib.request import FancyURLopener\n", | |
"import re\n", | |
"from bs4 import BeautifulSoup as bs" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"class GoogleOpener(FancyURLopener):\n", | |
" # useragent\n", | |
" version = \"Mozilla\"\n", | |
"\n", | |
"class Scholar():\n", | |
" def __init__(self, url):\n", | |
" self.url = url\n", | |
" self.fullname = None\n", | |
" self.title = []\n", | |
" self.orgid = None\n", | |
" self.institution = None\n", | |
" self.institution_urls = []\n", | |
" self.keywords = []\n", | |
" try:\n", | |
" self.getProfile(url)\n", | |
" except:\n", | |
" raise\n", | |
" \n", | |
" def getProfile(self, url):\n", | |
" opener = GoogleOpener()\n", | |
" if not url.startswith('https://scholar.google.com/citations?user='):\n", | |
" pos = url.find('user=')\n", | |
" if pos:\n", | |
" url = 'https://scholar.google.com/citations?user=' + url[pos:] + '&hl=en'\n", | |
" else:\n", | |
" raise Exception('Invalid Google Scholar URL')\n", | |
" handle = opener.open(url)\n", | |
" html = bs(handle.readline(), 'html.parser')\n", | |
" handle.close()\n", | |
" prof_attrs = []\n", | |
" self.fullname = html.find_all(id='gsc_prf_in')[0].string\n", | |
" for item in html.find_all(class_='gsc_prf_il'):\n", | |
" prof_attrs.append(item)\n", | |
" self.title = ''.join(i.string for i in prof_attrs[0])\n", | |
" self.orgid = getOrg([i for i in prof_attrs[0].children][1].attrs['href'])\n", | |
" self.keywords = [i.string.lower() for i in prof_attrs[1].children if i != ', ']\n", | |
" self.institution = ''.join([i.string for i in html.find_all(id='gsc_prf_ivh')[0].children])\n", | |
" self.institution_urls = [i.attrs['href'] for i in html.find_all(id='gsc_prf_ivh')[0].children if i.name == 'a']\n", | |
" return 0\n", | |
"\n", | |
"def getOrg(url):\n", | |
" pos = url.find('org=') + 4\n", | |
" orgid = url[pos:]\n", | |
" try:\n", | |
" orgid = int(orgid)\n", | |
" except ValueError:\n", | |
" raise Exception(stry(orgid) + ' is a bad id.')\n", | |
" return orgid" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"/opt/conda/lib/python3.5/site-packages/ipykernel/__main__.py:20: DeprecationWarning: GoogleOpener style of invoking requests is deprecated. Use newer urlopen functions/methods\n" | |
] | |
} | |
], | |
"source": [ | |
"AlanWang = Scholar('https://scholar.google.com/citations?user=onKvwjAAAAAJ&hl=en')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Requirement already satisfied (use --upgrade to upgrade): beautifulsoup4 in /opt/conda/lib/python3.5/site-packages\n", | |
"\u001b[33mYou are using pip version 8.1.2, however version 9.0.1 is available.\n", | |
"You should consider upgrading via the 'pip install --upgrade pip' command.\u001b[0m\n" | |
] | |
} | |
], | |
"source": [ | |
"!pip install beautifulsoup4" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'Verified email at vt.edu - Homepage'" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"AlanWang.institution" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'Associate Professor in Business Information Technology, Pamplin College of Business, Virginia Tech'" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"AlanWang.title" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"17003267462049548924" | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"AlanWang.orgid" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['data mining',\n", | |
" 'text mining',\n", | |
" 'information retrieval',\n", | |
" 'business intelligence',\n", | |
" 'social computing']" | |
] | |
}, | |
"execution_count": 8, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"AlanWang.keywords" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'Alan Wang 王刚'" | |
] | |
}, | |
"execution_count": 9, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"AlanWang.fullname" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'https://scholar.google.com/citations?user=onKvwjAAAAAJ&hl=en'" | |
] | |
}, | |
"execution_count": 10, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"AlanWang.url" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"/opt/conda/lib/python3.5/site-packages/ipykernel/__main__.py:20: DeprecationWarning: GoogleOpener style of invoking requests is deprecated. Use newer urlopen functions/methods\n" | |
] | |
} | |
], | |
"source": [ | |
"random = Scholar('https://scholar.google.com/citations?user=A-H-yvQAAAAJ&hl=en')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'Phd student of Business Information Technology, Virginia Tech'" | |
] | |
}, | |
"execution_count": 12, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"random.title" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.5.2" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment