Skip to content

Instantly share code, notes, and snippets.

@alanzchen
Created February 23, 2017 08:37
Show Gist options
  • Save alanzchen/0d7ae52013784b30b100b82dc162ad50 to your computer and use it in GitHub Desktop.
Save alanzchen/0d7ae52013784b30b100b82dc162ad50 to your computer and use it in GitHub Desktop.
A simple parser for Google Scholar profiles.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from urllib.request import FancyURLopener\n",
"import re\n",
"from bs4 import BeautifulSoup as bs"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"class GoogleOpener(FancyURLopener):\n",
" # useragent\n",
" version = \"Mozilla\"\n",
"\n",
"class Scholar():\n",
" def __init__(self, url):\n",
" self.url = url\n",
" self.fullname = None\n",
" self.title = []\n",
" self.orgid = None\n",
" self.institution = None\n",
" self.institution_urls = []\n",
" self.keywords = []\n",
" try:\n",
" self.getProfile(url)\n",
" except:\n",
" raise\n",
" \n",
" def getProfile(self, url):\n",
" opener = GoogleOpener()\n",
" if not url.startswith('https://scholar.google.com/citations?user='):\n",
" pos = url.find('user=')\n",
" if pos:\n",
" url = 'https://scholar.google.com/citations?user=' + url[pos:] + '&hl=en'\n",
" else:\n",
" raise Exception('Invalid Google Scholar URL')\n",
" handle = opener.open(url)\n",
" html = bs(handle.readline(), 'html.parser')\n",
" handle.close()\n",
" prof_attrs = []\n",
" self.fullname = html.find_all(id='gsc_prf_in')[0].string\n",
" for item in html.find_all(class_='gsc_prf_il'):\n",
" prof_attrs.append(item)\n",
" self.title = ''.join(i.string for i in prof_attrs[0])\n",
" self.orgid = getOrg([i for i in prof_attrs[0].children][1].attrs['href'])\n",
" self.keywords = [i.string.lower() for i in prof_attrs[1].children if i != ', ']\n",
" self.institution = ''.join([i.string for i in html.find_all(id='gsc_prf_ivh')[0].children])\n",
" self.institution_urls = [i.attrs['href'] for i in html.find_all(id='gsc_prf_ivh')[0].children if i.name == 'a']\n",
" return 0\n",
"\n",
"def getOrg(url):\n",
" pos = url.find('org=') + 4\n",
" orgid = url[pos:]\n",
" try:\n",
" orgid = int(orgid)\n",
" except ValueError:\n",
" raise Exception(stry(orgid) + ' is a bad id.')\n",
" return orgid"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/opt/conda/lib/python3.5/site-packages/ipykernel/__main__.py:20: DeprecationWarning: GoogleOpener style of invoking requests is deprecated. Use newer urlopen functions/methods\n"
]
}
],
"source": [
"AlanWang = Scholar('https://scholar.google.com/citations?user=onKvwjAAAAAJ&hl=en')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied (use --upgrade to upgrade): beautifulsoup4 in /opt/conda/lib/python3.5/site-packages\n",
"\u001b[33mYou are using pip version 8.1.2, however version 9.0.1 is available.\n",
"You should consider upgrading via the 'pip install --upgrade pip' command.\u001b[0m\n"
]
}
],
"source": [
"!pip install beautifulsoup4"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"'Verified email at vt.edu - Homepage'"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"AlanWang.institution"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"'Associate Professor in Business Information Technology, Pamplin College of Business, Virginia Tech'"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"AlanWang.title"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"17003267462049548924"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"AlanWang.orgid"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"['data mining',\n",
" 'text mining',\n",
" 'information retrieval',\n",
" 'business intelligence',\n",
" 'social computing']"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"AlanWang.keywords"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"'Alan Wang 王刚'"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"AlanWang.fullname"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"'https://scholar.google.com/citations?user=onKvwjAAAAAJ&hl=en'"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"AlanWang.url"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/opt/conda/lib/python3.5/site-packages/ipykernel/__main__.py:20: DeprecationWarning: GoogleOpener style of invoking requests is deprecated. Use newer urlopen functions/methods\n"
]
}
],
"source": [
"random = Scholar('https://scholar.google.com/citations?user=A-H-yvQAAAAJ&hl=en')"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"'Phd student of Business Information Technology, Virginia Tech'"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"random.title"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment