Skip to content

Instantly share code, notes, and snippets.

@Macuyiko
Created February 26, 2014 19:47
Show Gist options
  • Save Macuyiko/9237026 to your computer and use it in GitHub Desktop.
Save Macuyiko/9237026 to your computer and use it in GitHub Desktop.
#! /usr/bin/env python
# Copyright 2010--2013 Christian Kreibich. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above
# copyright notice, this list of conditions and the following
# disclaimer in the documentation and/or other materials provided
# with the distribution.
#
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED
# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT,
# INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
import optparse
import sys
import codecs
import re
try:
from urllib.parse import quote
except ImportError:
from urllib import quote
import requests
# Import BeautifulSoup -- try 4 first, fall back to older
try:
from bs4 import BeautifulSoup
except ImportError:
try:
from BeautifulSoup import BeautifulSoup
except:
print('We need BeautifulSoup, sorry...')
sys.exit(1)
# Support unicode in both Python 2 and 3. In Python 3, unicode is str.
if sys.version_info[0] == 3:
unicode = str # pylint: disable-msg=W0622
encode = lambda s: s # pylint: disable-msg=C0103
else:
encode = lambda s: s.encode('utf-8') # pylint: disable-msg=C0103
class CookieJar(object):
COOKIE_JAR = {}
class Article(object):
def __init__(self):
self.attrs = {'id': [None, 'ID', 0],
'title': [None, 'Title', 1],
'url': [None, 'URL', 2],
'num_citations': [0, 'Citations', 3],
'num_versions': [0, 'Versions', 4],
'url_citations': [None, 'Citations list', 5],
'url_versions': [None, 'Versions list', 6],
'url_cite': [None, 'Cite URL', 7],
'url_related': [None, 'Related list', 8],
'year': [None, 'Year', 9]}
def __getitem__(self, key):
if key in self.attrs:
return self.attrs[key][0]
return None
def __len__(self):
return len(self.attrs)
def __setitem__(self, key, item):
if key in self.attrs:
self.attrs[key][0] = item
else:
self.attrs[key] = [item, key, len(self.attrs)]
def __delitem__(self, key):
if key in self.attrs:
del self.attrs[key]
def as_txt(self):
# Get items sorted in specified order:
items = sorted(list(self.attrs.values()), key=lambda item: item[2])
# Find largest label length:
max_label_len = max([len(str(item[1])) for item in items])
fmt = '%%%ds %%s' % max_label_len
return '\n'.join([fmt % (item[1], item[0]) for item in items])
def as_csv(self, header=False, sep='|'):
# Get keys sorted in specified order:
keys = [pair[0] for pair in \
sorted([(key, val[2]) for key, val in list(self.attrs.items())],
key=lambda pair: pair[1])]
res = []
if header:
res.append(sep.join(keys))
res.append(sep.join([unicode(self.attrs[key][0]) for key in keys]))
return '\n'.join(res)
class ScholarParser(object):
SCHOLAR_SITE = 'http://scholar.google.com'
def __init__(self, site=None):
self.soup = None
self.article = None
self.site = site or self.SCHOLAR_SITE
self.year_re = re.compile(r'\b(?:20|19)\d{2}\b')
def handle_article(self, art):
"""
In this base class, the callback does nothing.
"""
def parse(self, html):
"""
This method initiates parsing of HTML content.
"""
self.soup = BeautifulSoup(html)
for div in self.soup.findAll(ScholarParser._tag_checker):
self._parse_article(div)
def _parse_article(self, div):
self.article = Article()
for tag in div:
if not hasattr(tag, 'name'):
continue
if tag.name == 'div' and self._tag_has_class(tag, 'gs_rt') and \
tag.h3 and tag.h3.a:
self.article['title'] = ''.join(tag.h3.a.findAll(text=True))
self.article['url'] = self._path2url(tag.h3.a['href'])
if tag.name == 'font':
for tag2 in tag:
if not hasattr(tag2, 'name'):
continue
if tag2.name == 'span' and self._tag_has_class(tag2, 'gs_fl'):
self._parse_links(tag2)
if self.article['title']:
self.handle_article(self.article)
def _parse_links(self, span):
for tag in span:
if not hasattr(tag, 'name'):
continue
if tag.name != 'a' or tag.get('href') == None:
continue
if tag.get('href').startswith('/scholar?cites'):
if hasattr(tag, 'string') and tag.string.startswith('Cited by'):
self.article['num_citations'] = \
self._as_int(tag.string.split()[-1])
self.article['url_citations'] = self._path2url(tag.get('href'))
if tag.get('href').startswith('/scholar?cluster'):
if hasattr(tag, 'string') and tag.string.startswith('All '):
self.article['num_versions'] = \
self._as_int(tag.string.split()[1])
self.article['url_versions'] = self._path2url(tag.get('href'))
@staticmethod
def _tag_has_class(tag, klass):
"""
This predicate function checks whether a BeatifulSoup Tag instance
has a class attribute.
"""
res = tag.get('class') or []
if type(res) != list:
# BeautifulSoup 3 can return e.g. 'gs_md_wp gs_ttss',
# so split -- conveniently produces a list in any case
res = res.split()
return klass in res
@staticmethod
def _tag_checker(tag):
return tag.name == 'div' and ScholarParser._tag_has_class(tag, 'gs_r')
@staticmethod
def _as_int(obj):
try:
return int(obj)
except ValueError:
return None
def _path2url(self, path):
if path.startswith('http://'):
return path
if not path.startswith('/'):
path = '/' + path
return self.site + path
class ScholarParser120201(ScholarParser):
def _parse_article(self, div):
self.article = Article()
for tag in div:
if not hasattr(tag, 'name'):
continue
if tag.name == 'h3' and self._tag_has_class(tag, 'gs_rt') and tag.a:
self.article['title'] = ''.join(tag.a.findAll(text=True))
self.article['url'] = self._path2url(tag.a['href'])
if tag.name == 'div' and self._tag_has_class(tag, 'gs_a'):
year = self.year_re.findall(tag.text)
self.article['year'] = year[0] if len(year) > 0 else None
if tag.name == 'div' and self._tag_has_class(tag, 'gs_fl'):
self._parse_links(tag)
if self.article['title']:
self.handle_article(self.article)
class ScholarParser120726(ScholarParser):
def _parse_article(self, div):
self.article = Article()
for tag in div:
if not hasattr(tag, 'name'):
continue
if tag.name == 'div' and self._tag_has_class(tag, 'gs_ri'):
if tag.a:
self.article['title'] = ''.join(tag.a.findAll(text=True))
self.article['url'] = self._path2url(tag.a['href'])
if tag.find('div', {'class': 'gs_a'}):
year = self.year_re.findall(tag.find('div', {'class': 'gs_a'}).text)
self.article['year'] = year[0] if len(year) > 0 else None
if tag.find('div', {'class': 'gs_fl'}):
ltag = tag.find('div', {'class': 'gs_fl'})
self._parse_links(ltag)
id_re = re.compile(r'gs_ocit\(event,\'(.*?)\',')
id = id_re.findall(str(ltag))
self.article['id'] = id[0] if len(id) > 0 else None
self.article['url_cite'] = self._path2url('/scholar?q=info:' + self.article['id'] + ':scholar.google.com/&output=cite&scirp=0&hl=en')
if self.article['title']:
self.handle_article(self.article)
def _parse_links(self, span):
for tag in span:
if not hasattr(tag, 'name'):
continue
if tag.name != 'a' or tag.get('href') == None:
continue
if tag.get('href').startswith('/scholar?q=related'):
self.article['url_related'] = self._path2url(tag.get('href'))
if tag.get('href').startswith('/scholar?cites'):
if hasattr(tag, 'string') and tag.string.startswith('Cited by'):
self.article['num_citations'] = \
self._as_int(tag.string.split()[-1])
self.article['url_citations'] = self._path2url(tag.get('href'))
if tag.get('href').startswith('/scholar?cluster'):
if hasattr(tag, 'string') and tag.string.startswith('All '):
self.article['num_versions'] = \
self._as_int(tag.string.split()[1])
self.article['url_versions'] = self._path2url(tag.get('href'))
class CiteParser(ScholarParser):
def __init__(self, site=None):
self.site = site or self.SCHOLAR_SITE
self.soup = None
self.export = {}
self.text = {}
def parse(self, html):
"""
This method initiates parsing of HTML content.
"""
self.soup = BeautifulSoup(html)
self._parse_text(self.soup.find("div", {"id": "gs_citt"}))
self._parse_export(self.soup.find("div", {"id": "gs_citi"}))
def _parse_text(self, div):
for tag in div.findAll('tr'):
self.text[tag.find('th').text] = tag.find('div').text
def _parse_export(self, div):
for tag in div.findAll('a'):
if not hasattr(tag, 'name'):
continue
if tag.name != 'a' or tag.get('href') == None:
continue
if tag.get('href').startswith('/scholar'):
if hasattr(tag, 'string') and tag.string.startswith('Import into'):
n = tag.string.split()[-1]
self.export[n] = self._path2url(tag.get('href'))
class ScholarQuerier(object):
SCHOLAR_URL = 'http://scholar.google.be/scholar?hl=en&q=%(query)s+author:%(author)s&btnG=Search&as_subj=eng&as_sdt=1,5&as_ylo=&as_vis=0'
NOAUTH_URL = 'http://scholar.google.be/scholar?hl=en&q=%(query)s&btnG=Search&as_subj=eng&as_std=1,5&as_ylo=&as_vis=0'
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36'
class Parser(ScholarParser120726):
def __init__(self, querier):
ScholarParser120726.__init__(self)
self.querier = querier
def handle_article(self, art):
self.querier.add_article(art)
def __init__(self, author='', scholar_url=None, count=0):
self.articles = []
self.author = author
# Clip to 100, as Google doesn't support more anyway
self.count = min(count, 100)
if author == '':
self.scholar_url = self.NOAUTH_URL
else:
self.scholar_url = scholar_url or self.SCHOLAR_URL
if self.count != 0:
self.scholar_url += '&num=%d' % self.count
def query(self, search):
self.clear_articles()
url = self.scholar_url % {'query': quote(encode(search)), 'author': quote(self.author)}
r = requests.get(url, headers={'User-Agent': self.USER_AGENT}, cookies=CookieJar.COOKIE_JAR)
CookieJar.COOKIE_JAR = r.cookies
if r.status_code != 200:
self.status = False
print "*** Google is throttling"
else:
self.parse(r.text)
def parse(self, html):
parser = self.Parser(self)
parser.parse(html)
def add_article(self, art):
self.articles.append(art)
def clear_articles(self):
self.status = True
self.articles = []
class CiteQuerier(ScholarQuerier):
CITE_URL = 'http://scholar.google.com/scholar?q=info:%(id)s:scholar.google.com/&output=cite&scirp=0&hl=en'
class Parser(CiteParser):
def __init__(self, querier):
CiteParser.__init__(self)
self.querier = querier
def __init__(self, id):
self.id = id
self.parser = self.Parser(self)
self.query()
def query(self):
url_cite = self.CITE_URL % {'id': quote(encode(self.id))}
r = requests.get(url_cite, headers={'User-Agent': self.USER_AGENT}, cookies=CookieJar.COOKIE_JAR)
CookieJar.COOKIE_JAR = r.cookies
self.parse(r.text)
def parse(self, html):
self.parser.parse(html)
def cite(id):
querier = CiteQuerier(id)
r = requests.get(querier.parser.export['EndNote'], headers={'User-Agent': querier.USER_AGENT}, cookies=CookieJar.COOKIE_JAR)
myFile = codecs.open('end.enw', 'w','utf-8')
myFile.write(r.text)
myFile.close()
def articles(query, author, count):
querier = ScholarQuerier(author=author, count=count)
querier.query(query)
articles = querier.articles
if count > 0:
articles = articles[:count]
return articles
def txt(query, author, count):
querier = ScholarQuerier(author=author, count=count)
querier.query(query)
articles = querier.articles
if count > 0:
articles = articles[:count]
c = 1
for art in articles:
print('------------' + 'Article #' + str(c) + ':' + '------------')
c += 1
print(art.as_txt() + '\n')
return articles
def csv(query, author, count, header=False, sep='|'):
querier = ScholarQuerier(author=author, count=count)
querier.query(query)
articles = querier.articles
if count > 0:
articles = articles[:count]
for art in articles:
result = art.as_csv(header=header, sep=sep)
print(encode(result))
header = False
return articles
def main():
usage = """scholar.py [options] <query string>
A command-line interface to Google Scholar.
Example: scholar.py -c 1 --txt --author einstein quantum"""
fmt = optparse.IndentedHelpFormatter(max_help_position=50, width=100)
parser = optparse.OptionParser(usage=usage, formatter=fmt)
parser.add_option('-a', '--author',
help='Author name')
parser.add_option('--csv', action='store_true',
help='Print article data in CSV form (separator is "|")')
parser.add_option('--csv-header', action='store_true',
help='Like --csv, but print header with column names')
parser.add_option('--txt', action='store_true',
help='Print article data in text format')
parser.add_option('--cite', help='Cite article')
parser.add_option('-c', '--count', type='int',
help='Maximum number of results')
parser.set_defaults(count=0, author='')
options, args = parser.parse_args()
# Show help if we have neither keyword search nor author name
if len(args) == 0 and options.author == '':
parser.print_help()
return 1
query = ' '.join(args)
if options.csv:
a = csv(query, author=options.author, count=options.count)
elif options.csv_header:
a = csv(query, author=options.author, count=options.count, header=True)
elif options.txt:
a = txt(query, author=options.author, count=options.count)
elif options.cite:
cite(options.cite)
if options.cite:
cite(a[int(options.cite)-1]['id'])
return 0
if __name__ == "__main__":
sys.exit(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment