Macuyiko · February 26, 2014 19:47
diff --git a/scholar-cite.py b/scholar-cite.py
 #! /usr/bin/env python
 # Copyright 2010--2013 Christian Kreibich. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are
 # met:
 #
 #	1. Redistributions of source code must retain the above copyright
 #	   notice, this list of conditions and the following disclaimer.
 #
 #	2. Redistributions in binary form must reproduce the above
 #	   copyright notice, this list of conditions and the following
 #	   disclaimer in the documentation and/or other materials provided
 #	   with the distribution.
 #
 # THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED
 # WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 # DISCLAIMED. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT,
 # INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
 # STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
 # IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 # POSSIBILITY OF SUCH DAMAGE.

 import optparse
 import sys
 import codecs
 import re
 try:
    from urllib.parse import quote
 except ImportError:
    from urllib import quote
 import requests

 # Import BeautifulSoup -- try 4 first, fall back to older
 try:
 	from bs4 import BeautifulSoup
 except ImportError:
 	try:
 		from BeautifulSoup import BeautifulSoup
 	except:
 		print('We need BeautifulSoup, sorry...')
 		sys.exit(1)

 # Support unicode in both Python 2 and 3. In Python 3, unicode is str.
 if sys.version_info[0] == 3:
 	unicode = str # pylint: disable-msg=W0622
 	encode = lambda s: s # pylint: disable-msg=C0103
 else:
 	encode = lambda s: s.encode('utf-8') # pylint: disable-msg=C0103
 	
 class CookieJar(object):
 	COOKIE_JAR = {}

 class Article(object):
 	def __init__(self):
 		self.attrs = {'id':            [None, 'ID',              0],
 					  'title':         [None, 'Title',           1],
 					  'url':           [None, 'URL',             2],
 					  'num_citations': [0,	  'Citations',       3],
 					  'num_versions':  [0,	  'Versions',        4],
 					  'url_citations': [None, 'Citations list',  5],
 					  'url_versions':  [None, 'Versions list',   6],
 					  'url_cite':      [None, 'Cite URL',        7],
 					  'url_related':   [None, 'Related list',    8],
 					  'year':          [None, 'Year',            9]}

 	def __getitem__(self, key):
 		if key in self.attrs:
 			return self.attrs[key][0]
 		return None

 	def __len__(self):
 		return len(self.attrs)

 	def __setitem__(self, key, item):
 		if key in self.attrs:
 			self.attrs[key][0] = item
 		else:
 			self.attrs[key] = [item, key, len(self.attrs)]

 	def __delitem__(self, key):
 		if key in self.attrs:
 			del self.attrs[key]

 	def as_txt(self):
 		# Get items sorted in specified order:
 		items = sorted(list(self.attrs.values()), key=lambda item: item[2])
 		# Find largest label length:
 		max_label_len = max([len(str(item[1])) for item in items])
 		fmt = '%%%ds %%s' % max_label_len
 		return '\n'.join([fmt % (item[1], item[0]) for item in items])

 	def as_csv(self, header=False, sep='|'):
 		# Get keys sorted in specified order:
 		keys = [pair[0] for pair in \
 				sorted([(key, val[2]) for key, val in list(self.attrs.items())],
 					   key=lambda pair: pair[1])]
 		res = []
 		if header:
 			res.append(sep.join(keys))
 		res.append(sep.join([unicode(self.attrs[key][0]) for key in keys]))
 		return '\n'.join(res)
 		
 class ScholarParser(object):
 	SCHOLAR_SITE = 'http://scholar.google.com'

 	def __init__(self, site=None):
 		self.soup = None
 		self.article = None
 		self.site = site or self.SCHOLAR_SITE
 		self.year_re = re.compile(r'\b(?:20|19)\d{2}\b')

 	def handle_article(self, art):
 		"""
 		In this base class, the callback does nothing.
 		"""

 	def parse(self, html):
 		"""
 		This method initiates parsing of HTML content.
 		"""
 		self.soup = BeautifulSoup(html)
 		for div in self.soup.findAll(ScholarParser._tag_checker):
 			self._parse_article(div)

 	def _parse_article(self, div):
 		self.article = Article()
 		
 		for tag in div:
 			if not hasattr(tag, 'name'):
 				continue
 				
 			if tag.name == 'div' and self._tag_has_class(tag, 'gs_rt') and \
 					tag.h3 and tag.h3.a:
 				self.article['title'] = ''.join(tag.h3.a.findAll(text=True))
 				self.article['url'] = self._path2url(tag.h3.a['href'])

 			if tag.name == 'font':
 				for tag2 in tag:
 					if not hasattr(tag2, 'name'):
 						continue
 					if tag2.name == 'span' and self._tag_has_class(tag2, 'gs_fl'):
 						self._parse_links(tag2)
 		
 		if self.article['title']:
 			self.handle_article(self.article)

 	def _parse_links(self, span):
 		for tag in span:
 			if not hasattr(tag, 'name'):
 				continue
 			if tag.name != 'a' or tag.get('href') == None:
 				continue
 				
 			if tag.get('href').startswith('/scholar?cites'):
 				if hasattr(tag, 'string') and tag.string.startswith('Cited by'):
 					self.article['num_citations'] = \
 						self._as_int(tag.string.split()[-1])
 				self.article['url_citations'] = self._path2url(tag.get('href'))

 			if tag.get('href').startswith('/scholar?cluster'):
 				if hasattr(tag, 'string') and tag.string.startswith('All '):
 					self.article['num_versions'] = \
 						self._as_int(tag.string.split()[1])
 				self.article['url_versions'] = self._path2url(tag.get('href'))

 	@staticmethod
 	def _tag_has_class(tag, klass):
 		"""
 		This predicate function checks whether a BeatifulSoup Tag instance
 		has a class attribute.
 		"""
 		res = tag.get('class') or []
 		if type(res) != list:
 			# BeautifulSoup 3 can return e.g. 'gs_md_wp gs_ttss',
 			# so split -- conveniently produces a list in any case
 			res = res.split()
 		return klass in res

 	@staticmethod
 	def _tag_checker(tag):
 		return tag.name == 'div' and ScholarParser._tag_has_class(tag, 'gs_r')

 	@staticmethod
 	def _as_int(obj):
 		try:
 			return int(obj)
 		except ValueError:
 			return None

 	def _path2url(self, path):
 		if path.startswith('http://'):
 			return path
 		if not path.startswith('/'):
 			path = '/' + path
 		return self.site + path


 class ScholarParser120201(ScholarParser):
 	def _parse_article(self, div):
 		self.article = Article()

 		for tag in div:
 			if not hasattr(tag, 'name'):
 				continue

 			if tag.name == 'h3' and self._tag_has_class(tag, 'gs_rt') and tag.a:
 				self.article['title'] = ''.join(tag.a.findAll(text=True))
 				self.article['url'] = self._path2url(tag.a['href'])

 			if tag.name == 'div' and self._tag_has_class(tag, 'gs_a'):
 				year = self.year_re.findall(tag.text)
 				self.article['year'] = year[0] if len(year) > 0 else None

 			if tag.name == 'div' and self._tag_has_class(tag, 'gs_fl'):
 				self._parse_links(tag)

 		if self.article['title']:
 			self.handle_article(self.article)


 class ScholarParser120726(ScholarParser):
 	def _parse_article(self, div):
 		self.article = Article()

 		for tag in div:
 			if not hasattr(tag, 'name'):
 				continue
 			if tag.name == 'div' and self._tag_has_class(tag, 'gs_ri'):
 				if tag.a:
 					self.article['title'] = ''.join(tag.a.findAll(text=True))
 					self.article['url'] = self._path2url(tag.a['href'])

 				if tag.find('div', {'class': 'gs_a'}):
 					year = self.year_re.findall(tag.find('div', {'class': 'gs_a'}).text)
 					self.article['year'] = year[0] if len(year) > 0 else None

 				if tag.find('div', {'class': 'gs_fl'}):
 					ltag = tag.find('div', {'class': 'gs_fl'})
 					self._parse_links(ltag)
 					
 					id_re = re.compile(r'gs_ocit\(event,\'(.*?)\',')
 					id = id_re.findall(str(ltag))
 					self.article['id'] = id[0] if len(id) > 0 else None
 					self.article['url_cite'] = self._path2url('/scholar?q=info:' + self.article['id'] + ':scholar.google.com/&output=cite&scirp=0&hl=en')

 		if self.article['title']:
 			self.handle_article(self.article)

 	def _parse_links(self, span):
 		for tag in span:
 			if not hasattr(tag, 'name'):
 				continue
 			if tag.name != 'a' or tag.get('href') == None:
 				continue

 			if tag.get('href').startswith('/scholar?q=related'):
 				self.article['url_related'] = self._path2url(tag.get('href'))
 				
 			if tag.get('href').startswith('/scholar?cites'):
 				if hasattr(tag, 'string') and tag.string.startswith('Cited by'):
 					self.article['num_citations'] = \
 						self._as_int(tag.string.split()[-1])
 				self.article['url_citations'] = self._path2url(tag.get('href'))

 			if tag.get('href').startswith('/scholar?cluster'):
 				if hasattr(tag, 'string') and tag.string.startswith('All '):
 					self.article['num_versions'] = \
 						self._as_int(tag.string.split()[1])
 				self.article['url_versions'] = self._path2url(tag.get('href'))

 class CiteParser(ScholarParser):
 	def __init__(self, site=None):
 		self.site = site or self.SCHOLAR_SITE
 		self.soup = None
 		self.export = {}
 		self.text = {}

 	def parse(self, html):
 		"""
 		This method initiates parsing of HTML content.
 		"""
 		self.soup = BeautifulSoup(html)
 		self._parse_text(self.soup.find("div", {"id": "gs_citt"}))
 		self._parse_export(self.soup.find("div", {"id": "gs_citi"}))

 	def _parse_text(self, div):
 		for tag in div.findAll('tr'):
 			self.text[tag.find('th').text] = tag.find('div').text
 			
 	def _parse_export(self, div):
 		for tag in div.findAll('a'):
 			if not hasattr(tag, 'name'):
 				continue
 			if tag.name != 'a' or tag.get('href') == None:
 				continue

 			if tag.get('href').startswith('/scholar'):
 				if hasattr(tag, 'string') and tag.string.startswith('Import into'):
 					n = tag.string.split()[-1]
 					self.export[n] = self._path2url(tag.get('href'))

 class ScholarQuerier(object):
 	SCHOLAR_URL = 'http://scholar.google.be/scholar?hl=en&q=%(query)s+author:%(author)s&btnG=Search&as_subj=eng&as_sdt=1,5&as_ylo=&as_vis=0'
 	NOAUTH_URL = 'http://scholar.google.be/scholar?hl=en&q=%(query)s&btnG=Search&as_subj=eng&as_std=1,5&as_ylo=&as_vis=0'
 	USER_AGENT = 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36'

 	class Parser(ScholarParser120726):
 		def __init__(self, querier):
 			ScholarParser120726.__init__(self)
 			self.querier = querier

 		def handle_article(self, art):
 			self.querier.add_article(art)

 	def __init__(self, author='', scholar_url=None, count=0):
 		self.articles = []
 		self.author = author
 		# Clip to 100, as Google doesn't support more anyway
 		self.count = min(count, 100)

 		if author == '':
 			self.scholar_url = self.NOAUTH_URL
 		else:
 			self.scholar_url = scholar_url or self.SCHOLAR_URL

 		if self.count != 0:
 			self.scholar_url += '&num=%d' % self.count

 	def query(self, search):
 		self.clear_articles()
 		url = self.scholar_url % {'query': quote(encode(search)), 'author': quote(self.author)}
 		r = requests.get(url, headers={'User-Agent': self.USER_AGENT}, cookies=CookieJar.COOKIE_JAR)
 		CookieJar.COOKIE_JAR = r.cookies
 		if r.status_code != 200:
 			self.status = False
 			print "*** Google is throttling"
 		else:
 			self.parse(r.text)

 	def parse(self, html):
 		parser = self.Parser(self)
 		parser.parse(html)

 	def add_article(self, art):
 		self.articles.append(art)

 	def clear_articles(self):
 		self.status = True
 		self.articles = []

 class CiteQuerier(ScholarQuerier):
 	CITE_URL = 'http://scholar.google.com/scholar?q=info:%(id)s:scholar.google.com/&output=cite&scirp=0&hl=en'
 	
 	class Parser(CiteParser):
 		def __init__(self, querier):
 			CiteParser.__init__(self)
 			self.querier = querier

 	def __init__(self, id):
 		self.id = id
 		self.parser = self.Parser(self)
 		self.query()

 	def query(self):
 		url_cite = self.CITE_URL % {'id': quote(encode(self.id))}
 		r = requests.get(url_cite, headers={'User-Agent': self.USER_AGENT}, cookies=CookieJar.COOKIE_JAR)
 		CookieJar.COOKIE_JAR = r.cookies
 		self.parse(r.text)

 	def parse(self, html):
 		self.parser.parse(html)

 def cite(id):
 	querier = CiteQuerier(id)
 	r = requests.get(querier.parser.export['EndNote'], headers={'User-Agent': querier.USER_AGENT}, cookies=CookieJar.COOKIE_JAR)
 	myFile = codecs.open('end.enw', 'w','utf-8')
 	myFile.write(r.text)
 	myFile.close()
 	
 def articles(query, author, count):
 	querier = ScholarQuerier(author=author, count=count)
 	querier.query(query)
 	articles = querier.articles
 	if count > 0:
 		articles = articles[:count]
 	return articles
 		
 def txt(query, author, count):
 	querier = ScholarQuerier(author=author, count=count)
 	querier.query(query)
 	articles = querier.articles
 	if count > 0:
 		articles = articles[:count]
 	c = 1
 	for art in articles:
 		print('------------' + 'Article #' + str(c) + ':' + '------------')
 		c += 1
 		print(art.as_txt() + '\n')
 	return articles

 def csv(query, author, count, header=False, sep='|'):
 	querier = ScholarQuerier(author=author, count=count)
 	querier.query(query)
 	articles = querier.articles
 	if count > 0:
 		articles = articles[:count]
 	for art in articles:
 		result = art.as_csv(header=header, sep=sep)
 		print(encode(result))
 		header = False
 	return articles

 def main():
 	usage = """scholar.py [options] <query string>
 A command-line interface to Google Scholar.

 Example: scholar.py -c 1 --txt --author einstein quantum"""

 	fmt = optparse.IndentedHelpFormatter(max_help_position=50, width=100)
 	parser = optparse.OptionParser(usage=usage, formatter=fmt)
 	parser.add_option('-a', '--author',
 					  help='Author name')
 	parser.add_option('--csv', action='store_true',
 					  help='Print article data in CSV form (separator is "|")')
 	parser.add_option('--csv-header', action='store_true',
 					  help='Like --csv, but print header with column names')
 	parser.add_option('--txt', action='store_true',
 					  help='Print article data in text format')
 	parser.add_option('--cite', help='Cite article')
 	parser.add_option('-c', '--count', type='int',
 					  help='Maximum number of results')
 	parser.set_defaults(count=0, author='')
 	options, args = parser.parse_args()

 	# Show help if we have neither keyword search nor author name
 	if len(args) == 0 and options.author == '':
 		parser.print_help()
 		return 1

 	query = ' '.join(args)

 	if options.csv:
 		a = csv(query, author=options.author, count=options.count)
 	elif options.csv_header:
 		a = csv(query, author=options.author, count=options.count, header=True)
 	elif options.txt:
 		a = txt(query, author=options.author, count=options.count)
 	elif options.cite:
 		cite(options.cite)
 	
 	if options.cite:
 		cite(a[int(options.cite)-1]['id'])
 		
 	return 0

 if __name__ == "__main__":
 	sys.exit(main())
	#! /usr/bin/env python
	# Copyright 2010--2013 Christian Kreibich. All rights reserved.
	#
	# Redistribution and use in source and binary forms, with or without
	# modification, are permitted provided that the following conditions are
	# met:
	#
	# 1. Redistributions of source code must retain the above copyright
	# notice, this list of conditions and the following disclaimer.
	#
	# 2. Redistributions in binary form must reproduce the above
	# copyright notice, this list of conditions and the following
	# disclaimer in the documentation and/or other materials provided
	# with the distribution.
	#
	# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED
	# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
	# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	# DISCLAIMED. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT,
	# INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
	# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
	# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
	# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	# POSSIBILITY OF SUCH DAMAGE.

	import optparse
	import sys
	import codecs
	import re
	try:
	from urllib.parse import quote
	except ImportError:
	from urllib import quote
	import requests

	# Import BeautifulSoup -- try 4 first, fall back to older
	try:
	from bs4 import BeautifulSoup
	except ImportError:
	try:
	from BeautifulSoup import BeautifulSoup
	except:
	print('We need BeautifulSoup, sorry...')
	sys.exit(1)

	# Support unicode in both Python 2 and 3. In Python 3, unicode is str.
	if sys.version_info[0] == 3:
	unicode = str # pylint: disable-msg=W0622
	encode = lambda s: s # pylint: disable-msg=C0103
	else:
	encode = lambda s: s.encode('utf-8') # pylint: disable-msg=C0103

	class CookieJar(object):
	COOKIE_JAR = {}

	class Article(object):
	def __init__(self):
	self.attrs = {'id': [None, 'ID', 0],
	'title': [None, 'Title', 1],
	'url': [None, 'URL', 2],
	'num_citations': [0, 'Citations', 3],
	'num_versions': [0, 'Versions', 4],
	'url_citations': [None, 'Citations list', 5],
	'url_versions': [None, 'Versions list', 6],
	'url_cite': [None, 'Cite URL', 7],
	'url_related': [None, 'Related list', 8],
	'year': [None, 'Year', 9]}

	def __getitem__(self, key):
	if key in self.attrs:
	return self.attrs[key][0]
	return None

	def __len__(self):
	return len(self.attrs)

	def __setitem__(self, key, item):
	if key in self.attrs:
	self.attrs[key][0] = item
	else:
	self.attrs[key] = [item, key, len(self.attrs)]

	def __delitem__(self, key):
	if key in self.attrs:
	del self.attrs[key]

	def as_txt(self):
	# Get items sorted in specified order:
	items = sorted(list(self.attrs.values()), key=lambda item: item[2])
	# Find largest label length:
	max_label_len = max([len(str(item[1])) for item in items])
	fmt = '%%%ds %%s' % max_label_len
	return '\n'.join([fmt % (item[1], item[0]) for item in items])

	def as_csv(self, header=False, sep='\|'):
	# Get keys sorted in specified order:
	keys = [pair[0] for pair in \
	sorted([(key, val[2]) for key, val in list(self.attrs.items())],
	key=lambda pair: pair[1])]
	res = []
	if header:
	res.append(sep.join(keys))
	res.append(sep.join([unicode(self.attrs[key][0]) for key in keys]))
	return '\n'.join(res)

	class ScholarParser(object):
	SCHOLAR_SITE = 'http://scholar.google.com'

	def __init__(self, site=None):
	self.soup = None
	self.article = None
	self.site = site or self.SCHOLAR_SITE
	self.year_re = re.compile(r'\b(?:20\|19)\d{2}\b')

	def handle_article(self, art):
	"""
	In this base class, the callback does nothing.
	"""

	def parse(self, html):
	"""
	This method initiates parsing of HTML content.
	"""
	self.soup = BeautifulSoup(html)
	for div in self.soup.findAll(ScholarParser._tag_checker):
	self._parse_article(div)

	def _parse_article(self, div):
	self.article = Article()

	for tag in div:
	if not hasattr(tag, 'name'):
	continue

	if tag.name == 'div' and self._tag_has_class(tag, 'gs_rt') and \
	tag.h3 and tag.h3.a:
	self.article['title'] = ''.join(tag.h3.a.findAll(text=True))
	self.article['url'] = self._path2url(tag.h3.a['href'])

	if tag.name == 'font':
	for tag2 in tag:
	if not hasattr(tag2, 'name'):
	continue
	if tag2.name == 'span' and self._tag_has_class(tag2, 'gs_fl'):
	self._parse_links(tag2)

	if self.article['title']:
	self.handle_article(self.article)

	def _parse_links(self, span):
	for tag in span:
	if not hasattr(tag, 'name'):
	continue
	if tag.name != 'a' or tag.get('href') == None:
	continue

	if tag.get('href').startswith('/scholar?cites'):
	if hasattr(tag, 'string') and tag.string.startswith('Cited by'):
	self.article['num_citations'] = \
	self._as_int(tag.string.split()[-1])
	self.article['url_citations'] = self._path2url(tag.get('href'))

	if tag.get('href').startswith('/scholar?cluster'):
	if hasattr(tag, 'string') and tag.string.startswith('All '):
	self.article['num_versions'] = \
	self._as_int(tag.string.split()[1])
	self.article['url_versions'] = self._path2url(tag.get('href'))

	@staticmethod
	def _tag_has_class(tag, klass):
	"""
	This predicate function checks whether a BeatifulSoup Tag instance
	has a class attribute.
	"""
	res = tag.get('class') or []
	if type(res) != list:
	# BeautifulSoup 3 can return e.g. 'gs_md_wp gs_ttss',
	# so split -- conveniently produces a list in any case
	res = res.split()
	return klass in res

	@staticmethod
	def _tag_checker(tag):
	return tag.name == 'div' and ScholarParser._tag_has_class(tag, 'gs_r')

	@staticmethod
	def _as_int(obj):
	try:
	return int(obj)
	except ValueError:
	return None

	def _path2url(self, path):
	if path.startswith('http://'):
	return path
	if not path.startswith('/'):
	path = '/' + path
	return self.site + path


	class ScholarParser120201(ScholarParser):
	def _parse_article(self, div):
	self.article = Article()

	for tag in div:
	if not hasattr(tag, 'name'):
	continue

	if tag.name == 'h3' and self._tag_has_class(tag, 'gs_rt') and tag.a:
	self.article['title'] = ''.join(tag.a.findAll(text=True))
	self.article['url'] = self._path2url(tag.a['href'])

	if tag.name == 'div' and self._tag_has_class(tag, 'gs_a'):
	year = self.year_re.findall(tag.text)
	self.article['year'] = year[0] if len(year) > 0 else None

	if tag.name == 'div' and self._tag_has_class(tag, 'gs_fl'):
	self._parse_links(tag)

	if self.article['title']:
	self.handle_article(self.article)


	class ScholarParser120726(ScholarParser):
	def _parse_article(self, div):
	self.article = Article()

	for tag in div:
	if not hasattr(tag, 'name'):
	continue
	if tag.name == 'div' and self._tag_has_class(tag, 'gs_ri'):
	if tag.a:
	self.article['title'] = ''.join(tag.a.findAll(text=True))
	self.article['url'] = self._path2url(tag.a['href'])

	if tag.find('div', {'class': 'gs_a'}):
	year = self.year_re.findall(tag.find('div', {'class': 'gs_a'}).text)
	self.article['year'] = year[0] if len(year) > 0 else None

	if tag.find('div', {'class': 'gs_fl'}):
	ltag = tag.find('div', {'class': 'gs_fl'})
	self._parse_links(ltag)

	id_re = re.compile(r'gs_ocit\(event,\'(.*?)\',')
	id = id_re.findall(str(ltag))
	self.article['id'] = id[0] if len(id) > 0 else None
	self.article['url_cite'] = self._path2url('/scholar?q=info:' + self.article['id'] + ':scholar.google.com/&output=cite&scirp=0&hl=en')

	if self.article['title']:
	self.handle_article(self.article)

	def _parse_links(self, span):
	for tag in span:
	if not hasattr(tag, 'name'):
	continue
	if tag.name != 'a' or tag.get('href') == None:
	continue

	if tag.get('href').startswith('/scholar?q=related'):
	self.article['url_related'] = self._path2url(tag.get('href'))

	if tag.get('href').startswith('/scholar?cites'):
	if hasattr(tag, 'string') and tag.string.startswith('Cited by'):
	self.article['num_citations'] = \
	self._as_int(tag.string.split()[-1])
	self.article['url_citations'] = self._path2url(tag.get('href'))

	if tag.get('href').startswith('/scholar?cluster'):
	if hasattr(tag, 'string') and tag.string.startswith('All '):
	self.article['num_versions'] = \
	self._as_int(tag.string.split()[1])
	self.article['url_versions'] = self._path2url(tag.get('href'))

	class CiteParser(ScholarParser):
	def __init__(self, site=None):
	self.site = site or self.SCHOLAR_SITE
	self.soup = None
	self.export = {}
	self.text = {}

	def parse(self, html):
	"""
	This method initiates parsing of HTML content.
	"""
	self.soup = BeautifulSoup(html)
	self._parse_text(self.soup.find("div", {"id": "gs_citt"}))
	self._parse_export(self.soup.find("div", {"id": "gs_citi"}))

	def _parse_text(self, div):
	for tag in div.findAll('tr'):
	self.text[tag.find('th').text] = tag.find('div').text

	def _parse_export(self, div):
	for tag in div.findAll('a'):
	if not hasattr(tag, 'name'):
	continue
	if tag.name != 'a' or tag.get('href') == None:
	continue

	if tag.get('href').startswith('/scholar'):
	if hasattr(tag, 'string') and tag.string.startswith('Import into'):
	n = tag.string.split()[-1]
	self.export[n] = self._path2url(tag.get('href'))

	class ScholarQuerier(object):
	SCHOLAR_URL = 'http://scholar.google.be/scholar?hl=en&q=%(query)s+author:%(author)s&btnG=Search&as_subj=eng&as_sdt=1,5&as_ylo=&as_vis=0'
	NOAUTH_URL = 'http://scholar.google.be/scholar?hl=en&q=%(query)s&btnG=Search&as_subj=eng&as_std=1,5&as_ylo=&as_vis=0'
	USER_AGENT = 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36'

	class Parser(ScholarParser120726):
	def __init__(self, querier):
	ScholarParser120726.__init__(self)
	self.querier = querier

	def handle_article(self, art):
	self.querier.add_article(art)

	def __init__(self, author='', scholar_url=None, count=0):
	self.articles = []
	self.author = author
	# Clip to 100, as Google doesn't support more anyway
	self.count = min(count, 100)

	if author == '':
	self.scholar_url = self.NOAUTH_URL
	else:
	self.scholar_url = scholar_url or self.SCHOLAR_URL

	if self.count != 0:
	self.scholar_url += '&num=%d' % self.count

	def query(self, search):
	self.clear_articles()
	url = self.scholar_url % {'query': quote(encode(search)), 'author': quote(self.author)}
	r = requests.get(url, headers={'User-Agent': self.USER_AGENT}, cookies=CookieJar.COOKIE_JAR)
	CookieJar.COOKIE_JAR = r.cookies
	if r.status_code != 200:
	self.status = False
	print "*** Google is throttling"
	else:
	self.parse(r.text)

	def parse(self, html):
	parser = self.Parser(self)
	parser.parse(html)

	def add_article(self, art):
	self.articles.append(art)

	def clear_articles(self):
	self.status = True
	self.articles = []

	class CiteQuerier(ScholarQuerier):
	CITE_URL = 'http://scholar.google.com/scholar?q=info:%(id)s:scholar.google.com/&output=cite&scirp=0&hl=en'

	class Parser(CiteParser):
	def __init__(self, querier):
	CiteParser.__init__(self)
	self.querier = querier

	def __init__(self, id):
	self.id = id
	self.parser = self.Parser(self)
	self.query()

	def query(self):
	url_cite = self.CITE_URL % {'id': quote(encode(self.id))}
	r = requests.get(url_cite, headers={'User-Agent': self.USER_AGENT}, cookies=CookieJar.COOKIE_JAR)
	CookieJar.COOKIE_JAR = r.cookies
	self.parse(r.text)

	def parse(self, html):
	self.parser.parse(html)

	def cite(id):
	querier = CiteQuerier(id)
	r = requests.get(querier.parser.export['EndNote'], headers={'User-Agent': querier.USER_AGENT}, cookies=CookieJar.COOKIE_JAR)
	myFile = codecs.open('end.enw', 'w','utf-8')
	myFile.write(r.text)
	myFile.close()

	def articles(query, author, count):
	querier = ScholarQuerier(author=author, count=count)
	querier.query(query)
	articles = querier.articles
	if count > 0:
	articles = articles[:count]
	return articles

	def txt(query, author, count):
	querier = ScholarQuerier(author=author, count=count)
	querier.query(query)
	articles = querier.articles
	if count > 0:
	articles = articles[:count]
	c = 1
	for art in articles:
	print('------------' + 'Article #' + str(c) + ':' + '------------')
	c += 1
	print(art.as_txt() + '\n')
	return articles

	def csv(query, author, count, header=False, sep='\|'):
	querier = ScholarQuerier(author=author, count=count)
	querier.query(query)
	articles = querier.articles
	if count > 0:
	articles = articles[:count]
	for art in articles:
	result = art.as_csv(header=header, sep=sep)
	print(encode(result))
	header = False
	return articles

	def main():
	usage = """scholar.py [options] <query string>
	A command-line interface to Google Scholar.

	Example: scholar.py -c 1 --txt --author einstein quantum"""

	fmt = optparse.IndentedHelpFormatter(max_help_position=50, width=100)
	parser = optparse.OptionParser(usage=usage, formatter=fmt)
	parser.add_option('-a', '--author',
	help='Author name')
	parser.add_option('--csv', action='store_true',
	help='Print article data in CSV form (separator is "\|")')
	parser.add_option('--csv-header', action='store_true',
	help='Like --csv, but print header with column names')
	parser.add_option('--txt', action='store_true',
	help='Print article data in text format')
	parser.add_option('--cite', help='Cite article')
	parser.add_option('-c', '--count', type='int',
	help='Maximum number of results')
	parser.set_defaults(count=0, author='')
	options, args = parser.parse_args()

	# Show help if we have neither keyword search nor author name
	if len(args) == 0 and options.author == '':
	parser.print_help()
	return 1

	query = ' '.join(args)

	if options.csv:
	a = csv(query, author=options.author, count=options.count)
	elif options.csv_header:
	a = csv(query, author=options.author, count=options.count, header=True)
	elif options.txt:
	a = txt(query, author=options.author, count=options.count)
	elif options.cite:
	cite(options.cite)

	if options.cite:
	cite(a[int(options.cite)-1]['id'])

	return 0

	if __name__ == "__main__":
	sys.exit(main())