formigone · December 3, 2019 17:37 · formigone · Dec 3, 2019
diff --git a/arxiv-abstracts-to-markdown.py b/arxiv-abstracts-to-markdown.py
 '''
 The purpose of this script is to quickly collect a large number of abstracts
 from argiv.org so to simplify scanning lots of documents on a single topic
 before committing the time to do an in-depth reading of any particular paper.

 @author formigone
 '''

 import requests
 import re
 from IPython.display import Markdown
 from bs4 import BeautifulSoup

 def fetch(query):
    args = [
        'query={}'.format(query),
        'searchtype=all',
        'source=header',
        'size=200',
    ]
    url = 'https://arxiv.org/search/?' + '&'.join(args)
    res = requests.get(url).text

    soup = BeautifulSoup(res, 'html.parser')

    doc = []
    for row in soup.find_all(class_='arxiv-result'):
        doc.append('## ' + row.find(class_='title').text.strip())

        authors = [auth.text.strip() for auth in row.find(class_='authors').find_all('a')]
        doc_id = row.find(class_='list-title').find('a').text
        doc.append('[{} | {}]'.format(', '.join(authors), doc_id))

        abstract = row.find(class_='abstract-full').text.strip().replace('△ Less', '').strip()
        re.sub(r'\s\s+', ' ', abstract)
        doc.append('\n{}\n'.format(abstract))

    return '\n'.join(doc)

 Markdown(fetch('deep+learning'))
	'''
	The purpose of this script is to quickly collect a large number of abstracts
	from argiv.org so to simplify scanning lots of documents on a single topic
	before committing the time to do an in-depth reading of any particular paper.

	@author formigone
	'''

	import requests
	import re
	from IPython.display import Markdown
	from bs4 import BeautifulSoup

	def fetch(query):
	args = [
	'query={}'.format(query),
	'searchtype=all',
	'source=header',
	'size=200',
	]
	url = 'https://arxiv.org/search/?' + '&'.join(args)
	res = requests.get(url).text

	soup = BeautifulSoup(res, 'html.parser')

	doc = []
	for row in soup.find_all(class_='arxiv-result'):
	doc.append('## ' + row.find(class_='title').text.strip())

	authors = [auth.text.strip() for auth in row.find(class_='authors').find_all('a')]
	doc_id = row.find(class_='list-title').find('a').text
	doc.append('[{} \| {}]'.format(', '.join(authors), doc_id))

	abstract = row.find(class_='abstract-full').text.strip().replace('△ Less', '').strip()
	re.sub(r'\s\s+', ' ', abstract)
	doc.append('\n{}\n'.format(abstract))

	return '\n'.join(doc)

	Markdown(fetch('deep+learning'))