Last active
October 29, 2024 15:19
-
-
Save jozefg/c2542f51a0b9b3f6efe528fcec90e334 to your computer and use it in GitHub Desktop.
Query the arxiv api for latest articles in a given classification.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
from datetime import date, timedelta | |
import argparse | |
import time | |
import itertools | |
import requests | |
import feedparser | |
OKBLUE = '\033[94m' | |
BOLD = '\033[1m' | |
ENDC = '\033[0m' | |
CS_CLASSES = [ | |
'cs.' + cat for cat in [ | |
'AI', 'AR', 'CC', 'CE', 'CG', 'CL', 'CR', 'CV', 'CY', 'DB', | |
'DC', 'DL', 'DM', 'DS', 'ET', 'FL', 'GL', 'GR', 'GT', 'HC', | |
'IR', 'IT', 'LG', 'LO', 'MA', 'MM', 'MS', 'NA', 'NE', 'NI', | |
'OH', 'OS', 'PF', 'PL', 'RO', 'SC', 'SD', 'SE', 'SI', 'SY', | |
] | |
] | |
MATH_CLASSES = [ | |
'math.' + cat for cat in [ | |
'AC', 'AG', 'AP', 'AT', 'CA', 'CO', 'CT', 'CV', 'DG', 'DS', | |
'FA', 'GM', 'GN', 'GR', 'GT', 'HO', 'IT', 'KT', 'LO', | |
'MG', 'MP', 'NA', 'NT', 'OA', 'OC', 'PR', 'QA', 'RA', | |
'RT', 'SG', 'SP', 'ST', 'math-ph' | |
] | |
] | |
# Which categories do we search | |
CLASSES = CS_CLASSES + MATH_CLASSES | |
# Maximum articles from each category | |
MAX_ARTICLES = 50 | |
# Only show articles since this date. | |
OLDEST_DATE = date.today() - timedelta(days=7) | |
# The endpoint for the arxiv api | |
QUERY_ENDPOINT = 'http://export.arxiv.org/api/query' | |
def display_article(number, title, authors, link, date, info): | |
if len(authors) > 5: | |
authors = authors[0:5] + ['et al.'] | |
print(f'{number}. ' + BOLD + title + ENDC) | |
print('\t' + ', '.join(authors)) | |
print('\t' + time.strftime('%Y-%m-%d', date)) | |
print('\t' + link) | |
if info: | |
print('\t' + info) | |
def show_class(cls, skip, maximum, since, delay=True): | |
response = requests.get( | |
QUERY_ENDPOINT, | |
params = [ | |
('search_query', f'cat:{cls}'), | |
('sortBy', 'lastUpdatedDate'), | |
('sortOrder', 'descending'), | |
('start', skip), | |
('max_results', maximum) | |
] | |
) | |
try: | |
response.raise_for_status() | |
# As requested by arxiv, sleep for 3 seconds | |
if delay: | |
time.sleep(3) | |
except HTTPError as http_err: | |
print(f'Failed to scrape {cls}.') | |
print(http_err) | |
feed = feedparser.parse(response.content) | |
articles = feed['entries'] | |
for article, i in zip(articles, itertools.count(start=1)): | |
if(article.updated_parsed >= since.timetuple()): | |
article_info = { | |
'title' : article.title.replace("\n ",""), | |
'date' : article.updated_parsed, | |
'authors' : [author['name'] for author in article.authors], | |
'link' : article.link, | |
'info' : article.arxiv_journal_ref if 'arxiv_journal_ref' in article else None | |
} | |
display_article(number = i, **article_info) | |
else: | |
break | |
def show(classes, skip, maximum, since): | |
for cls in classes[0:-1]: | |
print(f'{OKBLUE}Fetching articles from {cls}...{ENDC}') | |
show_class(cls, skip, maximum, since) | |
# Special case the last item to ensure we encure no unnecessary delays | |
show_class(classes[-1], skip, maximum, since, delay=False) | |
parser = argparse.ArgumentParser(prog='arxiv_scraper') | |
subcommand_parsers = parser.add_subparsers(required=True, dest="cmd") | |
# Options and arguments for the show command | |
show_parser = subcommand_parsers.add_parser("show", help='Show recent articles') | |
show_parser.add_argument( | |
"classes", | |
nargs='+', | |
choices=CLASSES, | |
help="Last of subject classifications to scrape.", | |
metavar='CLASS' | |
) | |
show_parser.add_argument( | |
"--max", | |
help=f"Maximum number of articles to display, default is {MAX_ARTICLES}", | |
default = MAX_ARTICLES | |
) | |
show_parser.add_argument( | |
"--skip", | |
help=f"Skip the first SKIP entries, default is 0", | |
default = 0 | |
) | |
show_parser.add_argument( | |
"--since", | |
help=f"Only show articles since YYYY-MM-DD, default is {OLDEST_DATE}", | |
default = OLDEST_DATE.isoformat() | |
) | |
args = vars(parser.parse_args()) | |
if args['cmd'] == 'show': | |
show( | |
classes = args['classes'], | |
maximum = args['max'], | |
skip = args['skip'], | |
since = date.fromisoformat(args['since']) | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment