Last active
February 14, 2024 12:58
-
-
Save mjpost/201a1b2753d82f6aaf0654e499bbfbcc to your computer and use it in GitHub Desktop.
Uses the Semantic Scholar API (with Anthology support!) to get paper citation counts for an Anthology volume
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
"""Uses the Semantic Scholar API to get citation counts for all papers in | |
an ACL volume. Assumes old-style IDs (e.g., P96-1). | |
Mad props to Semantic Scholar for making this so easy. | |
""" | |
import json | |
import os | |
import shutil | |
import sys | |
import urllib | |
def retrieve_url(remote_url: str, local_path: str): | |
""" | |
Saves a URL to a local path. Can handle cookies, e.g., those | |
used downloading PDFs from MIT Press (TACL, CL). | |
:param remote_url: The URL to download from. Currently supports http only. | |
:param local_path: Where to save the file to. | |
""" | |
if remote_url.startswith("http"): | |
import ssl | |
import urllib.request | |
cookieProcessor = urllib.request.HTTPCookieProcessor() | |
opener = urllib.request.build_opener(cookieProcessor) | |
request = urllib.request.Request( | |
remote_url, headers={'User-Agent': 'Mozilla/5.0'} | |
) | |
with opener.open(request, timeout=1000) as url, open( | |
local_path, mode="wb" | |
) as input_file_fh: | |
input_file_fh.write(url.read()) | |
else: | |
shutil.copyfile(remote_url, local_path) | |
return True | |
def main(args): | |
for num in range(1, 1000): | |
url = f"https://api.semanticscholar.org/v1/paper/ACL:{args.volume}{num:03d}" | |
try: | |
retrieve_url(url, "t.json") | |
d = json.load(open("t.json")) | |
""" | |
Here are all the supported keys: | |
dict_keys(['abstract', | |
'arxivId', | |
'authors', | |
'citationVelocity', | |
'citations', | |
'corpusId', | |
'doi', | |
'fieldsOfStudy', | |
'influentialCitationCount', | |
'isOpenAccess', | |
'isPublisherLicensed', | |
'is_open_access', | |
'is_publisher_licensed', | |
'numCitedBy', | |
'numCiting', | |
'paperId', | |
'references', | |
'title', | |
'topics', | |
'url', | |
'venue', | |
'year']) | |
""" | |
print(len(d["citations"]), d["influentialCitationCount"], d["title"], sep="\t") | |
except: | |
print(f"* couldn't find URL {url}, quitting.", file=sys.stderr) | |
sys.exit(1) | |
if __name__ == "__main__": | |
import argparse | |
parser = argparse.ArgumentParser() | |
parser.add_argument("volume") | |
args = parser.parse_args() | |
main(args) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Running on volume 1 of ACL 1996 gives the following. The volumes are (citations, influential citations, title).