Last active
January 17, 2022 19:23
-
-
Save fgolemo/ef4526cb4d58bb6e94f94e85c9a04be4 to your computer and use it in GitHub Desktop.
Crawl ICLR 22 review scores, confidences, titles, links for visualization.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### ICLR 2022 Scraper | |
import os.path | |
import pickle | |
import urllib.request | |
import json | |
from math import ceil | |
from multiprocessing import Queue, Process | |
from tqdm import trange, tqdm | |
URL_TITLES = "https://api.openreview.net/notes?details=replyCount%2Cinvitation%2Coriginal&offset={start}&limit=50&invitation=ICLR.cc%2F2022%2FConference%2F-%2FBlind_Submission" | |
URL_REVIEWS = "https://api.openreview.net/notes?forum={paper_id}&trash=true&details=replyCount%2Cwritable%2Crevisions%2Coriginal%2Coverwriting%2Cinvitation%2Ctags" | |
TOTAL_SUBMISSIONS = 2855 # from pulling the URL once manually and looking at the count | |
PROCESSES = 5 # how many workers are pulling reviews | |
paper_data = { | |
"paper_ids": [], | |
"paper_links": [], | |
"paper_titles": [], | |
} | |
pages = ceil(TOTAL_SUBMISSIONS / 50) | |
for p_no in trange(pages): | |
with urllib.request.urlopen(URL_TITLES.format(start=p_no * 50)) as url: | |
data = json.loads(url.read().decode()) | |
for note in data["notes"]: | |
paper_id = note["id"] | |
paper_title = note["content"]["title"] | |
paper_link = f"https://openreview.net/forum?id={paper_id}" | |
paper_data["paper_ids"].append(paper_id) | |
paper_data["paper_titles"].append(paper_title) | |
paper_data["paper_links"].append(paper_link) | |
def pull_reviews(q, paper_ids, paper_titles, paper_links): | |
for paper_idx, paper_id in enumerate(paper_ids): | |
paper_score = [] | |
paper_confs = [] | |
with urllib.request.urlopen(URL_REVIEWS.format(paper_id=paper_id)) as url: | |
data = json.loads(url.read().decode()) | |
for note in data["notes"]: | |
if "recommendation" in note["content"].keys(): | |
paper_rec = note["content"]["recommendation"] | |
paper_rec = paper_rec.split(":")[0] | |
paper_score.append(int(paper_rec)) | |
paper_conf = note["content"]["confidence"] | |
paper_conf = paper_conf.split(":")[0] | |
paper_confs.append(int(paper_conf)) | |
q.put([paper_id, paper_titles[paper_idx], paper_links[paper_idx], paper_score, paper_confs]) | |
chunk_len = int(len(paper_data["paper_ids"]) / PROCESSES) | |
def make_chunks(lst): | |
return [lst[i : i + chunk_len] for i in range(0, len(lst), chunk_len)] | |
paper_id_chunks = make_chunks(paper_data["paper_ids"]) | |
paper_title_chunks = make_chunks(paper_data["paper_titles"]) | |
paper_link_chunks = make_chunks(paper_data["paper_links"]) | |
q = Queue() | |
procs = [] | |
for proc_id in range(PROCESSES): | |
p = Process( | |
target=pull_reviews, args=(q, paper_id_chunks[proc_id], paper_title_chunks[proc_id], paper_link_chunks[proc_id]) | |
) | |
p.start() | |
procs.append(p) | |
paper_data2 = { | |
"paper_ids": [], | |
"paper_links": [], | |
"paper_titles": [], | |
"paper_scores": [], | |
"paper_confidences": [], | |
"paper_scores_mean": [], | |
"paper_confidences_mean": [], | |
} | |
with tqdm(total=TOTAL_SUBMISSIONS) as pbar: | |
counter = 0 | |
while True: | |
paper_id, paper_title, paper_link, paper_scores, paper_confs = q.get(block=True) | |
paper_data2["paper_ids"].append(paper_id) | |
paper_data2["paper_titles"].append(paper_title) | |
paper_data2["paper_links"].append(paper_link) | |
paper_data2["paper_scores"].append(paper_scores) | |
paper_data2["paper_scores_mean"].append(sum(paper_scores) / len(paper_scores)) | |
paper_data2["paper_confidences"].append(paper_confs) | |
paper_data2["paper_confidences_mean"].append(sum(paper_confs) / len(paper_confs)) | |
counter += 1 | |
if counter % 100 == 0: | |
print(len(paper_data2["paper_ids"])) | |
pickle.dump(paper_data2, open(os.path.expanduser("~/iclr22-papers.pickle"), "wb")) | |
pbar.update(100) | |
if counter == TOTAL_SUBMISSIONS: | |
pickle.dump(paper_data2, open(os.path.expanduser("~/iclr22-papers.pickle"), "wb")) | |
print("done") | |
break | |
for p in procs: | |
p.join() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
bit late, but yeah that's right. Thanks for pointing that out @ankitkv ! 😄