Last active
January 20, 2022 07:29
-
-
Save yzhangcs/697b88c9f685f31efc00af6295c863eb to your computer and use it in GitHub Desktop.
Script for crawling daily CL/ML/CV papers on arxiv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import re | |
from datetime import datetime, timedelta | |
import arxiv | |
KEYS = ['adversarial', 'algebraic', 'algebratic', 'amr', 'auto-encoding', 'autoencoder', 'autoencoding', 'autoregressive', | |
'backward', 'bayes', 'bayesian', 'bethe', 'bilexical', 'bipartite', 'bregman', 'carlo', 'chomsky', 'circuit', 'clique', | |
'constituency', 'constituent', 'context-free', 'crf', 'dependency', 'differentiable', 'differential', | |
'differentiation', 'discrete', 'discretized', 'duality', 'energy', 'euclidean', 'expectation', 'exponential', | |
'fenchel-young', 'filter', 'flow', 'flowseq', 'forest', 'forward', 'frank-wolfe', 'gaussian', 'generation', 'grammar', | |
'gumbel', 'gumbel-softmax', 'higher-order', 'hmm', 'hypergraph', 'induction', 'inside', 'invertible', 'lagrangian', | |
'latent', 'levenshtein', 'lexicalized', 'low-rank', 'marginal', 'markov', 'masking', 'mcmc', 'mean-field', | |
'message-passing', 'monte', 'mutual', 'non-projective', 'normalizing', 'optimal', 'outside', 'parse', 'parser', | |
'parsing', 'particle', 'partition', 'pcfg', 'perturb-and-map', 'perturb-and-parse', 'perturbation', 'posterior', | |
'probabilistic', 'probabilistically', 'programming', 'projection', 'prototype', 'proximal', 'randomized', 'ranking', | |
'rao-blackwell', 'regularization', 'regularized', 'relaxation', 'reorder', 'reparameterization', 'sample', 'sampling', | |
'second-order', 'semi-amortized', 'semiring', 'sequence', 'simplex', 'sinkhorn', 'sparse', 'sparsemap', 'sparsemax', | |
'stochastic', 'stochasticity', 'struct', 'structural', 'structure', 'structured', 'sum-product', 'syntax', | |
'transformer', 'translation', 'transport', 'tree', 'treecrf', 'variational', 'viterbi'] | |
AUTHORS = ['Alexander M. Rush', 'André F. T. Martins', 'Bailin Wang', 'Caio Corro', 'Chris Dyer', 'Daniel Gildea', | |
'David Chiang', 'David M. Blei', 'Eduard Hovy', 'Giorgio Satta', 'Graham Neubig', 'Ivan Titov', 'Jason Eisner', | |
'Justin T. Chiu', 'Kevin Gimpel', 'Lifu Tu', 'Lingpeng Kong', 'Mathieu Blondel', 'Michael Collins', | |
'Mirella Lapata', 'Noah A. Smith', 'Ryan Cotterell', 'Shay B. Cohen', 'Songlin Yang', 'Tim Vieira', 'Vlad Niculae', | |
'Xiang Lisa Li', 'Xuezhe Ma', 'Yao Fu', 'Yoon Kim', 'Yuntian Deng'] | |
CLASSES = ['cs.CL', 'cs.LG', 'cs.CV'] | |
def red(t): | |
return f"\033[91m{t}\033[0m" | |
def green(t): | |
return f"\033[92m{t}\033[0m" | |
def match(t, keys): | |
raw = t | |
for key in keys: | |
t = re.sub(f'(?<=[- ]){key}', lambda m: red(m.group()), t, flags=re.I) | |
return t, (raw != t) | |
titles = set() | |
for name in CLASSES: | |
search = arxiv.Search(query=name, max_results=50, sort_by=arxiv.SortCriterion.LastUpdatedDate) | |
for paper in search.results(): | |
if paper.title in titles: | |
continue | |
if paper.updated >= datetime.now(paper.updated.tzinfo) - timedelta(3): | |
titles.add(paper.title) | |
print('='*120) | |
title, _ = match(paper.title, KEYS) | |
authors, _ = match('\t'.join([f"{author}" for author in paper.authors]), AUTHORS) | |
abstract, matched = match(paper.summary, KEYS) | |
print(green('[TITLE] '), title) | |
print(green('[AUTHORS] '), authors) | |
if matched: | |
print(green('[ABSTRACT] '), abstract) | |
print(green('[LINK] '), paper.entry_id) | |
print(green('[DATE] '), paper.updated) | |
print(green('[CATEGORIES] '), '\t'.join(paper.categories)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment