Skip to content

Instantly share code, notes, and snippets.

@yzhangcs
Last active January 20, 2022 07:29
Show Gist options
  • Save yzhangcs/697b88c9f685f31efc00af6295c863eb to your computer and use it in GitHub Desktop.
Save yzhangcs/697b88c9f685f31efc00af6295c863eb to your computer and use it in GitHub Desktop.
Script for crawling daily CL/ML/CV papers on arxiv
# -*- coding: utf-8 -*-
import re
from datetime import datetime, timedelta
import arxiv
KEYS = ['adversarial', 'algebraic', 'algebratic', 'amr', 'auto-encoding', 'autoencoder', 'autoencoding', 'autoregressive',
'backward', 'bayes', 'bayesian', 'bethe', 'bilexical', 'bipartite', 'bregman', 'carlo', 'chomsky', 'circuit', 'clique',
'constituency', 'constituent', 'context-free', 'crf', 'dependency', 'differentiable', 'differential',
'differentiation', 'discrete', 'discretized', 'duality', 'energy', 'euclidean', 'expectation', 'exponential',
'fenchel-young', 'filter', 'flow', 'flowseq', 'forest', 'forward', 'frank-wolfe', 'gaussian', 'generation', 'grammar',
'gumbel', 'gumbel-softmax', 'higher-order', 'hmm', 'hypergraph', 'induction', 'inside', 'invertible', 'lagrangian',
'latent', 'levenshtein', 'lexicalized', 'low-rank', 'marginal', 'markov', 'masking', 'mcmc', 'mean-field',
'message-passing', 'monte', 'mutual', 'non-projective', 'normalizing', 'optimal', 'outside', 'parse', 'parser',
'parsing', 'particle', 'partition', 'pcfg', 'perturb-and-map', 'perturb-and-parse', 'perturbation', 'posterior',
'probabilistic', 'probabilistically', 'programming', 'projection', 'prototype', 'proximal', 'randomized', 'ranking',
'rao-blackwell', 'regularization', 'regularized', 'relaxation', 'reorder', 'reparameterization', 'sample', 'sampling',
'second-order', 'semi-amortized', 'semiring', 'sequence', 'simplex', 'sinkhorn', 'sparse', 'sparsemap', 'sparsemax',
'stochastic', 'stochasticity', 'struct', 'structural', 'structure', 'structured', 'sum-product', 'syntax',
'transformer', 'translation', 'transport', 'tree', 'treecrf', 'variational', 'viterbi']
AUTHORS = ['Alexander M. Rush', 'André F. T. Martins', 'Bailin Wang', 'Caio Corro', 'Chris Dyer', 'Daniel Gildea',
'David Chiang', 'David M. Blei', 'Eduard Hovy', 'Giorgio Satta', 'Graham Neubig', 'Ivan Titov', 'Jason Eisner',
'Justin T. Chiu', 'Kevin Gimpel', 'Lifu Tu', 'Lingpeng Kong', 'Mathieu Blondel', 'Michael Collins',
'Mirella Lapata', 'Noah A. Smith', 'Ryan Cotterell', 'Shay B. Cohen', 'Songlin Yang', 'Tim Vieira', 'Vlad Niculae',
'Xiang Lisa Li', 'Xuezhe Ma', 'Yao Fu', 'Yoon Kim', 'Yuntian Deng']
CLASSES = ['cs.CL', 'cs.LG', 'cs.CV']
def red(t):
return f"\033[91m{t}\033[0m"
def green(t):
return f"\033[92m{t}\033[0m"
def match(t, keys):
raw = t
for key in keys:
t = re.sub(f'(?<=[- ]){key}', lambda m: red(m.group()), t, flags=re.I)
return t, (raw != t)
titles = set()
for name in CLASSES:
search = arxiv.Search(query=name, max_results=50, sort_by=arxiv.SortCriterion.LastUpdatedDate)
for paper in search.results():
if paper.title in titles:
continue
if paper.updated >= datetime.now(paper.updated.tzinfo) - timedelta(3):
titles.add(paper.title)
print('='*120)
title, _ = match(paper.title, KEYS)
authors, _ = match('\t'.join([f"{author}" for author in paper.authors]), AUTHORS)
abstract, matched = match(paper.summary, KEYS)
print(green('[TITLE] '), title)
print(green('[AUTHORS] '), authors)
if matched:
print(green('[ABSTRACT] '), abstract)
print(green('[LINK] '), paper.entry_id)
print(green('[DATE] '), paper.updated)
print(green('[CATEGORIES] '), '\t'.join(paper.categories))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment