Last active
January 9, 2020 08:10
-
-
Save amaya382/a437ce5f9bffc79c742b39da731c1e89 to your computer and use it in GitHub Desktop.
fetch and notify papers from arXiv
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# based on https://www.takapy.work/entry/2019/07/15/121436 | |
import os | |
import re | |
import json | |
import urllib.request | |
import datetime | |
def parse_xml(data, tag): | |
pattern = "<" + tag + ">([\s\S]*?)<\/" + tag + ">" | |
return re.findall(pattern, data) | |
def fetch_papers(categories, keywords, base_date, prev_date): | |
categories_query = "%28" + "+OR+".join(["cat:" + category for category in categories]) + "%29" | |
keywords_query = "%28" + "+AND+".join(["all:" + keyword for keyword in keywords]) + "%29" | |
url = "http://export.arxiv.org/api/query?search_query=submittedDate:[" + \ | |
prev_date.strftime("%Y%m%d") + "0000+TO+" + \ | |
base_date.strftime("%Y%m%d")+"0000]+AND+" + categories_query + "+AND+" + keywords_query | |
data = urllib.request.urlopen(url).read().decode("utf-8") | |
return parse_xml(data, "entry") | |
def notify(webhook, papers, interval, keywords): | |
key = ", ".join(keywords) | |
if len(papers) == 0: | |
post_to_slack(webhook, text=f"直近{interval}日にPublishされた論文はありませんでした [{key}]") | |
return | |
else: | |
post_to_slack(webhook, text=f"直近{interval}日にPublishされた{len(papers)}件の論文が見つかりました [{key}]") | |
for paper in papers: | |
url = parse_xml(paper, "id")[0] | |
raw_title = parse_xml(paper, "title")[0] | |
title = " ".join(raw_title.split()) | |
date = parse_xml(paper, "published")[0] | |
date = date[:10] | |
author = ", ".join(parse_xml(paper, "name") ) | |
summary = parse_xml(paper, "summary")[0] | |
summary = " ".join(summary.split()) | |
attachment = { | |
"title": title, | |
"title_link": url, | |
"author_name": author, | |
"fields": [ | |
{ | |
"title": "Abstract", | |
"value": summary | |
}, | |
{ | |
"title": "Published", | |
"value": date | |
} | |
] | |
} | |
post_to_slack(webhook, attachments=[attachment]) | |
def post_to_slack(webhook, text=None, attachments=None): | |
data = {} | |
if text is not None: | |
data["text"] = text | |
if attachments is not None: | |
data["attachments"] = attachments | |
encoded_data = json.dumps(data).encode("utf-8") | |
req = urllib.request.Request(webhook, data=encoded_data) | |
urllib.request.urlopen(req) | |
def lambda_handler(event, context): | |
webhook = event["webhook"] # https://hooks.slack.com/services/xxx/yyy/zzz | |
interval = event["interval"] # #days | |
categories = event["categories"] # ["cs.AI", "cs.CL", "cs.CV", "cs.DB", "cs.DC", "cs.DS", "cs.IR", "cs.NE", "stat.ML"] | |
keywords = event["keywords"] # ["some", "keywords"] # keywords you want | |
base_date = datetime.datetime.now() + datetime.timedelta(hours=9) # JST | |
prev_date = base_date + datetime.timedelta(days=-interval) | |
papers = fetch_papers(categories, keywords, base_date, prev_date) | |
notify(webhook, papers, interval, keywords) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment