Last active
July 19, 2019 21:28
-
-
Save benjaminkreen/4a6bc6c30637dd07fd66876a63e784ff to your computer and use it in GitHub Desktop.
Gets alert results based on NED alerts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'httparty' | |
require 'date' | |
# This currently just gets all of the alerts and fetches the resulting documents. A good way to test this would be to | |
# compare the returned count of documents. if its close that would be great. | |
# here are all the keys that exist in the query column of the alerts table. | |
# ["startPage", "filterSubjects", "unformattedQuery", "query", "eLocationId", "pageSize", "resultView", "volume", "sortValue", "sortKey", "filterKeyword", "filterArticleTypes", "filterSubjectsDisjunction", "filterAuthors", "filterJournals", "id", "sort", "filterArticleType", "filterStartDate", "filterEndDate", "resultsPerPage", "sortOrder", "page", "x", "y", "from", "q", "filterSections"] | |
ALLOWED_QUERY_KEYS = ["q", "filterSubjects", "volume", "filterKeyword", "filterArticleTypes", "filterSubjectsDisjunction", "filterAuthors", "filterJournals","filterArticleType", "filterSections", "query", "unformattedQuery"] | |
WOMBAT_SEARCH_URL = 'https://collections.plos.org/dynamicSearch' | |
SOLR_API_URL = 'https://api.plos.org/search' | |
MAX_ARTICLES = 50 | |
EMAIL_URL = 'https://journals.plos.org/plosone/search' | |
PONE_TEMPLATE_ID = 'd-85c86529aaf24a82b891745f2c76b0e9' | |
SENDGRID_API_KEY = 'your-key' | |
SENDGRID_API_URL = 'https://api.sendgrid.com/v3/mail/send' | |
# Why try to deconstruct the wombat abstraction to make it solr compilant again? Just use wombat! | |
# First get alerts, weekly or monthly. I've cached them to a file | |
# TODO: implement NED API | |
alerts = JSON.parse(File.read('weekly_saved_searches.json')) | |
alerts.select { |x| x['name'] == 'PLoSONE' }.sample(2).each do |alert| | |
# here we clean out a bunch of escape cruft and only keep the queries we care about | |
# not sure why the other stuff is in there | |
# puts alert | |
original_query = JSON.parse(alert["query"].squeeze("\\")) | |
query = original_query.slice(*ALLOWED_QUERY_KEYS).select { |k,v| !v.empty? } | |
# set date range | |
day_offset = alert['frequency'] == 'monthly' ? 30 : 7 | |
start_date = Date.today - day_offset | |
query['filterStartDate'] = start_date.to_s | |
query['filterEndDate'] = Date.today.to_s | |
# query manipulation: there are a view fields that qualify as queries | |
possible_queries = [] | |
if query['filterSubjectsDisjunction'] | |
# Here's the only non-wombat friendly thing: Disjunction is a fancy word for 'OR'. Since these come from | |
# the akita ui, there no wombat query per se, so we just build our own OR'd query. | |
subject_query = query.delete('filterSubjectsDisjunction').inject('') do |q, subj| | |
q.empty? ? "subject:\"#{subj}\"" : "(#{q}) OR subject:\"#{subj}\"" | |
end | |
possible_queries.push(subject_query) | |
end | |
# there seems to have been a mess with what the query actually is. I've decided this is the precedence order, falling back on *:* | |
query_keys = ['q','query', 'unformattedQuery'] | |
query_keys.each { |key| possible_queries.push(query.delete(key)) } | |
possible_queries.push('*:*') | |
query['q'] = possible_queries.compact.reject(&:empty?).first | |
json_request_headers = { 'Accept' => 'application/json' } | |
sendgrid_headers = { | |
'Authorization' => "Bearer #{SENDGRID_API_KEY}", | |
'Content-Type' => 'application/json' | |
} | |
sendgrid_params = { | |
from: { email: '[email protected]' }, | |
personalizations: [ | |
{ | |
to: [ | |
{ email: '[email protected]' } # TODO: fetch correct email | |
], | |
dynamic_template_data: { | |
start_date: start_date.strftime('%b %d %Y'), | |
end_date: Date.today.strftime('%b %d %Y') | |
} | |
} | |
] | |
} | |
puts alert | |
if alert['name'] == 'PLoSONE' | |
# For PONE searches | |
solr_params = { | |
wt: 'json', | |
fl: 'title,subject,id,author', | |
q: "publication_date:[#{start_date.strftime('%FT%TZ')} TO #{Date.today.strftime('%FT%TZ')}] AND #{query['q']} AND doc_type:\"full\"" | |
} | |
resp = HTTParty.get(SOLR_API_URL + '?' + URI.encode_www_form(solr_params), headers: json_request_headers) | |
sleep 1 # Requesting too fast | |
search_results = resp.parsed_response['response'] | |
over_max = search_results['numFound'].to_i > MAX_ARTICLES | |
sendgrid_params[:template_id] = PONE_TEMPLATE_ID | |
sendgrid_params[:personalizations][0][:dynamic_template_data][:over_max] = over_max | |
sendgrid_params[:personalizations][0][:dynamic_template_data][:subjects] = [] | |
original_query['filterSubjectsDisjunction'].each do |subj| | |
# assemble doc data for subject | |
subj_docs = search_results['docs'].inject([]) do |list, doc| | |
if doc['subject'].any? { |doc_subj| doc_subj.include?(subj) } | |
list.push({title: doc['title'], authors: doc['author'].join(', '), doi: doc['id']}) | |
end | |
list | |
end | |
# assemble url data if necessary | |
url = nil | |
if subj_docs.count.zero? || over_max | |
url_params = { unformattedQuery: "subject:\"#{subj}\"" } | |
url_params.merge!(query.slice('filterJournals')) unless subj_docs.count.zero? # link to other journals if no results | |
url_params.merge!(query.slice('filterStartDate', 'filterEndDate')) if over_max # don't filter by date | |
url = EMAIL_URL + '?' + URI.encode_www_form(url_params) | |
end | |
# TODO: support subject tier grouping | |
subj_data = { | |
name: subj, | |
articles: subj_docs, | |
url: url | |
} | |
sendgrid_params[:personalizations][0][:dynamic_template_data][:subjects].push(subj_data) | |
end | |
# send email | |
sendgrid_resp = HTTParty.post(SENDGRID_API_URL, { body: sendgrid_params.to_json, headers: sendgrid_headers }) | |
puts sendgrid_resp.parsed_response | |
else | |
resp = HTTParty.get(WOMBAT_SEARCH_URL + '?' + URI.encode_www_form(query), headers: json_request_headers) | |
search_results = resp.parsed_response['searchResults'] | |
end | |
# puts search_results | |
# puts "Found " + search_results['numFound'].to_i.to_s + " results for #{alert['name']}" | |
# search_results['docs'].take(3).each { |doc| puts doc['title'] } # print out top 3 | |
end | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment