benjaminkreen · July 19, 2019 21:28
diff --git a/get_alert_results.rb b/get_alert_results.rb
 require 'httparty'
 require 'date'

 # This currently just gets all of the alerts and fetches the resulting documents. A good way to test this would be to
 # compare the returned count of documents. if its close that would be great.

 # here are all the keys that exist in the query column of the alerts table.
 # ["startPage", "filterSubjects", "unformattedQuery", "query", "eLocationId", "pageSize", "resultView", "volume", "sortValue", "sortKey", "filterKeyword", "filterArticleTypes", "filterSubjectsDisjunction", "filterAuthors", "filterJournals", "id", "sort", "filterArticleType", "filterStartDate", "filterEndDate", "resultsPerPage", "sortOrder", "page", "x", "y", "from", "q", "filterSections"]
 ALLOWED_QUERY_KEYS = ["q", "filterSubjects", "volume", "filterKeyword", "filterArticleTypes", "filterSubjectsDisjunction", "filterAuthors", "filterJournals","filterArticleType", "filterSections", "query", "unformattedQuery"]
 WOMBAT_SEARCH_URL = 'https://collections.plos.org/dynamicSearch'
 SOLR_API_URL = 'https://api.plos.org/search'
 MAX_ARTICLES = 50
 EMAIL_URL = 'https://journals.plos.org/plosone/search'
 PONE_TEMPLATE_ID = 'd-85c86529aaf24a82b891745f2c76b0e9'
 SENDGRID_API_KEY = 'your-key'
 SENDGRID_API_URL = 'https://api.sendgrid.com/v3/mail/send'
 # Why try to deconstruct the wombat abstraction to make it solr compilant again? Just use wombat! 
 # First get alerts, weekly or monthly. I've cached them to a file
 # TODO: implement NED API
 alerts = JSON.parse(File.read('weekly_saved_searches.json'))

 alerts.select { |x| x['name'] == 'PLoSONE' }.sample(2).each do |alert|
  # here we clean out a bunch of escape cruft and only keep the queries we care about
  # not sure why the other stuff is in there
  # puts alert
  original_query = JSON.parse(alert["query"].squeeze("\\"))
  query = original_query.slice(*ALLOWED_QUERY_KEYS).select { |k,v| !v.empty? }
  
  # set date range
  day_offset = alert['frequency'] == 'monthly' ? 30 : 7
  start_date = Date.today - day_offset
  query['filterStartDate'] = start_date.to_s
  query['filterEndDate'] = Date.today.to_s

  # query manipulation: there are a view fields that qualify as queries
  possible_queries = []
  if query['filterSubjectsDisjunction']
    # Here's the only non-wombat friendly thing: Disjunction is a fancy word for 'OR'. Since these come from
    # the akita ui, there no wombat query per se, so we just build our own OR'd query.
    subject_query = query.delete('filterSubjectsDisjunction').inject('') do |q, subj|
      q.empty? ? "subject:\"#{subj}\"" : "(#{q}) OR subject:\"#{subj}\"" 
    end
    possible_queries.push(subject_query)
  end
  # there seems to have been a mess with what the query actually is. I've decided this is the precedence order, falling back on *:*
  query_keys = ['q','query', 'unformattedQuery']
  query_keys.each { |key| possible_queries.push(query.delete(key)) }
  possible_queries.push('*:*')
  query['q'] = possible_queries.compact.reject(&:empty?).first
  
  json_request_headers = { 'Accept' => 'application/json' }  
  sendgrid_headers = {
    'Authorization' => "Bearer #{SENDGRID_API_KEY}",
    'Content-Type' => 'application/json'
  }
  sendgrid_params = {
    from: { email: '[email protected]' },
    personalizations: [
      {
        to: [
          { email: '[email protected]' } # TODO: fetch correct email
        ],
        dynamic_template_data: {
          start_date: start_date.strftime('%b %d %Y'),
          end_date: Date.today.strftime('%b %d %Y')
        }
      }
    ]
  }
  puts alert
  
  if alert['name'] == 'PLoSONE'
    # For PONE searches
    solr_params = {
      wt: 'json',
      fl: 'title,subject,id,author',
      q: "publication_date:[#{start_date.strftime('%FT%TZ')} TO #{Date.today.strftime('%FT%TZ')}] AND #{query['q']} AND doc_type:\"full\""
    }
    
    resp = HTTParty.get(SOLR_API_URL + '?' + URI.encode_www_form(solr_params), headers: json_request_headers)
    sleep 1 # Requesting too fast
    search_results = resp.parsed_response['response']
    over_max = search_results['numFound'].to_i > MAX_ARTICLES
    sendgrid_params[:template_id] = PONE_TEMPLATE_ID
    sendgrid_params[:personalizations][0][:dynamic_template_data][:over_max] = over_max
    sendgrid_params[:personalizations][0][:dynamic_template_data][:subjects] = []
    original_query['filterSubjectsDisjunction'].each do |subj|
      # assemble doc data for subject
      subj_docs = search_results['docs'].inject([]) do |list, doc|
        if doc['subject'].any? { |doc_subj| doc_subj.include?(subj) }
          list.push({title: doc['title'], authors: doc['author'].join(', '), doi: doc['id']})
        end
        list
      end
      
      # assemble url data if necessary
      url = nil
      if subj_docs.count.zero? || over_max
        url_params = { unformattedQuery: "subject:\"#{subj}\"" }
        url_params.merge!(query.slice('filterJournals')) unless subj_docs.count.zero? # link to other journals if no results
        url_params.merge!(query.slice('filterStartDate', 'filterEndDate')) if over_max # don't filter by date 
        url = EMAIL_URL + '?' + URI.encode_www_form(url_params)
      end
      
      # TODO: support subject tier grouping
      
      subj_data = {
        name: subj,
        articles: subj_docs,
        url: url
      }
      sendgrid_params[:personalizations][0][:dynamic_template_data][:subjects].push(subj_data)
    end
    
    # send email
    sendgrid_resp = HTTParty.post(SENDGRID_API_URL, { body: sendgrid_params.to_json, headers: sendgrid_headers })
    puts sendgrid_resp.parsed_response
  else
    resp = HTTParty.get(WOMBAT_SEARCH_URL + '?' + URI.encode_www_form(query), headers: json_request_headers)
    search_results = resp.parsed_response['searchResults']
  end
  # puts search_results
  # puts "Found " + search_results['numFound'].to_i.to_s + " results for #{alert['name']}"
  # search_results['docs'].take(3).each { |doc| puts doc['title'] } # print out top 3
 end
	require 'httparty'
	require 'date'

	# This currently just gets all of the alerts and fetches the resulting documents. A good way to test this would be to
	# compare the returned count of documents. if its close that would be great.

	# here are all the keys that exist in the query column of the alerts table.
	# ["startPage", "filterSubjects", "unformattedQuery", "query", "eLocationId", "pageSize", "resultView", "volume", "sortValue", "sortKey", "filterKeyword", "filterArticleTypes", "filterSubjectsDisjunction", "filterAuthors", "filterJournals", "id", "sort", "filterArticleType", "filterStartDate", "filterEndDate", "resultsPerPage", "sortOrder", "page", "x", "y", "from", "q", "filterSections"]
	ALLOWED_QUERY_KEYS = ["q", "filterSubjects", "volume", "filterKeyword", "filterArticleTypes", "filterSubjectsDisjunction", "filterAuthors", "filterJournals","filterArticleType", "filterSections", "query", "unformattedQuery"]
	WOMBAT_SEARCH_URL = 'https://collections.plos.org/dynamicSearch'
	SOLR_API_URL = 'https://api.plos.org/search'
	MAX_ARTICLES = 50
	EMAIL_URL = 'https://journals.plos.org/plosone/search'
	PONE_TEMPLATE_ID = 'd-85c86529aaf24a82b891745f2c76b0e9'
	SENDGRID_API_KEY = 'your-key'
	SENDGRID_API_URL = 'https://api.sendgrid.com/v3/mail/send'
	# Why try to deconstruct the wombat abstraction to make it solr compilant again? Just use wombat!
	# First get alerts, weekly or monthly. I've cached them to a file
	# TODO: implement NED API
	alerts = JSON.parse(File.read('weekly_saved_searches.json'))

	alerts.select { \|x\| x['name'] == 'PLoSONE' }.sample(2).each do \|alert\|
	# here we clean out a bunch of escape cruft and only keep the queries we care about
	# not sure why the other stuff is in there
	# puts alert
	original_query = JSON.parse(alert["query"].squeeze("\\"))
	query = original_query.slice(*ALLOWED_QUERY_KEYS).select { \|k,v\| !v.empty? }

	# set date range
	day_offset = alert['frequency'] == 'monthly' ? 30 : 7
	start_date = Date.today - day_offset
	query['filterStartDate'] = start_date.to_s
	query['filterEndDate'] = Date.today.to_s

	# query manipulation: there are a view fields that qualify as queries
	possible_queries = []
	if query['filterSubjectsDisjunction']
	# Here's the only non-wombat friendly thing: Disjunction is a fancy word for 'OR'. Since these come from
	# the akita ui, there no wombat query per se, so we just build our own OR'd query.
	subject_query = query.delete('filterSubjectsDisjunction').inject('') do \|q, subj\|
	q.empty? ? "subject:\"#{subj}\"" : "(#{q}) OR subject:\"#{subj}\""
	end
	possible_queries.push(subject_query)
	end
	# there seems to have been a mess with what the query actually is. I've decided this is the precedence order, falling back on :
	query_keys = ['q','query', 'unformattedQuery']
	query_keys.each { \|key\| possible_queries.push(query.delete(key)) }
	possible_queries.push(':')
	query['q'] = possible_queries.compact.reject(&:empty?).first

	json_request_headers = { 'Accept' => 'application/json' }
	sendgrid_headers = {
	'Authorization' => "Bearer #{SENDGRID_API_KEY}",
	'Content-Type' => 'application/json'
	}
	sendgrid_params = {
	from: { email: '[email protected]' },
	personalizations: [
	{
	to: [
	{ email: '[email protected]' } # TODO: fetch correct email
	],
	dynamic_template_data: {
	start_date: start_date.strftime('%b %d %Y'),
	end_date: Date.today.strftime('%b %d %Y')
	}
	}
	]
	}
	puts alert

	if alert['name'] == 'PLoSONE'
	# For PONE searches
	solr_params = {
	wt: 'json',
	fl: 'title,subject,id,author',
	q: "publication_date:[#{start_date.strftime('%FT%TZ')} TO #{Date.today.strftime('%FT%TZ')}] AND #{query['q']} AND doc_type:\"full\""
	}

	resp = HTTParty.get(SOLR_API_URL + '?' + URI.encode_www_form(solr_params), headers: json_request_headers)
	sleep 1 # Requesting too fast
	search_results = resp.parsed_response['response']
	over_max = search_results['numFound'].to_i > MAX_ARTICLES
	sendgrid_params[:template_id] = PONE_TEMPLATE_ID
	sendgrid_params[:personalizations][0][:dynamic_template_data][:over_max] = over_max
	sendgrid_params[:personalizations][0][:dynamic_template_data][:subjects] = []
	original_query['filterSubjectsDisjunction'].each do \|subj\|
	# assemble doc data for subject
	subj_docs = search_results['docs'].inject([]) do \|list, doc\|
	if doc['subject'].any? { \|doc_subj\| doc_subj.include?(subj) }
	list.push({title: doc['title'], authors: doc['author'].join(', '), doi: doc['id']})
	end
	list
	end

	# assemble url data if necessary
	url = nil
	if subj_docs.count.zero? \|\| over_max
	url_params = { unformattedQuery: "subject:\"#{subj}\"" }
	url_params.merge!(query.slice('filterJournals')) unless subj_docs.count.zero? # link to other journals if no results
	url_params.merge!(query.slice('filterStartDate', 'filterEndDate')) if over_max # don't filter by date
	url = EMAIL_URL + '?' + URI.encode_www_form(url_params)
	end

	# TODO: support subject tier grouping

	subj_data = {
	name: subj,
	articles: subj_docs,
	url: url
	}
	sendgrid_params[:personalizations][0][:dynamic_template_data][:subjects].push(subj_data)
	end

	# send email
	sendgrid_resp = HTTParty.post(SENDGRID_API_URL, { body: sendgrid_params.to_json, headers: sendgrid_headers })
	puts sendgrid_resp.parsed_response
	else
	resp = HTTParty.get(WOMBAT_SEARCH_URL + '?' + URI.encode_www_form(query), headers: json_request_headers)
	search_results = resp.parsed_response['searchResults']
	end
	# puts search_results
	# puts "Found " + search_results['numFound'].to_i.to_s + " results for #{alert['name']}"
	# search_results['docs'].take(3).each { \|doc\| puts doc['title'] } # print out top 3
	end