saxxi · December 22, 2015 15:49 · phungleson · Sep 12, 2013 · saxxi · Sep 12, 2013
diff --git a/gistfile1.rb b/gistfile1.rb
 # Elastic search grouping solution
 # As at present ElasticSearch does not provide a group_by equivalent, here's my attempt to do it manually.
 # In the example we have articles made by some authors and I'd like to have relevant docs, but not more than one per author.
 # Assumption.
 # 
 # 1) I'm looking for relevant content
 # 2) I've assumed that first 300 docs are relevant,
 #    So I consider only this selection, regardless many of these are from the same few authors.
 # 3) for my needs I didn't "really" needed pagination, for me it was enough a "show more" button updated through ajax

 `curl -X DELETE "http://localhost:9200/articles"
 
 curl -X PUT    "http://localhost:9200/articles" -d '{
  "settings": {
    "index": {
      "number_of_shards": 1, "number_of_replicas": 0
    }
  }
 }'
 
 curl -X POST "http://localhost:9200/articles/article" -d '{ "id": 111, "author_id": "user_1", "title": "One bad doc", "findable": true }'
 curl -X POST "http://localhost:9200/articles/article" -d '{ "id": 222, "author_id": "user_2", "title": "Two bad doc", "findable": true }'
 curl -X POST "http://localhost:9200/articles/article" -d '{ "id": 333, "author_id": "user_3", "title": "Three good doc", "findable": true }'
 
 curl -X POST "http://localhost:9200/articles/article" -d '{ "id": 444, "author_id": "user_1", "title": "Four good doc", "findable": true }'
 curl -X POST "http://localhost:9200/articles/article" -d '{ "id": 555, "author_id": "user_2", "title": "Five good doc", "findable": true }'
 curl -X POST "http://localhost:9200/articles/article" -d '{ "id": 666, "author_id": "user_1", "title": "Six good doc", "findable": true }'
 
 
 curl -XPOST 'http://localhost:9200/articles/_refresh'`

 # # Raw test our query
 #
 # curl -X POST "http://localhost:9200/articles/_search?pretty=true" -d '{
 #   "query": {
 #       "bool":{
 #          "must":[{ "query_string":{ "query":"doc", "default_operator":"AND" } }],
 #          "should":[{ "query_string":{ "query":"user_2", "default_operator":"AND", "boost":2000 } }]
 #       }
 #     },
 #   "fields": [{ "term": { findable: "true" } }],
 #   "facets": {
 #     "tags": { "terms": {"field": "owner", "size": 10} }
 #   }
 # }'

 params_start_from = 0
 per_page = 3

 my_query = {
    bool: {
       must: [{ query_string: { query: "doc", default_operator: "AND" } }],
       should: [{ query_string: { query: "user_2", default_operator: "AND", boost: 2000 } }]
    }
 }
 my_and_filters = [
  { term: { findable: "true" } }
 ]
 
 # FIRST QUERY - find all relevant ids
 all_res = Tire.search 'articles', query: my_query,
      filter: { :and => my_and_filters },
      fields: ['id', 'author_id'],
      size: 300
 
 docs = all_res.results.to_a.uniq { |el| el['author_id'] }
 @total_results_non_unique = all_res.results.total # <-- Global variable
 @total_results = docs.size # <-- Global variable
 
 # PAGINATION
 start_from = params_start_from.to_i # should always be < Settings.research.max_results
 docs = docs[ start_from .. start_from + per_page - 1 ]
 doc_ids = docs.nil? ? [] : docs.map { |doc| doc['id'] }
 
 # SECOND QUERY, FIND BY ID
 and_filters << { ids: { values: doc_ids } } # TODO: move :highlight to Part 1 and query only by :id
 
 res = Tire.search 'articles', query: my_query, 
      filter: { :and => my_and_filters },
      highlight: {
        fields: ['title']
      },
      size: per_page
	# Elastic search grouping solution
	# As at present ElasticSearch does not provide a group_by equivalent, here's my attempt to do it manually.
	# In the example we have articles made by some authors and I'd like to have relevant docs, but not more than one per author.
	# Assumption.
	#
	# 1) I'm looking for relevant content
	# 2) I've assumed that first 300 docs are relevant,
	# So I consider only this selection, regardless many of these are from the same few authors.
	# 3) for my needs I didn't "really" needed pagination, for me it was enough a "show more" button updated through ajax

	`curl -X DELETE "http://localhost:9200/articles"

	curl -X PUT "http://localhost:9200/articles" -d '{
	"settings": {
	"index": {
	"number_of_shards": 1, "number_of_replicas": 0
	}
	}
	}'

	curl -X POST "http://localhost:9200/articles/article" -d '{ "id": 111, "author_id": "user_1", "title": "One bad doc", "findable": true }'
	curl -X POST "http://localhost:9200/articles/article" -d '{ "id": 222, "author_id": "user_2", "title": "Two bad doc", "findable": true }'
	curl -X POST "http://localhost:9200/articles/article" -d '{ "id": 333, "author_id": "user_3", "title": "Three good doc", "findable": true }'

	curl -X POST "http://localhost:9200/articles/article" -d '{ "id": 444, "author_id": "user_1", "title": "Four good doc", "findable": true }'
	curl -X POST "http://localhost:9200/articles/article" -d '{ "id": 555, "author_id": "user_2", "title": "Five good doc", "findable": true }'
	curl -X POST "http://localhost:9200/articles/article" -d '{ "id": 666, "author_id": "user_1", "title": "Six good doc", "findable": true }'


	curl -XPOST 'http://localhost:9200/articles/_refresh'`

	# # Raw test our query
	#
	# curl -X POST "http://localhost:9200/articles/_search?pretty=true" -d '{
	# "query": {
	# "bool":{
	# "must":[{ "query_string":{ "query":"doc", "default_operator":"AND" } }],
	# "should":[{ "query_string":{ "query":"user_2", "default_operator":"AND", "boost":2000 } }]
	# }
	# },
	# "fields": [{ "term": { findable: "true" } }],
	# "facets": {
	# "tags": { "terms": {"field": "owner", "size": 10} }
	# }
	# }'

	params_start_from = 0
	per_page = 3

	my_query = {
	bool: {
	must: [{ query_string: { query: "doc", default_operator: "AND" } }],
	should: [{ query_string: { query: "user_2", default_operator: "AND", boost: 2000 } }]
	}
	}
	my_and_filters = [
	{ term: { findable: "true" } }
	]

	# FIRST QUERY - find all relevant ids
	all_res = Tire.search 'articles', query: my_query,
	filter: { :and => my_and_filters },
	fields: ['id', 'author_id'],
	size: 300

	docs = all_res.results.to_a.uniq { \|el\| el['author_id'] }
	@total_results_non_unique = all_res.results.total # <-- Global variable
	@total_results = docs.size # <-- Global variable

	# PAGINATION
	start_from = params_start_from.to_i # should always be < Settings.research.max_results
	docs = docs[ start_from .. start_from + per_page - 1 ]
	doc_ids = docs.nil? ? [] : docs.map { \|doc\| doc['id'] }

	# SECOND QUERY, FIND BY ID
	and_filters << { ids: { values: doc_ids } } # TODO: move :highlight to Part 1 and query only by :id

	res = Tire.search 'articles', query: my_query,
	filter: { :and => my_and_filters },
	highlight: {
	fields: ['title']
	},
	size: per_page