Last active
December 22, 2015 15:49
-
-
Save saxxi/6495116 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Elastic search grouping solution | |
# As at present ElasticSearch does not provide a group_by equivalent, here's my attempt to do it manually. | |
# In the example we have articles made by some authors and I'd like to have relevant docs, but not more than one per author. | |
# Assumption. | |
# | |
# 1) I'm looking for relevant content | |
# 2) I've assumed that first 300 docs are relevant, | |
# So I consider only this selection, regardless many of these are from the same few authors. | |
# 3) for my needs I didn't "really" needed pagination, for me it was enough a "show more" button updated through ajax | |
`curl -X DELETE "http://localhost:9200/articles" | |
curl -X PUT "http://localhost:9200/articles" -d '{ | |
"settings": { | |
"index": { | |
"number_of_shards": 1, "number_of_replicas": 0 | |
} | |
} | |
}' | |
curl -X POST "http://localhost:9200/articles/article" -d '{ "id": 111, "author_id": "user_1", "title": "One bad doc", "findable": true }' | |
curl -X POST "http://localhost:9200/articles/article" -d '{ "id": 222, "author_id": "user_2", "title": "Two bad doc", "findable": true }' | |
curl -X POST "http://localhost:9200/articles/article" -d '{ "id": 333, "author_id": "user_3", "title": "Three good doc", "findable": true }' | |
curl -X POST "http://localhost:9200/articles/article" -d '{ "id": 444, "author_id": "user_1", "title": "Four good doc", "findable": true }' | |
curl -X POST "http://localhost:9200/articles/article" -d '{ "id": 555, "author_id": "user_2", "title": "Five good doc", "findable": true }' | |
curl -X POST "http://localhost:9200/articles/article" -d '{ "id": 666, "author_id": "user_1", "title": "Six good doc", "findable": true }' | |
curl -XPOST 'http://localhost:9200/articles/_refresh'` | |
# # Raw test our query | |
# | |
# curl -X POST "http://localhost:9200/articles/_search?pretty=true" -d '{ | |
# "query": { | |
# "bool":{ | |
# "must":[{ "query_string":{ "query":"doc", "default_operator":"AND" } }], | |
# "should":[{ "query_string":{ "query":"user_2", "default_operator":"AND", "boost":2000 } }] | |
# } | |
# }, | |
# "fields": [{ "term": { findable: "true" } }], | |
# "facets": { | |
# "tags": { "terms": {"field": "owner", "size": 10} } | |
# } | |
# }' | |
params_start_from = 0 | |
per_page = 3 | |
my_query = { | |
bool: { | |
must: [{ query_string: { query: "doc", default_operator: "AND" } }], | |
should: [{ query_string: { query: "user_2", default_operator: "AND", boost: 2000 } }] | |
} | |
} | |
my_and_filters = [ | |
{ term: { findable: "true" } } | |
] | |
# FIRST QUERY - find all relevant ids | |
all_res = Tire.search 'articles', query: my_query, | |
filter: { :and => my_and_filters }, | |
fields: ['id', 'author_id'], | |
size: 300 | |
docs = all_res.results.to_a.uniq { |el| el['author_id'] } | |
@total_results_non_unique = all_res.results.total # <-- Global variable | |
@total_results = docs.size # <-- Global variable | |
# PAGINATION | |
start_from = params_start_from.to_i # should always be < Settings.research.max_results | |
docs = docs[ start_from .. start_from + per_page - 1 ] | |
doc_ids = docs.nil? ? [] : docs.map { |doc| doc['id'] } | |
# SECOND QUERY, FIND BY ID | |
and_filters << { ids: { values: doc_ids } } # TODO: move :highlight to Part 1 and query only by :id | |
res = Tire.search 'articles', query: my_query, | |
filter: { :and => my_and_filters }, | |
highlight: { | |
fields: ['title'] | |
}, | |
size: per_page | |
This is a satisfying solution for the present time (it's on staging env) but I would be glad to improve it, could you please provide some deeper hint on step 1? Thank you
If you use term facet, you can get list of all author_id
sorted by some conditions, you don't have to limit yourself to 300 number.
http://www.elasticsearch.org/guide/reference/api/search/facets/terms-facet/
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I guess, the concept seems working right (step 1, 2, 3)?
Step 1 can be replaced by doing a faceting query instead.