Created
February 4, 2013 18:05
-
-
Save adngdb/4708381 to your computer and use it in GitHub Desktop.
elasticsearch challenge!
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| elasticsearch challenge! | |
| From this data set: | |
| { | |
| 'name': 'Adrian', | |
| 'sex': 'male', | |
| 'age': 'young' | |
| }, | |
| { | |
| 'name': 'Peter', | |
| 'sex': 'male', | |
| 'age': 'young' | |
| }, | |
| { | |
| 'name': 'Laura', | |
| 'sex': 'female', | |
| 'age': 'young' | |
| }, | |
| { | |
| 'name': 'Peter', | |
| 'sex': 'male', | |
| 'age': 'old' | |
| }, | |
| { | |
| 'name': 'Peter', | |
| 'sex': 'male', | |
| 'age': 'young' | |
| } | |
| You need to create this output: | |
| { | |
| 'adrian': { | |
| 'count': 1, | |
| 'young_count': 1, | |
| 'old_count': 0 | |
| }, | |
| 'peter': { | |
| 'count': 3, | |
| 'young_count': 2, | |
| 'old_count': 1 | |
| } | |
| } | |
| Below is my current solution, and I'm looking for improvement. Especially, is | |
| it possible to do it without the second query to elasticsearch? | |
| By adrian@mozilla.com | |
| """ | |
| import pyelasticsearch | |
| es = pyelasticsearch.ElasticSearch('http://localhost:9200') | |
| es_index = 'myindex' | |
| es_doctype = 'mytype' | |
| try: | |
| es.delete_all(es_index, es_doctype) | |
| except pyelasticsearch.exceptions.ElasticHttpNotFoundError: | |
| pass | |
| docs = [ | |
| { | |
| 'name': 'Adrian', | |
| 'sex': 'male', | |
| 'age': 'young' | |
| }, | |
| { | |
| 'name': 'Peter', | |
| 'sex': 'male', | |
| 'age': 'young' | |
| }, | |
| { | |
| 'name': 'Laura', | |
| 'sex': 'female', | |
| 'age': 'young' | |
| }, | |
| { | |
| 'name': 'Peter', | |
| 'sex': 'male', | |
| 'age': 'old' | |
| }, | |
| { | |
| 'name': 'Peter', | |
| 'sex': 'male', | |
| 'age': 'young' | |
| } | |
| ] | |
| for doc in docs: | |
| es.index(es_index, es_doctype, doc) | |
| es.refresh(es_index) | |
| query = { | |
| 'size': 0, | |
| 'from': 0, | |
| 'query': { | |
| 'filtered': { | |
| 'query': { | |
| 'match_all': {} | |
| }, | |
| 'filter': { | |
| 'term': { | |
| 'sex': 'male' | |
| } | |
| } | |
| } | |
| }, | |
| 'facets': { | |
| 'names': { | |
| 'terms': { | |
| 'field': 'name' | |
| } | |
| } | |
| } | |
| } | |
| # This first query is used to get the list of all names matching the criteria | |
| results = es.search(query, index=es_index, doc_type=es_doctype) | |
| names = {} | |
| facets = {} | |
| for term in results['facets']['names']['terms']: | |
| name = term['term'] | |
| names[name] = { | |
| 'count': term['count'] | |
| } | |
| facets[name] = { | |
| 'terms': { | |
| 'field': 'age' | |
| }, | |
| 'facet_filter': { | |
| 'term': { | |
| 'name': name | |
| } | |
| } | |
| } | |
| query['facets'] = facets | |
| # This second query gives us the stats we need on each different name | |
| results = es.search(query, index=es_index, doc_type=es_doctype) | |
| for name in names: | |
| names[name]['young_count'] = 0 | |
| names[name]['old_count'] = 0 | |
| for age in results['facets'][name]['terms']: | |
| names[name]['%s_count' % age['term']] = age['count'] | |
| assert names == { | |
| 'adrian': { | |
| 'count': 1, | |
| 'young_count': 1, | |
| 'old_count': 0 | |
| }, | |
| 'peter': { | |
| 'count': 3, | |
| 'young_count': 2, | |
| 'old_count': 1 | |
| } | |
| } |
Author
You could always do the same thing during indexing and avoid the script all together.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@mattweber thank you! This looks like an elegant solution to my problem. I'm concerned about
script_fieldusing_sourceas ours is pretty big, but I'll run tests asap and see if your solution performs better than mine.