Created
February 4, 2013 18:05
-
-
Save adngdb/4708381 to your computer and use it in GitHub Desktop.
elasticsearch challenge!
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| elasticsearch challenge! | |
| From this data set: | |
| { | |
| 'name': 'Adrian', | |
| 'sex': 'male', | |
| 'age': 'young' | |
| }, | |
| { | |
| 'name': 'Peter', | |
| 'sex': 'male', | |
| 'age': 'young' | |
| }, | |
| { | |
| 'name': 'Laura', | |
| 'sex': 'female', | |
| 'age': 'young' | |
| }, | |
| { | |
| 'name': 'Peter', | |
| 'sex': 'male', | |
| 'age': 'old' | |
| }, | |
| { | |
| 'name': 'Peter', | |
| 'sex': 'male', | |
| 'age': 'young' | |
| } | |
| You need to create this output: | |
| { | |
| 'adrian': { | |
| 'count': 1, | |
| 'young_count': 1, | |
| 'old_count': 0 | |
| }, | |
| 'peter': { | |
| 'count': 3, | |
| 'young_count': 2, | |
| 'old_count': 1 | |
| } | |
| } | |
| Below is my current solution, and I'm looking for improvement. Especially, is | |
| it possible to do it without the second query to elasticsearch? | |
| By adrian@mozilla.com | |
| """ | |
| import pyelasticsearch | |
| es = pyelasticsearch.ElasticSearch('http://localhost:9200') | |
| es_index = 'myindex' | |
| es_doctype = 'mytype' | |
| try: | |
| es.delete_all(es_index, es_doctype) | |
| except pyelasticsearch.exceptions.ElasticHttpNotFoundError: | |
| pass | |
| docs = [ | |
| { | |
| 'name': 'Adrian', | |
| 'sex': 'male', | |
| 'age': 'young' | |
| }, | |
| { | |
| 'name': 'Peter', | |
| 'sex': 'male', | |
| 'age': 'young' | |
| }, | |
| { | |
| 'name': 'Laura', | |
| 'sex': 'female', | |
| 'age': 'young' | |
| }, | |
| { | |
| 'name': 'Peter', | |
| 'sex': 'male', | |
| 'age': 'old' | |
| }, | |
| { | |
| 'name': 'Peter', | |
| 'sex': 'male', | |
| 'age': 'young' | |
| } | |
| ] | |
| for doc in docs: | |
| es.index(es_index, es_doctype, doc) | |
| es.refresh(es_index) | |
| query = { | |
| 'size': 0, | |
| 'from': 0, | |
| 'query': { | |
| 'filtered': { | |
| 'query': { | |
| 'match_all': {} | |
| }, | |
| 'filter': { | |
| 'term': { | |
| 'sex': 'male' | |
| } | |
| } | |
| } | |
| }, | |
| 'facets': { | |
| 'names': { | |
| 'terms': { | |
| 'field': 'name' | |
| } | |
| } | |
| } | |
| } | |
| # This first query is used to get the list of all names matching the criteria | |
| results = es.search(query, index=es_index, doc_type=es_doctype) | |
| names = {} | |
| facets = {} | |
| for term in results['facets']['names']['terms']: | |
| name = term['term'] | |
| names[name] = { | |
| 'count': term['count'] | |
| } | |
| facets[name] = { | |
| 'terms': { | |
| 'field': 'age' | |
| }, | |
| 'facet_filter': { | |
| 'term': { | |
| 'name': name | |
| } | |
| } | |
| } | |
| query['facets'] = facets | |
| # This second query gives us the stats we need on each different name | |
| results = es.search(query, index=es_index, doc_type=es_doctype) | |
| for name in names: | |
| names[name]['young_count'] = 0 | |
| names[name]['old_count'] = 0 | |
| for age in results['facets'][name]['terms']: | |
| names[name]['%s_count' % age['term']] = age['count'] | |
| assert names == { | |
| 'adrian': { | |
| 'count': 1, | |
| 'young_count': 1, | |
| 'old_count': 0 | |
| }, | |
| 'peter': { | |
| 'count': 3, | |
| 'young_count': 2, | |
| 'old_count': 1 | |
| } | |
| } |
Use a terms script facet to join name + age. This does everything in a single query.
import pyelasticsearch
es = pyelasticsearch.ElasticSearch('http://localhost:9200')
es_index = 'myindex'
es_doctype = 'mytype'
try:
es.delete_all(es_index, es_doctype)
except pyelasticsearch.exceptions.ElasticHttpNotFoundError:
pass
docs = [
{
'name': 'Adrian',
'sex': 'male',
'age': 'young'
},
{
'name': 'Peter',
'sex': 'male',
'age': 'young'
},
{
'name': 'Laura',
'sex': 'female',
'age': 'young'
},
{
'name': 'Peter',
'sex': 'male',
'age': 'old'
},
{
'name': 'Peter',
'sex': 'male',
'age': 'young'
}
]
for doc in docs:
es.index(es_index, es_doctype, doc)
es.refresh(es_index)
query = {
'size': 0,
'from': 0,
'query': {
'filtered': {
'query': {
'match_all': {}
},
'filter': {
'term': {
'sex': 'male'
}
}
}
},
'facets': {
'name': {
'terms': {
'field': 'name'
}
},
'age_by_name': {
'terms': {
'script_field': '_source.name + \'|\' + _source.age',
}
}
}
}
results = es.search(query, index=es_index, doc_type=es_doctype)
names = {}
for name in results['facets']['name']['terms']:
# your need to defult old_count and young_count if you want to show when there are 0
names[name['term']] = {'count': name['count'], 'old_count': 0, 'young_count': 0}
for name_age in results['facets']['age_by_name']['terms']:
name, age = name_age['term'].lower().split('|')
cnt = name_age['count']
info = names.setdefault(name, {})
info[age + '_count'] = cnt
assert names == {
'adrian': {
'count': 1,
'young_count': 1,
'old_count': 0
},
'peter': {
'count': 3,
'young_count': 2,
'old_count': 1
}
}
Author
@mattweber thank you! This looks like an elegant solution to my problem. I'm concerned about script_field using _source as ours is pretty big, but I'll run tests asap and see if your solution performs better than mine.
You could always do the same thing during indexing and avoid the script all together.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This one can be done in a single query since you have a small number of terms for the second facet and those terms are known ahead of time.
If you didn't know the number of buckets for age, then you could do two queries, one to get a list of all the ages and then use script to bucket them as desired and then generate the desired facets.