Created
August 21, 2012 09:52
-
-
Save markmacgillivray/3414096 to your computer and use it in GitHub Desktop.
Example of setting up an ES index with nested objects
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# | |
# gisted at https://gist.github.com/3414096 | |
# | |
# DO ALL OF THIS FIRST, BEFORE RUNNING THIS SCRIPT | |
# | |
# Download elasticsearch (latest version) and unpack it. More info here: | |
# http://www.elasticsearch.org/download/ | |
# | |
# Then, run it - here is an example: | |
# sudo /opt/elasticsearch/bin/elasticsearch start | |
# | |
# If working, you can find it at http://localhost:9200 | |
# and check the status: http://localhost:9200/_status | |
# | |
# Now, get the OERS and STATS files created by Richard, put them in the same | |
# folder as this script, and ensure they are named as below. | |
# Copies are available at: http://test.cottagelabs.com/jorum | |
# Also the code that generates them is at: https://github.com/richard-jones/random-paradata | |
# | |
# Tidy the column names directly in the CSV (simple solution for now) by removing | |
# any white space and slashes, and changing the ID column in the stats field to | |
# AboutOER. Then run this script. | |
# | |
# Once done, check your index exists with correct mapping. For example at: | |
# http://localhost:9200/jorum/noers/_mapping | |
# | |
# More notes (which will become the write-up) are at the bottom of this file. | |
import csv, json, requests | |
oerfile = 'oers.csv' | |
statfile = 'stats.csv' | |
indexname = 'jorum' | |
typename = 'test' | |
location = 'http://localhost:9200/' + indexname + '/' + typename + '/' | |
# clear and prep the index | |
r = requests.delete(location) | |
r = requests.post(location + '_refresh') | |
# I have not decided whether to detect dates or treat them as flat values yet... | |
# "date_detection" : False, | |
# define a dynamic mapping with nested stats and send it to the index | |
mapping = { | |
typename: { | |
"dynamic_templates" : [ | |
{ | |
"default" : { | |
"match" : "*", | |
"match_mapping_type": "string", | |
"mapping" : { | |
"type" : "multi_field", | |
"fields" : { | |
"{name}" : {"type" : "{dynamic_type}", "index" : "analyzed", "store" : "no"}, | |
"exact" : {"type" : "{dynamic_type}", "index" : "not_analyzed", "store" : "yes"} | |
} | |
} | |
} | |
} | |
], | |
"properties": { | |
"stats": { | |
"type": "nested", | |
"include_in_parent": True, | |
"include_in_root": True | |
} | |
} | |
} | |
} | |
r = requests.put(location + '_mapping',data=json.dumps(mapping)) | |
# read in the OERs from their CSV, do any tidying required, then index them | |
recs = csv.DictReader(open(oerfile)) | |
records = [] | |
for rec in recs: | |
rec['SubjectKeywords'] = rec['SubjectKeywords'].split(',') | |
records.append(rec) | |
for rec in records: | |
r = requests.put(location + rec['ID'], data=json.dumps(rec)) | |
# read in the stats from their CSV, do any tidying required | |
recs = csv.DictReader(open(statfile)) | |
records = [] | |
for rec in recs: | |
records.append(rec) | |
# add each stat to the relevant OER and update the index | |
for rec in records: | |
r = requests.get(location + rec['AboutOER']) | |
oer = r.json['_source'] | |
if 'stats' not in oer: | |
oer['stats'] = [] | |
oer['stats'].append(rec) | |
r = requests.put(location + rec['AboutOER'], data=json.dumps(oer)) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Here is an example of a suitable query, with date ranges added to the facets to simulate user provided date constraints. | |
# | |
# This finds the correct records - the first two OERs from an index populated with our test dataset as described above. | |
# | |
# The facet histograms return the right values - every stat that occurred on the result set in November 2011. | |
# | |
# The event counters also operate correctly, and they can be altered to the nested items or not, | |
# and restricted by the date ranges or not. | |
curl -X GET 'http://localhost:9200/jorum/noers/_search?pretty=true' -d '{ | |
"query": { | |
"bool": { | |
"must": [ | |
{ "term" : {"HEFE.exact": "HE"} }, | |
{ "range" : {"RecordCreatedDate" : {"from": "2004-01-01T00:00:00Z" } } }, | |
{ "nested" : { | |
"path" : "stats", | |
"query" : { | |
"bool" : { | |
"must" : [ | |
{ "term" : { "StatisticalEvent" : "download" } } | |
], | |
"should" : [ | |
{ "term" : { "StatisticalEventIP" : "156.105.48.23" } }, | |
{ "term" : { "StatisticalEventIP" : "157.194.166.213" } } | |
], | |
"minimum_number_should_match": 1 | |
} | |
} | |
}} | |
], | |
"should": [ | |
{ "term" : {"Language.exact": "en-GB"} }, | |
{ "term" : {"Language.exact": "en-US"} }, | |
{ "term" : {"Language.exact": "en"} } | |
] | |
} | |
}, | |
"facets": { | |
"histo": { | |
"date_histogram": { | |
"interval": "day", | |
"field": "StatisticalEventDate" | |
}, | |
"facet_filter": { | |
"range" : {"StatisticalEventDate" : {"from": "2011-10-01T00:00:00Z", "to": "2011-11-30T00:00:00Z" } } | |
}, | |
"nested": "stats" | |
}, | |
"downloads" : { | |
"filter" : { | |
"term" : { "StatisticalEvent" : "download" } | |
}, | |
"facet_filter": { | |
"range" : {"StatisticalEventDate" : {"from": "2011-10-01T00:00:00Z", "to": "2011-11-30T00:00:00Z" } } | |
}, | |
"nested": "stats" | |
}, | |
"views" : { | |
"filter" : { | |
"term" : { "StatisticalEvent" : "view" } | |
}, | |
"facet_filter": { | |
"range" : {"StatisticalEventDate" : {"from": "2011-10-01T00:00:00Z", "to": "2011-11-30T00:00:00Z" } } | |
}, | |
"nested": "stats" | |
}, | |
"creates" : { | |
"filter" : { | |
"term" : { "StatisticalEvent" : "create" } | |
}, | |
"facet_filter": { | |
"range" : {"StatisticalEventDate" : {"from": "2011-10-01T00:00:00Z", "to": "2011-11-30T00:00:00Z" } } | |
}, | |
"nested": "stats" | |
} | |
}, | |
"size": 0 | |
}' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment