Skip to content

Instantly share code, notes, and snippets.

@markmacgillivray
Created August 21, 2012 09:52
Show Gist options
  • Save markmacgillivray/3414096 to your computer and use it in GitHub Desktop.
Save markmacgillivray/3414096 to your computer and use it in GitHub Desktop.
Example of setting up an ES index with nested objects
#
# gisted at https://gist.github.com/3414096
#
# DO ALL OF THIS FIRST, BEFORE RUNNING THIS SCRIPT
#
# Download elasticsearch (latest version) and unpack it. More info here:
# http://www.elasticsearch.org/download/
#
# Then, run it - here is an example:
# sudo /opt/elasticsearch/bin/elasticsearch start
#
# If working, you can find it at http://localhost:9200
# and check the status: http://localhost:9200/_status
#
# Now, get the OERS and STATS files created by Richard, put them in the same
# folder as this script, and ensure they are named as below.
# Copies are available at: http://test.cottagelabs.com/jorum
# Also the code that generates them is at: https://github.com/richard-jones/random-paradata
#
# Tidy the column names directly in the CSV (simple solution for now) by removing
# any white space and slashes, and changing the ID column in the stats field to
# AboutOER. Then run this script.
#
# Once done, check your index exists with correct mapping. For example at:
# http://localhost:9200/jorum/noers/_mapping
#
# More notes (which will become the write-up) are at the bottom of this file.
import csv, json, requests
oerfile = 'oers.csv'
statfile = 'stats.csv'
indexname = 'jorum'
typename = 'test'
location = 'http://localhost:9200/' + indexname + '/' + typename + '/'
# clear and prep the index
r = requests.delete(location)
r = requests.post(location + '_refresh')
# I have not decided whether to detect dates or treat them as flat values yet...
# "date_detection" : False,
# define a dynamic mapping with nested stats and send it to the index
mapping = {
typename: {
"dynamic_templates" : [
{
"default" : {
"match" : "*",
"match_mapping_type": "string",
"mapping" : {
"type" : "multi_field",
"fields" : {
"{name}" : {"type" : "{dynamic_type}", "index" : "analyzed", "store" : "no"},
"exact" : {"type" : "{dynamic_type}", "index" : "not_analyzed", "store" : "yes"}
}
}
}
}
],
"properties": {
"stats": {
"type": "nested",
"include_in_parent": True,
"include_in_root": True
}
}
}
}
r = requests.put(location + '_mapping',data=json.dumps(mapping))
# read in the OERs from their CSV, do any tidying required, then index them
recs = csv.DictReader(open(oerfile))
records = []
for rec in recs:
rec['SubjectKeywords'] = rec['SubjectKeywords'].split(',')
records.append(rec)
for rec in records:
r = requests.put(location + rec['ID'], data=json.dumps(rec))
# read in the stats from their CSV, do any tidying required
recs = csv.DictReader(open(statfile))
records = []
for rec in recs:
records.append(rec)
# add each stat to the relevant OER and update the index
for rec in records:
r = requests.get(location + rec['AboutOER'])
oer = r.json['_source']
if 'stats' not in oer:
oer['stats'] = []
oer['stats'].append(rec)
r = requests.put(location + rec['AboutOER'], data=json.dumps(oer))
# Here is an example of a suitable query, with date ranges added to the facets to simulate user provided date constraints.
#
# This finds the correct records - the first two OERs from an index populated with our test dataset as described above.
#
# The facet histograms return the right values - every stat that occurred on the result set in November 2011.
#
# The event counters also operate correctly, and they can be altered to the nested items or not,
# and restricted by the date ranges or not.
curl -X GET 'http://localhost:9200/jorum/noers/_search?pretty=true' -d '{
"query": {
"bool": {
"must": [
{ "term" : {"HEFE.exact": "HE"} },
{ "range" : {"RecordCreatedDate" : {"from": "2004-01-01T00:00:00Z" } } },
{ "nested" : {
"path" : "stats",
"query" : {
"bool" : {
"must" : [
{ "term" : { "StatisticalEvent" : "download" } }
],
"should" : [
{ "term" : { "StatisticalEventIP" : "156.105.48.23" } },
{ "term" : { "StatisticalEventIP" : "157.194.166.213" } }
],
"minimum_number_should_match": 1
}
}
}}
],
"should": [
{ "term" : {"Language.exact": "en-GB"} },
{ "term" : {"Language.exact": "en-US"} },
{ "term" : {"Language.exact": "en"} }
]
}
},
"facets": {
"histo": {
"date_histogram": {
"interval": "day",
"field": "StatisticalEventDate"
},
"facet_filter": {
"range" : {"StatisticalEventDate" : {"from": "2011-10-01T00:00:00Z", "to": "2011-11-30T00:00:00Z" } }
},
"nested": "stats"
},
"downloads" : {
"filter" : {
"term" : { "StatisticalEvent" : "download" }
},
"facet_filter": {
"range" : {"StatisticalEventDate" : {"from": "2011-10-01T00:00:00Z", "to": "2011-11-30T00:00:00Z" } }
},
"nested": "stats"
},
"views" : {
"filter" : {
"term" : { "StatisticalEvent" : "view" }
},
"facet_filter": {
"range" : {"StatisticalEventDate" : {"from": "2011-10-01T00:00:00Z", "to": "2011-11-30T00:00:00Z" } }
},
"nested": "stats"
},
"creates" : {
"filter" : {
"term" : { "StatisticalEvent" : "create" }
},
"facet_filter": {
"range" : {"StatisticalEventDate" : {"from": "2011-10-01T00:00:00Z", "to": "2011-11-30T00:00:00Z" } }
},
"nested": "stats"
}
},
"size": 0
}'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment