Last active
September 23, 2017 04:17
-
-
Save brwe/292681b8e4ab2612633f to your computer and use it in GitHub Desktop.
Significant terms examples, meetup "elasticsearch switzerland", June 5 2014
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Terms aggregations | |
POST reuters/_search | |
# aggregate on places - how many articles per country? | |
POST reuters-test/_search | |
{ | |
"size": 0, | |
"aggregations": { | |
"class": { | |
"terms": { | |
"field": "places", | |
"size": 100 | |
} | |
} | |
} | |
} | |
# aggregate on topics per country - what's the most important topic for each? | |
POST reuters/_search | |
{ | |
"size": 0, | |
"aggregations": { | |
"places": { | |
"terms": { | |
"field": "places", | |
"size": 100 | |
}, | |
"aggs": { | |
"topics": { | |
"terms": { | |
"field": "topics", | |
"size": 10 | |
} | |
} | |
} | |
} | |
} | |
} | |
# Aggregate words in body per country - which words most often used per country in the article? | |
# Not really useful, returns mostly stopwords | |
POST reuters/_search | |
{ | |
"size": 0, | |
"aggregations": { | |
"class": { | |
"terms": { | |
"field": "places" | |
}, | |
"aggs": { | |
"max_freq_terms": { | |
"terms": { | |
"field": "body", | |
"size": 10 | |
} | |
} | |
} | |
} | |
} | |
} | |
# Significant terms in reuters | |
POST reuters/_search | |
{ | |
"size": 0, | |
"aggregations": { | |
"class": { | |
"terms": { | |
"field": "places" | |
}, | |
"aggregations": { | |
"sig_terms": { | |
"significant_terms": { | |
"field": "body" | |
} | |
} | |
} | |
} | |
} | |
} | |
# Significant terms for movie reviews | |
POST movie-reviews/_search | |
# what are the most often used words in positive and negative reviews? | |
# Not really useful, returns mostly stopwords | |
POST movie-reviews/_search | |
{ | |
"size": 0, | |
"aggregations": { | |
"class": { | |
"terms": { | |
"field": "class" | |
}, | |
"aggregations": { | |
"sig_terms": { | |
"terms": { | |
"field": "text" | |
} | |
} | |
} | |
} | |
} | |
} | |
# Now, use significant terms for that | |
POST movie-reviews/_search | |
{ | |
"size": 0, | |
"aggregations": { | |
"class": { | |
"terms": { | |
"field": "class" | |
}, | |
"aggregations": { | |
"sig_terms": { | |
"significant_terms": { | |
"field": "text" | |
} | |
} | |
} | |
} | |
} | |
} | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "", | |
"signature": "sha256:4c65fdd997494b29a974f57180c6e458344c63376bd1dbf0f5cfd20a6a609cbc" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# We'll be using naive bayes with multinomial model, because our text is long (see Information Retrieval, chapter 13, Table 13.7)\n", | |
"\n", | |
"# Create index and mapping. We need:\n", | |
"# - text to be stored as token_count type because for naive bayes with multinomial model we need the word count\n", | |
"# - the number of shards to be 1 because there is only few data (2000 docs) and statistics for significant terms might be spoiled if we distribute this (TODO: DFS for significant terms works?)\n", | |
"\n", | |
"import json, requests, pprint, random, math, operator, datetime, sys, optparse, time, elasticsearch\n", | |
"def create_index():\n", | |
" es = elasticsearch.Elasticsearch()\n", | |
" try:\n", | |
" es.indices.delete(\"reuters\")\n", | |
" except Exception as e:\n", | |
" print e\n", | |
" \n", | |
" # We have to index test and training data into two different types, because significant terms can currently not filter out docs from the background set\n", | |
" mapping = {\n", | |
" \"mappings\": {\n", | |
" \"article\": {\n", | |
" \"properties\": {\n", | |
" \"body\": {\n", | |
" \"type\": \"string\"\n", | |
" },\n", | |
" \"places\": {\n", | |
" \"type\": \"string\",\n", | |
" \"index\": \"not_analyzed\" \n", | |
"\n", | |
" }\n", | |
" }\n", | |
" }\n", | |
" },\n", | |
" \"settings\": {\n", | |
" \"index.number_of_shards\": 1\n", | |
" }\n", | |
" }\n", | |
" es.indices.create(index=\"reuters\",body=mapping)\n", | |
"\n", | |
" " | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 5 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# read all filenames that has the data \n", | |
"# data can be downlodaed from https://github.com/fergiemcdowall/reuters-21578-json \n", | |
"# Cahnge the filename below!\n", | |
"\n", | |
"import os\n", | |
"\n", | |
"def gather_filenames():\n", | |
" allFiles = []\n", | |
" for r,d,f in os.walk('/Users/britta/devrepo/design/ml/naive_bayes/data/reuters-21578'):\n", | |
" for files in f:\n", | |
" if \"json\" in files:\n", | |
" allFiles.append(os.path.join(r,files))\n", | |
" return allFiles" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 6 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# create a bulk request from the filenames\n", | |
"import random, elasticsearch, json\n", | |
"\n", | |
"def indexDocsBulk(filenames):\n", | |
" bulk_string = ''\n", | |
" random.seed()\n", | |
" es = elasticsearch.Elasticsearch()\n", | |
" for filename in filenames : \n", | |
" f = open(filename, 'r')\n", | |
" docs = json.load(f)\n", | |
" for docId, doc in docs.iteritems():\n", | |
" if (\"places\" in doc.keys()) and (\"body\" in doc.keys()):\n", | |
" #header for bulk request\n", | |
" header = \"{ \\\"index\\\" : { \\\"_index\\\" : \\\"reuters\\\", \\\"_type\\\" : \\\"article\\\", \\\"_id\\\": \\\"\"+str(docId)+\"\\\"} }\"\n", | |
" \n", | |
" #add to the bulk request\n", | |
" bulk_string += (header + \"\\n\")\n", | |
" bulk_string += (json.dumps(doc) + \"\\n\") \n", | |
" response = es.bulk(body=bulk_string, refresh=True)\n" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 7 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"create_index()\n", | |
"indexDocsBulk(gather_filenames())" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 8 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Significant terms for movie recommendations were missing the background filter. Added this in revision 2.