Skip to content

Instantly share code, notes, and snippets.

@brwe
Last active September 23, 2017 04:17
Show Gist options
  • Save brwe/292681b8e4ab2612633f to your computer and use it in GitHub Desktop.
Save brwe/292681b8e4ab2612633f to your computer and use it in GitHub Desktop.
Significant terms examples, meetup "elasticsearch switzerland", June 5 2014
# Terms aggregations
POST reuters/_search
# aggregate on places - how many articles per country?
POST reuters-test/_search
{
"size": 0,
"aggregations": {
"class": {
"terms": {
"field": "places",
"size": 100
}
}
}
}
# aggregate on topics per country - what's the most important topic for each?
POST reuters/_search
{
"size": 0,
"aggregations": {
"places": {
"terms": {
"field": "places",
"size": 100
},
"aggs": {
"topics": {
"terms": {
"field": "topics",
"size": 10
}
}
}
}
}
}
# Aggregate words in body per country - which words most often used per country in the article?
# Not really useful, returns mostly stopwords
POST reuters/_search
{
"size": 0,
"aggregations": {
"class": {
"terms": {
"field": "places"
},
"aggs": {
"max_freq_terms": {
"terms": {
"field": "body",
"size": 10
}
}
}
}
}
}
# Significant terms in reuters
POST reuters/_search
{
"size": 0,
"aggregations": {
"class": {
"terms": {
"field": "places"
},
"aggregations": {
"sig_terms": {
"significant_terms": {
"field": "body"
}
}
}
}
}
}
# Significant terms for movie reviews
POST movie-reviews/_search
# what are the most often used words in positive and negative reviews?
# Not really useful, returns mostly stopwords
POST movie-reviews/_search
{
"size": 0,
"aggregations": {
"class": {
"terms": {
"field": "class"
},
"aggregations": {
"sig_terms": {
"terms": {
"field": "text"
}
}
}
}
}
}
# Now, use significant terms for that
POST movie-reviews/_search
{
"size": 0,
"aggregations": {
"class": {
"terms": {
"field": "class"
},
"aggregations": {
"sig_terms": {
"significant_terms": {
"field": "text"
}
}
}
}
}
}
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
{
"metadata": {
"name": "",
"signature": "sha256:4c65fdd997494b29a974f57180c6e458344c63376bd1dbf0f5cfd20a6a609cbc"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"# We'll be using naive bayes with multinomial model, because our text is long (see Information Retrieval, chapter 13, Table 13.7)\n",
"\n",
"# Create index and mapping. We need:\n",
"# - text to be stored as token_count type because for naive bayes with multinomial model we need the word count\n",
"# - the number of shards to be 1 because there is only few data (2000 docs) and statistics for significant terms might be spoiled if we distribute this (TODO: DFS for significant terms works?)\n",
"\n",
"import json, requests, pprint, random, math, operator, datetime, sys, optparse, time, elasticsearch\n",
"def create_index():\n",
" es = elasticsearch.Elasticsearch()\n",
" try:\n",
" es.indices.delete(\"reuters\")\n",
" except Exception as e:\n",
" print e\n",
" \n",
" # We have to index test and training data into two different types, because significant terms can currently not filter out docs from the background set\n",
" mapping = {\n",
" \"mappings\": {\n",
" \"article\": {\n",
" \"properties\": {\n",
" \"body\": {\n",
" \"type\": \"string\"\n",
" },\n",
" \"places\": {\n",
" \"type\": \"string\",\n",
" \"index\": \"not_analyzed\" \n",
"\n",
" }\n",
" }\n",
" }\n",
" },\n",
" \"settings\": {\n",
" \"index.number_of_shards\": 1\n",
" }\n",
" }\n",
" es.indices.create(index=\"reuters\",body=mapping)\n",
"\n",
" "
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 5
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# read all filenames that has the data \n",
"# data can be downlodaed from https://github.com/fergiemcdowall/reuters-21578-json \n",
"# Cahnge the filename below!\n",
"\n",
"import os\n",
"\n",
"def gather_filenames():\n",
" allFiles = []\n",
" for r,d,f in os.walk('/Users/britta/devrepo/design/ml/naive_bayes/data/reuters-21578'):\n",
" for files in f:\n",
" if \"json\" in files:\n",
" allFiles.append(os.path.join(r,files))\n",
" return allFiles"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 6
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# create a bulk request from the filenames\n",
"import random, elasticsearch, json\n",
"\n",
"def indexDocsBulk(filenames):\n",
" bulk_string = ''\n",
" random.seed()\n",
" es = elasticsearch.Elasticsearch()\n",
" for filename in filenames : \n",
" f = open(filename, 'r')\n",
" docs = json.load(f)\n",
" for docId, doc in docs.iteritems():\n",
" if (\"places\" in doc.keys()) and (\"body\" in doc.keys()):\n",
" #header for bulk request\n",
" header = \"{ \\\"index\\\" : { \\\"_index\\\" : \\\"reuters\\\", \\\"_type\\\" : \\\"article\\\", \\\"_id\\\": \\\"\"+str(docId)+\"\\\"} }\"\n",
" \n",
" #add to the bulk request\n",
" bulk_string += (header + \"\\n\")\n",
" bulk_string += (json.dumps(doc) + \"\\n\") \n",
" response = es.bulk(body=bulk_string, refresh=True)\n"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 7
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"create_index()\n",
"indexDocsBulk(gather_filenames())"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 8
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}
@brwe
Copy link
Author

brwe commented Jul 5, 2014

Significant terms for movie recommendations were missing the background filter. Added this in revision 2.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment