Last active
September 23, 2017 04:17
-
-
Save brwe/292681b8e4ab2612633f to your computer and use it in GitHub Desktop.
Significant terms examples, meetup "elasticsearch switzerland", June 5 2014
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Terms aggregations | |
POST reuters/_search | |
# aggregate on places - how many articles per country? | |
POST reuters-test/_search | |
{ | |
"size": 0, | |
"aggregations": { | |
"class": { | |
"terms": { | |
"field": "places", | |
"size": 100 | |
} | |
} | |
} | |
} | |
# aggregate on topics per country - what's the most important topic for each? | |
POST reuters/_search | |
{ | |
"size": 0, | |
"aggregations": { | |
"places": { | |
"terms": { | |
"field": "places", | |
"size": 100 | |
}, | |
"aggs": { | |
"topics": { | |
"terms": { | |
"field": "topics", | |
"size": 10 | |
} | |
} | |
} | |
} | |
} | |
} | |
# Aggregate words in body per country - which words most often used per country in the article? | |
# Not really useful, returns mostly stopwords | |
POST reuters/_search | |
{ | |
"size": 0, | |
"aggregations": { | |
"class": { | |
"terms": { | |
"field": "places" | |
}, | |
"aggs": { | |
"max_freq_terms": { | |
"terms": { | |
"field": "body", | |
"size": 10 | |
} | |
} | |
} | |
} | |
} | |
} | |
# Significant terms in reuters | |
POST reuters/_search | |
{ | |
"size": 0, | |
"aggregations": { | |
"class": { | |
"terms": { | |
"field": "places" | |
}, | |
"aggregations": { | |
"sig_terms": { | |
"significant_terms": { | |
"field": "body" | |
} | |
} | |
} | |
} | |
} | |
} | |
# Significant terms for movie reviews | |
POST movie-reviews/_search | |
# what are the most often used words in positive and negative reviews? | |
# Not really useful, returns mostly stopwords | |
POST movie-reviews/_search | |
{ | |
"size": 0, | |
"aggregations": { | |
"class": { | |
"terms": { | |
"field": "class" | |
}, | |
"aggregations": { | |
"sig_terms": { | |
"terms": { | |
"field": "text" | |
} | |
} | |
} | |
} | |
} | |
} | |
# Now, use significant terms for that | |
POST movie-reviews/_search | |
{ | |
"size": 0, | |
"aggregations": { | |
"class": { | |
"terms": { | |
"field": "class" | |
}, | |
"aggregations": { | |
"sig_terms": { | |
"significant_terms": { | |
"field": "text" | |
} | |
} | |
} | |
} | |
} | |
} | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "", | |
"signature": "sha256:e9b9ae004d6696888c28ee6d6dd422d055d79849413ea45f0d24c87a3e6abc32" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"\n", | |
"import json, requests, pprint, random, math, operator, datetime, sys, optparse, time, elasticsearch\n", | |
"def create_index():\n", | |
" es = elasticsearch.Elasticsearch()\n", | |
" es.indices.delete(\"movie-reviews\")\n", | |
" mapping = {\n", | |
" \"mappings\": {\n", | |
" \"review\": {\n", | |
" \"properties\": {\n", | |
" \"text\": {\n", | |
" \"type\": \"string\"\n", | |
" }\n", | |
" }\n", | |
" }\n", | |
" },\n", | |
" \"settings\": {\n", | |
" \"index.number_of_shards\": 1\n", | |
" }\n", | |
" }\n", | |
" es.indices.create(index=\"movie-reviews\",body=mapping)\n", | |
"\n", | |
" " | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 43 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# download data from: http://www.cs.cornell.edu/people/pabo/movie-review-data/ \n", | |
"# Need to change the path to the data below\n", | |
"\n", | |
"import os\n", | |
"\n", | |
"def gather_filenames():\n", | |
" allPosFiles = []\n", | |
" for r,d,f in os.walk('/Users/britta/Documents/naive_bayes/data/review_polarity/txt_sentoken/pos'):\n", | |
" for files in f:\n", | |
" allPosFiles.append(os.path.join(r,files))\n", | |
"\n", | |
" allNegFiles = []\n", | |
" for r,d,f in os.walk('/Users/britta/Documents/naive_bayes/data/review_polarity/txt_sentoken/neg'):\n", | |
" for files in f:\n", | |
" allNegFiles.append(os.path.join(r,files))\n", | |
" return [allPosFiles, allNegFiles]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 44 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# create a bulk request from the filenames\n", | |
"\n", | |
"def indexDocsBulk(filenames, classlabels, docIdStart):\n", | |
" bulk_string = ''\n", | |
" random.seed()\n", | |
" docId = docIdStart;\n", | |
" es = elasticsearch.Elasticsearch()\n", | |
" for filename in filenames :\n", | |
" f = open(filename, 'r')\n", | |
" #header for bulk request\n", | |
" header = \"{ \\\"index\\\" : { \\\"_index\\\" : \\\"movie-reviews\\\", \\\"_type\\\" : \\\"review\\\", \\\"_id\\\": \\\"\"+str(docId)+\"\\\"} }\"\n", | |
" # text process: remove all newlines and secial characters\n", | |
" text = f.read().replace('\\n', ' ').replace('\"', ' ').replace(\"\\\\\",\" \")\n", | |
" text = \"\".join([i for i in text if 31 < ord(i) < 127])\n", | |
" #create the document text\n", | |
" doc = \"{\\\"text\\\": \\\"\" + text + \"\\\",\\\"class\\\": \\\"\"+ classlabels + \"\\\"}\"\n", | |
" #add to the bulk request\n", | |
" bulk_string += (header + \"\\n\")\n", | |
" bulk_string += (doc + \"\\n\") \n", | |
" docId += 1;\n", | |
" response = es.bulk(body=bulk_string, refresh=True)\n", | |
" print \"Bulk took \" + str(float(response['took'])/1000.0) + \" s\"\n", | |
" return docId\n" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 45 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# index data\n", | |
"def index_docs(allPosFiles, allNegFiles):\n", | |
" nextId = indexDocsBulk(allPosFiles, \"pos\", 1)\n", | |
" indexDocsBulk(allNegFiles, \"neg\", nextId)\n", | |
" \n", | |
" " | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 46 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"create_index()\n", | |
"[pos_files, neg_files] = gather_filenames()\n", | |
"index_docs(pos_files, neg_files)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"Bulk took 0.969 s\n", | |
"Bulk took 0.817 s" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"\n" | |
] | |
} | |
], | |
"prompt_number": 47 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Significant terms for movie recommendations were missing the background filter. Added this in revision 2.