brwe · September 23, 2017 04:17 · brwe · Jul 5, 2014
diff --git a/example-queries b/example-queries
 # Terms aggregations

 POST reuters/_search

 # aggregate on places - how many articles per country?
 POST reuters-test/_search
 {
   "size": 0,
   "aggregations": {
      "class": {
         "terms": {
            "field": "places",
            "size": 100
         }
      }
   }
 }

 # aggregate on topics per country - what's the most important topic for each?

 POST reuters/_search
 {
   "size": 0,
   "aggregations": {
      "places": {
         "terms": {
            "field": "places",
            "size": 100
         },
         "aggs": {
           "topics": {
             "terms": {
               "field": "topics",
               "size": 10
             }
           }
         }
      }
   }
 }


 # Aggregate words in body per country - which words most often used per country in the article?
 # Not really useful, returns mostly stopwords
 POST reuters/_search
 {
   "size": 0,
   "aggregations": {
      "class": {
         "terms": {
            "field": "places"
         },
         "aggs": {
           "max_freq_terms": {
             "terms": {
               "field": "body",
               "size": 10
             }
           }
         }
      }
   }
 }




 # Significant terms in reuters


 POST reuters/_search
 {
   "size": 0,
   "aggregations": {
      "class": {
         "terms": {
            "field": "places"
         },
         "aggregations": {
            "sig_terms": {
               "significant_terms": {
                  "field": "body"
               }
            }
         }
      }
   }
 }


 # Significant terms for movie reviews

 POST movie-reviews/_search

 # what are the most often used words in positive and negative reviews?
 # Not really useful, returns mostly stopwords
 POST movie-reviews/_search
 {
   "size": 0,
   "aggregations": {
      "class": {
         "terms": {
            "field": "class"
         },
         "aggregations": {
            "sig_terms": {
               "terms": {
                  "field": "text"
               }
            }
         }
      }
   }
 }

 # Now, use significant terms for that

 POST movie-reviews/_search
 {
   "size": 0,
   "aggregations": {
      "class": {
         "terms": {
            "field": "class"
         },
         "aggregations": {
            "sig_terms": {
               "significant_terms": {
                  "field": "text"
               }
            }
         }
      }
   }
 }

diff --git a/movie-reviews-index.ipynb b/movie-reviews-index.ipynb
diff --git a/recommender-movielens.ipynb b/recommender-movielens.ipynb
diff --git a/reuters-sig-terms-index.ipynb b/reuters-sig-terms-index.ipynb
 {
 "metadata": {
  "name": "",
  "signature": "sha256:4c65fdd997494b29a974f57180c6e458344c63376bd1dbf0f5cfd20a6a609cbc"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# We'll be using naive bayes with multinomial model, because our text is long (see Information Retrieval, chapter 13, Table 13.7)\n",
      "\n",
      "# Create index and mapping. We need:\n",
      "# - text to be stored as token_count type because for naive bayes with multinomial model we need the word count\n",
      "# - the number of shards to be 1 because there is only few data (2000 docs) and statistics for significant terms might be spoiled if we distribute this (TODO: DFS for significant terms works?)\n",
      "\n",
      "import json, requests, pprint, random, math, operator, datetime, sys, optparse, time, elasticsearch\n",
      "def create_index():\n",
      "    es = elasticsearch.Elasticsearch()\n",
      "    try:\n",
      "        es.indices.delete(\"reuters\")\n",
      "    except Exception as e:\n",
      "        print e\n",
      "    \n",
      "    # We have to index test and training data into two different types, because significant terms can currently not filter out docs from the background set\n",
      "    mapping = {\n",
      "       \"mappings\": {\n",
      "          \"article\": {\n",
      "             \"properties\": {\n",
      "                \"body\": {\n",
      "                    \"type\": \"string\"\n",
      "                },\n",
      "                \"places\": {\n",
      "                    \"type\": \"string\",\n",
      "                    \"index\": \"not_analyzed\" \n",
      "\n",
      "                }\n",
      "             }\n",
      "          }\n",
      "       },\n",
      "       \"settings\": {\n",
      "          \"index.number_of_shards\": 1\n",
      "       }\n",
      "    }\n",
      "    es.indices.create(index=\"reuters\",body=mapping)\n",
      "\n",
      "        "
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 5
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# read all filenames that has the data \n",
      "# data can be downlodaed from https://github.com/fergiemcdowall/reuters-21578-json \n",
      "# Cahnge the filename below!\n",
      "\n",
      "import os\n",
      "\n",
      "def gather_filenames():\n",
      "    allFiles = []\n",
      "    for r,d,f in os.walk('/Users/britta/devrepo/design/ml/naive_bayes/data/reuters-21578'):\n",
      "        for files in f:\n",
      "            if \"json\" in files:\n",
      "               allFiles.append(os.path.join(r,files))\n",
      "    return allFiles"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 6
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# create a bulk request from the filenames\n",
      "import random, elasticsearch, json\n",
      "\n",
      "def indexDocsBulk(filenames):\n",
      "    bulk_string = ''\n",
      "    random.seed()\n",
      "    es = elasticsearch.Elasticsearch()\n",
      "    for filename in filenames :        \n",
      "        f = open(filename, 'r')\n",
      "        docs = json.load(f)\n",
      "        for docId, doc in docs.iteritems():\n",
      "            if (\"places\" in doc.keys()) and (\"body\" in doc.keys()):\n",
      "                #header for bulk request\n",
      "                header = \"{ \\\"index\\\" : { \\\"_index\\\" : \\\"reuters\\\", \\\"_type\\\" : \\\"article\\\", \\\"_id\\\": \\\"\"+str(docId)+\"\\\"} }\"\n",
      "        \n",
      "                #add to the bulk request\n",
      "                bulk_string += (header + \"\\n\")\n",
      "                bulk_string += (json.dumps(doc) + \"\\n\")  \n",
      "    response = es.bulk(body=bulk_string, refresh=True)\n"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 7
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "create_index()\n",
      "indexDocsBulk(gather_filenames())"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 8
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": []
    }
   ],
   "metadata": {}
  }
 ]
 }
	# Terms aggregations

	POST reuters/_search

	# aggregate on places - how many articles per country?
	POST reuters-test/_search
	{
	"size": 0,
	"aggregations": {
	"class": {
	"terms": {
	"field": "places",
	"size": 100
	}
	}
	}
	}

	# aggregate on topics per country - what's the most important topic for each?

	POST reuters/_search
	{
	"size": 0,
	"aggregations": {
	"places": {
	"terms": {
	"field": "places",
	"size": 100
	},
	"aggs": {
	"topics": {
	"terms": {
	"field": "topics",
	"size": 10
	}
	}
	}
	}
	}
	}


	# Aggregate words in body per country - which words most often used per country in the article?
	# Not really useful, returns mostly stopwords
	POST reuters/_search
	{
	"size": 0,
	"aggregations": {
	"class": {
	"terms": {
	"field": "places"
	},
	"aggs": {
	"max_freq_terms": {
	"terms": {
	"field": "body",
	"size": 10
	}
	}
	}
	}
	}
	}




	# Significant terms in reuters


	POST reuters/_search
	{
	"size": 0,
	"aggregations": {
	"class": {
	"terms": {
	"field": "places"
	},
	"aggregations": {
	"sig_terms": {
	"significant_terms": {
	"field": "body"
	}
	}
	}
	}
	}
	}


	# Significant terms for movie reviews

	POST movie-reviews/_search

	# what are the most often used words in positive and negative reviews?
	# Not really useful, returns mostly stopwords
	POST movie-reviews/_search
	{
	"size": 0,
	"aggregations": {
	"class": {
	"terms": {
	"field": "class"
	},
	"aggregations": {
	"sig_terms": {
	"terms": {
	"field": "text"
	}
	}
	}
	}
	}
	}

	# Now, use significant terms for that

	POST movie-reviews/_search
	{
	"size": 0,
	"aggregations": {
	"class": {
	"terms": {
	"field": "class"
	},
	"aggregations": {
	"sig_terms": {
	"significant_terms": {
	"field": "text"
	}
	}
	}
	}
	}
	}
	{
	"metadata": {
	"name": "",
	"signature": "sha256:4c65fdd997494b29a974f57180c6e458344c63376bd1dbf0f5cfd20a6a609cbc"
	},
	"nbformat": 3,
	"nbformat_minor": 0,
	"worksheets": [
	{
	"cells": [
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# We'll be using naive bayes with multinomial model, because our text is long (see Information Retrieval, chapter 13, Table 13.7)\n",
	"\n",
	"# Create index and mapping. We need:\n",
	"# - text to be stored as token_count type because for naive bayes with multinomial model we need the word count\n",
	"# - the number of shards to be 1 because there is only few data (2000 docs) and statistics for significant terms might be spoiled if we distribute this (TODO: DFS for significant terms works?)\n",
	"\n",
	"import json, requests, pprint, random, math, operator, datetime, sys, optparse, time, elasticsearch\n",
	"def create_index():\n",
	" es = elasticsearch.Elasticsearch()\n",
	" try:\n",
	" es.indices.delete(\"reuters\")\n",
	" except Exception as e:\n",
	" print e\n",
	" \n",
	" # We have to index test and training data into two different types, because significant terms can currently not filter out docs from the background set\n",
	" mapping = {\n",
	" \"mappings\": {\n",
	" \"article\": {\n",
	" \"properties\": {\n",
	" \"body\": {\n",
	" \"type\": \"string\"\n",
	" },\n",
	" \"places\": {\n",
	" \"type\": \"string\",\n",
	" \"index\": \"not_analyzed\" \n",
	"\n",
	" }\n",
	" }\n",
	" }\n",
	" },\n",
	" \"settings\": {\n",
	" \"index.number_of_shards\": 1\n",
	" }\n",
	" }\n",
	" es.indices.create(index=\"reuters\",body=mapping)\n",
	"\n",
	" "
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 5
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# read all filenames that has the data \n",
	"# data can be downlodaed from https://github.com/fergiemcdowall/reuters-21578-json \n",
	"# Cahnge the filename below!\n",
	"\n",
	"import os\n",
	"\n",
	"def gather_filenames():\n",
	" allFiles = []\n",
	" for r,d,f in os.walk('/Users/britta/devrepo/design/ml/naive_bayes/data/reuters-21578'):\n",
	" for files in f:\n",
	" if \"json\" in files:\n",
	" allFiles.append(os.path.join(r,files))\n",
	" return allFiles"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 6
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# create a bulk request from the filenames\n",
	"import random, elasticsearch, json\n",
	"\n",
	"def indexDocsBulk(filenames):\n",
	" bulk_string = ''\n",
	" random.seed()\n",
	" es = elasticsearch.Elasticsearch()\n",
	" for filename in filenames : \n",
	" f = open(filename, 'r')\n",
	" docs = json.load(f)\n",
	" for docId, doc in docs.iteritems():\n",
	" if (\"places\" in doc.keys()) and (\"body\" in doc.keys()):\n",
	" #header for bulk request\n",
	" header = \"{ \\\"index\\\" : { \\\"_index\\\" : \\\"reuters\\\", \\\"_type\\\" : \\\"article\\\", \\\"_id\\\": \\\"\"+str(docId)+\"\\\"} }\"\n",
	" \n",
	" #add to the bulk request\n",
	" bulk_string += (header + \"\\n\")\n",
	" bulk_string += (json.dumps(doc) + \"\\n\") \n",
	" response = es.bulk(body=bulk_string, refresh=True)\n"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 7
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"create_index()\n",
	"indexDocsBulk(gather_filenames())"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 8
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [],
	"language": "python",
	"metadata": {},
	"outputs": []
	}
	],
	"metadata": {}
	}
	]
	}