brwe · September 23, 2017 04:17 · brwe · Jul 5, 2014
diff --git a/example-queries b/example-queries
 # Terms aggregations

 POST reuters/_search

 # aggregate on places - how many articles per country?
 POST reuters-test/_search
 {
   "size": 0,
   "aggregations": {
      "class": {
         "terms": {
            "field": "places",
            "size": 100
         }
      }
   }
 }

 # aggregate on topics per country - what's the most important topic for each?

 POST reuters/_search
 {
   "size": 0,
   "aggregations": {
      "places": {
         "terms": {
            "field": "places",
            "size": 100
         },
         "aggs": {
           "topics": {
             "terms": {
               "field": "topics",
               "size": 10
             }
           }
         }
      }
   }
 }


 # Aggregate words in body per country - which words most often used per country in the article?
 # Not really useful, returns mostly stopwords
 POST reuters/_search
 {
   "size": 0,
   "aggregations": {
      "class": {
         "terms": {
            "field": "places"
         },
         "aggs": {
           "max_freq_terms": {
             "terms": {
               "field": "body",
               "size": 10
             }
           }
         }
      }
   }
 }




 # Significant terms in reuters


 POST reuters/_search
 {
   "size": 0,
   "aggregations": {
      "class": {
         "terms": {
            "field": "places"
         },
         "aggregations": {
            "sig_terms": {
               "significant_terms": {
                  "field": "body"
               }
            }
         }
      }
   }
 }


 # Significant terms for movie reviews

 POST movie-reviews/_search

 # what are the most often used words in positive and negative reviews?
 # Not really useful, returns mostly stopwords
 POST movie-reviews/_search
 {
   "size": 0,
   "aggregations": {
      "class": {
         "terms": {
            "field": "class"
         },
         "aggregations": {
            "sig_terms": {
               "terms": {
                  "field": "text"
               }
            }
         }
      }
   }
 }

 # Now, use significant terms for that

 POST movie-reviews/_search
 {
   "size": 0,
   "aggregations": {
      "class": {
         "terms": {
            "field": "class"
         },
         "aggregations": {
            "sig_terms": {
               "significant_terms": {
                  "field": "text"
               }
            }
         }
      }
   }
 }

diff --git a/movie-reviews-index.ipynb b/movie-reviews-index.ipynb
 {
 "metadata": {
  "name": "",
  "signature": "sha256:e9b9ae004d6696888c28ee6d6dd422d055d79849413ea45f0d24c87a3e6abc32"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "\n",
      "import json, requests, pprint, random, math, operator, datetime, sys, optparse, time, elasticsearch\n",
      "def create_index():\n",
      "    es = elasticsearch.Elasticsearch()\n",
      "    es.indices.delete(\"movie-reviews\")\n",
      "    mapping = {\n",
      "       \"mappings\": {\n",
      "          \"review\": {\n",
      "             \"properties\": {\n",
      "                \"text\": {\n",
      "                    \"type\": \"string\"\n",
      "                }\n",
      "             }\n",
      "          }\n",
      "       },\n",
      "       \"settings\": {\n",
      "          \"index.number_of_shards\": 1\n",
      "       }\n",
      "    }\n",
      "    es.indices.create(index=\"movie-reviews\",body=mapping)\n",
      "\n",
      "        "
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 43
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# download data from: http://www.cs.cornell.edu/people/pabo/movie-review-data/ \n",
      "# Need to change the path to the data below\n",
      "\n",
      "import os\n",
      "\n",
      "def gather_filenames():\n",
      "    allPosFiles = []\n",
      "    for r,d,f in os.walk('/Users/britta/Documents/naive_bayes/data/review_polarity/txt_sentoken/pos'):\n",
      "        for files in f:\n",
      "           allPosFiles.append(os.path.join(r,files))\n",
      "\n",
      "    allNegFiles = []\n",
      "    for r,d,f in os.walk('/Users/britta/Documents/naive_bayes/data/review_polarity/txt_sentoken/neg'):\n",
      "        for files in f:\n",
      "           allNegFiles.append(os.path.join(r,files))\n",
      "    return [allPosFiles, allNegFiles]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 44
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# create a bulk request from the filenames\n",
      "\n",
      "def indexDocsBulk(filenames, classlabels, docIdStart):\n",
      "    bulk_string = ''\n",
      "    random.seed()\n",
      "    docId = docIdStart;\n",
      "    es = elasticsearch.Elasticsearch()\n",
      "    for filename in filenames :\n",
      "        f = open(filename, 'r')\n",
      "        #header for bulk request\n",
      "        header = \"{ \\\"index\\\" : { \\\"_index\\\" : \\\"movie-reviews\\\", \\\"_type\\\" : \\\"review\\\", \\\"_id\\\": \\\"\"+str(docId)+\"\\\"} }\"\n",
      "        # text process: remove all newlines and secial characters\n",
      "        text = f.read().replace('\\n', ' ').replace('\"', ' ').replace(\"\\\\\",\" \")\n",
      "        text = \"\".join([i for i in text if 31 < ord(i) < 127])\n",
      "        #create the document text\n",
      "        doc = \"{\\\"text\\\": \\\"\" + text + \"\\\",\\\"class\\\": \\\"\"+ classlabels + \"\\\"}\"\n",
      "        #add to the bulk request\n",
      "        bulk_string += (header + \"\\n\")\n",
      "        bulk_string += (doc + \"\\n\")  \n",
      "        docId += 1;\n",
      "    response = es.bulk(body=bulk_string, refresh=True)\n",
      "    print \"Bulk took \" + str(float(response['took'])/1000.0) + \" s\"\n",
      "    return docId\n"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 45
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# index data\n",
      "def index_docs(allPosFiles, allNegFiles):\n",
      "    nextId = indexDocsBulk(allPosFiles, \"pos\", 1)\n",
      "    indexDocsBulk(allNegFiles, \"neg\", nextId)\n",
      "    \n",
      "    "
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 46
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "create_index()\n",
      "[pos_files, neg_files] = gather_filenames()\n",
      "index_docs(pos_files, neg_files)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "Bulk took 0.969 s\n",
        "Bulk took 0.817 s"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n"
       ]
      }
     ],
     "prompt_number": 47
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": []
    }
   ],
   "metadata": {}
  }
 ]
 }
diff --git a/recommender-movielens.ipynb b/recommender-movielens.ipynb
diff --git a/reuters-sig-terms-index.ipynb b/reuters-sig-terms-index.ipynb
	# Terms aggregations

	POST reuters/_search

	# aggregate on places - how many articles per country?
	POST reuters-test/_search
	{
	"size": 0,
	"aggregations": {
	"class": {
	"terms": {
	"field": "places",
	"size": 100
	}
	}
	}
	}

	# aggregate on topics per country - what's the most important topic for each?

	POST reuters/_search
	{
	"size": 0,
	"aggregations": {
	"places": {
	"terms": {
	"field": "places",
	"size": 100
	},
	"aggs": {
	"topics": {
	"terms": {
	"field": "topics",
	"size": 10
	}
	}
	}
	}
	}
	}


	# Aggregate words in body per country - which words most often used per country in the article?
	# Not really useful, returns mostly stopwords
	POST reuters/_search
	{
	"size": 0,
	"aggregations": {
	"class": {
	"terms": {
	"field": "places"
	},
	"aggs": {
	"max_freq_terms": {
	"terms": {
	"field": "body",
	"size": 10
	}
	}
	}
	}
	}
	}




	# Significant terms in reuters


	POST reuters/_search
	{
	"size": 0,
	"aggregations": {
	"class": {
	"terms": {
	"field": "places"
	},
	"aggregations": {
	"sig_terms": {
	"significant_terms": {
	"field": "body"
	}
	}
	}
	}
	}
	}


	# Significant terms for movie reviews

	POST movie-reviews/_search

	# what are the most often used words in positive and negative reviews?
	# Not really useful, returns mostly stopwords
	POST movie-reviews/_search
	{
	"size": 0,
	"aggregations": {
	"class": {
	"terms": {
	"field": "class"
	},
	"aggregations": {
	"sig_terms": {
	"terms": {
	"field": "text"
	}
	}
	}
	}
	}
	}

	# Now, use significant terms for that

	POST movie-reviews/_search
	{
	"size": 0,
	"aggregations": {
	"class": {
	"terms": {
	"field": "class"
	},
	"aggregations": {
	"sig_terms": {
	"significant_terms": {
	"field": "text"
	}
	}
	}
	}
	}
	}
	{
	"metadata": {
	"name": "",
	"signature": "sha256:e9b9ae004d6696888c28ee6d6dd422d055d79849413ea45f0d24c87a3e6abc32"
	},
	"nbformat": 3,
	"nbformat_minor": 0,
	"worksheets": [
	{
	"cells": [
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"\n",
	"import json, requests, pprint, random, math, operator, datetime, sys, optparse, time, elasticsearch\n",
	"def create_index():\n",
	" es = elasticsearch.Elasticsearch()\n",
	" es.indices.delete(\"movie-reviews\")\n",
	" mapping = {\n",
	" \"mappings\": {\n",
	" \"review\": {\n",
	" \"properties\": {\n",
	" \"text\": {\n",
	" \"type\": \"string\"\n",
	" }\n",
	" }\n",
	" }\n",
	" },\n",
	" \"settings\": {\n",
	" \"index.number_of_shards\": 1\n",
	" }\n",
	" }\n",
	" es.indices.create(index=\"movie-reviews\",body=mapping)\n",
	"\n",
	" "
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 43
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# download data from: http://www.cs.cornell.edu/people/pabo/movie-review-data/ \n",
	"# Need to change the path to the data below\n",
	"\n",
	"import os\n",
	"\n",
	"def gather_filenames():\n",
	" allPosFiles = []\n",
	" for r,d,f in os.walk('/Users/britta/Documents/naive_bayes/data/review_polarity/txt_sentoken/pos'):\n",
	" for files in f:\n",
	" allPosFiles.append(os.path.join(r,files))\n",
	"\n",
	" allNegFiles = []\n",
	" for r,d,f in os.walk('/Users/britta/Documents/naive_bayes/data/review_polarity/txt_sentoken/neg'):\n",
	" for files in f:\n",
	" allNegFiles.append(os.path.join(r,files))\n",
	" return [allPosFiles, allNegFiles]"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 44
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# create a bulk request from the filenames\n",
	"\n",
	"def indexDocsBulk(filenames, classlabels, docIdStart):\n",
	" bulk_string = ''\n",
	" random.seed()\n",
	" docId = docIdStart;\n",
	" es = elasticsearch.Elasticsearch()\n",
	" for filename in filenames :\n",
	" f = open(filename, 'r')\n",
	" #header for bulk request\n",
	" header = \"{ \\\"index\\\" : { \\\"_index\\\" : \\\"movie-reviews\\\", \\\"_type\\\" : \\\"review\\\", \\\"_id\\\": \\\"\"+str(docId)+\"\\\"} }\"\n",
	" # text process: remove all newlines and secial characters\n",
	" text = f.read().replace('\\n', ' ').replace('\"', ' ').replace(\"\\\\\",\" \")\n",
	" text = \"\".join([i for i in text if 31 < ord(i) < 127])\n",
	" #create the document text\n",
	" doc = \"{\\\"text\\\": \\\"\" + text + \"\\\",\\\"class\\\": \\\"\"+ classlabels + \"\\\"}\"\n",
	" #add to the bulk request\n",
	" bulk_string += (header + \"\\n\")\n",
	" bulk_string += (doc + \"\\n\") \n",
	" docId += 1;\n",
	" response = es.bulk(body=bulk_string, refresh=True)\n",
	" print \"Bulk took \" + str(float(response['took'])/1000.0) + \" s\"\n",
	" return docId\n"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 45
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# index data\n",
	"def index_docs(allPosFiles, allNegFiles):\n",
	" nextId = indexDocsBulk(allPosFiles, \"pos\", 1)\n",
	" indexDocsBulk(allNegFiles, \"neg\", nextId)\n",
	" \n",
	" "
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 46
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"create_index()\n",
	"[pos_files, neg_files] = gather_filenames()\n",
	"index_docs(pos_files, neg_files)"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"Bulk took 0.969 s\n",
	"Bulk took 0.817 s"
	]
	},
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"\n"
	]
	}
	],
	"prompt_number": 47
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [],
	"language": "python",
	"metadata": {},
	"outputs": []
	}
	],
	"metadata": {}
	}
	]
	}