brwe · September 23, 2017 04:17 · brwe · Jul 5, 2014
diff --git a/example-queries b/example-queries
 # Terms aggregations

 POST reuters/_search

 # aggregate on places - how many articles per country?
 POST reuters-test/_search
 {
   "size": 0,
   "aggregations": {
      "class": {
         "terms": {
            "field": "places",
            "size": 100
         }
      }
   }
 }

 # aggregate on topics per country - what's the most important topic for each?

 POST reuters/_search
 {
   "size": 0,
   "aggregations": {
      "places": {
         "terms": {
            "field": "places",
            "size": 100
         },
         "aggs": {
           "topics": {
             "terms": {
               "field": "topics",
               "size": 10
             }
           }
         }
      }
   }
 }


 # Aggregate words in body per country - which words most often used per country in the article?
 # Not really useful, returns mostly stopwords
 POST reuters/_search
 {
   "size": 0,
   "aggregations": {
      "class": {
         "terms": {
            "field": "places"
         },
         "aggs": {
           "max_freq_terms": {
             "terms": {
               "field": "body",
               "size": 10
             }
           }
         }
      }
   }
 }




 # Significant terms in reuters


 POST reuters/_search
 {
   "size": 0,
   "aggregations": {
      "class": {
         "terms": {
            "field": "places"
         },
         "aggregations": {
            "sig_terms": {
               "significant_terms": {
                  "field": "body"
               }
            }
         }
      }
   }
 }


 # Significant terms for movie reviews

 POST movie-reviews/_search

 # what are the most often used words in positive and negative reviews?
 # Not really useful, returns mostly stopwords
 POST movie-reviews/_search
 {
   "size": 0,
   "aggregations": {
      "class": {
         "terms": {
            "field": "class"
         },
         "aggregations": {
            "sig_terms": {
               "terms": {
                  "field": "text"
               }
            }
         }
      }
   }
 }

 # Now, use significant terms for that

 POST movie-reviews/_search
 {
   "size": 0,
   "aggregations": {
      "class": {
         "terms": {
            "field": "class"
         },
         "aggregations": {
            "sig_terms": {
               "significant_terms": {
                  "field": "text"
               }
            }
         }
      }
   }
 }

diff --git a/movie-reviews-index.ipynb b/movie-reviews-index.ipynb
diff --git a/recommender-movielens.ipynb b/recommender-movielens.ipynb
 {
 "metadata": {
  "name": "",
  "signature": "sha256:56be5f029dc5a19c773d50a811ebf1e94f817849631321b230f6e85c187e1658"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# create mapping for user\n",
      "\n",
      "import json, requests, pprint, random, math, operator, datetime, sys, optparse, time, elasticsearch\n",
      "def create_index():\n",
      "    es = elasticsearch.Elasticsearch()\n",
      "    try:\n",
      "        es.indices.delete(\"movielens\")\n",
      "    except Exception as e:\n",
      "        print e\n",
      "   \n",
      "    # mapping for user, contains list of movie ids for positive rated movies (>3) and negative (all others)\n",
      "    mapping = {\n",
      "       \"mappings\": {\n",
      "          \"user\": {\n",
      "             \"properties\": {\n",
      "                \"pos\": {\n",
      "                     \"type\": \"string\"\n",
      "                },\n",
      "                \"neg\": {\n",
      "                     \"type\": \"string\"\n",
      "                }\n",
      "             }\n",
      "          }\n",
      "       },\n",
      "       \"settings\": {\n",
      "          \"index.number_of_shards\": 1\n",
      "       }\n",
      "    }\n",
      "    es.indices.create(index=\"movielens\",body=mapping)\n"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 1
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "create_index()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stderr",
       "text": [
        "WARNING:elasticsearch:DELETE /movielens [status:404 request:0.004s]\n"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "TransportError(404, u'IndexMissingException[[movielens] missing]')\n"
       ]
      }
     ],
     "prompt_number": 2
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# index users\n",
      "import elasticsearch, string\n",
      "\n",
      "# needs the movielens dataset, which can be downloaded here: http://grouplens.org/datasets/movielens/ \n",
      "# Change the path below!\n",
      "\n",
      "f = open(\"/Users/britta/Downloads/ml-1m/users.dat\", 'r')\n",
      "es = elasticsearch.Elasticsearch()\n",
      "for line in f.readlines():\n",
      "    [UserID, Gender, Age, Occupation, Zipcode] = string.split(line,'::')\n",
      "    doc = {\"gender\": Gender, \"age\": Age, \"occupation\": Occupation, \"zipcode\": Zipcode, \"pos\": \"\", \"neg\": \"\"}\n",
      "    es.index(index=\"movielens\", doc_type='user', id=UserID, body=doc)\n",
      "elasticsearch.client.IndicesClient(es).refresh()\n",
      "    "
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 3,
       "text": [
        "{u'_shards': {u'failed': 0, u'successful': 33, u'total': 58}}"
       ]
      }
     ],
     "prompt_number": 3
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# index movies\n",
      "import elasticsearch, string\n",
      "\n",
      "f = open(\"/Users/britta/Downloads/ml-1m/movies.dat\", 'r')\n",
      "es = elasticsearch.Elasticsearch()\n",
      "for line in f.readlines():\n",
      "    [MovieID, Title, Genres] = string.split(line,'::')\n",
      "    doc = {\"title\": Title.decode('latin-1'), \"genres\": string.split(Genres, '|')}\n",
      "    es.index(index=\"movielens\", doc_type='movie', id=MovieID, body=doc)\n",
      "elasticsearch.client.IndicesClient(es).refresh()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 4,
       "text": [
        "{u'_shards': {u'failed': 0, u'successful': 33, u'total': 58}}"
       ]
      }
     ],
     "prompt_number": 4
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# index ratings but also update users ratings field (pos/neg)\n",
      "# This can take a while...\n",
      "\n",
      "import elasticsearch, string, sys\n",
      "\n",
      "f = open(\"/Users/britta/Downloads/ml-1m/ratings.dat\", 'r')\n",
      "es = elasticsearch.Elasticsearch()\n",
      "id = \"1\"\n",
      "for line in f.readlines():\n",
      "    [UserID, MovieID, Rating, Timestamp] = string.split(line,'::')\n",
      "     \n",
      "    if int(Rating)>3:\n",
      "        label = \"pos\"\n",
      "    else: \n",
      "        label = \"neg\"\n",
      "    ratingDoc = {\"userId\": UserID, \"movieId\": MovieID, \"rating\": Rating, \"timestamp\": Timestamp}\n",
      "    updateScript = \"ctx._source.\" + label+\" += \\\" \" + MovieID + \"\\\"\"\n",
      "    updatebody = {\"script\": updateScript}\n",
      "    try:\n",
      "        es.update(index=\"movielens\", doc_type=\"user\", id=UserID, body= updatebody)\n",
      "    except :\n",
      "        sys.exc_info()[0]\n",
      "        print \"User \" + str(UserID) + \" is missing. Request was \" + str(updatebody)\n",
      "    \n",
      "    es.index(index=\"movielens\", doc_type='rating', body=ratingDoc)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 5
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Indexing done"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Get the movie id for a specific title\n",
      "\n",
      "import elasticsearch\n",
      "es = elasticsearch.Elasticsearch()\n",
      "movie = es.get(index=\"movielens\", doc_type=\"movie\", id=movieId)\n",
      "\n",
      "query ={\n",
      "    \"query\": {\n",
      "        \"match\": {\n",
      "           \"title\": \"matrix\"\n",
      "        }\n",
      "    }\n",
      "}\n",
      "movies = es.search(index=\"movielens\", doc_type=\"movie\", body=query)\n",
      "for movie in movies['hits']['hits']:\n",
      "    print movie['_id'] + '   ' + movie['_source']['title']"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "2571   Matrix, The (1999)\n"
       ]
      }
     ],
     "prompt_number": 51
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Get recommendations for a movie\n",
      "import elasticsearch\n",
      "\n",
      "# movie id from before\n",
      "movieId = \"2571\"\n",
      "es = elasticsearch.Elasticsearch()\n",
      "movie = es.get(index=\"movielens\", doc_type=\"movie\", id=movieId)\n",
      "print 'recommendations for \\\"'+movie['_source']['title']+'\\\":'\n",
      "print '--------------------------------------------------'\n",
      "aggregation_query = {\n",
      "  \"size\": 0,\n",
      "  \"aggregations\": {\n",
      "    \"rec_movie\": {\n",
      "      \"filter\": {\n",
      "        \"term\": {\n",
      "          \"pos\": movieId\n",
      "        }\n",
      "      },\n",
      "      \"aggs\": {\n",
      "        \"m\": {\n",
      "          \"significant_terms\": {\n",
      "            \"field\": \"pos\",\n",
      "            \"min_doc_count\": 10,\n",
      "            \"size\": 30,\n",
      "            \"background_filter\": {\n",
      "              \"type\": {\n",
      "                \"value\": \"user\"\n",
      "              }\n",
      "            }\n",
      "          }\n",
      "        }\n",
      "      }\n",
      "    }\n",
      "  }\n",
      "}\n",
      "\n",
      "\n",
      "result = es.search(index=\"movielens\", doc_type=\"user\", body=aggregation_query)\n",
      "\n",
      "for term in result['aggregations']['rec_movie']['m']['buckets']:\n",
      "    if (term['key'] != movieId):\n",
      "        movie = es.get(index=\"movielens\", doc_type=\"movie\", id=term['key'])\n",
      "        print movie['_source']['title']"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "recommendations for \"Matrix, The (1999)\":\n",
        "--------------------------------------------------\n",
        "Terminator 2: Judgment Day (1991)\n",
        "Total Recall (1990)\n",
        "Terminator, The (1984)\n",
        "Aliens (1986)"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Fifth Element, The (1997)\n",
        "Star Wars: Episode V - The Empire Strikes Back (1980)\n",
        "Fugitive, The (1993)\n",
        "Star Wars: Episode IV - A New Hope (1977)\n",
        "Twelve Monkeys (1995)\n",
        "Men in Black (1997)\n",
        "Alien (1979)\n",
        "Independence Day (ID4) (1996)\n",
        "Jurassic Park (1993)\n",
        "Hunt for Red October, The (1990)\n",
        "Face/Off (1997)\n",
        "Star Wars: Episode VI - Return of the Jedi (1983)\n",
        "Blade Runner (1982)\n",
        "Die Hard (1988)\n",
        "Indiana Jones and the Last Crusade (1989)\n",
        "Braveheart (1995)\n",
        "Saving Private Ryan (1998)\n",
        "Star Trek: First Contact (1996)\n",
        "Star Wars: Episode I - The Phantom Menace (1999)\n",
        "Predator (1987)\n",
        "Raiders of the Lost Ark (1981)"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Gattaca (1997)\n",
        "Rock, The (1996)\n",
        "Star Trek: The Wrath of Khan (1982)\n",
        "Contact (1997)\n"
       ]
      }
     ],
     "prompt_number": 2
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Get recommendations per gender\n",
      "import elasticsearch\n",
      "\n",
      "print 'popular m/f:'\n",
      "print '--------------------------------------------------'\n",
      "aggregation_query = {\n",
      "  \"size\": 0,\n",
      "  \"aggregations\": {\n",
      "    \"rec_movie\": {\n",
      "      \"terms\": {\n",
      "        \"field\": \"gender\"\n",
      "      },\n",
      "      \"aggs\": {\n",
      "        \"movies\": {\n",
      "          \"significant_terms\": {\n",
      "            \"field\": \"pos\",\n",
      "            \"min_doc_count\": 10,\n",
      "            \"size\": 10,\n",
      "            \"background_filter\": {\n",
      "              \"type\": {\n",
      "                \"value\": \"user\"\n",
      "              }\n",
      "            }\n",
      "          }\n",
      "        }\n",
      "      }\n",
      "    }\n",
      "  }\n",
      "}\n",
      "\n",
      "result = es.search(index=\"movielens\", doc_type=\"user\", body=aggregation_query)\n",
      "\n",
      "\n",
      "for term in result['aggregations']['rec_movie']['buckets']:\n",
      "    print '--------------------------------------------------'\n",
      "    print term['key'] + ':'\n",
      "    print '--------------------------------------------------'\n",
      "    for movie in term['movies']['buckets']:\n",
      "        movie = es.get(index=\"movielens\", doc_type=\"movie\", id=movie['key'])\n",
      "        print movie['_source']['title']"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "popular m/f:\n",
        "--------------------------------------------------\n",
        "--------------------------------------------------"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "m:\n",
        "--------------------------------------------------\n",
        "Terminator 2: Judgment Day (1991)\n",
        "Terminator, The (1984)\n",
        "Alien (1979)\n",
        "Aliens (1986)\n",
        "Matrix, The (1999)\n",
        "Star Wars: Episode IV - A New Hope (1977)\n",
        "Star Wars: Episode V - The Empire Strikes Back (1980)\n",
        "Saving Private Ryan (1998)\n",
        "Blade Runner (1982)\n",
        "Total Recall (1990)\n",
        "--------------------------------------------------\n",
        "f:\n",
        "--------------------------------------------------\n",
        "Sense and Sensibility (1995)\n",
        "Emma (1996)\n",
        "My Fair Lady (1964)\n",
        "Gone with the Wind (1939)\n",
        "Dirty Dancing (1987)\n",
        "Breakfast at Tiffany's (1961)\n",
        "Like Water for Chocolate (Como agua para chocolate) (1992)\n",
        "Roman Holiday (1953)\n",
        "Strictly Ballroom (1992)\n",
        "Circle of Friends (1995)\n"
       ]
      }
     ],
     "prompt_number": 3
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Get recommendations per occupation\n",
      "import elasticsearch\n",
      "\n",
      "print 'popular m/f:'\n",
      "print '--------------------------------------------------'\n",
      "aggregation_query = {\n",
      "  \"size\": 0,\n",
      "  \"aggregations\": {\n",
      "    \"rec_movie\": {\n",
      "      \"terms\": {\n",
      "        \"field\": \"occupation\"\n",
      "      },\n",
      "      \"aggs\": {\n",
      "        \"movies\": {\n",
      "          \"significant_terms\": {\n",
      "            \"field\": \"pos\",\n",
      "            \"min_doc_count\": 10,\n",
      "            \"shard_min_doc_count\": 10,\n",
      "            \"size\": 10,\n",
      "            \"background_filter\": {\n",
      "              \"type\": {\n",
      "                \"value\": \"user\"\n",
      "              }\n",
      "            }\n",
      "          }\n",
      "        }\n",
      "      }\n",
      "    }\n",
      "  }\n",
      "}\n",
      "\n",
      "result = es.search(index=\"movielens\", doc_type=\"user\", body=aggregation_query)\n",
      "occupations = {\n",
      "\"0\":  \"other  or not specified\",\n",
      "\"1\":  \"academic/educator\",\n",
      "\"2\":  \"artist\",\n",
      "        \"3\":  \"clerical/admin\",\n",
      "        \"4\":  \"college/grad student\",\n",
      "        \"5\":  \"customer service\",\n",
      "        \"6\":  \"doctor/health care\",\n",
      "        \"7\":  \"executive/managerial\",\n",
      "        \"8\":  \"farmer\",\n",
      "        \"9\":  \"homemaker\",\n",
      "        \"10\":  \"K-12 student\",\n",
      "        \"11\":  \"lawyer\",\n",
      "        \"12\":  \"programmer\",\n",
      "        \"13\":  \"retired\",\n",
      "        \"14\":  \"sales/marketing\",\n",
      "        \"15\":  \"scientist\",\n",
      "        \"16\":  \"self-employed\",\n",
      "        \"17\":  \"technician/engineer\",\n",
      "        \"18\":  \"tradesman/craftsman\",\n",
      "        \"19\":  \"unemployed\",\n",
      "        \"20\":  \"writer\"}\n",
      "for term in result['aggregations']['rec_movie']['buckets']:\n",
      "    print '--------------------------------------------------'\n",
      "    print occupations[term['key']] + ':'\n",
      "    \n",
      "    print '--------------------------------------------------'\n",
      "    for movie in term['movies']['buckets']:\n",
      "        movie = es.get(index=\"movielens\", doc_type=\"movie\", id=movie['key'])\n",
      "        print movie['_source']['title']"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "popular m/f:\n",
        "--------------------------------------------------\n",
        "--------------------------------------------------"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "college/grad student:\n",
        "--------------------------------------------------\n",
        "Fight Club (1999)\n",
        "South Park: Bigger, Longer and Uncut (1999)\n",
        "American Beauty (1999)\n",
        "X-Men (2000)\n",
        "Gladiator (2000)\n",
        "Clerks (1994)\n",
        "American Pie (1999)\n",
        "American History X (1998)\n",
        "Pulp Fiction (1994)\n",
        "Austin Powers: The Spy Who Shagged Me (1999)\n",
        "--------------------------------------------------\n",
        "other  or not specified:\n",
        "--------------------------------------------------\n",
        "Nightmare on Elm Street 3: Dream Warriors, A (1987)\n",
        "Toy Story (1995)\n",
        "Nightmare Before Christmas, The (1993)\n",
        "Sid and Nancy (1986)\n",
        "There's Something About Mary (1998)\n",
        "Gilda (1946)\n",
        "Babe (1995)\n",
        "Boogie Nights (1997)\n",
        "Dumb & Dumber (1994)\n",
        "Doors, The (1991)"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "--------------------------------------------------\n",
        "executive/managerial:\n",
        "--------------------------------------------------\n",
        "From Russia with Love (1963)\n",
        "Hunt for Red October, The (1990)\n",
        "Die Hard 2 (1990)\n",
        "Dirty Dozen, The (1967)\n",
        "Fatal Attraction (1987)\n",
        "Patriot Games (1992)\n",
        "Thelma & Louise (1991)\n",
        "African Queen, The (1951)\n",
        "Longest Day, The (1962)\n",
        "Lethal Weapon (1987)\n",
        "--------------------------------------------------\n",
        "academic/educator:\n",
        "--------------------------------------------------\n",
        "Annie Hall (1977)\n",
        "North by Northwest (1959)\n",
        "Shakespeare in Love (1998)\n",
        "Postino, Il (The Postman) (1994)\n",
        "Sense and Sensibility (1995)\n",
        "Philadelphia Story, The (1940)\n",
        "Manhattan (1979)\n",
        "Maltese Falcon, The (1941)\n",
        "Like Water for Chocolate (Como agua para chocolate) (1992)\n",
        "To Kill a Mockingbird (1962)"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "--------------------------------------------------\n",
        "technician/engineer:\n",
        "--------------------------------------------------\n",
        "Terminator 2: Judgment Day (1991)\n",
        "Total Recall (1990)\n",
        "Terminator, The (1984)\n",
        "Matrix, The (1999)\n",
        "Star Trek: The Wrath of Khan (1982)\n",
        "Aliens (1986)\n",
        "Star Trek IV: The Voyage Home (1986)\n",
        "Galaxy Quest (1999)\n",
        "Predator (1987)\n",
        "Stargate (1994)\n",
        "--------------------------------------------------\n",
        "programmer:\n",
        "--------------------------------------------------\n",
        "Gattaca (1997)\n",
        "Brazil (1985)\n",
        "Matrix, The (1999)\n",
        "Star Wars: Episode V - The Empire Strikes Back (1980)\n",
        "Alien (1979)\n",
        "Terminator 2: Judgment Day (1991)"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Blade Runner (1982)\n",
        "Terminator, The (1984)\n",
        "Star Trek: The Wrath of Khan (1982)\n",
        "Star Wars: Episode VI - Return of the Jedi (1983)\n",
        "--------------------------------------------------\n",
        "sales/marketing:\n",
        "--------------------------------------------------\n",
        "Fugitive, The (1993)\n",
        "My Cousin Vinny (1992)\n",
        "Tin Cup (1996)\n",
        "Silence of the Lambs, The (1991)\n",
        "About Last Night... (1986)\n",
        "Air Force One (1997)\n",
        "Swingers (1996)\n",
        "Lethal Weapon (1987)\n",
        "Fast Times at Ridgemont High (1982)\n",
        "Basic Instinct (1992)\n",
        "--------------------------------------------------\n",
        "writer:\n",
        "--------------------------------------------------\n",
        "Grifters, The (1990)\n",
        "Manhattan (1979)\n",
        "Living in Oblivion (1995)\n",
        "Crumb (1994)\n",
        "M (1931)\n",
        "Being John Malkovich (1999)\n",
        "Touch of Evil (1958)\n",
        "Crimes and Misdemeanors (1989)\n",
        "Out of Sight (1998)\n",
        "Big Sleep, The (1946)\n",
        "--------------------------------------------------\n",
        "artist:\n",
        "--------------------------------------------------\n",
        "Heavenly Creatures (1994)\n",
        "Badlands (1973)"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Fisher King, The (1991)\n",
        "Thirty-Two Short Films About Glenn Gould (1993)\n",
        "Basquiat (1996)\n",
        "Ed Wood (1994)\n",
        "Nightmare Before Christmas, The (1993)\n",
        "Vanya on 42nd Street (1994)\n",
        "Do the Right Thing (1989)\n",
        "Sweet Hereafter, The (1997)\n",
        "--------------------------------------------------\n",
        "self-employed:\n",
        "--------------------------------------------------\n",
        "King Kong (1933)\n",
        "Boat, The (Das Boot) (1981)\n",
        "Three Days of the Condor (1975)\n",
        "Tender Mercies (1983)\n",
        "Local Hero (1983)\n",
        "Breaker Morant (1980)\n",
        "Butch Cassidy and the Sundance Kid (1969)\n",
        "Serpico (1973)\n",
        "Anatomy of a Murder (1959)\n",
        "Great Escape, The (1963)\n"
       ]
      }
     ],
     "prompt_number": 4
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": []
    }
   ],
   "metadata": {}
  }
 ]
 }
diff --git a/reuters-sig-terms-index.ipynb b/reuters-sig-terms-index.ipynb
	# Terms aggregations

	POST reuters/_search

	# aggregate on places - how many articles per country?
	POST reuters-test/_search
	{
	"size": 0,
	"aggregations": {
	"class": {
	"terms": {
	"field": "places",
	"size": 100
	}
	}
	}
	}

	# aggregate on topics per country - what's the most important topic for each?

	POST reuters/_search
	{
	"size": 0,
	"aggregations": {
	"places": {
	"terms": {
	"field": "places",
	"size": 100
	},
	"aggs": {
	"topics": {
	"terms": {
	"field": "topics",
	"size": 10
	}
	}
	}
	}
	}
	}


	# Aggregate words in body per country - which words most often used per country in the article?
	# Not really useful, returns mostly stopwords
	POST reuters/_search
	{
	"size": 0,
	"aggregations": {
	"class": {
	"terms": {
	"field": "places"
	},
	"aggs": {
	"max_freq_terms": {
	"terms": {
	"field": "body",
	"size": 10
	}
	}
	}
	}
	}
	}




	# Significant terms in reuters


	POST reuters/_search
	{
	"size": 0,
	"aggregations": {
	"class": {
	"terms": {
	"field": "places"
	},
	"aggregations": {
	"sig_terms": {
	"significant_terms": {
	"field": "body"
	}
	}
	}
	}
	}
	}


	# Significant terms for movie reviews

	POST movie-reviews/_search

	# what are the most often used words in positive and negative reviews?
	# Not really useful, returns mostly stopwords
	POST movie-reviews/_search
	{
	"size": 0,
	"aggregations": {
	"class": {
	"terms": {
	"field": "class"
	},
	"aggregations": {
	"sig_terms": {
	"terms": {
	"field": "text"
	}
	}
	}
	}
	}
	}

	# Now, use significant terms for that

	POST movie-reviews/_search
	{
	"size": 0,
	"aggregations": {
	"class": {
	"terms": {
	"field": "class"
	},
	"aggregations": {
	"sig_terms": {
	"significant_terms": {
	"field": "text"
	}
	}
	}
	}
	}
	}
	{
	"metadata": {
	"name": "",
	"signature": "sha256:56be5f029dc5a19c773d50a811ebf1e94f817849631321b230f6e85c187e1658"
	},
	"nbformat": 3,
	"nbformat_minor": 0,
	"worksheets": [
	{
	"cells": [
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# create mapping for user\n",
	"\n",
	"import json, requests, pprint, random, math, operator, datetime, sys, optparse, time, elasticsearch\n",
	"def create_index():\n",
	" es = elasticsearch.Elasticsearch()\n",
	" try:\n",
	" es.indices.delete(\"movielens\")\n",
	" except Exception as e:\n",
	" print e\n",
	" \n",
	" # mapping for user, contains list of movie ids for positive rated movies (>3) and negative (all others)\n",
	" mapping = {\n",
	" \"mappings\": {\n",
	" \"user\": {\n",
	" \"properties\": {\n",
	" \"pos\": {\n",
	" \"type\": \"string\"\n",
	" },\n",
	" \"neg\": {\n",
	" \"type\": \"string\"\n",
	" }\n",
	" }\n",
	" }\n",
	" },\n",
	" \"settings\": {\n",
	" \"index.number_of_shards\": 1\n",
	" }\n",
	" }\n",
	" es.indices.create(index=\"movielens\",body=mapping)\n"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 1
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"create_index()"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stderr",
	"text": [
	"WARNING:elasticsearch:DELETE /movielens [status:404 request:0.004s]\n"
	]
	},
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"TransportError(404, u'IndexMissingException[[movielens] missing]')\n"
	]
	}
	],
	"prompt_number": 2
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# index users\n",
	"import elasticsearch, string\n",
	"\n",
	"# needs the movielens dataset, which can be downloaded here: http://grouplens.org/datasets/movielens/ \n",
	"# Change the path below!\n",
	"\n",
	"f = open(\"/Users/britta/Downloads/ml-1m/users.dat\", 'r')\n",
	"es = elasticsearch.Elasticsearch()\n",
	"for line in f.readlines():\n",
	" [UserID, Gender, Age, Occupation, Zipcode] = string.split(line,'::')\n",
	" doc = {\"gender\": Gender, \"age\": Age, \"occupation\": Occupation, \"zipcode\": Zipcode, \"pos\": \"\", \"neg\": \"\"}\n",
	" es.index(index=\"movielens\", doc_type='user', id=UserID, body=doc)\n",
	"elasticsearch.client.IndicesClient(es).refresh()\n",
	" "
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"metadata": {},
	"output_type": "pyout",
	"prompt_number": 3,
	"text": [
	"{u'_shards': {u'failed': 0, u'successful': 33, u'total': 58}}"
	]
	}
	],
	"prompt_number": 3
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# index movies\n",
	"import elasticsearch, string\n",
	"\n",
	"f = open(\"/Users/britta/Downloads/ml-1m/movies.dat\", 'r')\n",
	"es = elasticsearch.Elasticsearch()\n",
	"for line in f.readlines():\n",
	" [MovieID, Title, Genres] = string.split(line,'::')\n",
	" doc = {\"title\": Title.decode('latin-1'), \"genres\": string.split(Genres, '\|')}\n",
	" es.index(index=\"movielens\", doc_type='movie', id=MovieID, body=doc)\n",
	"elasticsearch.client.IndicesClient(es).refresh()"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"metadata": {},
	"output_type": "pyout",
	"prompt_number": 4,
	"text": [
	"{u'_shards': {u'failed': 0, u'successful': 33, u'total': 58}}"
	]
	}
	],
	"prompt_number": 4
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# index ratings but also update users ratings field (pos/neg)\n",
	"# This can take a while...\n",
	"\n",
	"import elasticsearch, string, sys\n",
	"\n",
	"f = open(\"/Users/britta/Downloads/ml-1m/ratings.dat\", 'r')\n",
	"es = elasticsearch.Elasticsearch()\n",
	"id = \"1\"\n",
	"for line in f.readlines():\n",
	" [UserID, MovieID, Rating, Timestamp] = string.split(line,'::')\n",
	" \n",
	" if int(Rating)>3:\n",
	" label = \"pos\"\n",
	" else: \n",
	" label = \"neg\"\n",
	" ratingDoc = {\"userId\": UserID, \"movieId\": MovieID, \"rating\": Rating, \"timestamp\": Timestamp}\n",
	" updateScript = \"ctx._source.\" + label+\" += \\\" \" + MovieID + \"\\\"\"\n",
	" updatebody = {\"script\": updateScript}\n",
	" try:\n",
	" es.update(index=\"movielens\", doc_type=\"user\", id=UserID, body= updatebody)\n",
	" except :\n",
	" sys.exc_info()[0]\n",
	" print \"User \" + str(UserID) + \" is missing. Request was \" + str(updatebody)\n",
	" \n",
	" es.index(index=\"movielens\", doc_type='rating', body=ratingDoc)"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 5
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# Indexing done"
	],
	"language": "python",
	"metadata": {},
	"outputs": []
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# Get the movie id for a specific title\n",
	"\n",
	"import elasticsearch\n",
	"es = elasticsearch.Elasticsearch()\n",
	"movie = es.get(index=\"movielens\", doc_type=\"movie\", id=movieId)\n",
	"\n",
	"query ={\n",
	" \"query\": {\n",
	" \"match\": {\n",
	" \"title\": \"matrix\"\n",
	" }\n",
	" }\n",
	"}\n",
	"movies = es.search(index=\"movielens\", doc_type=\"movie\", body=query)\n",
	"for movie in movies['hits']['hits']:\n",
	" print movie['_id'] + ' ' + movie['_source']['title']"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"2571 Matrix, The (1999)\n"
	]
	}
	],
	"prompt_number": 51
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# Get recommendations for a movie\n",
	"import elasticsearch\n",
	"\n",
	"# movie id from before\n",
	"movieId = \"2571\"\n",
	"es = elasticsearch.Elasticsearch()\n",
	"movie = es.get(index=\"movielens\", doc_type=\"movie\", id=movieId)\n",
	"print 'recommendations for \\\"'+movie['_source']['title']+'\\\":'\n",
	"print '--------------------------------------------------'\n",
	"aggregation_query = {\n",
	" \"size\": 0,\n",
	" \"aggregations\": {\n",
	" \"rec_movie\": {\n",
	" \"filter\": {\n",
	" \"term\": {\n",
	" \"pos\": movieId\n",
	" }\n",
	" },\n",
	" \"aggs\": {\n",
	" \"m\": {\n",
	" \"significant_terms\": {\n",
	" \"field\": \"pos\",\n",
	" \"min_doc_count\": 10,\n",
	" \"size\": 30,\n",
	" \"background_filter\": {\n",
	" \"type\": {\n",
	" \"value\": \"user\"\n",
	" }\n",
	" }\n",
	" }\n",
	" }\n",
	" }\n",
	" }\n",
	" }\n",
	"}\n",
	"\n",
	"\n",
	"result = es.search(index=\"movielens\", doc_type=\"user\", body=aggregation_query)\n",
	"\n",
	"for term in result['aggregations']['rec_movie']['m']['buckets']:\n",
	" if (term['key'] != movieId):\n",
	" movie = es.get(index=\"movielens\", doc_type=\"movie\", id=term['key'])\n",
	" print movie['_source']['title']"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"recommendations for \"Matrix, The (1999)\":\n",
	"--------------------------------------------------\n",
	"Terminator 2: Judgment Day (1991)\n",
	"Total Recall (1990)\n",
	"Terminator, The (1984)\n",
	"Aliens (1986)"
	]
	},
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"\n",
	"Fifth Element, The (1997)\n",
	"Star Wars: Episode V - The Empire Strikes Back (1980)\n",
	"Fugitive, The (1993)\n",
	"Star Wars: Episode IV - A New Hope (1977)\n",
	"Twelve Monkeys (1995)\n",
	"Men in Black (1997)\n",
	"Alien (1979)\n",
	"Independence Day (ID4) (1996)\n",
	"Jurassic Park (1993)\n",
	"Hunt for Red October, The (1990)\n",
	"Face/Off (1997)\n",
	"Star Wars: Episode VI - Return of the Jedi (1983)\n",
	"Blade Runner (1982)\n",
	"Die Hard (1988)\n",
	"Indiana Jones and the Last Crusade (1989)\n",
	"Braveheart (1995)\n",
	"Saving Private Ryan (1998)\n",
	"Star Trek: First Contact (1996)\n",
	"Star Wars: Episode I - The Phantom Menace (1999)\n",
	"Predator (1987)\n",
	"Raiders of the Lost Ark (1981)"
	]
	},
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"\n",
	"Gattaca (1997)\n",
	"Rock, The (1996)\n",
	"Star Trek: The Wrath of Khan (1982)\n",
	"Contact (1997)\n"
	]
	}
	],
	"prompt_number": 2
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# Get recommendations per gender\n",
	"import elasticsearch\n",
	"\n",
	"print 'popular m/f:'\n",
	"print '--------------------------------------------------'\n",
	"aggregation_query = {\n",
	" \"size\": 0,\n",
	" \"aggregations\": {\n",
	" \"rec_movie\": {\n",
	" \"terms\": {\n",
	" \"field\": \"gender\"\n",
	" },\n",
	" \"aggs\": {\n",
	" \"movies\": {\n",
	" \"significant_terms\": {\n",
	" \"field\": \"pos\",\n",
	" \"min_doc_count\": 10,\n",
	" \"size\": 10,\n",
	" \"background_filter\": {\n",
	" \"type\": {\n",
	" \"value\": \"user\"\n",
	" }\n",
	" }\n",
	" }\n",
	" }\n",
	" }\n",
	" }\n",
	" }\n",
	"}\n",
	"\n",
	"result = es.search(index=\"movielens\", doc_type=\"user\", body=aggregation_query)\n",
	"\n",
	"\n",
	"for term in result['aggregations']['rec_movie']['buckets']:\n",
	" print '--------------------------------------------------'\n",
	" print term['key'] + ':'\n",
	" print '--------------------------------------------------'\n",
	" for movie in term['movies']['buckets']:\n",
	" movie = es.get(index=\"movielens\", doc_type=\"movie\", id=movie['key'])\n",
	" print movie['_source']['title']"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"popular m/f:\n",
	"--------------------------------------------------\n",
	"--------------------------------------------------"
	]
	},
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"\n",
	"m:\n",
	"--------------------------------------------------\n",
	"Terminator 2: Judgment Day (1991)\n",
	"Terminator, The (1984)\n",
	"Alien (1979)\n",
	"Aliens (1986)\n",
	"Matrix, The (1999)\n",
	"Star Wars: Episode IV - A New Hope (1977)\n",
	"Star Wars: Episode V - The Empire Strikes Back (1980)\n",
	"Saving Private Ryan (1998)\n",
	"Blade Runner (1982)\n",
	"Total Recall (1990)\n",
	"--------------------------------------------------\n",
	"f:\n",
	"--------------------------------------------------\n",
	"Sense and Sensibility (1995)\n",
	"Emma (1996)\n",
	"My Fair Lady (1964)\n",
	"Gone with the Wind (1939)\n",
	"Dirty Dancing (1987)\n",
	"Breakfast at Tiffany's (1961)\n",
	"Like Water for Chocolate (Como agua para chocolate) (1992)\n",
	"Roman Holiday (1953)\n",
	"Strictly Ballroom (1992)\n",
	"Circle of Friends (1995)\n"
	]
	}
	],
	"prompt_number": 3
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# Get recommendations per occupation\n",
	"import elasticsearch\n",
	"\n",
	"print 'popular m/f:'\n",
	"print '--------------------------------------------------'\n",
	"aggregation_query = {\n",
	" \"size\": 0,\n",
	" \"aggregations\": {\n",
	" \"rec_movie\": {\n",
	" \"terms\": {\n",
	" \"field\": \"occupation\"\n",
	" },\n",
	" \"aggs\": {\n",
	" \"movies\": {\n",
	" \"significant_terms\": {\n",
	" \"field\": \"pos\",\n",
	" \"min_doc_count\": 10,\n",
	" \"shard_min_doc_count\": 10,\n",
	" \"size\": 10,\n",
	" \"background_filter\": {\n",
	" \"type\": {\n",
	" \"value\": \"user\"\n",
	" }\n",
	" }\n",
	" }\n",
	" }\n",
	" }\n",
	" }\n",
	" }\n",
	"}\n",
	"\n",
	"result = es.search(index=\"movielens\", doc_type=\"user\", body=aggregation_query)\n",
	"occupations = {\n",
	"\"0\": \"other or not specified\",\n",
	"\"1\": \"academic/educator\",\n",
	"\"2\": \"artist\",\n",
	" \"3\": \"clerical/admin\",\n",
	" \"4\": \"college/grad student\",\n",
	" \"5\": \"customer service\",\n",
	" \"6\": \"doctor/health care\",\n",
	" \"7\": \"executive/managerial\",\n",
	" \"8\": \"farmer\",\n",
	" \"9\": \"homemaker\",\n",
	" \"10\": \"K-12 student\",\n",
	" \"11\": \"lawyer\",\n",
	" \"12\": \"programmer\",\n",
	" \"13\": \"retired\",\n",
	" \"14\": \"sales/marketing\",\n",
	" \"15\": \"scientist\",\n",
	" \"16\": \"self-employed\",\n",
	" \"17\": \"technician/engineer\",\n",
	" \"18\": \"tradesman/craftsman\",\n",
	" \"19\": \"unemployed\",\n",
	" \"20\": \"writer\"}\n",
	"for term in result['aggregations']['rec_movie']['buckets']:\n",
	" print '--------------------------------------------------'\n",
	" print occupations[term['key']] + ':'\n",
	" \n",
	" print '--------------------------------------------------'\n",
	" for movie in term['movies']['buckets']:\n",
	" movie = es.get(index=\"movielens\", doc_type=\"movie\", id=movie['key'])\n",
	" print movie['_source']['title']"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"popular m/f:\n",
	"--------------------------------------------------\n",
	"--------------------------------------------------"
	]
	},
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"\n",
	"college/grad student:\n",
	"--------------------------------------------------\n",
	"Fight Club (1999)\n",
	"South Park: Bigger, Longer and Uncut (1999)\n",
	"American Beauty (1999)\n",
	"X-Men (2000)\n",
	"Gladiator (2000)\n",
	"Clerks (1994)\n",
	"American Pie (1999)\n",
	"American History X (1998)\n",
	"Pulp Fiction (1994)\n",
	"Austin Powers: The Spy Who Shagged Me (1999)\n",
	"--------------------------------------------------\n",
	"other or not specified:\n",
	"--------------------------------------------------\n",
	"Nightmare on Elm Street 3: Dream Warriors, A (1987)\n",
	"Toy Story (1995)\n",
	"Nightmare Before Christmas, The (1993)\n",
	"Sid and Nancy (1986)\n",
	"There's Something About Mary (1998)\n",
	"Gilda (1946)\n",
	"Babe (1995)\n",
	"Boogie Nights (1997)\n",
	"Dumb & Dumber (1994)\n",
	"Doors, The (1991)"
	]
	},
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"\n",
	"--------------------------------------------------\n",
	"executive/managerial:\n",
	"--------------------------------------------------\n",
	"From Russia with Love (1963)\n",
	"Hunt for Red October, The (1990)\n",
	"Die Hard 2 (1990)\n",
	"Dirty Dozen, The (1967)\n",
	"Fatal Attraction (1987)\n",
	"Patriot Games (1992)\n",
	"Thelma & Louise (1991)\n",
	"African Queen, The (1951)\n",
	"Longest Day, The (1962)\n",
	"Lethal Weapon (1987)\n",
	"--------------------------------------------------\n",
	"academic/educator:\n",
	"--------------------------------------------------\n",
	"Annie Hall (1977)\n",
	"North by Northwest (1959)\n",
	"Shakespeare in Love (1998)\n",
	"Postino, Il (The Postman) (1994)\n",
	"Sense and Sensibility (1995)\n",
	"Philadelphia Story, The (1940)\n",
	"Manhattan (1979)\n",
	"Maltese Falcon, The (1941)\n",
	"Like Water for Chocolate (Como agua para chocolate) (1992)\n",
	"To Kill a Mockingbird (1962)"
	]
	},
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"\n",
	"--------------------------------------------------\n",
	"technician/engineer:\n",
	"--------------------------------------------------\n",
	"Terminator 2: Judgment Day (1991)\n",
	"Total Recall (1990)\n",
	"Terminator, The (1984)\n",
	"Matrix, The (1999)\n",
	"Star Trek: The Wrath of Khan (1982)\n",
	"Aliens (1986)\n",
	"Star Trek IV: The Voyage Home (1986)\n",
	"Galaxy Quest (1999)\n",
	"Predator (1987)\n",
	"Stargate (1994)\n",
	"--------------------------------------------------\n",
	"programmer:\n",
	"--------------------------------------------------\n",
	"Gattaca (1997)\n",
	"Brazil (1985)\n",
	"Matrix, The (1999)\n",
	"Star Wars: Episode V - The Empire Strikes Back (1980)\n",
	"Alien (1979)\n",
	"Terminator 2: Judgment Day (1991)"
	]
	},
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"\n",
	"Blade Runner (1982)\n",
	"Terminator, The (1984)\n",
	"Star Trek: The Wrath of Khan (1982)\n",
	"Star Wars: Episode VI - Return of the Jedi (1983)\n",
	"--------------------------------------------------\n",
	"sales/marketing:\n",
	"--------------------------------------------------\n",
	"Fugitive, The (1993)\n",
	"My Cousin Vinny (1992)\n",
	"Tin Cup (1996)\n",
	"Silence of the Lambs, The (1991)\n",
	"About Last Night... (1986)\n",
	"Air Force One (1997)\n",
	"Swingers (1996)\n",
	"Lethal Weapon (1987)\n",
	"Fast Times at Ridgemont High (1982)\n",
	"Basic Instinct (1992)\n",
	"--------------------------------------------------\n",
	"writer:\n",
	"--------------------------------------------------\n",
	"Grifters, The (1990)\n",
	"Manhattan (1979)\n",
	"Living in Oblivion (1995)\n",
	"Crumb (1994)\n",
	"M (1931)\n",
	"Being John Malkovich (1999)\n",
	"Touch of Evil (1958)\n",
	"Crimes and Misdemeanors (1989)\n",
	"Out of Sight (1998)\n",
	"Big Sleep, The (1946)\n",
	"--------------------------------------------------\n",
	"artist:\n",
	"--------------------------------------------------\n",
	"Heavenly Creatures (1994)\n",
	"Badlands (1973)"
	]
	},
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"\n",
	"Fisher King, The (1991)\n",
	"Thirty-Two Short Films About Glenn Gould (1993)\n",
	"Basquiat (1996)\n",
	"Ed Wood (1994)\n",
	"Nightmare Before Christmas, The (1993)\n",
	"Vanya on 42nd Street (1994)\n",
	"Do the Right Thing (1989)\n",
	"Sweet Hereafter, The (1997)\n",
	"--------------------------------------------------\n",
	"self-employed:\n",
	"--------------------------------------------------\n",
	"King Kong (1933)\n",
	"Boat, The (Das Boot) (1981)\n",
	"Three Days of the Condor (1975)\n",
	"Tender Mercies (1983)\n",
	"Local Hero (1983)\n",
	"Breaker Morant (1980)\n",
	"Butch Cassidy and the Sundance Kid (1969)\n",
	"Serpico (1973)\n",
	"Anatomy of a Murder (1959)\n",
	"Great Escape, The (1963)\n"
	]
	}
	],
	"prompt_number": 4
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [],
	"language": "python",
	"metadata": {},
	"outputs": []
	}
	],
	"metadata": {}
	}
	]
	}