qpleple · June 14, 2015 21:54
diff --git a/Refactoring __vectorize_nested_feature.ipynb b/Refactoring __vectorize_nested_feature.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from pymongo import MongoClient\n",
    "from pandas import DataFrame\n",
    "\n",
    "db = MongoClient().lerni_dev\n",
    "dataset = [doc['_id'] for doc in db.documents.find({}, ['_id'])]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Interface\n",
    "- input : `dataset` une liste d'ids de textes et `features` une liste de selecteurs d'indicateurs  \n",
    "- output : `X` la matrice numpy des indicateurs et `labels` les labels des colonnes de `X`"
   ]
  },
  {
   "cell_type": "raw",
   "metadata": {},
   "source": [
    ">>> dataset = ['short_1', 'short_2', 'short_3']\n",
    ">>> features = ['avg_z_score', 'Bow_a89c6d269683','dialogs','punctuation.punct_count','writing_mistakes.rule_id']\n",
    ">>> X, labels = vectorize(dataset, features)\n",
    ">>> X\n",
    "[[1.2, 3.4, 0.0, ...],\n",
    " [12.1, 0.0, 0.0, ...],\n",
    " ...]\n",
    ">>> labels\n",
    "['avg_z_score', 'Bow_a89c6d269683__manger', 'Bow_a89c6d269683__poisson', 'dialogs__dialog_count', ...]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "On récupère toutes les valeurs dans Mongo avec une seule requête (tous les docs, tous les indicateurs). L'objet retourné est en fait quasiment ce qu'on cherche, il ne reste plus que de le mettre à plat."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Bow_a89c6d269683</th>\n",
       "      <th>avg_z_score</th>\n",
       "      <th>dialogs</th>\n",
       "      <th>punctuation__punct_count</th>\n",
       "      <th>writing_mistakes__rule_id</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>_id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>short_159</th>\n",
       "      <td>[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...</td>\n",
       "      <td>-0.244209</td>\n",
       "      <td>{u'dialog_count': 0, u'ratio_dialog': 0.0}</td>\n",
       "      <td>240</td>\n",
       "      <td>{u'uppercase_sentence_start': 1, u'accord_genr...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>short_416</th>\n",
       "      <td>[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...</td>\n",
       "      <td>0.556336</td>\n",
       "      <td>{u'dialog_count': 0, u'ratio_dialog': 0.0}</td>\n",
       "      <td>159</td>\n",
       "      <td>{u'accord_nombre': 1, u'comma_parenthesis_whit...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>short_461</th>\n",
       "      <td>[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...</td>\n",
       "      <td>-0.017605</td>\n",
       "      <td>{u'dialog_count': 0, u'ratio_dialog': 0.0}</td>\n",
       "      <td>28</td>\n",
       "      <td>{u'french_whitespace': 1, u'hunspell_no_sugges...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>short_478</th>\n",
       "      <td>[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...</td>\n",
       "      <td>-0.337611</td>\n",
       "      <td>{u'dialog_count': 0, u'ratio_dialog': 0.0}</td>\n",
       "      <td>41</td>\n",
       "      <td>{u'uppercase_sentence_start': 1, u'hunspell_no...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>short_487</th>\n",
       "      <td>[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...</td>\n",
       "      <td>0.060998</td>\n",
       "      <td>{u'dialog_count': 0, u'ratio_dialog': 0.0}</td>\n",
       "      <td>77</td>\n",
       "      <td>{u'french_whitespace': 2, u'hunspell_no_sugges...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                            Bow_a89c6d269683  avg_z_score  \\\n",
       "_id                                                                         \n",
       "short_159  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...    -0.244209   \n",
       "short_416  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...     0.556336   \n",
       "short_461  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...    -0.017605   \n",
       "short_478  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...    -0.337611   \n",
       "short_487  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...     0.060998   \n",
       "\n",
       "                                              dialogs  \\\n",
       "_id                                                     \n",
       "short_159  {u'dialog_count': 0, u'ratio_dialog': 0.0}   \n",
       "short_416  {u'dialog_count': 0, u'ratio_dialog': 0.0}   \n",
       "short_461  {u'dialog_count': 0, u'ratio_dialog': 0.0}   \n",
       "short_478  {u'dialog_count': 0, u'ratio_dialog': 0.0}   \n",
       "short_487  {u'dialog_count': 0, u'ratio_dialog': 0.0}   \n",
       "\n",
       "           punctuation__punct_count  \\\n",
       "_id                                   \n",
       "short_159                       240   \n",
       "short_416                       159   \n",
       "short_461                        28   \n",
       "short_478                        41   \n",
       "short_487                        77   \n",
       "\n",
       "                                   writing_mistakes__rule_id  \n",
       "_id                                                           \n",
       "short_159  {u'uppercase_sentence_start': 1, u'accord_genr...  \n",
       "short_416  {u'accord_nombre': 1, u'comma_parenthesis_whit...  \n",
       "short_461  {u'french_whitespace': 1, u'hunspell_no_sugges...  \n",
       "short_478  {u'uppercase_sentence_start': 1, u'hunspell_no...  \n",
       "short_487  {u'french_whitespace': 2, u'hunspell_no_sugges...  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "features = [\n",
    "    'avg_z_score',              # indicateur à plat contenant une valeur\n",
    "    'Bow_a89c6d269683',         # indicateur à plat contenant une liste\n",
    "    'dialogs',                  # indicateur à plat contenant un dictionnaire\n",
    "    'punctuation.punct_count',  # indicateur nested contenant une valeur\n",
    "    'writing_mistakes.rule_id'  # indicateur nested contenant un dictionnaire\n",
    "]\n",
    "\n",
    "cursor = db.documents.aggregate([\n",
    "  {'$match': {'_id': {'$in': dataset}}},\n",
    "  {'$project': {f.replace('.', '__'): '$'+f for f in features}}\n",
    "])\n",
    "\n",
    "df = DataFrame(list(cursor)).set_index('_id')\n",
    "df[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "X = df.values      # array numpy des valeurs seulement\n",
    "labels = list(df)  # liste des labels des colonnes"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Quand on ne connait pas les colonnes à l'avance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Exemple : on ne connait pas toutes les RULE_ID des fautes\n",
    "# c'est pas grave, on n'a pas besoin d'avoir la liste des colonnes\n",
    "data = [\n",
    "    {'_id': 'short_1', 'rule_1': 12},\n",
    "    {'_id': 'short_2', 'rule_1': 2, 'rule_2': 12},\n",
    "    {'_id': 'short_3', 'rule_3': 1},\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>rule_1</th>\n",
       "      <th>rule_2</th>\n",
       "      <th>rule_3</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>_id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>short_1</th>\n",
       "      <td>12</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>short_2</th>\n",
       "      <td>2</td>\n",
       "      <td>12</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>short_3</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         rule_1  rule_2  rule_3\n",
       "_id                            \n",
       "short_1      12     NaN     NaN\n",
       "short_2       2      12     NaN\n",
       "short_3     NaN     NaN       1"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# les colonnes sont créées à la volée\n",
    "# lorsqu'on n'a pas de valeur pour un texte, on récupère un NaN\n",
    "df = DataFrame(data).set_index('_id')\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>rule_1</th>\n",
       "      <th>rule_2</th>\n",
       "      <th>rule_3</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>_id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>short_1</th>\n",
       "      <td>12</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>short_2</th>\n",
       "      <td>2</td>\n",
       "      <td>12</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>short_3</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         rule_1  rule_2  rule_3\n",
       "_id                            \n",
       "short_1      12       0       0\n",
       "short_2       2      12       0\n",
       "short_3       0       0       1"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# on peut ensuite mettre à zéro les NaN, et on a gagné\n",
    "df = df.fillna(0)\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 12.,   0.,   0.],\n",
       "       [  2.,  12.,   0.],\n",
       "       [  0.,   0.,   1.]])"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# la matrices des indicateu\n",
    "df.values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['rule_1', 'rule_2', 'rule_3']"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# la liste des colonnes\n",
    "list(df)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Prédiction\n",
    "Pour la prédiction, il faut savoir reconstruire la représentation vectorielle d'un texte.  \n",
    "La liste des colonnes de la matrice devrait être la seule information dont on a besoin pour le faire.\n",
    "\n",
    "On ne devrait même pas avoir à utiliser la collection `dataset` pour vectoriser un texte ou un dataset... si ?"
   ]
  },
  {
   "cell_type": "raw",
   "metadata": {},
   "source": [
    ">>> new_text_ids = ['short_4', 'short_5']\n",
    ">>> X, labels = vectorize(new_text_ids, features)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Piste pour mettre à plat"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'a': {'b': {'d': 1, 'f': [1, 2, 3]}}, 'i': 6, 'f': {'g': 5}}\n",
      "{'f__g': 5, 'a__b__d': 1, 'i': 6, 'a__b__f__2': 3, 'a__b__f__1': 2, 'a__b__f__0': 1}\n"
     ]
    }
   ],
   "source": [
    "def flatten(d):\n",
    "    for k in d.keys():\n",
    "        if type(d[k]) is list:\n",
    "            d[k] = dict(enumerate(d[k]))\n",
    "            \n",
    "        if type(d[k]) is dict:\n",
    "            flatten(d[k])\n",
    "            d.update({str(k)+'__'+str(k2):v2 for k2, v2 in d[k].items()})\n",
    "            del d[k]\n",
    "    \n",
    "d = {'a': {'b': {'d': 1, 'f': [1, 2, 3]}}, 'f': {'g': 5}, 'i': 6}\n",
    "print d\n",
    "flatten(d)\n",
    "print d"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Bow_a89c6d269683__0</th>\n",
       "      <th>Bow_a89c6d269683__1</th>\n",
       "      <th>Bow_a89c6d269683__10</th>\n",
       "      <th>Bow_a89c6d269683__100</th>\n",
       "      <th>Bow_a89c6d269683__1000</th>\n",
       "      <th>Bow_a89c6d269683__1001</th>\n",
       "      <th>Bow_a89c6d269683__1002</th>\n",
       "      <th>Bow_a89c6d269683__1003</th>\n",
       "      <th>Bow_a89c6d269683__1004</th>\n",
       "      <th>Bow_a89c6d269683__1005</th>\n",
       "      <th>...</th>\n",
       "      <th>writing_mistakes__rule_id__trait_union</th>\n",
       "      <th>writing_mistakes__rule_id__trait_union_inversion</th>\n",
       "      <th>writing_mistakes__rule_id__trait_union_nombre</th>\n",
       "      <th>writing_mistakes__rule_id__tres</th>\n",
       "      <th>writing_mistakes__rule_id__un_espece_de</th>\n",
       "      <th>writing_mistakes__rule_id__uppercase_sentence_start</th>\n",
       "      <th>writing_mistakes__rule_id__virgule</th>\n",
       "      <th>writing_mistakes__rule_id__voire_meme</th>\n",
       "      <th>writing_mistakes__rule_id__whitespace_rule</th>\n",
       "      <th>writing_mistakes__rule_id__xxieme</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>_id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>short_159</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>11</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>short_416</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>15</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>short_461</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>short_478</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>short_487</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 8466 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "           Bow_a89c6d269683__0  Bow_a89c6d269683__1  Bow_a89c6d269683__10  \\\n",
       "_id                                                                         \n",
       "short_159                    0                    0                     0   \n",
       "short_416                    0                    0                     0   \n",
       "short_461                    0                    0                     0   \n",
       "short_478                    0                    0                     0   \n",
       "short_487                    0                    0                     0   \n",
       "\n",
       "           Bow_a89c6d269683__100  Bow_a89c6d269683__1000  \\\n",
       "_id                                                        \n",
       "short_159                      0                       0   \n",
       "short_416                      0                       0   \n",
       "short_461                      0                       0   \n",
       "short_478                      0                       0   \n",
       "short_487                      0                       0   \n",
       "\n",
       "           Bow_a89c6d269683__1001  Bow_a89c6d269683__1002  \\\n",
       "_id                                                         \n",
       "short_159                       0                       0   \n",
       "short_416                       0                       0   \n",
       "short_461                       0                       0   \n",
       "short_478                       0                       0   \n",
       "short_487                       0                       0   \n",
       "\n",
       "           Bow_a89c6d269683__1003  Bow_a89c6d269683__1004  \\\n",
       "_id                                                         \n",
       "short_159                       0                       0   \n",
       "short_416                       0                       0   \n",
       "short_461                       0                       0   \n",
       "short_478                       0                       0   \n",
       "short_487                       0                       0   \n",
       "\n",
       "           Bow_a89c6d269683__1005                ...                  \\\n",
       "_id                                              ...                   \n",
       "short_159                       0                ...                   \n",
       "short_416                       0                ...                   \n",
       "short_461                       0                ...                   \n",
       "short_478                       0                ...                   \n",
       "short_487                       0                ...                   \n",
       "\n",
       "           writing_mistakes__rule_id__trait_union  \\\n",
       "_id                                                 \n",
       "short_159                                       0   \n",
       "short_416                                       0   \n",
       "short_461                                       0   \n",
       "short_478                                       0   \n",
       "short_487                                       0   \n",
       "\n",
       "           writing_mistakes__rule_id__trait_union_inversion  \\\n",
       "_id                                                           \n",
       "short_159                                                 0   \n",
       "short_416                                                 0   \n",
       "short_461                                                 0   \n",
       "short_478                                                 0   \n",
       "short_487                                                 0   \n",
       "\n",
       "           writing_mistakes__rule_id__trait_union_nombre  \\\n",
       "_id                                                        \n",
       "short_159                                              0   \n",
       "short_416                                              0   \n",
       "short_461                                              0   \n",
       "short_478                                              0   \n",
       "short_487                                              0   \n",
       "\n",
       "           writing_mistakes__rule_id__tres  \\\n",
       "_id                                          \n",
       "short_159                                0   \n",
       "short_416                                0   \n",
       "short_461                                0   \n",
       "short_478                                0   \n",
       "short_487                                0   \n",
       "\n",
       "           writing_mistakes__rule_id__un_espece_de  \\\n",
       "_id                                                  \n",
       "short_159                                        0   \n",
       "short_416                                        0   \n",
       "short_461                                        0   \n",
       "short_478                                        0   \n",
       "short_487                                        0   \n",
       "\n",
       "           writing_mistakes__rule_id__uppercase_sentence_start  \\\n",
       "_id                                                              \n",
       "short_159                                                  1     \n",
       "short_416                                                 15     \n",
       "short_461                                                  0     \n",
       "short_478                                                  1     \n",
       "short_487                                                  0     \n",
       "\n",
       "           writing_mistakes__rule_id__virgule  \\\n",
       "_id                                             \n",
       "short_159                                   0   \n",
       "short_416                                   1   \n",
       "short_461                                   0   \n",
       "short_478                                   0   \n",
       "short_487                                   0   \n",
       "\n",
       "           writing_mistakes__rule_id__voire_meme  \\\n",
       "_id                                                \n",
       "short_159                                      0   \n",
       "short_416                                      0   \n",
       "short_461                                      0   \n",
       "short_478                                      0   \n",
       "short_487                                      0   \n",
       "\n",
       "           writing_mistakes__rule_id__whitespace_rule  \\\n",
       "_id                                                     \n",
       "short_159                                          11   \n",
       "short_416                                           0   \n",
       "short_461                                           1   \n",
       "short_478                                           0   \n",
       "short_487                                           7   \n",
       "\n",
       "           writing_mistakes__rule_id__xxieme  \n",
       "_id                                           \n",
       "short_159                                  0  \n",
       "short_416                                  0  \n",
       "short_461                                  0  \n",
       "short_478                                  2  \n",
       "short_487                                  0  \n",
       "\n",
       "[5 rows x 8466 columns]"
      ]
     },
     "execution_count": 75,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "features = ['avg_z_score', 'Bow_a89c6d269683', 'dialogs', 'punctuation.punct_count', 'writing_mistakes.rule_id']\n",
    "\n",
    "cursor = db.documents.aggregate([\n",
    "  {'$match': {'_id': {'$in': dataset}}},\n",
    "  {'$project': {f.replace('.', '__'): '$'+f for f in features}}\n",
    "])\n",
    "\n",
    "rows = list(cursor)\n",
    "for row in rows:\n",
    "    flatten(row)\n",
    "\n",
    "df = DataFrame(rows).set_index('_id').fillna(0)\n",
    "df[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "X, labels = df.values, list(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(40, 8466)"
      ]
     },
     "execution_count": 78,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['Bow_a89c6d269683__0', 'Bow_a89c6d269683__1', 'Bow_a89c6d269683__10', 'Bow_a89c6d269683__100', 'Bow_a89c6d269683__1000', 'Bow_a89c6d269683__1001', 'Bow_a89c6d269683__1002', 'Bow_a89c6d269683__1003', 'Bow_a89c6d269683__1004', 'Bow_a89c6d269683__1005']\n",
      "['writing_mistakes__rule_id__trait_union', 'writing_mistakes__rule_id__trait_union_inversion', 'writing_mistakes__rule_id__trait_union_nombre', 'writing_mistakes__rule_id__tres', 'writing_mistakes__rule_id__un_espece_de', 'writing_mistakes__rule_id__uppercase_sentence_start', 'writing_mistakes__rule_id__virgule', 'writing_mistakes__rule_id__voire_meme', 'writing_mistakes__rule_id__whitespace_rule', 'writing_mistakes__rule_id__xxieme']\n"
     ]
    }
   ],
   "source": [
    "print labels[:10]\n",
    "print labels[-10:]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Reconstruire une colonne à partir de son nom"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "writing_mistakes__rule_id__xxieme\n"
     ]
    }
   ],
   "source": [
    "# voici une des colonnes de la matrice (la dernière)\n",
    "col = labels[-1]\n",
    "print col"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{u'_id': u'short_159'},\n",
       " {u'_id': u'short_416'},\n",
       " {u'_id': u'short_461'},\n",
       " {u'_id': u'short_478', u'_value': 2},\n",
       " {u'_id': u'short_487'},\n",
       " {u'_id': u'short_490'},\n",
       " {u'_id': u'short_493'},\n",
       " {u'_id': u'short_495'},\n",
       " {u'_id': u'short_497'},\n",
       " {u'_id': u'short_498'},\n",
       " {u'_id': u'short_499'},\n",
       " {u'_id': u'short_500'},\n",
       " {u'_id': u'short_501'},\n",
       " {u'_id': u'short_502'},\n",
       " {u'_id': u'short_503'},\n",
       " {u'_id': u'short_511'},\n",
       " {u'_id': u'short_512'},\n",
       " {u'_id': u'short_513'},\n",
       " {u'_id': u'short_516'},\n",
       " {u'_id': u'short_520'},\n",
       " {u'_id': u'short_521'},\n",
       " {u'_id': u'short_524'},\n",
       " {u'_id': u'short_533'},\n",
       " {u'_id': u'short_534'},\n",
       " {u'_id': u'short_535'},\n",
       " {u'_id': u'short_536'},\n",
       " {u'_id': u'short_537'},\n",
       " {u'_id': u'short_540'},\n",
       " {u'_id': u'short_541', u'_value': 1},\n",
       " {u'_id': u'short_542'},\n",
       " {u'_id': u'short_550'},\n",
       " {u'_id': u'short_551'},\n",
       " {u'_id': u'short_555'},\n",
       " {u'_id': u'short_557'},\n",
       " {u'_id': u'short_559'},\n",
       " {u'_id': u'short_563'},\n",
       " {u'_id': u'short_573'},\n",
       " {u'_id': u'short_574'},\n",
       " {u'_id': u'short_575', u'_value': 1},\n",
       " {u'_id': u'short_576', u'_value': 1}]"
      ]
     },
     "execution_count": 92,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# on changeant les \"__\" en \".\" dans le nom de la colonne, on obtient un sélecteur\n",
    "# mongo pour cette valeur\n",
    "# fait comme ça, ça ne marche pas pour les listes : 'Bow_a89c6d269683.10' n'est pas compris\n",
    "# par mongo comme étant l'item 10 de la liste Bow_a89c6d269683\n",
    "\n",
    "cursor = db.documents.aggregate([\n",
    "  {'$match': {'_id': {'$in': dataset}}},\n",
    "  {'$project': {'_value': '$'+col.replace('__', '.')}}\n",
    "])\n",
    "\n",
    "list(cursor)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
 }