Created
June 14, 2015 21:54
-
-
Save qpleple/e28dabeaf2801267388c to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import numpy as np\n", | |
"from pymongo import MongoClient\n", | |
"from pandas import DataFrame\n", | |
"\n", | |
"db = MongoClient().lerni_dev\n", | |
"dataset = [doc['_id'] for doc in db.documents.find({}, ['_id'])]" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Interface\n", | |
"- input : `dataset` une liste d'ids de textes et `features` une liste de selecteurs d'indicateurs \n", | |
"- output : `X` la matrice numpy des indicateurs et `labels` les labels des colonnes de `X`" | |
] | |
}, | |
{ | |
"cell_type": "raw", | |
"metadata": {}, | |
"source": [ | |
">>> dataset = ['short_1', 'short_2', 'short_3']\n", | |
">>> features = ['avg_z_score', 'Bow_a89c6d269683','dialogs','punctuation.punct_count','writing_mistakes.rule_id']\n", | |
">>> X, labels = vectorize(dataset, features)\n", | |
">>> X\n", | |
"[[1.2, 3.4, 0.0, ...],\n", | |
" [12.1, 0.0, 0.0, ...],\n", | |
" ...]\n", | |
">>> labels\n", | |
"['avg_z_score', 'Bow_a89c6d269683__manger', 'Bow_a89c6d269683__poisson', 'dialogs__dialog_count', ...]" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"On récupère toutes les valeurs dans Mongo avec une seule requête (tous les docs, tous les indicateurs). L'objet retourné est en fait quasiment ce qu'on cherche, il ne reste plus que de le mettre à plat." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>Bow_a89c6d269683</th>\n", | |
" <th>avg_z_score</th>\n", | |
" <th>dialogs</th>\n", | |
" <th>punctuation__punct_count</th>\n", | |
" <th>writing_mistakes__rule_id</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>_id</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>short_159</th>\n", | |
" <td>[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...</td>\n", | |
" <td>-0.244209</td>\n", | |
" <td>{u'dialog_count': 0, u'ratio_dialog': 0.0}</td>\n", | |
" <td>240</td>\n", | |
" <td>{u'uppercase_sentence_start': 1, u'accord_genr...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>short_416</th>\n", | |
" <td>[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...</td>\n", | |
" <td>0.556336</td>\n", | |
" <td>{u'dialog_count': 0, u'ratio_dialog': 0.0}</td>\n", | |
" <td>159</td>\n", | |
" <td>{u'accord_nombre': 1, u'comma_parenthesis_whit...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>short_461</th>\n", | |
" <td>[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...</td>\n", | |
" <td>-0.017605</td>\n", | |
" <td>{u'dialog_count': 0, u'ratio_dialog': 0.0}</td>\n", | |
" <td>28</td>\n", | |
" <td>{u'french_whitespace': 1, u'hunspell_no_sugges...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>short_478</th>\n", | |
" <td>[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...</td>\n", | |
" <td>-0.337611</td>\n", | |
" <td>{u'dialog_count': 0, u'ratio_dialog': 0.0}</td>\n", | |
" <td>41</td>\n", | |
" <td>{u'uppercase_sentence_start': 1, u'hunspell_no...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>short_487</th>\n", | |
" <td>[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...</td>\n", | |
" <td>0.060998</td>\n", | |
" <td>{u'dialog_count': 0, u'ratio_dialog': 0.0}</td>\n", | |
" <td>77</td>\n", | |
" <td>{u'french_whitespace': 2, u'hunspell_no_sugges...</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" Bow_a89c6d269683 avg_z_score \\\n", | |
"_id \n", | |
"short_159 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... -0.244209 \n", | |
"short_416 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 0.556336 \n", | |
"short_461 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... -0.017605 \n", | |
"short_478 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... -0.337611 \n", | |
"short_487 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 0.060998 \n", | |
"\n", | |
" dialogs \\\n", | |
"_id \n", | |
"short_159 {u'dialog_count': 0, u'ratio_dialog': 0.0} \n", | |
"short_416 {u'dialog_count': 0, u'ratio_dialog': 0.0} \n", | |
"short_461 {u'dialog_count': 0, u'ratio_dialog': 0.0} \n", | |
"short_478 {u'dialog_count': 0, u'ratio_dialog': 0.0} \n", | |
"short_487 {u'dialog_count': 0, u'ratio_dialog': 0.0} \n", | |
"\n", | |
" punctuation__punct_count \\\n", | |
"_id \n", | |
"short_159 240 \n", | |
"short_416 159 \n", | |
"short_461 28 \n", | |
"short_478 41 \n", | |
"short_487 77 \n", | |
"\n", | |
" writing_mistakes__rule_id \n", | |
"_id \n", | |
"short_159 {u'uppercase_sentence_start': 1, u'accord_genr... \n", | |
"short_416 {u'accord_nombre': 1, u'comma_parenthesis_whit... \n", | |
"short_461 {u'french_whitespace': 1, u'hunspell_no_sugges... \n", | |
"short_478 {u'uppercase_sentence_start': 1, u'hunspell_no... \n", | |
"short_487 {u'french_whitespace': 2, u'hunspell_no_sugges... " | |
] | |
}, | |
"execution_count": 2, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"features = [\n", | |
" 'avg_z_score', # indicateur à plat contenant une valeur\n", | |
" 'Bow_a89c6d269683', # indicateur à plat contenant une liste\n", | |
" 'dialogs', # indicateur à plat contenant un dictionnaire\n", | |
" 'punctuation.punct_count', # indicateur nested contenant une valeur\n", | |
" 'writing_mistakes.rule_id' # indicateur nested contenant un dictionnaire\n", | |
"]\n", | |
"\n", | |
"cursor = db.documents.aggregate([\n", | |
" {'$match': {'_id': {'$in': dataset}}},\n", | |
" {'$project': {f.replace('.', '__'): '$'+f for f in features}}\n", | |
"])\n", | |
"\n", | |
"df = DataFrame(list(cursor)).set_index('_id')\n", | |
"df[:5]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"X = df.values # array numpy des valeurs seulement\n", | |
"labels = list(df) # liste des labels des colonnes" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Quand on ne connait pas les colonnes à l'avance" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 27, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# Exemple : on ne connait pas toutes les RULE_ID des fautes\n", | |
"# c'est pas grave, on n'a pas besoin d'avoir la liste des colonnes\n", | |
"data = [\n", | |
" {'_id': 'short_1', 'rule_1': 12},\n", | |
" {'_id': 'short_2', 'rule_1': 2, 'rule_2': 12},\n", | |
" {'_id': 'short_3', 'rule_3': 1},\n", | |
"]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 24, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>rule_1</th>\n", | |
" <th>rule_2</th>\n", | |
" <th>rule_3</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>_id</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>short_1</th>\n", | |
" <td>12</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>short_2</th>\n", | |
" <td>2</td>\n", | |
" <td>12</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>short_3</th>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" rule_1 rule_2 rule_3\n", | |
"_id \n", | |
"short_1 12 NaN NaN\n", | |
"short_2 2 12 NaN\n", | |
"short_3 NaN NaN 1" | |
] | |
}, | |
"execution_count": 24, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# les colonnes sont créées à la volée\n", | |
"# lorsqu'on n'a pas de valeur pour un texte, on récupère un NaN\n", | |
"df = DataFrame(data).set_index('_id')\n", | |
"df" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 25, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>rule_1</th>\n", | |
" <th>rule_2</th>\n", | |
" <th>rule_3</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>_id</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>short_1</th>\n", | |
" <td>12</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>short_2</th>\n", | |
" <td>2</td>\n", | |
" <td>12</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>short_3</th>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" rule_1 rule_2 rule_3\n", | |
"_id \n", | |
"short_1 12 0 0\n", | |
"short_2 2 12 0\n", | |
"short_3 0 0 1" | |
] | |
}, | |
"execution_count": 25, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# on peut ensuite mettre à zéro les NaN, et on a gagné\n", | |
"df = df.fillna(0)\n", | |
"df" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 26, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([[ 12., 0., 0.],\n", | |
" [ 2., 12., 0.],\n", | |
" [ 0., 0., 1.]])" | |
] | |
}, | |
"execution_count": 26, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# la matrices des indicateu\n", | |
"df.values" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 28, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['rule_1', 'rule_2', 'rule_3']" | |
] | |
}, | |
"execution_count": 28, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# la liste des colonnes\n", | |
"list(df)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Prédiction\n", | |
"Pour la prédiction, il faut savoir reconstruire la représentation vectorielle d'un texte. \n", | |
"La liste des colonnes de la matrice devrait être la seule information dont on a besoin pour le faire.\n", | |
"\n", | |
"On ne devrait même pas avoir à utiliser la collection `dataset` pour vectoriser un texte ou un dataset... si ?" | |
] | |
}, | |
{ | |
"cell_type": "raw", | |
"metadata": {}, | |
"source": [ | |
">>> new_text_ids = ['short_4', 'short_5']\n", | |
">>> X, labels = vectorize(new_text_ids, features)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Piste pour mettre à plat" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 69, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"{'a': {'b': {'d': 1, 'f': [1, 2, 3]}}, 'i': 6, 'f': {'g': 5}}\n", | |
"{'f__g': 5, 'a__b__d': 1, 'i': 6, 'a__b__f__2': 3, 'a__b__f__1': 2, 'a__b__f__0': 1}\n" | |
] | |
} | |
], | |
"source": [ | |
"def flatten(d):\n", | |
" for k in d.keys():\n", | |
" if type(d[k]) is list:\n", | |
" d[k] = dict(enumerate(d[k]))\n", | |
" \n", | |
" if type(d[k]) is dict:\n", | |
" flatten(d[k])\n", | |
" d.update({str(k)+'__'+str(k2):v2 for k2, v2 in d[k].items()})\n", | |
" del d[k]\n", | |
" \n", | |
"d = {'a': {'b': {'d': 1, 'f': [1, 2, 3]}}, 'f': {'g': 5}, 'i': 6}\n", | |
"print d\n", | |
"flatten(d)\n", | |
"print d" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 75, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>Bow_a89c6d269683__0</th>\n", | |
" <th>Bow_a89c6d269683__1</th>\n", | |
" <th>Bow_a89c6d269683__10</th>\n", | |
" <th>Bow_a89c6d269683__100</th>\n", | |
" <th>Bow_a89c6d269683__1000</th>\n", | |
" <th>Bow_a89c6d269683__1001</th>\n", | |
" <th>Bow_a89c6d269683__1002</th>\n", | |
" <th>Bow_a89c6d269683__1003</th>\n", | |
" <th>Bow_a89c6d269683__1004</th>\n", | |
" <th>Bow_a89c6d269683__1005</th>\n", | |
" <th>...</th>\n", | |
" <th>writing_mistakes__rule_id__trait_union</th>\n", | |
" <th>writing_mistakes__rule_id__trait_union_inversion</th>\n", | |
" <th>writing_mistakes__rule_id__trait_union_nombre</th>\n", | |
" <th>writing_mistakes__rule_id__tres</th>\n", | |
" <th>writing_mistakes__rule_id__un_espece_de</th>\n", | |
" <th>writing_mistakes__rule_id__uppercase_sentence_start</th>\n", | |
" <th>writing_mistakes__rule_id__virgule</th>\n", | |
" <th>writing_mistakes__rule_id__voire_meme</th>\n", | |
" <th>writing_mistakes__rule_id__whitespace_rule</th>\n", | |
" <th>writing_mistakes__rule_id__xxieme</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>_id</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>short_159</th>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>...</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>11</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>short_416</th>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>...</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>15</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>short_461</th>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>...</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>short_478</th>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>...</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>short_487</th>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>...</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>7</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>5 rows × 8466 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" Bow_a89c6d269683__0 Bow_a89c6d269683__1 Bow_a89c6d269683__10 \\\n", | |
"_id \n", | |
"short_159 0 0 0 \n", | |
"short_416 0 0 0 \n", | |
"short_461 0 0 0 \n", | |
"short_478 0 0 0 \n", | |
"short_487 0 0 0 \n", | |
"\n", | |
" Bow_a89c6d269683__100 Bow_a89c6d269683__1000 \\\n", | |
"_id \n", | |
"short_159 0 0 \n", | |
"short_416 0 0 \n", | |
"short_461 0 0 \n", | |
"short_478 0 0 \n", | |
"short_487 0 0 \n", | |
"\n", | |
" Bow_a89c6d269683__1001 Bow_a89c6d269683__1002 \\\n", | |
"_id \n", | |
"short_159 0 0 \n", | |
"short_416 0 0 \n", | |
"short_461 0 0 \n", | |
"short_478 0 0 \n", | |
"short_487 0 0 \n", | |
"\n", | |
" Bow_a89c6d269683__1003 Bow_a89c6d269683__1004 \\\n", | |
"_id \n", | |
"short_159 0 0 \n", | |
"short_416 0 0 \n", | |
"short_461 0 0 \n", | |
"short_478 0 0 \n", | |
"short_487 0 0 \n", | |
"\n", | |
" Bow_a89c6d269683__1005 ... \\\n", | |
"_id ... \n", | |
"short_159 0 ... \n", | |
"short_416 0 ... \n", | |
"short_461 0 ... \n", | |
"short_478 0 ... \n", | |
"short_487 0 ... \n", | |
"\n", | |
" writing_mistakes__rule_id__trait_union \\\n", | |
"_id \n", | |
"short_159 0 \n", | |
"short_416 0 \n", | |
"short_461 0 \n", | |
"short_478 0 \n", | |
"short_487 0 \n", | |
"\n", | |
" writing_mistakes__rule_id__trait_union_inversion \\\n", | |
"_id \n", | |
"short_159 0 \n", | |
"short_416 0 \n", | |
"short_461 0 \n", | |
"short_478 0 \n", | |
"short_487 0 \n", | |
"\n", | |
" writing_mistakes__rule_id__trait_union_nombre \\\n", | |
"_id \n", | |
"short_159 0 \n", | |
"short_416 0 \n", | |
"short_461 0 \n", | |
"short_478 0 \n", | |
"short_487 0 \n", | |
"\n", | |
" writing_mistakes__rule_id__tres \\\n", | |
"_id \n", | |
"short_159 0 \n", | |
"short_416 0 \n", | |
"short_461 0 \n", | |
"short_478 0 \n", | |
"short_487 0 \n", | |
"\n", | |
" writing_mistakes__rule_id__un_espece_de \\\n", | |
"_id \n", | |
"short_159 0 \n", | |
"short_416 0 \n", | |
"short_461 0 \n", | |
"short_478 0 \n", | |
"short_487 0 \n", | |
"\n", | |
" writing_mistakes__rule_id__uppercase_sentence_start \\\n", | |
"_id \n", | |
"short_159 1 \n", | |
"short_416 15 \n", | |
"short_461 0 \n", | |
"short_478 1 \n", | |
"short_487 0 \n", | |
"\n", | |
" writing_mistakes__rule_id__virgule \\\n", | |
"_id \n", | |
"short_159 0 \n", | |
"short_416 1 \n", | |
"short_461 0 \n", | |
"short_478 0 \n", | |
"short_487 0 \n", | |
"\n", | |
" writing_mistakes__rule_id__voire_meme \\\n", | |
"_id \n", | |
"short_159 0 \n", | |
"short_416 0 \n", | |
"short_461 0 \n", | |
"short_478 0 \n", | |
"short_487 0 \n", | |
"\n", | |
" writing_mistakes__rule_id__whitespace_rule \\\n", | |
"_id \n", | |
"short_159 11 \n", | |
"short_416 0 \n", | |
"short_461 1 \n", | |
"short_478 0 \n", | |
"short_487 7 \n", | |
"\n", | |
" writing_mistakes__rule_id__xxieme \n", | |
"_id \n", | |
"short_159 0 \n", | |
"short_416 0 \n", | |
"short_461 0 \n", | |
"short_478 2 \n", | |
"short_487 0 \n", | |
"\n", | |
"[5 rows x 8466 columns]" | |
] | |
}, | |
"execution_count": 75, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"features = ['avg_z_score', 'Bow_a89c6d269683', 'dialogs', 'punctuation.punct_count', 'writing_mistakes.rule_id']\n", | |
"\n", | |
"cursor = db.documents.aggregate([\n", | |
" {'$match': {'_id': {'$in': dataset}}},\n", | |
" {'$project': {f.replace('.', '__'): '$'+f for f in features}}\n", | |
"])\n", | |
"\n", | |
"rows = list(cursor)\n", | |
"for row in rows:\n", | |
" flatten(row)\n", | |
"\n", | |
"df = DataFrame(rows).set_index('_id').fillna(0)\n", | |
"df[:5]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 77, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"X, labels = df.values, list(df)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 78, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(40, 8466)" | |
] | |
}, | |
"execution_count": 78, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"X.shape" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 81, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"['Bow_a89c6d269683__0', 'Bow_a89c6d269683__1', 'Bow_a89c6d269683__10', 'Bow_a89c6d269683__100', 'Bow_a89c6d269683__1000', 'Bow_a89c6d269683__1001', 'Bow_a89c6d269683__1002', 'Bow_a89c6d269683__1003', 'Bow_a89c6d269683__1004', 'Bow_a89c6d269683__1005']\n", | |
"['writing_mistakes__rule_id__trait_union', 'writing_mistakes__rule_id__trait_union_inversion', 'writing_mistakes__rule_id__trait_union_nombre', 'writing_mistakes__rule_id__tres', 'writing_mistakes__rule_id__un_espece_de', 'writing_mistakes__rule_id__uppercase_sentence_start', 'writing_mistakes__rule_id__virgule', 'writing_mistakes__rule_id__voire_meme', 'writing_mistakes__rule_id__whitespace_rule', 'writing_mistakes__rule_id__xxieme']\n" | |
] | |
} | |
], | |
"source": [ | |
"print labels[:10]\n", | |
"print labels[-10:]" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Reconstruire une colonne à partir de son nom" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 91, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"writing_mistakes__rule_id__xxieme\n" | |
] | |
} | |
], | |
"source": [ | |
"# voici une des colonnes de la matrice (la dernière)\n", | |
"col = labels[-1]\n", | |
"print col" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 92, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[{u'_id': u'short_159'},\n", | |
" {u'_id': u'short_416'},\n", | |
" {u'_id': u'short_461'},\n", | |
" {u'_id': u'short_478', u'_value': 2},\n", | |
" {u'_id': u'short_487'},\n", | |
" {u'_id': u'short_490'},\n", | |
" {u'_id': u'short_493'},\n", | |
" {u'_id': u'short_495'},\n", | |
" {u'_id': u'short_497'},\n", | |
" {u'_id': u'short_498'},\n", | |
" {u'_id': u'short_499'},\n", | |
" {u'_id': u'short_500'},\n", | |
" {u'_id': u'short_501'},\n", | |
" {u'_id': u'short_502'},\n", | |
" {u'_id': u'short_503'},\n", | |
" {u'_id': u'short_511'},\n", | |
" {u'_id': u'short_512'},\n", | |
" {u'_id': u'short_513'},\n", | |
" {u'_id': u'short_516'},\n", | |
" {u'_id': u'short_520'},\n", | |
" {u'_id': u'short_521'},\n", | |
" {u'_id': u'short_524'},\n", | |
" {u'_id': u'short_533'},\n", | |
" {u'_id': u'short_534'},\n", | |
" {u'_id': u'short_535'},\n", | |
" {u'_id': u'short_536'},\n", | |
" {u'_id': u'short_537'},\n", | |
" {u'_id': u'short_540'},\n", | |
" {u'_id': u'short_541', u'_value': 1},\n", | |
" {u'_id': u'short_542'},\n", | |
" {u'_id': u'short_550'},\n", | |
" {u'_id': u'short_551'},\n", | |
" {u'_id': u'short_555'},\n", | |
" {u'_id': u'short_557'},\n", | |
" {u'_id': u'short_559'},\n", | |
" {u'_id': u'short_563'},\n", | |
" {u'_id': u'short_573'},\n", | |
" {u'_id': u'short_574'},\n", | |
" {u'_id': u'short_575', u'_value': 1},\n", | |
" {u'_id': u'short_576', u'_value': 1}]" | |
] | |
}, | |
"execution_count": 92, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# on changeant les \"__\" en \".\" dans le nom de la colonne, on obtient un sélecteur\n", | |
"# mongo pour cette valeur\n", | |
"# fait comme ça, ça ne marche pas pour les listes : 'Bow_a89c6d269683.10' n'est pas compris\n", | |
"# par mongo comme étant l'item 10 de la liste Bow_a89c6d269683\n", | |
"\n", | |
"cursor = db.documents.aggregate([\n", | |
" {'$match': {'_id': {'$in': dataset}}},\n", | |
" {'$project': {'_value': '$'+col.replace('__', '.')}}\n", | |
"])\n", | |
"\n", | |
"list(cursor)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 2", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.6" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment