Last active
December 26, 2019 05:16
-
-
Save d2207197/5dcd462ee06aa053b1a23a6eb5518c2c to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"toc": true | |
}, | |
"source": [ | |
"<h1>Table of Contents<span class=\"tocSkip\"></span></h1>\n", | |
"<div class=\"toc\"><ul class=\"toc-item\"></ul></div>" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2019-12-26T05:14:04.868076Z", | |
"start_time": "2019-12-26T05:14:04.569093Z" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"import sklearn" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2019-12-26T05:14:12.436029Z", | |
"start_time": "2019-12-26T05:14:12.422009Z" | |
} | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'0.21.3'" | |
] | |
}, | |
"execution_count": 2, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"sklearn.__version__\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2019-12-26T05:15:17.626Z" | |
} | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"/home/joe/.asdf/installs/python/miniconda3-latest/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:469: FutureWarning: Default multi_class will be changed to 'auto' in 0.22. Specify the multi_class option to silence this warning.\n", | |
" \"this warning.\", FutureWarning)\n", | |
"/home/joe/.asdf/installs/python/miniconda3-latest/lib/python3.7/site-packages/sklearn/model_selection/_split.py:657: Warning: The least populated class in y has only 1 members, which is too few. The minimum number of members in any class cannot be less than n_splits=3.\n", | |
" % (min_groups, self.n_splits)), Warning)\n" | |
] | |
} | |
], | |
"source": [ | |
"import pandas as pd\n", | |
"from sklearn.compose import ColumnTransformer\n", | |
"from sklearn.pipeline import Pipeline\n", | |
"from sklearn.linear_model import LogisticRegressionCV\n", | |
"from sklearn.preprocessing import LabelEncoder\n", | |
"from sklearn.feature_extraction.text import CountVectorizer\n", | |
"import catboost as cb\n", | |
"\n", | |
"csv_path = '/home/ryanchao2012/projects/grandchallenge/data/ques-class/v2.csv'\n", | |
"df = pd.read_csv(csv_path)\n", | |
"\n", | |
"le = LabelEncoder()\n", | |
"y = le.fit_transform(df['LABEL'])\n", | |
"\n", | |
"column_trans = ColumnTransformer([\n", | |
" ('ngrams', CountVectorizer(token_pattern=r'[a-zA-Z]+|[0-9.]+|\\S', ngram_range=(1,2)), 'QTEXT')], \n", | |
" remainder='drop')\n", | |
"\n", | |
"clf = LogisticRegressionCV(cv=3, random_state=0)\n", | |
"\n", | |
"pipe = Pipeline(steps=[\n", | |
" ('feature extraction', column_trans),\n", | |
" ('clf', clf)\n", | |
"])\n", | |
"pipe.fit(df, y)\n", | |
"pipe.score(df, y)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2019-12-26T05:15:49.539Z" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"pred_label = le.inverse_transform(pipe.predict(df))\n", | |
"pred_label\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 75, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2019-12-25T09:56:17.191964Z", | |
"start_time": "2019-12-25T09:56:17.187507Z" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"df['pred_label'] = pred_label" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 76, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2019-12-25T09:56:17.864171Z", | |
"start_time": "2019-12-25T09:56:17.850706Z" | |
} | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>QID</th>\n", | |
" <th>QTEXT</th>\n", | |
" <th>ANSWER</th>\n", | |
" <th>LABEL</th>\n", | |
" <th>pred_label</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <td>0</td>\n", | |
" <td>D001Q01</td>\n", | |
" <td>蘇東坡在中國歷史上,是哪一個朝代的人?</td>\n", | |
" <td>北宋</td>\n", | |
" <td>ERA</td>\n", | |
" <td>ERA</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>1</td>\n", | |
" <td>D001Q02</td>\n", | |
" <td>蘇東坡是中國哪個省份的人?</td>\n", | |
" <td>四川省</td>\n", | |
" <td>LOCATION</td>\n", | |
" <td>LOCATION</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>2</td>\n", | |
" <td>D001Q03</td>\n", | |
" <td>蘇東坡的爸爸叫什麼名字?</td>\n", | |
" <td>蘇洵</td>\n", | |
" <td>PERSON</td>\n", | |
" <td>PERSON</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>3</td>\n", | |
" <td>D001Q04</td>\n", | |
" <td>蘇文忠公指的是誰?</td>\n", | |
" <td>蘇軾</td>\n", | |
" <td>PERSON</td>\n", | |
" <td>PERSON</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>4</td>\n", | |
" <td>D001Q05</td>\n", | |
" <td>《蘇文忠公全集》是由何人編纂?</td>\n", | |
" <td>王宗稷</td>\n", | |
" <td>WHO</td>\n", | |
" <td>WHO</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>1172</td>\n", | |
" <td>D308Q03</td>\n", | |
" <td>『池裡不見水,地上沒有泥』,猜一個字,答案為何?</td>\n", | |
" <td>也</td>\n", | |
" <td>DOMAIN_TERM</td>\n", | |
" <td>DOMAIN_TERM</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>1173</td>\n", | |
" <td>D308Q04</td>\n", | |
" <td>提供『七十二小時』謎語的同學是誰?</td>\n", | |
" <td>周偉</td>\n", | |
" <td>WHO</td>\n", | |
" <td>WHO</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>1174</td>\n", | |
" <td>D308Q05</td>\n", | |
" <td>答對『也』字謎語的同學叫什麼名字?</td>\n", | |
" <td>于佩佩</td>\n", | |
" <td>NAME</td>\n", | |
" <td>NAME</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>1175</td>\n", | |
" <td>D308Q06</td>\n", | |
" <td>『七十二小時』,猜一個字,答案為何?</td>\n", | |
" <td>晶</td>\n", | |
" <td>DOMAIN_TERM</td>\n", | |
" <td>DOMAIN_TERM</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>1176</td>\n", | |
" <td>D308Q07</td>\n", | |
" <td>猜對『根在水中央,身材細又長,皮膚白又嫩,好吃又營養。』答案是豆芽的是哪一位同學?</td>\n", | |
" <td>丁小芹</td>\n", | |
" <td>PERSON</td>\n", | |
" <td>PERSON</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>1177 rows × 5 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" QID QTEXT ANSWER LABEL \\\n", | |
"0 D001Q01 蘇東坡在中國歷史上,是哪一個朝代的人? 北宋 ERA \n", | |
"1 D001Q02 蘇東坡是中國哪個省份的人? 四川省 LOCATION \n", | |
"2 D001Q03 蘇東坡的爸爸叫什麼名字? 蘇洵 PERSON \n", | |
"3 D001Q04 蘇文忠公指的是誰? 蘇軾 PERSON \n", | |
"4 D001Q05 《蘇文忠公全集》是由何人編纂? 王宗稷 WHO \n", | |
"... ... ... ... ... \n", | |
"1172 D308Q03 『池裡不見水,地上沒有泥』,猜一個字,答案為何? 也 DOMAIN_TERM \n", | |
"1173 D308Q04 提供『七十二小時』謎語的同學是誰? 周偉 WHO \n", | |
"1174 D308Q05 答對『也』字謎語的同學叫什麼名字? 于佩佩 NAME \n", | |
"1175 D308Q06 『七十二小時』,猜一個字,答案為何? 晶 DOMAIN_TERM \n", | |
"1176 D308Q07 猜對『根在水中央,身材細又長,皮膚白又嫩,好吃又營養。』答案是豆芽的是哪一位同學? 丁小芹 PERSON \n", | |
"\n", | |
" pred_label \n", | |
"0 ERA \n", | |
"1 LOCATION \n", | |
"2 PERSON \n", | |
"3 PERSON \n", | |
"4 WHO \n", | |
"... ... \n", | |
"1172 DOMAIN_TERM \n", | |
"1173 WHO \n", | |
"1174 NAME \n", | |
"1175 DOMAIN_TERM \n", | |
"1176 PERSON \n", | |
"\n", | |
"[1177 rows x 5 columns]" | |
] | |
}, | |
"execution_count": 76, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 94, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2019-12-25T10:02:50.743927Z", | |
"start_time": "2019-12-25T10:02:50.671317Z" | |
}, | |
"scrolled": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>LABEL</th>\n", | |
" <th>n</th>\n", | |
" <th>pred_n</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <td>19</td>\n", | |
" <td>DOMAIN_TERM</td>\n", | |
" <td>165</td>\n", | |
" <td>313.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>67</td>\n", | |
" <td>YESNO</td>\n", | |
" <td>114</td>\n", | |
" <td>114.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>50</td>\n", | |
" <td>RELATIVE</td>\n", | |
" <td>86</td>\n", | |
" <td>86.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>63</td>\n", | |
" <td>WHERE</td>\n", | |
" <td>63</td>\n", | |
" <td>63.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>48</td>\n", | |
" <td>QUANTITY</td>\n", | |
" <td>54</td>\n", | |
" <td>54.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>64</td>\n", | |
" <td>WHO</td>\n", | |
" <td>50</td>\n", | |
" <td>50.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>65</td>\n", | |
" <td>YEAR</td>\n", | |
" <td>49</td>\n", | |
" <td>49.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>12</td>\n", | |
" <td>COUNTRY</td>\n", | |
" <td>48</td>\n", | |
" <td>48.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>1</td>\n", | |
" <td>ALIAS</td>\n", | |
" <td>43</td>\n", | |
" <td>43.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>44</td>\n", | |
" <td>PERSON</td>\n", | |
" <td>43</td>\n", | |
" <td>43.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>43</td>\n", | |
" <td>PERIOD</td>\n", | |
" <td>33</td>\n", | |
" <td>33.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>56</td>\n", | |
" <td>SUMMARY</td>\n", | |
" <td>29</td>\n", | |
" <td>4.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>62</td>\n", | |
" <td>WHEN</td>\n", | |
" <td>26</td>\n", | |
" <td>26.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>0</td>\n", | |
" <td>AGE</td>\n", | |
" <td>25</td>\n", | |
" <td>25.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>8</td>\n", | |
" <td>CHOICE</td>\n", | |
" <td>22</td>\n", | |
" <td>22.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>13</td>\n", | |
" <td>DATE</td>\n", | |
" <td>22</td>\n", | |
" <td>22.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>38</td>\n", | |
" <td>NAME</td>\n", | |
" <td>17</td>\n", | |
" <td>17.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>33</td>\n", | |
" <td>LOCATION</td>\n", | |
" <td>17</td>\n", | |
" <td>17.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>9</td>\n", | |
" <td>CITY</td>\n", | |
" <td>17</td>\n", | |
" <td>17.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>26</td>\n", | |
" <td>GROUP</td>\n", | |
" <td>16</td>\n", | |
" <td>0.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>20</td>\n", | |
" <td>ERA</td>\n", | |
" <td>15</td>\n", | |
" <td>15.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>27</td>\n", | |
" <td>HEIGHT</td>\n", | |
" <td>15</td>\n", | |
" <td>15.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>5</td>\n", | |
" <td>ART_WORK</td>\n", | |
" <td>14</td>\n", | |
" <td>0.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>16</td>\n", | |
" <td>DISEASE</td>\n", | |
" <td>12</td>\n", | |
" <td>12.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>11</td>\n", | |
" <td>COMPANY</td>\n", | |
" <td>12</td>\n", | |
" <td>12.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>22</td>\n", | |
" <td>FLOWER</td>\n", | |
" <td>12</td>\n", | |
" <td>12.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>7</td>\n", | |
" <td>BUILDING</td>\n", | |
" <td>11</td>\n", | |
" <td>11.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>51</td>\n", | |
" <td>RIVER</td>\n", | |
" <td>10</td>\n", | |
" <td>0.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>32</td>\n", | |
" <td>LENGTH</td>\n", | |
" <td>9</td>\n", | |
" <td>9.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>54</td>\n", | |
" <td>SEASON</td>\n", | |
" <td>7</td>\n", | |
" <td>0.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>18</td>\n", | |
" <td>DOLLAR</td>\n", | |
" <td>7</td>\n", | |
" <td>0.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>57</td>\n", | |
" <td>TEMPLE</td>\n", | |
" <td>7</td>\n", | |
" <td>0.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>30</td>\n", | |
" <td>JOB</td>\n", | |
" <td>7</td>\n", | |
" <td>7.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>60</td>\n", | |
" <td>VEHICLE</td>\n", | |
" <td>7</td>\n", | |
" <td>0.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>10</td>\n", | |
" <td>COLOR</td>\n", | |
" <td>7</td>\n", | |
" <td>7.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>40</td>\n", | |
" <td>ORDER</td>\n", | |
" <td>6</td>\n", | |
" <td>0.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>17</td>\n", | |
" <td>DISTANCE</td>\n", | |
" <td>6</td>\n", | |
" <td>6.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>53</td>\n", | |
" <td>SCHOOL</td>\n", | |
" <td>5</td>\n", | |
" <td>5.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>58</td>\n", | |
" <td>TIME</td>\n", | |
" <td>5</td>\n", | |
" <td>5.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>34</td>\n", | |
" <td>METAL</td>\n", | |
" <td>5</td>\n", | |
" <td>5.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>4</td>\n", | |
" <td>AREA</td>\n", | |
" <td>5</td>\n", | |
" <td>5.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>6</td>\n", | |
" <td>BODY</td>\n", | |
" <td>5</td>\n", | |
" <td>0.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>2</td>\n", | |
" <td>ANIMAL</td>\n", | |
" <td>5</td>\n", | |
" <td>0.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>41</td>\n", | |
" <td>PEN</td>\n", | |
" <td>4</td>\n", | |
" <td>0.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>52</td>\n", | |
" <td>ROAD</td>\n", | |
" <td>4</td>\n", | |
" <td>0.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>59</td>\n", | |
" <td>TOOL</td>\n", | |
" <td>3</td>\n", | |
" <td>3.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>23</td>\n", | |
" <td>FOOD</td>\n", | |
" <td>3</td>\n", | |
" <td>0.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>36</td>\n", | |
" <td>MOUNTAIN</td>\n", | |
" <td>3</td>\n", | |
" <td>0.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>35</td>\n", | |
" <td>MONTH</td>\n", | |
" <td>2</td>\n", | |
" <td>0.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>14</td>\n", | |
" <td>DAY</td>\n", | |
" <td>2</td>\n", | |
" <td>2.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>29</td>\n", | |
" <td>ISLAND</td>\n", | |
" <td>2</td>\n", | |
" <td>0.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>42</td>\n", | |
" <td>PERCENT</td>\n", | |
" <td>2</td>\n", | |
" <td>0.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>39</td>\n", | |
" <td>OCEAN</td>\n", | |
" <td>2</td>\n", | |
" <td>0.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>3</td>\n", | |
" <td>ARE</td>\n", | |
" <td>1</td>\n", | |
" <td>0.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>61</td>\n", | |
" <td>WEIGHT</td>\n", | |
" <td>1</td>\n", | |
" <td>0.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>15</td>\n", | |
" <td>DIRECTION</td>\n", | |
" <td>1</td>\n", | |
" <td>0.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>66</td>\n", | |
" <td>YES</td>\n", | |
" <td>1</td>\n", | |
" <td>0.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>37</td>\n", | |
" <td>MOVIE</td>\n", | |
" <td>1</td>\n", | |
" <td>0.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>25</td>\n", | |
" <td>GENDER</td>\n", | |
" <td>1</td>\n", | |
" <td>0.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>24</td>\n", | |
" <td>FRUIT</td>\n", | |
" <td>1</td>\n", | |
" <td>0.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>55</td>\n", | |
" <td>SPEED</td>\n", | |
" <td>1</td>\n", | |
" <td>0.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>28</td>\n", | |
" <td>IDIOM</td>\n", | |
" <td>1</td>\n", | |
" <td>0.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>31</td>\n", | |
" <td>LAKE</td>\n", | |
" <td>1</td>\n", | |
" <td>0.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>49</td>\n", | |
" <td>RACE</td>\n", | |
" <td>1</td>\n", | |
" <td>0.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>68</td>\n", | |
" <td>YESNO_ALT</td>\n", | |
" <td>1</td>\n", | |
" <td>0.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>47</td>\n", | |
" <td>PORT</td>\n", | |
" <td>1</td>\n", | |
" <td>0.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>46</td>\n", | |
" <td>PLANT</td>\n", | |
" <td>1</td>\n", | |
" <td>0.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>45</td>\n", | |
" <td>PHONE</td>\n", | |
" <td>1</td>\n", | |
" <td>0.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>21</td>\n", | |
" <td>FISH</td>\n", | |
" <td>1</td>\n", | |
" <td>0.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>69</td>\n", | |
" <td>YESON</td>\n", | |
" <td>1</td>\n", | |
" <td>0.0</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" LABEL n pred_n\n", | |
"19 DOMAIN_TERM 165 313.0\n", | |
"67 YESNO 114 114.0\n", | |
"50 RELATIVE 86 86.0\n", | |
"63 WHERE 63 63.0\n", | |
"48 QUANTITY 54 54.0\n", | |
"64 WHO 50 50.0\n", | |
"65 YEAR 49 49.0\n", | |
"12 COUNTRY 48 48.0\n", | |
"1 ALIAS 43 43.0\n", | |
"44 PERSON 43 43.0\n", | |
"43 PERIOD 33 33.0\n", | |
"56 SUMMARY 29 4.0\n", | |
"62 WHEN 26 26.0\n", | |
"0 AGE 25 25.0\n", | |
"8 CHOICE 22 22.0\n", | |
"13 DATE 22 22.0\n", | |
"38 NAME 17 17.0\n", | |
"33 LOCATION 17 17.0\n", | |
"9 CITY 17 17.0\n", | |
"26 GROUP 16 0.0\n", | |
"20 ERA 15 15.0\n", | |
"27 HEIGHT 15 15.0\n", | |
"5 ART_WORK 14 0.0\n", | |
"16 DISEASE 12 12.0\n", | |
"11 COMPANY 12 12.0\n", | |
"22 FLOWER 12 12.0\n", | |
"7 BUILDING 11 11.0\n", | |
"51 RIVER 10 0.0\n", | |
"32 LENGTH 9 9.0\n", | |
"54 SEASON 7 0.0\n", | |
"18 DOLLAR 7 0.0\n", | |
"57 TEMPLE 7 0.0\n", | |
"30 JOB 7 7.0\n", | |
"60 VEHICLE 7 0.0\n", | |
"10 COLOR 7 7.0\n", | |
"40 ORDER 6 0.0\n", | |
"17 DISTANCE 6 6.0\n", | |
"53 SCHOOL 5 5.0\n", | |
"58 TIME 5 5.0\n", | |
"34 METAL 5 5.0\n", | |
"4 AREA 5 5.0\n", | |
"6 BODY 5 0.0\n", | |
"2 ANIMAL 5 0.0\n", | |
"41 PEN 4 0.0\n", | |
"52 ROAD 4 0.0\n", | |
"59 TOOL 3 3.0\n", | |
"23 FOOD 3 0.0\n", | |
"36 MOUNTAIN 3 0.0\n", | |
"35 MONTH 2 0.0\n", | |
"14 DAY 2 2.0\n", | |
"29 ISLAND 2 0.0\n", | |
"42 PERCENT 2 0.0\n", | |
"39 OCEAN 2 0.0\n", | |
"3 ARE 1 0.0\n", | |
"61 WEIGHT 1 0.0\n", | |
"15 DIRECTION 1 0.0\n", | |
"66 YES 1 0.0\n", | |
"37 MOVIE 1 0.0\n", | |
"25 GENDER 1 0.0\n", | |
"24 FRUIT 1 0.0\n", | |
"55 SPEED 1 0.0\n", | |
"28 IDIOM 1 0.0\n", | |
"31 LAKE 1 0.0\n", | |
"49 RACE 1 0.0\n", | |
"68 YESNO_ALT 1 0.0\n", | |
"47 PORT 1 0.0\n", | |
"46 PLANT 1 0.0\n", | |
"45 PHONE 1 0.0\n", | |
"21 FISH 1 0.0\n", | |
"69 YESON 1 0.0" | |
] | |
}, | |
"execution_count": 94, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"pd.options.display.max_rows = 100\n", | |
"\n", | |
"label_summary_df = (\n", | |
" df >> group_by('LABEL') >> summarize(n = n(X.LABEL))\n", | |
")\n", | |
"label_summary_df >>= left_join(\n", | |
" other=(df >> group_by('pred_label') >> summarize(pred_n = n(X.LABEL)) >> rename(LABEL=X.pred_label)),\n", | |
" by='LABEL'\n", | |
")\n", | |
"label_summary_df = label_summary_df.fillna(0)\n", | |
"label_summary_df >> arrange(desc(X.n))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 100, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2019-12-25T10:06:13.596151Z", | |
"start_time": "2019-12-25T10:06:13.506650Z" | |
}, | |
"scrolled": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>LABEL</th>\n", | |
" <th>label_n</th>\n", | |
" <th>pred_label</th>\n", | |
" <th>pair_n</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <td>20</td>\n", | |
" <td>DOMAIN_TERM</td>\n", | |
" <td>165</td>\n", | |
" <td>DOMAIN_TERM</td>\n", | |
" <td>165</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>70</td>\n", | |
" <td>YESNO</td>\n", | |
" <td>114</td>\n", | |
" <td>YESNO</td>\n", | |
" <td>114</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>53</td>\n", | |
" <td>RELATIVE</td>\n", | |
" <td>86</td>\n", | |
" <td>RELATIVE</td>\n", | |
" <td>86</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>66</td>\n", | |
" <td>WHERE</td>\n", | |
" <td>63</td>\n", | |
" <td>WHERE</td>\n", | |
" <td>63</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>51</td>\n", | |
" <td>QUANTITY</td>\n", | |
" <td>54</td>\n", | |
" <td>QUANTITY</td>\n", | |
" <td>54</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>67</td>\n", | |
" <td>WHO</td>\n", | |
" <td>50</td>\n", | |
" <td>WHO</td>\n", | |
" <td>50</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>68</td>\n", | |
" <td>YEAR</td>\n", | |
" <td>49</td>\n", | |
" <td>YEAR</td>\n", | |
" <td>49</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>12</td>\n", | |
" <td>COUNTRY</td>\n", | |
" <td>48</td>\n", | |
" <td>COUNTRY</td>\n", | |
" <td>48</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>1</td>\n", | |
" <td>ALIAS</td>\n", | |
" <td>43</td>\n", | |
" <td>ALIAS</td>\n", | |
" <td>43</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>47</td>\n", | |
" <td>PERSON</td>\n", | |
" <td>43</td>\n", | |
" <td>PERSON</td>\n", | |
" <td>43</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>46</td>\n", | |
" <td>PERIOD</td>\n", | |
" <td>33</td>\n", | |
" <td>PERIOD</td>\n", | |
" <td>33</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>59</td>\n", | |
" <td>SUMMARY</td>\n", | |
" <td>29</td>\n", | |
" <td>DOMAIN_TERM</td>\n", | |
" <td>29</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>65</td>\n", | |
" <td>WHEN</td>\n", | |
" <td>26</td>\n", | |
" <td>WHEN</td>\n", | |
" <td>26</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>0</td>\n", | |
" <td>AGE</td>\n", | |
" <td>25</td>\n", | |
" <td>AGE</td>\n", | |
" <td>25</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>8</td>\n", | |
" <td>CHOICE</td>\n", | |
" <td>22</td>\n", | |
" <td>CHOICE</td>\n", | |
" <td>22</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>13</td>\n", | |
" <td>DATE</td>\n", | |
" <td>22</td>\n", | |
" <td>DATE</td>\n", | |
" <td>22</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>9</td>\n", | |
" <td>CITY</td>\n", | |
" <td>17</td>\n", | |
" <td>CITY</td>\n", | |
" <td>17</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>34</td>\n", | |
" <td>LOCATION</td>\n", | |
" <td>17</td>\n", | |
" <td>LOCATION</td>\n", | |
" <td>17</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>39</td>\n", | |
" <td>NAME</td>\n", | |
" <td>17</td>\n", | |
" <td>NAME</td>\n", | |
" <td>17</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>27</td>\n", | |
" <td>GROUP</td>\n", | |
" <td>16</td>\n", | |
" <td>DOMAIN_TERM</td>\n", | |
" <td>16</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>21</td>\n", | |
" <td>ERA</td>\n", | |
" <td>15</td>\n", | |
" <td>ERA</td>\n", | |
" <td>15</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>28</td>\n", | |
" <td>HEIGHT</td>\n", | |
" <td>15</td>\n", | |
" <td>HEIGHT</td>\n", | |
" <td>15</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>5</td>\n", | |
" <td>ART_WORK</td>\n", | |
" <td>14</td>\n", | |
" <td>DOMAIN_TERM</td>\n", | |
" <td>14</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>11</td>\n", | |
" <td>COMPANY</td>\n", | |
" <td>12</td>\n", | |
" <td>COMPANY</td>\n", | |
" <td>12</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>16</td>\n", | |
" <td>DISEASE</td>\n", | |
" <td>12</td>\n", | |
" <td>DISEASE</td>\n", | |
" <td>12</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>23</td>\n", | |
" <td>FLOWER</td>\n", | |
" <td>12</td>\n", | |
" <td>FLOWER</td>\n", | |
" <td>12</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>7</td>\n", | |
" <td>BUILDING</td>\n", | |
" <td>11</td>\n", | |
" <td>BUILDING</td>\n", | |
" <td>11</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>54</td>\n", | |
" <td>RIVER</td>\n", | |
" <td>10</td>\n", | |
" <td>DOMAIN_TERM</td>\n", | |
" <td>10</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>33</td>\n", | |
" <td>LENGTH</td>\n", | |
" <td>9</td>\n", | |
" <td>LENGTH</td>\n", | |
" <td>9</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>10</td>\n", | |
" <td>COLOR</td>\n", | |
" <td>7</td>\n", | |
" <td>COLOR</td>\n", | |
" <td>7</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>31</td>\n", | |
" <td>JOB</td>\n", | |
" <td>7</td>\n", | |
" <td>JOB</td>\n", | |
" <td>7</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>57</td>\n", | |
" <td>SEASON</td>\n", | |
" <td>7</td>\n", | |
" <td>DOMAIN_TERM</td>\n", | |
" <td>7</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>60</td>\n", | |
" <td>TEMPLE</td>\n", | |
" <td>7</td>\n", | |
" <td>DOMAIN_TERM</td>\n", | |
" <td>7</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>63</td>\n", | |
" <td>VEHICLE</td>\n", | |
" <td>7</td>\n", | |
" <td>DOMAIN_TERM</td>\n", | |
" <td>7</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>18</td>\n", | |
" <td>DOLLAR</td>\n", | |
" <td>7</td>\n", | |
" <td>DOMAIN_TERM</td>\n", | |
" <td>6</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>19</td>\n", | |
" <td>DOLLAR</td>\n", | |
" <td>7</td>\n", | |
" <td>SUMMARY</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>17</td>\n", | |
" <td>DISTANCE</td>\n", | |
" <td>6</td>\n", | |
" <td>DISTANCE</td>\n", | |
" <td>6</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>41</td>\n", | |
" <td>ORDER</td>\n", | |
" <td>6</td>\n", | |
" <td>DOMAIN_TERM</td>\n", | |
" <td>5</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>42</td>\n", | |
" <td>ORDER</td>\n", | |
" <td>6</td>\n", | |
" <td>SUMMARY</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>2</td>\n", | |
" <td>ANIMAL</td>\n", | |
" <td>5</td>\n", | |
" <td>DOMAIN_TERM</td>\n", | |
" <td>5</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>4</td>\n", | |
" <td>AREA</td>\n", | |
" <td>5</td>\n", | |
" <td>AREA</td>\n", | |
" <td>5</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>6</td>\n", | |
" <td>BODY</td>\n", | |
" <td>5</td>\n", | |
" <td>DOMAIN_TERM</td>\n", | |
" <td>5</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>35</td>\n", | |
" <td>METAL</td>\n", | |
" <td>5</td>\n", | |
" <td>METAL</td>\n", | |
" <td>5</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>56</td>\n", | |
" <td>SCHOOL</td>\n", | |
" <td>5</td>\n", | |
" <td>SCHOOL</td>\n", | |
" <td>5</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>61</td>\n", | |
" <td>TIME</td>\n", | |
" <td>5</td>\n", | |
" <td>TIME</td>\n", | |
" <td>5</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>43</td>\n", | |
" <td>PEN</td>\n", | |
" <td>4</td>\n", | |
" <td>DOMAIN_TERM</td>\n", | |
" <td>4</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>55</td>\n", | |
" <td>ROAD</td>\n", | |
" <td>4</td>\n", | |
" <td>DOMAIN_TERM</td>\n", | |
" <td>4</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>24</td>\n", | |
" <td>FOOD</td>\n", | |
" <td>3</td>\n", | |
" <td>DOMAIN_TERM</td>\n", | |
" <td>3</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>37</td>\n", | |
" <td>MOUNTAIN</td>\n", | |
" <td>3</td>\n", | |
" <td>DOMAIN_TERM</td>\n", | |
" <td>3</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>62</td>\n", | |
" <td>TOOL</td>\n", | |
" <td>3</td>\n", | |
" <td>TOOL</td>\n", | |
" <td>3</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>14</td>\n", | |
" <td>DAY</td>\n", | |
" <td>2</td>\n", | |
" <td>DAY</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>30</td>\n", | |
" <td>ISLAND</td>\n", | |
" <td>2</td>\n", | |
" <td>DOMAIN_TERM</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>36</td>\n", | |
" <td>MONTH</td>\n", | |
" <td>2</td>\n", | |
" <td>DOMAIN_TERM</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>40</td>\n", | |
" <td>OCEAN</td>\n", | |
" <td>2</td>\n", | |
" <td>DOMAIN_TERM</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>44</td>\n", | |
" <td>PERCENT</td>\n", | |
" <td>2</td>\n", | |
" <td>DOMAIN_TERM</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>45</td>\n", | |
" <td>PERCENT</td>\n", | |
" <td>2</td>\n", | |
" <td>SUMMARY</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>3</td>\n", | |
" <td>ARE</td>\n", | |
" <td>1</td>\n", | |
" <td>DOMAIN_TERM</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>15</td>\n", | |
" <td>DIRECTION</td>\n", | |
" <td>1</td>\n", | |
" <td>DOMAIN_TERM</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>22</td>\n", | |
" <td>FISH</td>\n", | |
" <td>1</td>\n", | |
" <td>DOMAIN_TERM</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>25</td>\n", | |
" <td>FRUIT</td>\n", | |
" <td>1</td>\n", | |
" <td>DOMAIN_TERM</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>26</td>\n", | |
" <td>GENDER</td>\n", | |
" <td>1</td>\n", | |
" <td>DOMAIN_TERM</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>29</td>\n", | |
" <td>IDIOM</td>\n", | |
" <td>1</td>\n", | |
" <td>DOMAIN_TERM</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>32</td>\n", | |
" <td>LAKE</td>\n", | |
" <td>1</td>\n", | |
" <td>DOMAIN_TERM</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>38</td>\n", | |
" <td>MOVIE</td>\n", | |
" <td>1</td>\n", | |
" <td>DOMAIN_TERM</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>48</td>\n", | |
" <td>PHONE</td>\n", | |
" <td>1</td>\n", | |
" <td>SUMMARY</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>49</td>\n", | |
" <td>PLANT</td>\n", | |
" <td>1</td>\n", | |
" <td>DOMAIN_TERM</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>50</td>\n", | |
" <td>PORT</td>\n", | |
" <td>1</td>\n", | |
" <td>DOMAIN_TERM</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>52</td>\n", | |
" <td>RACE</td>\n", | |
" <td>1</td>\n", | |
" <td>DOMAIN_TERM</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>58</td>\n", | |
" <td>SPEED</td>\n", | |
" <td>1</td>\n", | |
" <td>DOMAIN_TERM</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>64</td>\n", | |
" <td>WEIGHT</td>\n", | |
" <td>1</td>\n", | |
" <td>DOMAIN_TERM</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>69</td>\n", | |
" <td>YES</td>\n", | |
" <td>1</td>\n", | |
" <td>DOMAIN_TERM</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>71</td>\n", | |
" <td>YESNO_ALT</td>\n", | |
" <td>1</td>\n", | |
" <td>DOMAIN_TERM</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>72</td>\n", | |
" <td>YESON</td>\n", | |
" <td>1</td>\n", | |
" <td>DOMAIN_TERM</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" LABEL label_n pred_label pair_n\n", | |
"20 DOMAIN_TERM 165 DOMAIN_TERM 165\n", | |
"70 YESNO 114 YESNO 114\n", | |
"53 RELATIVE 86 RELATIVE 86\n", | |
"66 WHERE 63 WHERE 63\n", | |
"51 QUANTITY 54 QUANTITY 54\n", | |
"67 WHO 50 WHO 50\n", | |
"68 YEAR 49 YEAR 49\n", | |
"12 COUNTRY 48 COUNTRY 48\n", | |
"1 ALIAS 43 ALIAS 43\n", | |
"47 PERSON 43 PERSON 43\n", | |
"46 PERIOD 33 PERIOD 33\n", | |
"59 SUMMARY 29 DOMAIN_TERM 29\n", | |
"65 WHEN 26 WHEN 26\n", | |
"0 AGE 25 AGE 25\n", | |
"8 CHOICE 22 CHOICE 22\n", | |
"13 DATE 22 DATE 22\n", | |
"9 CITY 17 CITY 17\n", | |
"34 LOCATION 17 LOCATION 17\n", | |
"39 NAME 17 NAME 17\n", | |
"27 GROUP 16 DOMAIN_TERM 16\n", | |
"21 ERA 15 ERA 15\n", | |
"28 HEIGHT 15 HEIGHT 15\n", | |
"5 ART_WORK 14 DOMAIN_TERM 14\n", | |
"11 COMPANY 12 COMPANY 12\n", | |
"16 DISEASE 12 DISEASE 12\n", | |
"23 FLOWER 12 FLOWER 12\n", | |
"7 BUILDING 11 BUILDING 11\n", | |
"54 RIVER 10 DOMAIN_TERM 10\n", | |
"33 LENGTH 9 LENGTH 9\n", | |
"10 COLOR 7 COLOR 7\n", | |
"31 JOB 7 JOB 7\n", | |
"57 SEASON 7 DOMAIN_TERM 7\n", | |
"60 TEMPLE 7 DOMAIN_TERM 7\n", | |
"63 VEHICLE 7 DOMAIN_TERM 7\n", | |
"18 DOLLAR 7 DOMAIN_TERM 6\n", | |
"19 DOLLAR 7 SUMMARY 1\n", | |
"17 DISTANCE 6 DISTANCE 6\n", | |
"41 ORDER 6 DOMAIN_TERM 5\n", | |
"42 ORDER 6 SUMMARY 1\n", | |
"2 ANIMAL 5 DOMAIN_TERM 5\n", | |
"4 AREA 5 AREA 5\n", | |
"6 BODY 5 DOMAIN_TERM 5\n", | |
"35 METAL 5 METAL 5\n", | |
"56 SCHOOL 5 SCHOOL 5\n", | |
"61 TIME 5 TIME 5\n", | |
"43 PEN 4 DOMAIN_TERM 4\n", | |
"55 ROAD 4 DOMAIN_TERM 4\n", | |
"24 FOOD 3 DOMAIN_TERM 3\n", | |
"37 MOUNTAIN 3 DOMAIN_TERM 3\n", | |
"62 TOOL 3 TOOL 3\n", | |
"14 DAY 2 DAY 2\n", | |
"30 ISLAND 2 DOMAIN_TERM 2\n", | |
"36 MONTH 2 DOMAIN_TERM 2\n", | |
"40 OCEAN 2 DOMAIN_TERM 2\n", | |
"44 PERCENT 2 DOMAIN_TERM 1\n", | |
"45 PERCENT 2 SUMMARY 1\n", | |
"3 ARE 1 DOMAIN_TERM 1\n", | |
"15 DIRECTION 1 DOMAIN_TERM 1\n", | |
"22 FISH 1 DOMAIN_TERM 1\n", | |
"25 FRUIT 1 DOMAIN_TERM 1\n", | |
"26 GENDER 1 DOMAIN_TERM 1\n", | |
"29 IDIOM 1 DOMAIN_TERM 1\n", | |
"32 LAKE 1 DOMAIN_TERM 1\n", | |
"38 MOVIE 1 DOMAIN_TERM 1\n", | |
"48 PHONE 1 SUMMARY 1\n", | |
"49 PLANT 1 DOMAIN_TERM 1\n", | |
"50 PORT 1 DOMAIN_TERM 1\n", | |
"52 RACE 1 DOMAIN_TERM 1\n", | |
"58 SPEED 1 DOMAIN_TERM 1\n", | |
"64 WEIGHT 1 DOMAIN_TERM 1\n", | |
"69 YES 1 DOMAIN_TERM 1\n", | |
"71 YESNO_ALT 1 DOMAIN_TERM 1\n", | |
"72 YESON 1 DOMAIN_TERM 1" | |
] | |
}, | |
"execution_count": 100, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df >> group_by('LABEL', 'pred_label') >> summarise(pair_n = n(X.LABEL)) >> left_join(\n", | |
" df >> group_by('LABEL') >> summarise(label_n=n(X.LABEL)), by='LABEL'\n", | |
") >> arrange(desc(X.label_n), desc(X.pair_n)) >> select(X.LABEL, X.label_n, X.pred_label, X.pair_n)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 48, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2019-12-25T09:38:19.309065Z", | |
"start_time": "2019-12-25T09:38:16.717369Z" | |
} | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"https://gist.github.com/5dcd462ee06aa053b1a23a6eb5518c2c\r\n" | |
] | |
} | |
], | |
"source": [ | |
"!/home/joe/.gem/ruby/2.5.0/bin/gist -u 5dcd462ee06aa053b1a23a6eb5518c2c grandchallenge-q-cls.ipynb" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "miniconda", | |
"language": "python", | |
"name": "miniconda" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.3" | |
}, | |
"toc": { | |
"base_numbering": 1, | |
"nav_menu": {}, | |
"number_sections": true, | |
"sideBar": true, | |
"skip_h1_title": false, | |
"title_cell": "Table of Contents", | |
"title_sidebar": "Contents", | |
"toc_cell": true, | |
"toc_position": {}, | |
"toc_section_display": true, | |
"toc_window_display": true | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment