Created
May 8, 2017 22:32
-
-
Save peterk87/2c0a7be7f6418703daab410655f141c3 to your computer and use it in GitHub Desktop.
Implementing Adjusted Wallace Coefficient in Python with Numpy
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd\n", | |
"import numpy as np" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"df_test = pd.read_table('../tests/data/comparing-partitions-microbial-typing-test-data.tsv')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>T type</th>\n", | |
" <th>emm type</th>\n", | |
" <th>PFGE Sma80</th>\n", | |
" <th>PFGE SFi68</th>\n", | |
" <th>T type + emm type</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>4</td>\n", | |
" <td>4</td>\n", | |
" <td>8</td>\n", | |
" <td>10</td>\n", | |
" <td>44</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>12</td>\n", | |
" <td>22</td>\n", | |
" <td>9</td>\n", | |
" <td>3</td>\n", | |
" <td>1222</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>NT</td>\n", | |
" <td>22</td>\n", | |
" <td>9</td>\n", | |
" <td>13</td>\n", | |
" <td>NT22</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>12</td>\n", | |
" <td>22</td>\n", | |
" <td>9</td>\n", | |
" <td>13</td>\n", | |
" <td>1222</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>12</td>\n", | |
" <td>22</td>\n", | |
" <td>9</td>\n", | |
" <td>13</td>\n", | |
" <td>1222</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" T type emm type PFGE Sma80 PFGE SFi68 T type + emm type\n", | |
"0 4 4 8 10 44\n", | |
"1 12 22 9 3 1222\n", | |
"2 NT 22 9 13 NT22\n", | |
"3 12 22 9 13 1222\n", | |
"4 12 22 9 13 1222" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df_test.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"ct = pd.crosstab(df_test['T type'], df_test['emm type'])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th>emm type</th>\n", | |
" <th>1</th>\n", | |
" <th>2</th>\n", | |
" <th>4</th>\n", | |
" <th>6</th>\n", | |
" <th>9</th>\n", | |
" <th>11</th>\n", | |
" <th>12</th>\n", | |
" <th>22</th>\n", | |
" <th>28</th>\n", | |
" <th>75</th>\n", | |
" <th>77</th>\n", | |
" <th>89</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>T type</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>38</td>\n", | |
" <td>0</td>\n", | |
" <td>2</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>12</th>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>2</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>2</td>\n", | |
" <td>26</td>\n", | |
" <td>127</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>13</th>\n", | |
" <td>2</td>\n", | |
" <td>0</td>\n", | |
" <td>3</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>3</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>3</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>0</td>\n", | |
" <td>3</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>25</th>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>12</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>28</th>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>33</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>33</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5/27/44</th>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>2</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6</th>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>9</th>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>8</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>B3264</th>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>2</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>NT</th>\n", | |
" <td>2</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>6</td>\n", | |
" <td>1</td>\n", | |
" <td>2</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
"emm type 1 2 4 6 9 11 12 22 28 75 77 89\n", | |
"T type \n", | |
"1 38 0 2 0 0 0 0 0 0 0 0 0\n", | |
"12 0 1 2 0 1 2 26 127 0 0 0 0\n", | |
"13 2 0 3 0 0 1 1 3 0 0 3 0\n", | |
"2 0 3 1 0 0 0 1 0 0 0 0 0\n", | |
"25 0 0 0 0 0 0 0 1 0 12 0 0\n", | |
"28 0 0 0 0 0 0 0 0 33 0 0 0\n", | |
"4 0 0 33 0 0 0 0 0 0 0 0 0\n", | |
"5/27/44 0 0 2 0 0 0 0 0 0 0 0 0\n", | |
"6 0 0 1 1 0 0 0 0 0 1 0 0\n", | |
"9 0 0 0 0 8 0 0 0 0 0 0 0\n", | |
"B3264 0 0 2 0 0 0 0 0 0 0 0 2\n", | |
"NT 2 0 1 0 0 0 6 1 2 0 0 0" | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"ct" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"emm type\n", | |
"1 42\n", | |
"2 4\n", | |
"4 47\n", | |
"6 1\n", | |
"9 9\n", | |
"11 3\n", | |
"12 34\n", | |
"22 132\n", | |
"28 35\n", | |
"75 13\n", | |
"77 3\n", | |
"89 2\n", | |
"dtype: int64" | |
] | |
}, | |
"execution_count": 8, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"ct.sum(axis=0)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"T type\n", | |
"1 40\n", | |
"12 159\n", | |
"13 13\n", | |
"2 5\n", | |
"25 13\n", | |
"28 33\n", | |
"4 33\n", | |
"5/27/44 2\n", | |
"6 3\n", | |
"9 8\n", | |
"B3264 4\n", | |
"NT 12\n", | |
"dtype: int64" | |
] | |
}, | |
"execution_count": 9, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"ct.sum(axis=1)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"arr = np.array(ct)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([[ 38, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", | |
" [ 0, 1, 2, 0, 1, 2, 26, 127, 0, 0, 0, 0],\n", | |
" [ 2, 0, 3, 0, 0, 1, 1, 3, 0, 0, 3, 0],\n", | |
" [ 0, 3, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0],\n", | |
" [ 0, 0, 0, 0, 0, 0, 0, 1, 0, 12, 0, 0],\n", | |
" [ 0, 0, 0, 0, 0, 0, 0, 0, 33, 0, 0, 0],\n", | |
" [ 0, 0, 33, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", | |
" [ 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", | |
" [ 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0],\n", | |
" [ 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0],\n", | |
" [ 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2],\n", | |
" [ 2, 0, 1, 0, 0, 0, 6, 1, 2, 0, 0, 0]])" | |
] | |
}, | |
"execution_count": 14, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"arr" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 26, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"a = np.sum(arr * (arr - 1) / 2.0)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 27, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"10215.0" | |
] | |
}, | |
"execution_count": 27, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"a" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"col_sum = arr.sum(axis=0)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([ 42, 4, 47, 1, 9, 3, 34, 132, 35, 13, 3, 2])" | |
] | |
}, | |
"execution_count": 19, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"col_sum" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"row_sum = arr.sum(axis=1)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([ 40, 159, 13, 5, 13, 33, 33, 2, 3, 8, 4, 12])" | |
] | |
}, | |
"execution_count": 21, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"row_sum" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 24, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"a1 = np.sum(col_sum * (col_sum - 1) / 2.0)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 25, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"11871.0" | |
] | |
}, | |
"execution_count": 25, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"a1" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 28, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"b = a1 - a" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 29, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"1656.0" | |
] | |
}, | |
"execution_count": 29, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"b" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 31, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"n = np.sum(arr)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 32, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"325" | |
] | |
}, | |
"execution_count": 32, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 33, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"c = np.sum(row_sum * (row_sum - 1) / 2.0) - a" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 34, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"4452.0" | |
] | |
}, | |
"execution_count": 34, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"c" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 35, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"d = ((n * (n - 1)) / 2.0) - a1 - c" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 36, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"36327.0" | |
] | |
}, | |
"execution_count": 36, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"d" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 43, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def abcdn(s1, s2):\n", | |
" ct = pd.crosstab(s1, s2)\n", | |
" arr = np.array(ct)\n", | |
" col_sum = arr.sum(axis=0)\n", | |
" row_sum = arr.sum(axis=1)\n", | |
" n = float(np.sum(arr))\n", | |
" a = np.sum(arr * (arr - 1) / 2.0)\n", | |
" a1 = np.sum(col_sum * (col_sum - 1) / 2.0)\n", | |
" b = a1 - a\n", | |
" c = np.sum(row_sum * (row_sum - 1) / 2.0) - a\n", | |
" d = ((n * (n - 1)) / 2.0) - a1 - c\n", | |
" \n", | |
" return (a, b, c, d, n)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 44, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"a,b,c,d,n = abcdn(df_test[df_test.columns[0]], df_test[df_test.columns[1]])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 45, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"10215.0 1656.0 4452.0 36327.0 325.0\n" | |
] | |
} | |
], | |
"source": [ | |
"print a, b, c, d, n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 46, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def rand(a, b, c, d):\n", | |
" return (a + d) / (a + b + c + d)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 47, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"0.88398860398860402" | |
] | |
}, | |
"execution_count": 47, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"rand(a, b, c, d)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 120, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def wallace(a, b, c):\n", | |
" w_ab = 0.0 if a + b <= 0 else a / (a + b)\n", | |
" w_ba = 0.0 if a + c <= 0 else a / (a + c)\n", | |
" return w_ba, w_ab" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 121, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(0.69646144405808963, 0.86050037907505683)" | |
] | |
}, | |
"execution_count": 121, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"wallace(a, b, c)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 61, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"s = df_test[df_test.columns[0]]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 62, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"0 4\n", | |
"1 12\n", | |
"2 NT\n", | |
"3 12\n", | |
"4 12\n", | |
"5 12\n", | |
"6 4\n", | |
"7 4\n", | |
"8 12\n", | |
"9 12\n", | |
"10 12\n", | |
"11 12\n", | |
"12 12\n", | |
"13 12\n", | |
"14 12\n", | |
"15 12\n", | |
"16 12\n", | |
"17 12\n", | |
"18 12\n", | |
"19 12\n", | |
"20 12\n", | |
"21 12\n", | |
"22 12\n", | |
"23 12\n", | |
"24 12\n", | |
"25 12\n", | |
"26 12\n", | |
"27 12\n", | |
"28 28\n", | |
"29 13\n", | |
" ... \n", | |
"295 4\n", | |
"296 NT\n", | |
"297 12\n", | |
"298 12\n", | |
"299 13\n", | |
"300 1\n", | |
"301 1\n", | |
"302 25\n", | |
"303 4\n", | |
"304 12\n", | |
"305 1\n", | |
"306 12\n", | |
"307 4\n", | |
"308 28\n", | |
"309 2\n", | |
"310 28\n", | |
"311 2\n", | |
"312 12\n", | |
"313 4\n", | |
"314 12\n", | |
"315 5/27/44\n", | |
"316 4\n", | |
"317 4\n", | |
"318 B3264\n", | |
"319 12\n", | |
"320 4\n", | |
"321 4\n", | |
"322 4\n", | |
"323 4\n", | |
"324 25\n", | |
"Name: T type, dtype: object" | |
] | |
}, | |
"execution_count": 62, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"s" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 65, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"ename": "TypeError", | |
"evalue": "crosstab() takes at least 2 arguments (1 given)", | |
"output_type": "error", | |
"traceback": [ | |
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", | |
"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", | |
"\u001b[0;32m<ipython-input-65-6b20577531ea>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcrosstab\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", | |
"\u001b[0;31mTypeError\u001b[0m: crosstab() takes at least 2 arguments (1 given)" | |
] | |
} | |
], | |
"source": [ | |
"pd.crosstab(s)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 68, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"12 159\n", | |
"1 40\n", | |
"4 33\n", | |
"28 33\n", | |
"13 13\n", | |
"25 13\n", | |
"NT 12\n", | |
"9 8\n", | |
"2 5\n", | |
"B3264 4\n", | |
"6 3\n", | |
"5/27/44 2\n", | |
"Name: T type, dtype: int64" | |
] | |
}, | |
"execution_count": 68, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"pd.value_counts(s)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 132, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def simpsons(s1):\n", | |
" assert isinstance(s1, pd.Series)\n", | |
" n = float(s1.size)\n", | |
" ct = pd.value_counts(s1)\n", | |
" arr = np.array(ct)\n", | |
" sid = 1.0\n", | |
" if n * (n - 1) > 0:\n", | |
" sid = 1.0 - np.sum(arr * (arr - 1)) / (n * (n - 1.0))\n", | |
" s2 = 4.0 / n * (np.sum((arr / n)**3) - np.sum((arr / n)**2)**2)\n", | |
" s2_sqrt = np.sqrt(s2)\n", | |
" low = sid - 2.0 * s2_sqrt\n", | |
" high = sid + 2.0 * s2_sqrt\n", | |
" return sid, low, high, n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 70, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(0.72142450142450143, 0.67573245702169915, 0.76711654582730371, 325.0)" | |
] | |
}, | |
"execution_count": 70, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"simpsons(df_test[df_test.columns[0]])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 73, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th>emm type</th>\n", | |
" <th>1</th>\n", | |
" <th>2</th>\n", | |
" <th>4</th>\n", | |
" <th>6</th>\n", | |
" <th>9</th>\n", | |
" <th>11</th>\n", | |
" <th>12</th>\n", | |
" <th>22</th>\n", | |
" <th>28</th>\n", | |
" <th>75</th>\n", | |
" <th>77</th>\n", | |
" <th>89</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>T type</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>38</td>\n", | |
" <td>0</td>\n", | |
" <td>2</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>12</th>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>2</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>2</td>\n", | |
" <td>26</td>\n", | |
" <td>127</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>13</th>\n", | |
" <td>2</td>\n", | |
" <td>0</td>\n", | |
" <td>3</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>3</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>3</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>0</td>\n", | |
" <td>3</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>25</th>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>12</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>28</th>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>33</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>33</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5/27/44</th>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>2</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6</th>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>9</th>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>8</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>B3264</th>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>2</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>NT</th>\n", | |
" <td>2</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>6</td>\n", | |
" <td>1</td>\n", | |
" <td>2</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
"emm type 1 2 4 6 9 11 12 22 28 75 77 89\n", | |
"T type \n", | |
"1 38 0 2 0 0 0 0 0 0 0 0 0\n", | |
"12 0 1 2 0 1 2 26 127 0 0 0 0\n", | |
"13 2 0 3 0 0 1 1 3 0 0 3 0\n", | |
"2 0 3 1 0 0 0 1 0 0 0 0 0\n", | |
"25 0 0 0 0 0 0 0 1 0 12 0 0\n", | |
"28 0 0 0 0 0 0 0 0 33 0 0 0\n", | |
"4 0 0 33 0 0 0 0 0 0 0 0 0\n", | |
"5/27/44 0 0 2 0 0 0 0 0 0 0 0 0\n", | |
"6 0 0 1 1 0 0 0 0 0 1 0 0\n", | |
"9 0 0 0 0 8 0 0 0 0 0 0 0\n", | |
"B3264 0 0 2 0 0 0 0 0 0 0 0 2\n", | |
"NT 2 0 1 0 0 0 6 1 2 0 0 0" | |
] | |
}, | |
"execution_count": 73, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"ct" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 74, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([ 40, 159, 13, 5, 13, 33, 33, 2, 3, 8, 4, 12])" | |
] | |
}, | |
"execution_count": 74, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"row_sum" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 75, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([ 42, 4, 47, 1, 9, 3, 34, 132, 35, 13, 3, 2])" | |
] | |
}, | |
"execution_count": 75, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"col_sum" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 76, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"row_sum2 = row_sum[row_sum > 1]\n", | |
"col_sum2 = col_sum[col_sum > 1]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 77, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([ 40, 159, 13, 5, 13, 33, 33, 2, 3, 8, 4, 12])" | |
] | |
}, | |
"execution_count": 77, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"row_sum2" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 94, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"col_sum2 = col_sum2.astype(np.float64)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 95, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([ 42., 4., 47., 9., 3., 34., 132., 35., 13.,\n", | |
" 3., 2.])" | |
] | |
}, | |
"execution_count": 95, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"col_sum2" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 87, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([[ 38, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", | |
" [ 0, 1, 2, 0, 1, 2, 26, 127, 0, 0, 0, 0],\n", | |
" [ 2, 0, 3, 0, 0, 1, 1, 3, 0, 0, 3, 0],\n", | |
" [ 0, 3, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0],\n", | |
" [ 0, 0, 0, 0, 0, 0, 0, 1, 0, 12, 0, 0],\n", | |
" [ 0, 0, 0, 0, 0, 0, 0, 0, 33, 0, 0, 0],\n", | |
" [ 0, 0, 33, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", | |
" [ 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", | |
" [ 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0],\n", | |
" [ 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0],\n", | |
" [ 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2],\n", | |
" [ 2, 0, 1, 0, 0, 0, 6, 1, 2, 0, 0, 0]])" | |
] | |
}, | |
"execution_count": 87, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"arr" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 88, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"arr_col = arr[:,col_sum > 1]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 89, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([[ 38, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0],\n", | |
" [ 0, 1, 2, 1, 2, 26, 127, 0, 0, 0, 0],\n", | |
" [ 2, 0, 3, 0, 1, 1, 3, 0, 0, 3, 0],\n", | |
" [ 0, 3, 1, 0, 0, 1, 0, 0, 0, 0, 0],\n", | |
" [ 0, 0, 0, 0, 0, 0, 1, 0, 12, 0, 0],\n", | |
" [ 0, 0, 0, 0, 0, 0, 0, 33, 0, 0, 0],\n", | |
" [ 0, 0, 33, 0, 0, 0, 0, 0, 0, 0, 0],\n", | |
" [ 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0],\n", | |
" [ 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0],\n", | |
" [ 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0],\n", | |
" [ 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 2],\n", | |
" [ 2, 0, 1, 0, 0, 6, 1, 2, 0, 0, 0]])" | |
] | |
}, | |
"execution_count": 89, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"arr_col" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 101, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def x_y_2(x):\n", | |
" return (x / col_sum2)**2.0\n", | |
"\n", | |
"def x_y_3(x):\n", | |
" return (x / col_sum2)**3.0" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 96, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([ 42., 4., 47., 9., 3., 34., 132., 35., 13.,\n", | |
" 3., 2.])" | |
] | |
}, | |
"execution_count": 96, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"col_sum2" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 100, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([ 0.82312925, 0.625 , 0.50565867, 0.80246914, 0.55555556,\n", | |
" 0.61764706, 0.92630854, 0.8922449 , 0.85798817, 1. , 1. ])" | |
] | |
}, | |
"execution_count": 100, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"np.sum(np.apply_along_axis(x_y_2, 1, arr_col), axis=0)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 102, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"csumFc2 = np.sum(np.apply_along_axis(x_y_2, 1, arr_col), axis=0)\n", | |
"csumFc3 = np.sum(np.apply_along_axis(x_y_3, 1, arr_col), axis=0)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 103, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([ 0.82312925, 0.625 , 0.50565867, 0.80246914, 0.55555556,\n", | |
" 0.61764706, 0.92630854, 0.8922449 , 0.85798817, 1. , 1. ])" | |
] | |
}, | |
"execution_count": 103, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"csumFc2" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 104, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([ 0.74084872, 0.4375 , 0.34673435, 0.7037037 , 0.33333333,\n", | |
" 0.45272746, 0.8906263 , 0.83836735, 0.78698225, 1. , 1. ])" | |
] | |
}, | |
"execution_count": 104, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"csumFc3" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 105, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"0.0" | |
] | |
}, | |
"execution_count": 105, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"np.clip(-.6, 0.0, 1.0)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 106, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"1.0" | |
] | |
}, | |
"execution_count": 106, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"np.clip(1.6, 0.0, 1.0)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 133, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"def awc(s1, s2):\n", | |
" assert isinstance(s1, pd.Series)\n", | |
" assert isinstance(s2, pd.Series)\n", | |
" assert s1.size == s2.size\n", | |
" ct = pd.crosstab(s1, s2)\n", | |
" a,b,c,d,n = abcdn(s1, s2)\n", | |
" arr = np.array(ct, dtype=np.float64)\n", | |
" col_sum = arr.sum(axis=0)\n", | |
" row_sum = arr.sum(axis=1)\n", | |
" col_sum2 = col_sum[col_sum > 1].astype(np.float64)\n", | |
" row_sum2 = row_sum[row_sum > 1].astype(np.float64)\n", | |
" arr_col = arr[:,col_sum > 1]\n", | |
" arr_row = arr[:,row_sum > 1]\n", | |
" \n", | |
" csumFc2 = np.sum(np.apply_along_axis(lambda x: (x / col_sum2)**2.0, 1, arr_col), axis=0)\n", | |
" csumFc3 = np.sum(np.apply_along_axis(lambda x: (x / col_sum2)**3.0, 1, arr_col), axis=0)\n", | |
" rsumFc2 = np.sum(np.apply_along_axis(lambda x: (x / row_sum2)**2.0, 0, arr_row), axis=1)\n", | |
" rsumFc3 = np.sum(np.apply_along_axis(lambda x: (x / row_sum2)**3.0, 0, arr_row), axis=1)\n", | |
" print csumFc2\n", | |
" print csumFc3\n", | |
" print rsumFc2\n", | |
" print rsumFc3\n", | |
" rSumW1 = np.sum((row_sum2 * (row_sum2 - 1.0))**2.0 * \n", | |
" (((4.0 * row_sum2 * (row_sum2 - 1.0) * (row_sum2 - 2.0) * rsumFc3) + \n", | |
" (2.0 * row_sum2 * (row_sum2 - 1.0) * rsumFc2) - \n", | |
" (2.0 * row_sum2 * (row_sum2 - 1.0) * (2.0 * row_sum2 - 3.0) * (rsumFc2**2.0))) / \n", | |
" ((row_sum2 * (row_sum2 - 1.0))**2.0)))\n", | |
" rSumW2 = np.sum(row_sum * (row_sum - 1.0))\n", | |
" cSumW1 = np.sum((col_sum2 * (col_sum2 - 1.0))**2.0 * \n", | |
" (((4.0 * col_sum2 * (col_sum2 - 1.0) * (col_sum2 - 2.0) * csumFc3) + \n", | |
" (2.0 * col_sum2 * (col_sum2 - 1.0) * csumFc2) - \n", | |
" (2.0 * col_sum2 * (col_sum2 - 1.0) * (2.0 * col_sum2 - 3.0) * (csumFc2**2.0))) / \n", | |
" ((col_sum2 * (col_sum2 - 1.0))**2.0)))\n", | |
" cSumW2 = np.sum(col_sum * (col_sum - 1.0))\n", | |
" \n", | |
" var_w1 = 0.0 if rSumW2 <= 0 else (rSumW1 / (rSumW2**2.0))\n", | |
" var_w2 = 0.0 if cSumW2 <= 0 else (cSumW1 / (cSumW2**2.0))\n", | |
" w_ab, w_ba = wallace(a, b, c)\n", | |
" sid1, sid1_low, sid1_high, sid1_n = simpsons(s1)\n", | |
" sid2, sid2_low, sid2_high, sid2_n = simpsons(s2)\n", | |
" print sid1, sid2\n", | |
" wi_ab = 1.0 - sid1\n", | |
" wi_ba = 1.0 - sid2\n", | |
" \n", | |
" awc_ab = (w_ab - wi_ba) / (1.0 - wi_ba)\n", | |
" awc_ab_ci = 2.0 * (1.0 / (1.0 - wi_ba)) * np.sqrt(var_w1)\n", | |
" awc_ab_low = np.clip(awc_ab - awc_ab_ci, 0.0, 1.0)\n", | |
" awc_ab_high = np.clip(awc_ab + awc_ab_ci, 0.0, 1.0)\n", | |
" \n", | |
" awc_ba = (w_ba - wi_ab) / (1.0 - wi_ab)\n", | |
" awc_ba_ci = 2.0 * (1.0 / (1.0 - wi_ab)) * np.sqrt(var_w2)\n", | |
" awc_ba_low = np.clip(awc_ba - awc_ba_ci, 0.0, 1.0)\n", | |
" awc_ba_high = np.clip(awc_ba + awc_ba_ci, 0.0, 1.0)\n", | |
" \n", | |
" return {'ab': {'awc': [awc_ab, awc_ab_low, awc_ab_high, awc_ab_ci],\n", | |
" 'wallace': w_ab, },\n", | |
" 'ba': {'awc': [awc_ba, awc_ba_low, awc_ba_high, awc_ba_ci],\n", | |
" 'wallace': w_ba, }, }\n", | |
" " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 134, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"c1 = df_test[df_test.columns[0]]\n", | |
"c2 = df_test[df_test.columns[1]]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 135, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"Index([u'T type', u'emm type', u'PFGE Sma80', u'PFGE SFi68',\n", | |
" u'T type + emm type'],\n", | |
" dtype='object')" | |
] | |
}, | |
"execution_count": 135, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df_test.columns" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 136, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"[ 0.82312925 0.625 0.50565867 0.80246914 0.55555556 0.61764706\n", | |
" 0.92630854 0.8922449 0.85798817 1. 1. ]\n", | |
"[ 0.74084872 0.4375 0.34673435 0.7037037 0.33333333 0.45272746\n", | |
" 0.8906263 0.83836735 0.78698225 1. 1. ]\n", | |
"[ 0.905 0.66512401 0.19526627 0.44 0.85798817 1. 1.\n", | |
" 1. 0.33333333 1. 0.5 0.31944444]\n", | |
"[ 0.8575 0.51396567 0.04142012 0.232 0.78698225 1. 1.\n", | |
" 1. 0.11111111 1. 0.25 0.13541667]\n", | |
"0.721424501425 0.77452991453\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"{'ab': {'awc': [0.60809963534315259,\n", | |
" 0.51367864215706394,\n", | |
" 0.70252062852924124,\n", | |
" 0.094420993186088681],\n", | |
" 'wallace': 0.69646144405808963},\n", | |
" 'ba': {'awc': [0.80663309792016802,\n", | |
" 0.73580497749026319,\n", | |
" 0.87746121835007285,\n", | |
" 0.07082812042990487],\n", | |
" 'wallace': 0.86050037907505683}}" | |
] | |
}, | |
"execution_count": 136, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"awc(c1, c2)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 2", | |
"language": "python2", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.12" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment