Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save peterk87/2c0a7be7f6418703daab410655f141c3 to your computer and use it in GitHub Desktop.
Save peterk87/2c0a7be7f6418703daab410655f141c3 to your computer and use it in GitHub Desktop.
Implementing Adjusted Wallace Coefficient in Python with Numpy
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df_test = pd.read_table('../tests/data/comparing-partitions-microbial-typing-test-data.tsv')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>T type</th>\n",
" <th>emm type</th>\n",
" <th>PFGE Sma80</th>\n",
" <th>PFGE SFi68</th>\n",
" <th>T type + emm type</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>8</td>\n",
" <td>10</td>\n",
" <td>44</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>12</td>\n",
" <td>22</td>\n",
" <td>9</td>\n",
" <td>3</td>\n",
" <td>1222</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>NT</td>\n",
" <td>22</td>\n",
" <td>9</td>\n",
" <td>13</td>\n",
" <td>NT22</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>12</td>\n",
" <td>22</td>\n",
" <td>9</td>\n",
" <td>13</td>\n",
" <td>1222</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>12</td>\n",
" <td>22</td>\n",
" <td>9</td>\n",
" <td>13</td>\n",
" <td>1222</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" T type emm type PFGE Sma80 PFGE SFi68 T type + emm type\n",
"0 4 4 8 10 44\n",
"1 12 22 9 3 1222\n",
"2 NT 22 9 13 NT22\n",
"3 12 22 9 13 1222\n",
"4 12 22 9 13 1222"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_test.head()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"ct = pd.crosstab(df_test['T type'], df_test['emm type'])"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th>emm type</th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" <th>4</th>\n",
" <th>6</th>\n",
" <th>9</th>\n",
" <th>11</th>\n",
" <th>12</th>\n",
" <th>22</th>\n",
" <th>28</th>\n",
" <th>75</th>\n",
" <th>77</th>\n",
" <th>89</th>\n",
" </tr>\n",
" <tr>\n",
" <th>T type</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>38</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>26</td>\n",
" <td>127</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>12</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>28</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>33</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>33</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5/27/44</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>8</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>B3264</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>NT</th>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>6</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"emm type 1 2 4 6 9 11 12 22 28 75 77 89\n",
"T type \n",
"1 38 0 2 0 0 0 0 0 0 0 0 0\n",
"12 0 1 2 0 1 2 26 127 0 0 0 0\n",
"13 2 0 3 0 0 1 1 3 0 0 3 0\n",
"2 0 3 1 0 0 0 1 0 0 0 0 0\n",
"25 0 0 0 0 0 0 0 1 0 12 0 0\n",
"28 0 0 0 0 0 0 0 0 33 0 0 0\n",
"4 0 0 33 0 0 0 0 0 0 0 0 0\n",
"5/27/44 0 0 2 0 0 0 0 0 0 0 0 0\n",
"6 0 0 1 1 0 0 0 0 0 1 0 0\n",
"9 0 0 0 0 8 0 0 0 0 0 0 0\n",
"B3264 0 0 2 0 0 0 0 0 0 0 0 2\n",
"NT 2 0 1 0 0 0 6 1 2 0 0 0"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ct"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"emm type\n",
"1 42\n",
"2 4\n",
"4 47\n",
"6 1\n",
"9 9\n",
"11 3\n",
"12 34\n",
"22 132\n",
"28 35\n",
"75 13\n",
"77 3\n",
"89 2\n",
"dtype: int64"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ct.sum(axis=0)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"T type\n",
"1 40\n",
"12 159\n",
"13 13\n",
"2 5\n",
"25 13\n",
"28 33\n",
"4 33\n",
"5/27/44 2\n",
"6 3\n",
"9 8\n",
"B3264 4\n",
"NT 12\n",
"dtype: int64"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ct.sum(axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"arr = np.array(ct)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([[ 38, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
" [ 0, 1, 2, 0, 1, 2, 26, 127, 0, 0, 0, 0],\n",
" [ 2, 0, 3, 0, 0, 1, 1, 3, 0, 0, 3, 0],\n",
" [ 0, 3, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0],\n",
" [ 0, 0, 0, 0, 0, 0, 0, 1, 0, 12, 0, 0],\n",
" [ 0, 0, 0, 0, 0, 0, 0, 0, 33, 0, 0, 0],\n",
" [ 0, 0, 33, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
" [ 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
" [ 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0],\n",
" [ 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0],\n",
" [ 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2],\n",
" [ 2, 0, 1, 0, 0, 0, 6, 1, 2, 0, 0, 0]])"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"arr"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"a = np.sum(arr * (arr - 1) / 2.0)"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"10215.0"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"a"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"col_sum = arr.sum(axis=0)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([ 42, 4, 47, 1, 9, 3, 34, 132, 35, 13, 3, 2])"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"col_sum"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"row_sum = arr.sum(axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([ 40, 159, 13, 5, 13, 33, 33, 2, 3, 8, 4, 12])"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"row_sum"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"a1 = np.sum(col_sum * (col_sum - 1) / 2.0)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"11871.0"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"a1"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"b = a1 - a"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"1656.0"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"b"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"n = np.sum(arr)"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"325"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"n"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"c = np.sum(row_sum * (row_sum - 1) / 2.0) - a"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"4452.0"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"c"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"d = ((n * (n - 1)) / 2.0) - a1 - c"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"36327.0"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"d"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def abcdn(s1, s2):\n",
" ct = pd.crosstab(s1, s2)\n",
" arr = np.array(ct)\n",
" col_sum = arr.sum(axis=0)\n",
" row_sum = arr.sum(axis=1)\n",
" n = float(np.sum(arr))\n",
" a = np.sum(arr * (arr - 1) / 2.0)\n",
" a1 = np.sum(col_sum * (col_sum - 1) / 2.0)\n",
" b = a1 - a\n",
" c = np.sum(row_sum * (row_sum - 1) / 2.0) - a\n",
" d = ((n * (n - 1)) / 2.0) - a1 - c\n",
" \n",
" return (a, b, c, d, n)"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"a,b,c,d,n = abcdn(df_test[df_test.columns[0]], df_test[df_test.columns[1]])"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"10215.0 1656.0 4452.0 36327.0 325.0\n"
]
}
],
"source": [
"print a, b, c, d, n"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def rand(a, b, c, d):\n",
" return (a + d) / (a + b + c + d)"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0.88398860398860402"
]
},
"execution_count": 47,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"rand(a, b, c, d)"
]
},
{
"cell_type": "code",
"execution_count": 120,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def wallace(a, b, c):\n",
" w_ab = 0.0 if a + b <= 0 else a / (a + b)\n",
" w_ba = 0.0 if a + c <= 0 else a / (a + c)\n",
" return w_ba, w_ab"
]
},
{
"cell_type": "code",
"execution_count": 121,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(0.69646144405808963, 0.86050037907505683)"
]
},
"execution_count": 121,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wallace(a, b, c)"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"s = df_test[df_test.columns[0]]"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0 4\n",
"1 12\n",
"2 NT\n",
"3 12\n",
"4 12\n",
"5 12\n",
"6 4\n",
"7 4\n",
"8 12\n",
"9 12\n",
"10 12\n",
"11 12\n",
"12 12\n",
"13 12\n",
"14 12\n",
"15 12\n",
"16 12\n",
"17 12\n",
"18 12\n",
"19 12\n",
"20 12\n",
"21 12\n",
"22 12\n",
"23 12\n",
"24 12\n",
"25 12\n",
"26 12\n",
"27 12\n",
"28 28\n",
"29 13\n",
" ... \n",
"295 4\n",
"296 NT\n",
"297 12\n",
"298 12\n",
"299 13\n",
"300 1\n",
"301 1\n",
"302 25\n",
"303 4\n",
"304 12\n",
"305 1\n",
"306 12\n",
"307 4\n",
"308 28\n",
"309 2\n",
"310 28\n",
"311 2\n",
"312 12\n",
"313 4\n",
"314 12\n",
"315 5/27/44\n",
"316 4\n",
"317 4\n",
"318 B3264\n",
"319 12\n",
"320 4\n",
"321 4\n",
"322 4\n",
"323 4\n",
"324 25\n",
"Name: T type, dtype: object"
]
},
"execution_count": 62,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"s"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {
"collapsed": false
},
"outputs": [
{
"ename": "TypeError",
"evalue": "crosstab() takes at least 2 arguments (1 given)",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-65-6b20577531ea>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcrosstab\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;31mTypeError\u001b[0m: crosstab() takes at least 2 arguments (1 given)"
]
}
],
"source": [
"pd.crosstab(s)"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"12 159\n",
"1 40\n",
"4 33\n",
"28 33\n",
"13 13\n",
"25 13\n",
"NT 12\n",
"9 8\n",
"2 5\n",
"B3264 4\n",
"6 3\n",
"5/27/44 2\n",
"Name: T type, dtype: int64"
]
},
"execution_count": 68,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.value_counts(s)"
]
},
{
"cell_type": "code",
"execution_count": 132,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def simpsons(s1):\n",
" assert isinstance(s1, pd.Series)\n",
" n = float(s1.size)\n",
" ct = pd.value_counts(s1)\n",
" arr = np.array(ct)\n",
" sid = 1.0\n",
" if n * (n - 1) > 0:\n",
" sid = 1.0 - np.sum(arr * (arr - 1)) / (n * (n - 1.0))\n",
" s2 = 4.0 / n * (np.sum((arr / n)**3) - np.sum((arr / n)**2)**2)\n",
" s2_sqrt = np.sqrt(s2)\n",
" low = sid - 2.0 * s2_sqrt\n",
" high = sid + 2.0 * s2_sqrt\n",
" return sid, low, high, n"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(0.72142450142450143, 0.67573245702169915, 0.76711654582730371, 325.0)"
]
},
"execution_count": 70,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"simpsons(df_test[df_test.columns[0]])"
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th>emm type</th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" <th>4</th>\n",
" <th>6</th>\n",
" <th>9</th>\n",
" <th>11</th>\n",
" <th>12</th>\n",
" <th>22</th>\n",
" <th>28</th>\n",
" <th>75</th>\n",
" <th>77</th>\n",
" <th>89</th>\n",
" </tr>\n",
" <tr>\n",
" <th>T type</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>38</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>26</td>\n",
" <td>127</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>12</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>28</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>33</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>33</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5/27/44</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>8</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>B3264</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>NT</th>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>6</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"emm type 1 2 4 6 9 11 12 22 28 75 77 89\n",
"T type \n",
"1 38 0 2 0 0 0 0 0 0 0 0 0\n",
"12 0 1 2 0 1 2 26 127 0 0 0 0\n",
"13 2 0 3 0 0 1 1 3 0 0 3 0\n",
"2 0 3 1 0 0 0 1 0 0 0 0 0\n",
"25 0 0 0 0 0 0 0 1 0 12 0 0\n",
"28 0 0 0 0 0 0 0 0 33 0 0 0\n",
"4 0 0 33 0 0 0 0 0 0 0 0 0\n",
"5/27/44 0 0 2 0 0 0 0 0 0 0 0 0\n",
"6 0 0 1 1 0 0 0 0 0 1 0 0\n",
"9 0 0 0 0 8 0 0 0 0 0 0 0\n",
"B3264 0 0 2 0 0 0 0 0 0 0 0 2\n",
"NT 2 0 1 0 0 0 6 1 2 0 0 0"
]
},
"execution_count": 73,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ct"
]
},
{
"cell_type": "code",
"execution_count": 74,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([ 40, 159, 13, 5, 13, 33, 33, 2, 3, 8, 4, 12])"
]
},
"execution_count": 74,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"row_sum"
]
},
{
"cell_type": "code",
"execution_count": 75,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([ 42, 4, 47, 1, 9, 3, 34, 132, 35, 13, 3, 2])"
]
},
"execution_count": 75,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"col_sum"
]
},
{
"cell_type": "code",
"execution_count": 76,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"row_sum2 = row_sum[row_sum > 1]\n",
"col_sum2 = col_sum[col_sum > 1]"
]
},
{
"cell_type": "code",
"execution_count": 77,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([ 40, 159, 13, 5, 13, 33, 33, 2, 3, 8, 4, 12])"
]
},
"execution_count": 77,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"row_sum2"
]
},
{
"cell_type": "code",
"execution_count": 94,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"col_sum2 = col_sum2.astype(np.float64)"
]
},
{
"cell_type": "code",
"execution_count": 95,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([ 42., 4., 47., 9., 3., 34., 132., 35., 13.,\n",
" 3., 2.])"
]
},
"execution_count": 95,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"col_sum2"
]
},
{
"cell_type": "code",
"execution_count": 87,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([[ 38, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
" [ 0, 1, 2, 0, 1, 2, 26, 127, 0, 0, 0, 0],\n",
" [ 2, 0, 3, 0, 0, 1, 1, 3, 0, 0, 3, 0],\n",
" [ 0, 3, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0],\n",
" [ 0, 0, 0, 0, 0, 0, 0, 1, 0, 12, 0, 0],\n",
" [ 0, 0, 0, 0, 0, 0, 0, 0, 33, 0, 0, 0],\n",
" [ 0, 0, 33, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
" [ 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
" [ 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0],\n",
" [ 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0],\n",
" [ 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2],\n",
" [ 2, 0, 1, 0, 0, 0, 6, 1, 2, 0, 0, 0]])"
]
},
"execution_count": 87,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"arr"
]
},
{
"cell_type": "code",
"execution_count": 88,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"arr_col = arr[:,col_sum > 1]"
]
},
{
"cell_type": "code",
"execution_count": 89,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([[ 38, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0],\n",
" [ 0, 1, 2, 1, 2, 26, 127, 0, 0, 0, 0],\n",
" [ 2, 0, 3, 0, 1, 1, 3, 0, 0, 3, 0],\n",
" [ 0, 3, 1, 0, 0, 1, 0, 0, 0, 0, 0],\n",
" [ 0, 0, 0, 0, 0, 0, 1, 0, 12, 0, 0],\n",
" [ 0, 0, 0, 0, 0, 0, 0, 33, 0, 0, 0],\n",
" [ 0, 0, 33, 0, 0, 0, 0, 0, 0, 0, 0],\n",
" [ 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0],\n",
" [ 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0],\n",
" [ 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0],\n",
" [ 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 2],\n",
" [ 2, 0, 1, 0, 0, 6, 1, 2, 0, 0, 0]])"
]
},
"execution_count": 89,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"arr_col"
]
},
{
"cell_type": "code",
"execution_count": 101,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def x_y_2(x):\n",
" return (x / col_sum2)**2.0\n",
"\n",
"def x_y_3(x):\n",
" return (x / col_sum2)**3.0"
]
},
{
"cell_type": "code",
"execution_count": 96,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([ 42., 4., 47., 9., 3., 34., 132., 35., 13.,\n",
" 3., 2.])"
]
},
"execution_count": 96,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"col_sum2"
]
},
{
"cell_type": "code",
"execution_count": 100,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([ 0.82312925, 0.625 , 0.50565867, 0.80246914, 0.55555556,\n",
" 0.61764706, 0.92630854, 0.8922449 , 0.85798817, 1. , 1. ])"
]
},
"execution_count": 100,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.sum(np.apply_along_axis(x_y_2, 1, arr_col), axis=0)"
]
},
{
"cell_type": "code",
"execution_count": 102,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"csumFc2 = np.sum(np.apply_along_axis(x_y_2, 1, arr_col), axis=0)\n",
"csumFc3 = np.sum(np.apply_along_axis(x_y_3, 1, arr_col), axis=0)"
]
},
{
"cell_type": "code",
"execution_count": 103,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([ 0.82312925, 0.625 , 0.50565867, 0.80246914, 0.55555556,\n",
" 0.61764706, 0.92630854, 0.8922449 , 0.85798817, 1. , 1. ])"
]
},
"execution_count": 103,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"csumFc2"
]
},
{
"cell_type": "code",
"execution_count": 104,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([ 0.74084872, 0.4375 , 0.34673435, 0.7037037 , 0.33333333,\n",
" 0.45272746, 0.8906263 , 0.83836735, 0.78698225, 1. , 1. ])"
]
},
"execution_count": 104,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"csumFc3"
]
},
{
"cell_type": "code",
"execution_count": 105,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0.0"
]
},
"execution_count": 105,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.clip(-.6, 0.0, 1.0)"
]
},
{
"cell_type": "code",
"execution_count": 106,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"1.0"
]
},
"execution_count": 106,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.clip(1.6, 0.0, 1.0)"
]
},
{
"cell_type": "code",
"execution_count": 133,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def awc(s1, s2):\n",
" assert isinstance(s1, pd.Series)\n",
" assert isinstance(s2, pd.Series)\n",
" assert s1.size == s2.size\n",
" ct = pd.crosstab(s1, s2)\n",
" a,b,c,d,n = abcdn(s1, s2)\n",
" arr = np.array(ct, dtype=np.float64)\n",
" col_sum = arr.sum(axis=0)\n",
" row_sum = arr.sum(axis=1)\n",
" col_sum2 = col_sum[col_sum > 1].astype(np.float64)\n",
" row_sum2 = row_sum[row_sum > 1].astype(np.float64)\n",
" arr_col = arr[:,col_sum > 1]\n",
" arr_row = arr[:,row_sum > 1]\n",
" \n",
" csumFc2 = np.sum(np.apply_along_axis(lambda x: (x / col_sum2)**2.0, 1, arr_col), axis=0)\n",
" csumFc3 = np.sum(np.apply_along_axis(lambda x: (x / col_sum2)**3.0, 1, arr_col), axis=0)\n",
" rsumFc2 = np.sum(np.apply_along_axis(lambda x: (x / row_sum2)**2.0, 0, arr_row), axis=1)\n",
" rsumFc3 = np.sum(np.apply_along_axis(lambda x: (x / row_sum2)**3.0, 0, arr_row), axis=1)\n",
" print csumFc2\n",
" print csumFc3\n",
" print rsumFc2\n",
" print rsumFc3\n",
" rSumW1 = np.sum((row_sum2 * (row_sum2 - 1.0))**2.0 * \n",
" (((4.0 * row_sum2 * (row_sum2 - 1.0) * (row_sum2 - 2.0) * rsumFc3) + \n",
" (2.0 * row_sum2 * (row_sum2 - 1.0) * rsumFc2) - \n",
" (2.0 * row_sum2 * (row_sum2 - 1.0) * (2.0 * row_sum2 - 3.0) * (rsumFc2**2.0))) / \n",
" ((row_sum2 * (row_sum2 - 1.0))**2.0)))\n",
" rSumW2 = np.sum(row_sum * (row_sum - 1.0))\n",
" cSumW1 = np.sum((col_sum2 * (col_sum2 - 1.0))**2.0 * \n",
" (((4.0 * col_sum2 * (col_sum2 - 1.0) * (col_sum2 - 2.0) * csumFc3) + \n",
" (2.0 * col_sum2 * (col_sum2 - 1.0) * csumFc2) - \n",
" (2.0 * col_sum2 * (col_sum2 - 1.0) * (2.0 * col_sum2 - 3.0) * (csumFc2**2.0))) / \n",
" ((col_sum2 * (col_sum2 - 1.0))**2.0)))\n",
" cSumW2 = np.sum(col_sum * (col_sum - 1.0))\n",
" \n",
" var_w1 = 0.0 if rSumW2 <= 0 else (rSumW1 / (rSumW2**2.0))\n",
" var_w2 = 0.0 if cSumW2 <= 0 else (cSumW1 / (cSumW2**2.0))\n",
" w_ab, w_ba = wallace(a, b, c)\n",
" sid1, sid1_low, sid1_high, sid1_n = simpsons(s1)\n",
" sid2, sid2_low, sid2_high, sid2_n = simpsons(s2)\n",
" print sid1, sid2\n",
" wi_ab = 1.0 - sid1\n",
" wi_ba = 1.0 - sid2\n",
" \n",
" awc_ab = (w_ab - wi_ba) / (1.0 - wi_ba)\n",
" awc_ab_ci = 2.0 * (1.0 / (1.0 - wi_ba)) * np.sqrt(var_w1)\n",
" awc_ab_low = np.clip(awc_ab - awc_ab_ci, 0.0, 1.0)\n",
" awc_ab_high = np.clip(awc_ab + awc_ab_ci, 0.0, 1.0)\n",
" \n",
" awc_ba = (w_ba - wi_ab) / (1.0 - wi_ab)\n",
" awc_ba_ci = 2.0 * (1.0 / (1.0 - wi_ab)) * np.sqrt(var_w2)\n",
" awc_ba_low = np.clip(awc_ba - awc_ba_ci, 0.0, 1.0)\n",
" awc_ba_high = np.clip(awc_ba + awc_ba_ci, 0.0, 1.0)\n",
" \n",
" return {'ab': {'awc': [awc_ab, awc_ab_low, awc_ab_high, awc_ab_ci],\n",
" 'wallace': w_ab, },\n",
" 'ba': {'awc': [awc_ba, awc_ba_low, awc_ba_high, awc_ba_ci],\n",
" 'wallace': w_ba, }, }\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 134,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"c1 = df_test[df_test.columns[0]]\n",
"c2 = df_test[df_test.columns[1]]"
]
},
{
"cell_type": "code",
"execution_count": 135,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Index([u'T type', u'emm type', u'PFGE Sma80', u'PFGE SFi68',\n",
" u'T type + emm type'],\n",
" dtype='object')"
]
},
"execution_count": 135,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_test.columns"
]
},
{
"cell_type": "code",
"execution_count": 136,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[ 0.82312925 0.625 0.50565867 0.80246914 0.55555556 0.61764706\n",
" 0.92630854 0.8922449 0.85798817 1. 1. ]\n",
"[ 0.74084872 0.4375 0.34673435 0.7037037 0.33333333 0.45272746\n",
" 0.8906263 0.83836735 0.78698225 1. 1. ]\n",
"[ 0.905 0.66512401 0.19526627 0.44 0.85798817 1. 1.\n",
" 1. 0.33333333 1. 0.5 0.31944444]\n",
"[ 0.8575 0.51396567 0.04142012 0.232 0.78698225 1. 1.\n",
" 1. 0.11111111 1. 0.25 0.13541667]\n",
"0.721424501425 0.77452991453\n"
]
},
{
"data": {
"text/plain": [
"{'ab': {'awc': [0.60809963534315259,\n",
" 0.51367864215706394,\n",
" 0.70252062852924124,\n",
" 0.094420993186088681],\n",
" 'wallace': 0.69646144405808963},\n",
" 'ba': {'awc': [0.80663309792016802,\n",
" 0.73580497749026319,\n",
" 0.87746121835007285,\n",
" 0.07082812042990487],\n",
" 'wallace': 0.86050037907505683}}"
]
},
"execution_count": 136,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"awc(c1, c2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python2",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.12"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment