Last active
March 13, 2021 09:09
-
-
Save lukauskas/28c15d04875579291b6fa52b458f713a to your computer and use it in GitHub Desktop.
Gist for degraded performance of spearmanr in scipy 1.6.1, issue https://github.com/scipy/scipy/issues/6654
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"from scipy.stats import spearmanr\n", | |
"from sinfo import sinfo" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Version numbers are printed at the end.\n", | |
"\n", | |
"I did some digging and while I can reproduce it for my dataset (`data.tsv.gz`), I cannot reproduce the issue for random data" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df = pd.read_csv('data.tsv.gz', sep='\\t', index_col=0)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>x</th>\n", | |
" <th>y</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>-0.052921</td>\n", | |
" <td>0.203387</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>0.181125</td>\n", | |
" <td>-0.107936</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>-0.562084</td>\n", | |
" <td>-0.050814</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>-0.460667</td>\n", | |
" <td>-0.066390</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>0.081773</td>\n", | |
" <td>0.101136</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>...</th>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>552351</th>\n", | |
" <td>0.343175</td>\n", | |
" <td>0.028522</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>552352</th>\n", | |
" <td>0.257745</td>\n", | |
" <td>0.230120</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>552353</th>\n", | |
" <td>0.127060</td>\n", | |
" <td>0.063844</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>552354</th>\n", | |
" <td>-0.167225</td>\n", | |
" <td>0.027455</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>552355</th>\n", | |
" <td>-0.028771</td>\n", | |
" <td>-0.163098</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>552356 rows × 2 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" x y\n", | |
"0 -0.052921 0.203387\n", | |
"1 0.181125 -0.107936\n", | |
"2 -0.562084 -0.050814\n", | |
"3 -0.460667 -0.066390\n", | |
"4 0.081773 0.101136\n", | |
"... ... ...\n", | |
"552351 0.343175 0.028522\n", | |
"552352 0.257745 0.230120\n", | |
"552353 0.127060 0.063844\n", | |
"552354 -0.167225 0.027455\n", | |
"552355 -0.028771 -0.163098\n", | |
"\n", | |
"[552356 rows x 2 columns]" | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"This does not seem to be a NaN=None thing as dtype is object:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"x float64\n", | |
"y float64\n", | |
"dtype: object\n" | |
] | |
} | |
], | |
"source": [ | |
"print(df.dtypes)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"But there are NaNs:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"x 2488\n", | |
"y 1620\n", | |
"dtype: int64" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df.isnull().sum()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Without further ado, on my laptop this takes around 30s:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 32.1 s, sys: 209 ms, total: 32.3 s\n", | |
"Wall time: 32.5 s\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"SpearmanrResult(correlation=-0.0022916182928414985, pvalue=0.08973233486988992)" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"%time spearmanr(df['x'], df['y'], nan_policy=\"omit\")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"This does not seem to be a pandas issue:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 32 s, sys: 178 ms, total: 32.2 s\n", | |
"Wall time: 32.3 s\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"SpearmanrResult(correlation=-0.0022916182928414985, pvalue=0.08973233486988992)" | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"%time spearmanr(np.asarray(df['x'].values), np.asarray(df['y'].values), nan_policy=\"omit\")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"It also didn't seem to be an issue of the order of entries in the files (the dataset is already a shuffled version of my actual dataset). \n", | |
"\n", | |
"Droping NaNs removes only a few rows and solves the issue" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>x</th>\n", | |
" <th>y</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>-0.052921</td>\n", | |
" <td>0.203387</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>0.181125</td>\n", | |
" <td>-0.107936</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>-0.562084</td>\n", | |
" <td>-0.050814</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>-0.460667</td>\n", | |
" <td>-0.066390</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>0.081773</td>\n", | |
" <td>0.101136</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>...</th>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>552351</th>\n", | |
" <td>0.343175</td>\n", | |
" <td>0.028522</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>552352</th>\n", | |
" <td>0.257745</td>\n", | |
" <td>0.230120</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>552353</th>\n", | |
" <td>0.127060</td>\n", | |
" <td>0.063844</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>552354</th>\n", | |
" <td>-0.167225</td>\n", | |
" <td>0.027455</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>552355</th>\n", | |
" <td>-0.028771</td>\n", | |
" <td>-0.163098</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>548256 rows × 2 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" x y\n", | |
"0 -0.052921 0.203387\n", | |
"1 0.181125 -0.107936\n", | |
"2 -0.562084 -0.050814\n", | |
"3 -0.460667 -0.066390\n", | |
"4 0.081773 0.101136\n", | |
"... ... ...\n", | |
"552351 0.343175 0.028522\n", | |
"552352 0.257745 0.230120\n", | |
"552353 0.127060 0.063844\n", | |
"552354 -0.167225 0.027455\n", | |
"552355 -0.028771 -0.163098\n", | |
"\n", | |
"[548256 rows x 2 columns]" | |
] | |
}, | |
"execution_count": 8, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df_no_na = df.dropna()\n", | |
"df_no_na" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Super fast now:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 146 ms, sys: 22.2 ms, total: 168 ms\n", | |
"Wall time: 167 ms\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"SpearmanrResult(correlation=-0.002291618292841498, pvalue=0.08973233486989)" | |
] | |
}, | |
"execution_count": 9, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"%time spearmanr(df_no_na['x'], df_no_na['y'], nan_policy=\"raise\")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"🤷🏼♂️" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Version numbers etc:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"-----\n", | |
"numpy 1.20.1\n", | |
"pandas 1.2.3\n", | |
"scipy 1.6.1\n", | |
"sinfo 0.3.1\n", | |
"-----\n", | |
"IPython 7.21.0\n", | |
"jupyter_client 6.1.11\n", | |
"jupyter_core 4.7.1\n", | |
"notebook 6.2.0\n", | |
"-----\n", | |
"Python 3.9.2 (default, Feb 24 2021, 13:30:36) [Clang 12.0.0 (clang-1200.0.32.29)]\n", | |
"macOS-10.15.7-x86_64-i386-64bit\n", | |
"8 logical CPU cores, i386\n", | |
"-----\n", | |
"Session information updated at 2021-03-12 19:32\n" | |
] | |
} | |
], | |
"source": [ | |
"sinfo()" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.9.2" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
data.tsv.gz
should be 10.6MB, checksum: