Skip to content

Instantly share code, notes, and snippets.

@alimanfoo
Created January 4, 2021 16:28
Show Gist options
  • Save alimanfoo/5e784e712f930af10794b93d119e7e52 to your computer and use it in GitHub Desktop.
Save alimanfoo/5e784e712f930af10794b93d119e7e52 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import intake"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"application/json": "gcs:\n args:\n path: https://malariagen.github.io/intake/gcs.yml\n description: ''\n driver: intake.catalog.local.YAMLFileCatalog\n metadata:\n version: 1\n",
"text/plain": [
"gcs:\n",
" args:\n",
" path: https://malariagen.github.io/intake/gcs.yml\n",
" description: ''\n",
" driver: intake.catalog.local.YAMLFileCatalog\n",
" metadata:\n",
" version: 1\n"
]
},
"metadata": {
"application/json": {
"root": "gcs"
}
},
"output_type": "display_data"
}
],
"source": [
"cat = intake.open_catalog('https://malariagen.github.io/intake/gcs.yml')\n",
"cat"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/aliman/malariagen/binder/conda/envs/vector-ops-v2.5.1/lib/python3.7/site-packages/dask/dataframe/utils.py:14: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.\n",
" import pandas.util.testing as tm\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>sample_id</th>\n",
" <th>ena_analysis</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>AR0001-C</td>\n",
" <td>ERZ1695275</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>AR0002-C</td>\n",
" <td>ERZ1695276</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>AR0004-C</td>\n",
" <td>ERZ1695277</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>AR0006-C</td>\n",
" <td>ERZ1695278</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>AR0007-C</td>\n",
" <td>ERZ1695279</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3076</th>\n",
" <td>AD0494-C</td>\n",
" <td>ERZ1698351</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3077</th>\n",
" <td>AD0495-C</td>\n",
" <td>ERZ1698352</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3078</th>\n",
" <td>AD0496-C</td>\n",
" <td>ERZ1698353</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3079</th>\n",
" <td>AD0497-C</td>\n",
" <td>ERZ1698354</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3080</th>\n",
" <td>AD0498-C</td>\n",
" <td>ERZ1698355</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>3081 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
" sample_id ena_analysis\n",
"0 AR0001-C ERZ1695275\n",
"1 AR0002-C ERZ1695276\n",
"2 AR0004-C ERZ1695277\n",
"3 AR0006-C ERZ1695278\n",
"4 AR0007-C ERZ1695279\n",
"... ... ...\n",
"3076 AD0494-C ERZ1698351\n",
"3077 AD0495-C ERZ1698352\n",
"3078 AD0496-C ERZ1698353\n",
"3079 AD0497-C ERZ1698354\n",
"3080 AD0498-C ERZ1698355\n",
"\n",
"[3081 rows x 2 columns]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ena_alignments = cat.ag3.ena_alignments.read()\n",
"ena_alignments"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>sample_set</th>\n",
" <th>sample_count</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>AG1000G-AO</td>\n",
" <td>81</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>AG1000G-BF-A</td>\n",
" <td>181</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>AG1000G-BF-B</td>\n",
" <td>102</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>AG1000G-BF-C</td>\n",
" <td>13</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>AG1000G-CD</td>\n",
" <td>76</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>AG1000G-CF</td>\n",
" <td>73</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>AG1000G-CI</td>\n",
" <td>80</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>AG1000G-CM-A</td>\n",
" <td>303</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>AG1000G-CM-B</td>\n",
" <td>97</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>AG1000G-CM-C</td>\n",
" <td>44</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>AG1000G-FR</td>\n",
" <td>23</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>AG1000G-GA-A</td>\n",
" <td>69</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>AG1000G-GH</td>\n",
" <td>100</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>AG1000G-GM-A</td>\n",
" <td>74</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>AG1000G-GM-B</td>\n",
" <td>31</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>AG1000G-GM-C</td>\n",
" <td>174</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>AG1000G-GN-A</td>\n",
" <td>45</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>AG1000G-GN-B</td>\n",
" <td>185</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>AG1000G-GQ</td>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>AG1000G-GW</td>\n",
" <td>101</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>AG1000G-KE</td>\n",
" <td>86</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>AG1000G-ML-A</td>\n",
" <td>60</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>AG1000G-ML-B</td>\n",
" <td>71</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>AG1000G-MW</td>\n",
" <td>41</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24</th>\n",
" <td>AG1000G-MZ</td>\n",
" <td>74</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25</th>\n",
" <td>AG1000G-TZ</td>\n",
" <td>300</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>AG1000G-UG</td>\n",
" <td>290</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27</th>\n",
" <td>AG1000G-X</td>\n",
" <td>297</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" sample_set sample_count\n",
"0 AG1000G-AO 81\n",
"1 AG1000G-BF-A 181\n",
"2 AG1000G-BF-B 102\n",
"3 AG1000G-BF-C 13\n",
"4 AG1000G-CD 76\n",
"5 AG1000G-CF 73\n",
"6 AG1000G-CI 80\n",
"7 AG1000G-CM-A 303\n",
"8 AG1000G-CM-B 97\n",
"9 AG1000G-CM-C 44\n",
"10 AG1000G-FR 23\n",
"11 AG1000G-GA-A 69\n",
"12 AG1000G-GH 100\n",
"13 AG1000G-GM-A 74\n",
"14 AG1000G-GM-B 31\n",
"15 AG1000G-GM-C 174\n",
"16 AG1000G-GN-A 45\n",
"17 AG1000G-GN-B 185\n",
"18 AG1000G-GQ 10\n",
"19 AG1000G-GW 101\n",
"20 AG1000G-KE 86\n",
"21 AG1000G-ML-A 60\n",
"22 AG1000G-ML-B 71\n",
"23 AG1000G-MW 41\n",
"24 AG1000G-MZ 74\n",
"25 AG1000G-TZ 300\n",
"26 AG1000G-UG 290\n",
"27 AG1000G-X 297"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sample_sets = cat.ag3.sample_sets.read()\n",
"sample_sets"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>sample_id</th>\n",
" <th>partner_sample_id</th>\n",
" <th>contributor</th>\n",
" <th>country</th>\n",
" <th>location</th>\n",
" <th>year</th>\n",
" <th>month</th>\n",
" <th>latitude</th>\n",
" <th>longitude</th>\n",
" <th>sex_call</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>AR0047-C</td>\n",
" <td>LUA047</td>\n",
" <td>Joao Pinto</td>\n",
" <td>Angola</td>\n",
" <td>Luanda</td>\n",
" <td>2009</td>\n",
" <td>4</td>\n",
" <td>-8.884</td>\n",
" <td>13.302</td>\n",
" <td>F</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>AR0049-C</td>\n",
" <td>LUA049</td>\n",
" <td>Joao Pinto</td>\n",
" <td>Angola</td>\n",
" <td>Luanda</td>\n",
" <td>2009</td>\n",
" <td>4</td>\n",
" <td>-8.884</td>\n",
" <td>13.302</td>\n",
" <td>F</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>AR0051-C</td>\n",
" <td>LUA051</td>\n",
" <td>Joao Pinto</td>\n",
" <td>Angola</td>\n",
" <td>Luanda</td>\n",
" <td>2009</td>\n",
" <td>4</td>\n",
" <td>-8.884</td>\n",
" <td>13.302</td>\n",
" <td>F</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>AR0061-C</td>\n",
" <td>LUA061</td>\n",
" <td>Joao Pinto</td>\n",
" <td>Angola</td>\n",
" <td>Luanda</td>\n",
" <td>2009</td>\n",
" <td>4</td>\n",
" <td>-8.884</td>\n",
" <td>13.302</td>\n",
" <td>F</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>AR0078-C</td>\n",
" <td>LUA078</td>\n",
" <td>Joao Pinto</td>\n",
" <td>Angola</td>\n",
" <td>Luanda</td>\n",
" <td>2009</td>\n",
" <td>4</td>\n",
" <td>-8.884</td>\n",
" <td>13.302</td>\n",
" <td>F</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3076</th>\n",
" <td>AD0494-C</td>\n",
" <td>80-2-o-16</td>\n",
" <td>Martin Donnelly</td>\n",
" <td>Lab Cross</td>\n",
" <td>LSTM</td>\n",
" <td>-1</td>\n",
" <td>-1</td>\n",
" <td>53.409</td>\n",
" <td>-2.969</td>\n",
" <td>F</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3077</th>\n",
" <td>AD0495-C</td>\n",
" <td>80-2-o-17</td>\n",
" <td>Martin Donnelly</td>\n",
" <td>Lab Cross</td>\n",
" <td>LSTM</td>\n",
" <td>-1</td>\n",
" <td>-1</td>\n",
" <td>53.409</td>\n",
" <td>-2.969</td>\n",
" <td>M</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3078</th>\n",
" <td>AD0496-C</td>\n",
" <td>80-2-o-18</td>\n",
" <td>Martin Donnelly</td>\n",
" <td>Lab Cross</td>\n",
" <td>LSTM</td>\n",
" <td>-1</td>\n",
" <td>-1</td>\n",
" <td>53.409</td>\n",
" <td>-2.969</td>\n",
" <td>M</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3079</th>\n",
" <td>AD0497-C</td>\n",
" <td>80-2-o-19</td>\n",
" <td>Martin Donnelly</td>\n",
" <td>Lab Cross</td>\n",
" <td>LSTM</td>\n",
" <td>-1</td>\n",
" <td>-1</td>\n",
" <td>53.409</td>\n",
" <td>-2.969</td>\n",
" <td>F</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3080</th>\n",
" <td>AD0498-C</td>\n",
" <td>80-2-o-20</td>\n",
" <td>Martin Donnelly</td>\n",
" <td>Lab Cross</td>\n",
" <td>LSTM</td>\n",
" <td>-1</td>\n",
" <td>-1</td>\n",
" <td>53.409</td>\n",
" <td>-2.969</td>\n",
" <td>M</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>3081 rows × 10 columns</p>\n",
"</div>"
],
"text/plain": [
" sample_id partner_sample_id contributor country location year \\\n",
"0 AR0047-C LUA047 Joao Pinto Angola Luanda 2009 \n",
"1 AR0049-C LUA049 Joao Pinto Angola Luanda 2009 \n",
"2 AR0051-C LUA051 Joao Pinto Angola Luanda 2009 \n",
"3 AR0061-C LUA061 Joao Pinto Angola Luanda 2009 \n",
"4 AR0078-C LUA078 Joao Pinto Angola Luanda 2009 \n",
"... ... ... ... ... ... ... \n",
"3076 AD0494-C 80-2-o-16 Martin Donnelly Lab Cross LSTM -1 \n",
"3077 AD0495-C 80-2-o-17 Martin Donnelly Lab Cross LSTM -1 \n",
"3078 AD0496-C 80-2-o-18 Martin Donnelly Lab Cross LSTM -1 \n",
"3079 AD0497-C 80-2-o-19 Martin Donnelly Lab Cross LSTM -1 \n",
"3080 AD0498-C 80-2-o-20 Martin Donnelly Lab Cross LSTM -1 \n",
"\n",
" month latitude longitude sex_call \n",
"0 4 -8.884 13.302 F \n",
"1 4 -8.884 13.302 F \n",
"2 4 -8.884 13.302 F \n",
"3 4 -8.884 13.302 F \n",
"4 4 -8.884 13.302 F \n",
"... ... ... ... ... \n",
"3076 -1 53.409 -2.969 F \n",
"3077 -1 53.409 -2.969 M \n",
"3078 -1 53.409 -2.969 M \n",
"3079 -1 53.409 -2.969 F \n",
"3080 -1 53.409 -2.969 M \n",
"\n",
"[3081 rows x 10 columns]"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_samples = pd.concat(\n",
" [cat.ag3.samples(sample_set=s).read() for s in sample_sets.sample_set]\n",
").reset_index(drop=True)\n",
"df_samples"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_samples.sample_id.isin(ena_alignments.sample_id).all()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ena_alignments.sample_id.isin(df_samples.sample_id).all()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment