Created
January 4, 2021 16:28
-
-
Save alimanfoo/5e784e712f930af10794b93d119e7e52 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import intake" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"application/json": "gcs:\n args:\n path: https://malariagen.github.io/intake/gcs.yml\n description: ''\n driver: intake.catalog.local.YAMLFileCatalog\n metadata:\n version: 1\n", | |
"text/plain": [ | |
"gcs:\n", | |
" args:\n", | |
" path: https://malariagen.github.io/intake/gcs.yml\n", | |
" description: ''\n", | |
" driver: intake.catalog.local.YAMLFileCatalog\n", | |
" metadata:\n", | |
" version: 1\n" | |
] | |
}, | |
"metadata": { | |
"application/json": { | |
"root": "gcs" | |
} | |
}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"cat = intake.open_catalog('https://malariagen.github.io/intake/gcs.yml')\n", | |
"cat" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"/home/aliman/malariagen/binder/conda/envs/vector-ops-v2.5.1/lib/python3.7/site-packages/dask/dataframe/utils.py:14: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.\n", | |
" import pandas.util.testing as tm\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>sample_id</th>\n", | |
" <th>ena_analysis</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>AR0001-C</td>\n", | |
" <td>ERZ1695275</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>AR0002-C</td>\n", | |
" <td>ERZ1695276</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>AR0004-C</td>\n", | |
" <td>ERZ1695277</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>AR0006-C</td>\n", | |
" <td>ERZ1695278</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>AR0007-C</td>\n", | |
" <td>ERZ1695279</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>...</th>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3076</th>\n", | |
" <td>AD0494-C</td>\n", | |
" <td>ERZ1698351</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3077</th>\n", | |
" <td>AD0495-C</td>\n", | |
" <td>ERZ1698352</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3078</th>\n", | |
" <td>AD0496-C</td>\n", | |
" <td>ERZ1698353</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3079</th>\n", | |
" <td>AD0497-C</td>\n", | |
" <td>ERZ1698354</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3080</th>\n", | |
" <td>AD0498-C</td>\n", | |
" <td>ERZ1698355</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>3081 rows × 2 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" sample_id ena_analysis\n", | |
"0 AR0001-C ERZ1695275\n", | |
"1 AR0002-C ERZ1695276\n", | |
"2 AR0004-C ERZ1695277\n", | |
"3 AR0006-C ERZ1695278\n", | |
"4 AR0007-C ERZ1695279\n", | |
"... ... ...\n", | |
"3076 AD0494-C ERZ1698351\n", | |
"3077 AD0495-C ERZ1698352\n", | |
"3078 AD0496-C ERZ1698353\n", | |
"3079 AD0497-C ERZ1698354\n", | |
"3080 AD0498-C ERZ1698355\n", | |
"\n", | |
"[3081 rows x 2 columns]" | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"ena_alignments = cat.ag3.ena_alignments.read()\n", | |
"ena_alignments" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>sample_set</th>\n", | |
" <th>sample_count</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>AG1000G-AO</td>\n", | |
" <td>81</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>AG1000G-BF-A</td>\n", | |
" <td>181</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>AG1000G-BF-B</td>\n", | |
" <td>102</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>AG1000G-BF-C</td>\n", | |
" <td>13</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>AG1000G-CD</td>\n", | |
" <td>76</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>AG1000G-CF</td>\n", | |
" <td>73</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6</th>\n", | |
" <td>AG1000G-CI</td>\n", | |
" <td>80</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>7</th>\n", | |
" <td>AG1000G-CM-A</td>\n", | |
" <td>303</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>8</th>\n", | |
" <td>AG1000G-CM-B</td>\n", | |
" <td>97</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>9</th>\n", | |
" <td>AG1000G-CM-C</td>\n", | |
" <td>44</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>10</th>\n", | |
" <td>AG1000G-FR</td>\n", | |
" <td>23</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>11</th>\n", | |
" <td>AG1000G-GA-A</td>\n", | |
" <td>69</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>12</th>\n", | |
" <td>AG1000G-GH</td>\n", | |
" <td>100</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>13</th>\n", | |
" <td>AG1000G-GM-A</td>\n", | |
" <td>74</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>14</th>\n", | |
" <td>AG1000G-GM-B</td>\n", | |
" <td>31</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>15</th>\n", | |
" <td>AG1000G-GM-C</td>\n", | |
" <td>174</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>16</th>\n", | |
" <td>AG1000G-GN-A</td>\n", | |
" <td>45</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>17</th>\n", | |
" <td>AG1000G-GN-B</td>\n", | |
" <td>185</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>18</th>\n", | |
" <td>AG1000G-GQ</td>\n", | |
" <td>10</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>19</th>\n", | |
" <td>AG1000G-GW</td>\n", | |
" <td>101</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>20</th>\n", | |
" <td>AG1000G-KE</td>\n", | |
" <td>86</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>21</th>\n", | |
" <td>AG1000G-ML-A</td>\n", | |
" <td>60</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>22</th>\n", | |
" <td>AG1000G-ML-B</td>\n", | |
" <td>71</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>23</th>\n", | |
" <td>AG1000G-MW</td>\n", | |
" <td>41</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>24</th>\n", | |
" <td>AG1000G-MZ</td>\n", | |
" <td>74</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>25</th>\n", | |
" <td>AG1000G-TZ</td>\n", | |
" <td>300</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>26</th>\n", | |
" <td>AG1000G-UG</td>\n", | |
" <td>290</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>27</th>\n", | |
" <td>AG1000G-X</td>\n", | |
" <td>297</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" sample_set sample_count\n", | |
"0 AG1000G-AO 81\n", | |
"1 AG1000G-BF-A 181\n", | |
"2 AG1000G-BF-B 102\n", | |
"3 AG1000G-BF-C 13\n", | |
"4 AG1000G-CD 76\n", | |
"5 AG1000G-CF 73\n", | |
"6 AG1000G-CI 80\n", | |
"7 AG1000G-CM-A 303\n", | |
"8 AG1000G-CM-B 97\n", | |
"9 AG1000G-CM-C 44\n", | |
"10 AG1000G-FR 23\n", | |
"11 AG1000G-GA-A 69\n", | |
"12 AG1000G-GH 100\n", | |
"13 AG1000G-GM-A 74\n", | |
"14 AG1000G-GM-B 31\n", | |
"15 AG1000G-GM-C 174\n", | |
"16 AG1000G-GN-A 45\n", | |
"17 AG1000G-GN-B 185\n", | |
"18 AG1000G-GQ 10\n", | |
"19 AG1000G-GW 101\n", | |
"20 AG1000G-KE 86\n", | |
"21 AG1000G-ML-A 60\n", | |
"22 AG1000G-ML-B 71\n", | |
"23 AG1000G-MW 41\n", | |
"24 AG1000G-MZ 74\n", | |
"25 AG1000G-TZ 300\n", | |
"26 AG1000G-UG 290\n", | |
"27 AG1000G-X 297" | |
] | |
}, | |
"execution_count": 10, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"sample_sets = cat.ag3.sample_sets.read()\n", | |
"sample_sets" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>sample_id</th>\n", | |
" <th>partner_sample_id</th>\n", | |
" <th>contributor</th>\n", | |
" <th>country</th>\n", | |
" <th>location</th>\n", | |
" <th>year</th>\n", | |
" <th>month</th>\n", | |
" <th>latitude</th>\n", | |
" <th>longitude</th>\n", | |
" <th>sex_call</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>AR0047-C</td>\n", | |
" <td>LUA047</td>\n", | |
" <td>Joao Pinto</td>\n", | |
" <td>Angola</td>\n", | |
" <td>Luanda</td>\n", | |
" <td>2009</td>\n", | |
" <td>4</td>\n", | |
" <td>-8.884</td>\n", | |
" <td>13.302</td>\n", | |
" <td>F</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>AR0049-C</td>\n", | |
" <td>LUA049</td>\n", | |
" <td>Joao Pinto</td>\n", | |
" <td>Angola</td>\n", | |
" <td>Luanda</td>\n", | |
" <td>2009</td>\n", | |
" <td>4</td>\n", | |
" <td>-8.884</td>\n", | |
" <td>13.302</td>\n", | |
" <td>F</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>AR0051-C</td>\n", | |
" <td>LUA051</td>\n", | |
" <td>Joao Pinto</td>\n", | |
" <td>Angola</td>\n", | |
" <td>Luanda</td>\n", | |
" <td>2009</td>\n", | |
" <td>4</td>\n", | |
" <td>-8.884</td>\n", | |
" <td>13.302</td>\n", | |
" <td>F</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>AR0061-C</td>\n", | |
" <td>LUA061</td>\n", | |
" <td>Joao Pinto</td>\n", | |
" <td>Angola</td>\n", | |
" <td>Luanda</td>\n", | |
" <td>2009</td>\n", | |
" <td>4</td>\n", | |
" <td>-8.884</td>\n", | |
" <td>13.302</td>\n", | |
" <td>F</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>AR0078-C</td>\n", | |
" <td>LUA078</td>\n", | |
" <td>Joao Pinto</td>\n", | |
" <td>Angola</td>\n", | |
" <td>Luanda</td>\n", | |
" <td>2009</td>\n", | |
" <td>4</td>\n", | |
" <td>-8.884</td>\n", | |
" <td>13.302</td>\n", | |
" <td>F</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>...</th>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3076</th>\n", | |
" <td>AD0494-C</td>\n", | |
" <td>80-2-o-16</td>\n", | |
" <td>Martin Donnelly</td>\n", | |
" <td>Lab Cross</td>\n", | |
" <td>LSTM</td>\n", | |
" <td>-1</td>\n", | |
" <td>-1</td>\n", | |
" <td>53.409</td>\n", | |
" <td>-2.969</td>\n", | |
" <td>F</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3077</th>\n", | |
" <td>AD0495-C</td>\n", | |
" <td>80-2-o-17</td>\n", | |
" <td>Martin Donnelly</td>\n", | |
" <td>Lab Cross</td>\n", | |
" <td>LSTM</td>\n", | |
" <td>-1</td>\n", | |
" <td>-1</td>\n", | |
" <td>53.409</td>\n", | |
" <td>-2.969</td>\n", | |
" <td>M</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3078</th>\n", | |
" <td>AD0496-C</td>\n", | |
" <td>80-2-o-18</td>\n", | |
" <td>Martin Donnelly</td>\n", | |
" <td>Lab Cross</td>\n", | |
" <td>LSTM</td>\n", | |
" <td>-1</td>\n", | |
" <td>-1</td>\n", | |
" <td>53.409</td>\n", | |
" <td>-2.969</td>\n", | |
" <td>M</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3079</th>\n", | |
" <td>AD0497-C</td>\n", | |
" <td>80-2-o-19</td>\n", | |
" <td>Martin Donnelly</td>\n", | |
" <td>Lab Cross</td>\n", | |
" <td>LSTM</td>\n", | |
" <td>-1</td>\n", | |
" <td>-1</td>\n", | |
" <td>53.409</td>\n", | |
" <td>-2.969</td>\n", | |
" <td>F</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3080</th>\n", | |
" <td>AD0498-C</td>\n", | |
" <td>80-2-o-20</td>\n", | |
" <td>Martin Donnelly</td>\n", | |
" <td>Lab Cross</td>\n", | |
" <td>LSTM</td>\n", | |
" <td>-1</td>\n", | |
" <td>-1</td>\n", | |
" <td>53.409</td>\n", | |
" <td>-2.969</td>\n", | |
" <td>M</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>3081 rows × 10 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" sample_id partner_sample_id contributor country location year \\\n", | |
"0 AR0047-C LUA047 Joao Pinto Angola Luanda 2009 \n", | |
"1 AR0049-C LUA049 Joao Pinto Angola Luanda 2009 \n", | |
"2 AR0051-C LUA051 Joao Pinto Angola Luanda 2009 \n", | |
"3 AR0061-C LUA061 Joao Pinto Angola Luanda 2009 \n", | |
"4 AR0078-C LUA078 Joao Pinto Angola Luanda 2009 \n", | |
"... ... ... ... ... ... ... \n", | |
"3076 AD0494-C 80-2-o-16 Martin Donnelly Lab Cross LSTM -1 \n", | |
"3077 AD0495-C 80-2-o-17 Martin Donnelly Lab Cross LSTM -1 \n", | |
"3078 AD0496-C 80-2-o-18 Martin Donnelly Lab Cross LSTM -1 \n", | |
"3079 AD0497-C 80-2-o-19 Martin Donnelly Lab Cross LSTM -1 \n", | |
"3080 AD0498-C 80-2-o-20 Martin Donnelly Lab Cross LSTM -1 \n", | |
"\n", | |
" month latitude longitude sex_call \n", | |
"0 4 -8.884 13.302 F \n", | |
"1 4 -8.884 13.302 F \n", | |
"2 4 -8.884 13.302 F \n", | |
"3 4 -8.884 13.302 F \n", | |
"4 4 -8.884 13.302 F \n", | |
"... ... ... ... ... \n", | |
"3076 -1 53.409 -2.969 F \n", | |
"3077 -1 53.409 -2.969 M \n", | |
"3078 -1 53.409 -2.969 M \n", | |
"3079 -1 53.409 -2.969 F \n", | |
"3080 -1 53.409 -2.969 M \n", | |
"\n", | |
"[3081 rows x 10 columns]" | |
] | |
}, | |
"execution_count": 12, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df_samples = pd.concat(\n", | |
" [cat.ag3.samples(sample_set=s).read() for s in sample_sets.sample_set]\n", | |
").reset_index(drop=True)\n", | |
"df_samples" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"True" | |
] | |
}, | |
"execution_count": 13, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df_samples.sample_id.isin(ena_alignments.sample_id).all()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"True" | |
] | |
}, | |
"execution_count": 14, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"ena_alignments.sample_id.isin(df_samples.sample_id).all()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.6" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment