Last active
September 20, 2023 06:14
-
-
Save acthp/f9828f57eb795404eddb85006375af8d to your computer and use it in GitHub Desktop.
pancan and toil notebook
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 187, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import xenaPython as xena\n", | |
"\n", | |
"GENES = ['FOXM1', 'TP53']\n", | |
"\n", | |
"def get_codes(host, dataset, fields, data):\n", | |
" \"get codes for enumerations\"\n", | |
" codes = xena.field_codes(host, dataset, fields)\n", | |
" codes_idx = dict([(x['name'], x['code'].split('\\t')) for x in codes if x['code'] is not None])\n", | |
" for i in range(len(fields)):\n", | |
" if fields[i] in codes_idx:\n", | |
" data[i] = [None if v == 'NaN' else codes_idx[fields[i]][int(v)] for v in data[i]]\n", | |
" return data\n", | |
"\n", | |
"def get_fields(host, dataset, samples, fields):\n", | |
" \"get field values\"\n", | |
" data = xena.dataset_fetch(host, dataset, samples, fields)\n", | |
" return data\n", | |
"\n", | |
"def get_fields_and_codes(host, dataset, samples, fields):\n", | |
" \"get fields and resolve codes\"\n", | |
" return get_codes( host, dataset, fields, get_fields( host, dataset, samples, fields))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 188, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#\n", | |
"# pancanAtlas cohort\n", | |
"#\n", | |
"\n", | |
"cohort = 'TCGA PanCanAtlas'\n", | |
"host = xena.PUBLIC_HUBS['pancanAtlasHub']" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 189, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['TCGA-P4-A5E8-11',\n", | |
" 'TCGA-EE-A181-06',\n", | |
" 'TCGA-AA-3511-11',\n", | |
" 'TCGA-BR-8590-01',\n", | |
" 'TCGA-06-6390-01',\n", | |
" 'TCGA-26-5139-01',\n", | |
" 'TCGA-B0-4813-11',\n", | |
" 'TCGA-29-1763-01',\n", | |
" 'TCGA-D1-A17S-01',\n", | |
" 'TCGA-EJ-7797-11']" | |
] | |
}, | |
"execution_count": 189, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# get samples in cohort\n", | |
"samples = xena.cohort_samples(host, cohort, None)\n", | |
"samples[0: 10]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 190, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[dict_keys(['FOXM1', 'TP53']),\n", | |
" 'FOXM1',\n", | |
" [5.18, 11.31, 9.65, 9.22, 'NaN', 9.48, 'NaN', 10.27, 8.63, 5.13]]" | |
] | |
}, | |
"execution_count": 190, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# get expression for GENES\n", | |
"dataset = 'EB++AdjustPANCAN_IlluminaHiSeq_RNASeqV2.geneExp.xena'\n", | |
"expression = get_fields(host, dataset, samples, GENES) # list of lists.\n", | |
"expression_by_gene = dict(zip(GENES, expression)) # index by gene.\n", | |
"[expression_by_gene.keys(), GENES[0], expression_by_gene[GENES[0]][0:10]]\n", | |
"# note that missing data is returned as 'NaN'. One might want to remap this to None or NaN, depending on\n", | |
"# the later analysis tools." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 191, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[dict_keys(['samples', 'rows']),\n", | |
" 3726,\n", | |
" ['TCGA-56-8624-11',\n", | |
" 'TCGA-25-1329-01',\n", | |
" 'TCGA-HC-8260-11',\n", | |
" 'TCGA-AG-3727-01',\n", | |
" 'TCGA-DX-A23R-01']]" | |
] | |
}, | |
"execution_count": 191, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# get mutation for GENES\n", | |
"dataset = 'mc3.v0.2.8.PUBLIC.xena'\n", | |
"mutation_columns = xena.sparse_data(host, dataset, samples, GENES)\n", | |
"# Two keys are returned: 'rows', which is all the variants (in a column orientation), and \n", | |
"# 'samples', which is the list of all samples in the dataset. 'samples' is required in order to\n", | |
"# distinguish samples without this assay (not in the dataset) from samples found to have\n", | |
"# no mutations in these genes. A sampleID in ['samples'] that is not in ['rows']['sampleID']\n", | |
"# was found to have no mutations. A sample not in ['samples'] has no assessment: we can't\n", | |
"# say anything about its mutations.\n", | |
"samples_without_mutation_data = list(set(samples) - set(mutation_columns['samples']))\n", | |
"[mutation_columns.keys(), len(samples_without_mutation_data), samples_without_mutation_data[0: 5]]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 192, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[{'alt': 'T',\n", | |
" 'altGene': None,\n", | |
" 'amino-acid': '',\n", | |
" 'dna-vaf': 0.33,\n", | |
" 'effect': \"3'UTR\",\n", | |
" 'genes': ['FOXM1'],\n", | |
" 'position': {'chrom': 'chr12',\n", | |
" 'chromend': 2967148,\n", | |
" 'chromstart': 2967148,\n", | |
" 'strand': '0'},\n", | |
" 'ref': 'G',\n", | |
" 'rna-vaf': None,\n", | |
" 'sampleID': 'TCGA-EY-A1GI-01'},\n", | |
" {'alt': 'G',\n", | |
" 'altGene': None,\n", | |
" 'amino-acid': '',\n", | |
" 'dna-vaf': 0.46,\n", | |
" 'effect': \"3'UTR\",\n", | |
" 'genes': ['FOXM1'],\n", | |
" 'position': {'chrom': 'chr12',\n", | |
" 'chromend': 2967228,\n", | |
" 'chromstart': 2967228,\n", | |
" 'strand': '0'},\n", | |
" 'ref': 'T',\n", | |
" 'rna-vaf': None,\n", | |
" 'sampleID': 'TCGA-EY-A5W2-01'}]" | |
] | |
}, | |
"execution_count": 192, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# a row orientation takes more memory, and won't fit in a numpy array, but is easier\n", | |
"# to view.\n", | |
"rows = mutation_columns['rows']\n", | |
"keys = rows.keys()\n", | |
"mutations = [dict(zip(keys, [rows[k][i] for k in keys])) for i in range(len(rows['sampleID']))]\n", | |
"# You might want to groupby ['genes'][0] at this point, to build per-gene stats. ['genes'] is a\n", | |
"# list because in the general case a variant can hit mutiple genes. For this dataset, gene-level\n", | |
"# non-silent mutations, they do not.\n", | |
"mutations[0: 2] " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 193, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['KIRP', 'SKCM', 'COAD', 'STAD', 'GBM', 'GBM', 'KIRC', 'OV', 'UCEC', 'PRAD']" | |
] | |
}, | |
"execution_count": 193, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# get disease type and survival columns\n", | |
"dataset = 'Survival_SupplementalTable_S1_20171025_xena_sp'\n", | |
"fields = ['cancer type abbreviation', 'OS', 'OS.time']\n", | |
"values = get_fields_and_codes(host, dataset, samples, fields) # list of lists\n", | |
"phenotypes = dict(zip(fields, values)) # index by phenotype\n", | |
"phenotypes['cancer type abbreviation'][0:10]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 194, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'Additional - New Primary',\n", | |
" 'Additional Metastatic',\n", | |
" 'Metastatic',\n", | |
" None,\n", | |
" 'Primary Blood Derived Cancer - Peripheral Blood',\n", | |
" 'Primary Tumor',\n", | |
" 'Recurrent Tumor',\n", | |
" 'Solid Tissue Normal'}" | |
] | |
}, | |
"execution_count": 194, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# get sample type. TCGA includes a few \"normal\" tissue samples. These normals are of\n", | |
"# limited value because there are few of them, and they are not entirely normal, being\n", | |
"# taken from disease tissue, outside of the visible tumor. It's often best to omit them.\n", | |
"dataset = 'TCGA_phenotype_denseDataOnlyDownload.tsv'\n", | |
"fields = ['sample_type']\n", | |
"values = get_fields_and_codes(host, dataset, samples, fields)\n", | |
"set(values[0])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 195, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[1475,\n", | |
" ['TCGA-P4-A5E8-11',\n", | |
" 'TCGA-AA-3511-11',\n", | |
" 'TCGA-B0-4813-11',\n", | |
" 'TCGA-EJ-7797-11',\n", | |
" 'TCGA-CV-7406-11']]" | |
] | |
}, | |
"execution_count": 195, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"samples_to_omit = [samples[i] for i in range(len(samples)) if values[0][i] == 'Solid Tissue Normal']\n", | |
"[len(samples_to_omit), samples_to_omit[0: 5]]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 196, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"pancan_summary = {\n", | |
" 'samples': samples,\n", | |
" 'expression': expression_by_gene,\n", | |
" 'mutations': mutations,\n", | |
" 'samples_without_mutation_data': samples_without_mutation_data,\n", | |
" 'phenotypes': phenotypes,\n", | |
" 'samples_to_omit': samples_to_omit\n", | |
"}" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 197, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"#\n", | |
"# TCGA TARGET GTEx\n", | |
"#\n", | |
"cohort = 'TCGA TARGET GTEx'\n", | |
"host = xena.PUBLIC_HUBS['toilHub']" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 198, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['TCGA-BR-8590-01',\n", | |
" 'TCGA-P4-A5E8-11',\n", | |
" 'TCGA-61-1727-01',\n", | |
" 'GTEX-QCQG-0326-SM-2I3ES',\n", | |
" 'TCGA-CN-5361-01',\n", | |
" 'GTEX-1399Q-2326-SM-5KM2X',\n", | |
" 'TCGA-D1-A17H-01',\n", | |
" 'TCGA-D1-A17S-01',\n", | |
" 'TCGA-EJ-7797-11',\n", | |
" 'GTEX-11DXY-0006-SM-5NQ8N']" | |
] | |
}, | |
"execution_count": 198, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# get samples in cohort\n", | |
"samples = xena.cohort_samples(host, cohort, None)\n", | |
"samples[0: 10]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 199, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[dict_keys(['FOXM1', 'TP53']),\n", | |
" 'FOXM1',\n", | |
" [10.5357,\n", | |
" 5.5018,\n", | |
" 'NaN',\n", | |
" 4.4925,\n", | |
" 12.0133,\n", | |
" 9.4554,\n", | |
" 'NaN',\n", | |
" 'NaN',\n", | |
" 7.0413,\n", | |
" 6.3072]]" | |
] | |
}, | |
"execution_count": 199, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"dataset = 'TcgaTargetGtex_gene_expected_count'\n", | |
"# This dataset is not in HUGO space, so we have to use a mapping dataset to connect\n", | |
"# gene names to probes. You can determine the namespace of dataset probes by inspecting\n", | |
"# probemap (ID/Gene Mapping) metadata for the dataset.\n", | |
"# https://xenabrowser.net/datapages/?host=https%3A%2F%2Ftoil.xenahubs.net&dataset=TcgaTargetGtex_gene_expected_count\n", | |
"# The dataset_gene_probe_avg query resolves probes for a gene. \n", | |
"expression = xena.dataset_gene_probe_avg(host, dataset, samples, GENES)\n", | |
"expression_by_gene = dict([(g['gene'], g['scores'][0]) for g in expression])\n", | |
"[expression_by_gene.keys(), GENES[0], expression_by_gene[GENES[0]][0:10]]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 200, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['TCGA', 'TCGA', None, 'GTEX', 'TCGA', 'GTEX', None, None, 'TCGA', 'GTEX']" | |
] | |
}, | |
"execution_count": 200, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"dataset = 'TcgaTargetGTEX_phenotype.txt'\n", | |
"fields = ['_study', '_sample_type']\n", | |
"# As in pancan, there are normal samples in tcga which should probably be removed. _sample_type will\n", | |
"# identify normals. _study will identify tcga vs. gtex vs. target.\n", | |
"values = get_fields_and_codes(host, dataset, samples, fields) # list of lists\n", | |
"phenotypes = dict(zip(fields, values)) # index by phenotype\n", | |
"phenotypes['_study'][0:10]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 201, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"toil_summary = {\n", | |
" 'samples': samples,\n", | |
" 'expression': expression_by_gene,\n", | |
" 'phenotypes': phenotypes\n", | |
"}" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.4" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I tried downloading datasets for about (50 samples each with 60499 genes) and I got this error from python
File "", line 2, in
File "", line 1, in
File "/home/vappiah/.local/lib/python3.5/site-packages/xenaPython/xenaQuery.py", line 201, in post
response = urlopen(req)
File "/opt/apps/Python/Python-3.5/lib/python3.5/urllib/request.py", line 163, in urlopen
return opener.open(url, data, timeout)
File "/opt/apps/Python/Python-3.5/lib/python3.5/urllib/request.py", line 472, in open
response = meth(req, response)
File "/opt/apps/Python/Python-3.5/lib/python3.5/urllib/request.py", line 582, in http_response
'http', request, response, code, msg, hdrs)
File "/opt/apps/Python/Python-3.5/lib/python3.5/urllib/request.py", line 510, in error
return self._call_chain(*args)
File "/opt/apps/Python/Python-3.5/lib/python3.5/urllib/request.py", line 444, in _call_chain
result = func(*args)
File "/opt/apps/Python/Python-3.5/lib/python3.5/urllib/request.py", line 590, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 504: Gateway Time-out