Skip to content

Instantly share code, notes, and snippets.

@acthp
Last active September 20, 2023 06:14
Show Gist options
  • Save acthp/f9828f57eb795404eddb85006375af8d to your computer and use it in GitHub Desktop.
Save acthp/f9828f57eb795404eddb85006375af8d to your computer and use it in GitHub Desktop.
pancan and toil notebook
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 187,
"metadata": {},
"outputs": [],
"source": [
"import xenaPython as xena\n",
"\n",
"GENES = ['FOXM1', 'TP53']\n",
"\n",
"def get_codes(host, dataset, fields, data):\n",
" \"get codes for enumerations\"\n",
" codes = xena.field_codes(host, dataset, fields)\n",
" codes_idx = dict([(x['name'], x['code'].split('\\t')) for x in codes if x['code'] is not None])\n",
" for i in range(len(fields)):\n",
" if fields[i] in codes_idx:\n",
" data[i] = [None if v == 'NaN' else codes_idx[fields[i]][int(v)] for v in data[i]]\n",
" return data\n",
"\n",
"def get_fields(host, dataset, samples, fields):\n",
" \"get field values\"\n",
" data = xena.dataset_fetch(host, dataset, samples, fields)\n",
" return data\n",
"\n",
"def get_fields_and_codes(host, dataset, samples, fields):\n",
" \"get fields and resolve codes\"\n",
" return get_codes( host, dataset, fields, get_fields( host, dataset, samples, fields))"
]
},
{
"cell_type": "code",
"execution_count": 188,
"metadata": {},
"outputs": [],
"source": [
"#\n",
"# pancanAtlas cohort\n",
"#\n",
"\n",
"cohort = 'TCGA PanCanAtlas'\n",
"host = xena.PUBLIC_HUBS['pancanAtlasHub']"
]
},
{
"cell_type": "code",
"execution_count": 189,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['TCGA-P4-A5E8-11',\n",
" 'TCGA-EE-A181-06',\n",
" 'TCGA-AA-3511-11',\n",
" 'TCGA-BR-8590-01',\n",
" 'TCGA-06-6390-01',\n",
" 'TCGA-26-5139-01',\n",
" 'TCGA-B0-4813-11',\n",
" 'TCGA-29-1763-01',\n",
" 'TCGA-D1-A17S-01',\n",
" 'TCGA-EJ-7797-11']"
]
},
"execution_count": 189,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# get samples in cohort\n",
"samples = xena.cohort_samples(host, cohort, None)\n",
"samples[0: 10]"
]
},
{
"cell_type": "code",
"execution_count": 190,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[dict_keys(['FOXM1', 'TP53']),\n",
" 'FOXM1',\n",
" [5.18, 11.31, 9.65, 9.22, 'NaN', 9.48, 'NaN', 10.27, 8.63, 5.13]]"
]
},
"execution_count": 190,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# get expression for GENES\n",
"dataset = 'EB++AdjustPANCAN_IlluminaHiSeq_RNASeqV2.geneExp.xena'\n",
"expression = get_fields(host, dataset, samples, GENES) # list of lists.\n",
"expression_by_gene = dict(zip(GENES, expression)) # index by gene.\n",
"[expression_by_gene.keys(), GENES[0], expression_by_gene[GENES[0]][0:10]]\n",
"# note that missing data is returned as 'NaN'. One might want to remap this to None or NaN, depending on\n",
"# the later analysis tools."
]
},
{
"cell_type": "code",
"execution_count": 191,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[dict_keys(['samples', 'rows']),\n",
" 3726,\n",
" ['TCGA-56-8624-11',\n",
" 'TCGA-25-1329-01',\n",
" 'TCGA-HC-8260-11',\n",
" 'TCGA-AG-3727-01',\n",
" 'TCGA-DX-A23R-01']]"
]
},
"execution_count": 191,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# get mutation for GENES\n",
"dataset = 'mc3.v0.2.8.PUBLIC.xena'\n",
"mutation_columns = xena.sparse_data(host, dataset, samples, GENES)\n",
"# Two keys are returned: 'rows', which is all the variants (in a column orientation), and \n",
"# 'samples', which is the list of all samples in the dataset. 'samples' is required in order to\n",
"# distinguish samples without this assay (not in the dataset) from samples found to have\n",
"# no mutations in these genes. A sampleID in ['samples'] that is not in ['rows']['sampleID']\n",
"# was found to have no mutations. A sample not in ['samples'] has no assessment: we can't\n",
"# say anything about its mutations.\n",
"samples_without_mutation_data = list(set(samples) - set(mutation_columns['samples']))\n",
"[mutation_columns.keys(), len(samples_without_mutation_data), samples_without_mutation_data[0: 5]]"
]
},
{
"cell_type": "code",
"execution_count": 192,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[{'alt': 'T',\n",
" 'altGene': None,\n",
" 'amino-acid': '',\n",
" 'dna-vaf': 0.33,\n",
" 'effect': \"3'UTR\",\n",
" 'genes': ['FOXM1'],\n",
" 'position': {'chrom': 'chr12',\n",
" 'chromend': 2967148,\n",
" 'chromstart': 2967148,\n",
" 'strand': '0'},\n",
" 'ref': 'G',\n",
" 'rna-vaf': None,\n",
" 'sampleID': 'TCGA-EY-A1GI-01'},\n",
" {'alt': 'G',\n",
" 'altGene': None,\n",
" 'amino-acid': '',\n",
" 'dna-vaf': 0.46,\n",
" 'effect': \"3'UTR\",\n",
" 'genes': ['FOXM1'],\n",
" 'position': {'chrom': 'chr12',\n",
" 'chromend': 2967228,\n",
" 'chromstart': 2967228,\n",
" 'strand': '0'},\n",
" 'ref': 'T',\n",
" 'rna-vaf': None,\n",
" 'sampleID': 'TCGA-EY-A5W2-01'}]"
]
},
"execution_count": 192,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# a row orientation takes more memory, and won't fit in a numpy array, but is easier\n",
"# to view.\n",
"rows = mutation_columns['rows']\n",
"keys = rows.keys()\n",
"mutations = [dict(zip(keys, [rows[k][i] for k in keys])) for i in range(len(rows['sampleID']))]\n",
"# You might want to groupby ['genes'][0] at this point, to build per-gene stats. ['genes'] is a\n",
"# list because in the general case a variant can hit mutiple genes. For this dataset, gene-level\n",
"# non-silent mutations, they do not.\n",
"mutations[0: 2] "
]
},
{
"cell_type": "code",
"execution_count": 193,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['KIRP', 'SKCM', 'COAD', 'STAD', 'GBM', 'GBM', 'KIRC', 'OV', 'UCEC', 'PRAD']"
]
},
"execution_count": 193,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# get disease type and survival columns\n",
"dataset = 'Survival_SupplementalTable_S1_20171025_xena_sp'\n",
"fields = ['cancer type abbreviation', 'OS', 'OS.time']\n",
"values = get_fields_and_codes(host, dataset, samples, fields) # list of lists\n",
"phenotypes = dict(zip(fields, values)) # index by phenotype\n",
"phenotypes['cancer type abbreviation'][0:10]"
]
},
{
"cell_type": "code",
"execution_count": 194,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'Additional - New Primary',\n",
" 'Additional Metastatic',\n",
" 'Metastatic',\n",
" None,\n",
" 'Primary Blood Derived Cancer - Peripheral Blood',\n",
" 'Primary Tumor',\n",
" 'Recurrent Tumor',\n",
" 'Solid Tissue Normal'}"
]
},
"execution_count": 194,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# get sample type. TCGA includes a few \"normal\" tissue samples. These normals are of\n",
"# limited value because there are few of them, and they are not entirely normal, being\n",
"# taken from disease tissue, outside of the visible tumor. It's often best to omit them.\n",
"dataset = 'TCGA_phenotype_denseDataOnlyDownload.tsv'\n",
"fields = ['sample_type']\n",
"values = get_fields_and_codes(host, dataset, samples, fields)\n",
"set(values[0])"
]
},
{
"cell_type": "code",
"execution_count": 195,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[1475,\n",
" ['TCGA-P4-A5E8-11',\n",
" 'TCGA-AA-3511-11',\n",
" 'TCGA-B0-4813-11',\n",
" 'TCGA-EJ-7797-11',\n",
" 'TCGA-CV-7406-11']]"
]
},
"execution_count": 195,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"samples_to_omit = [samples[i] for i in range(len(samples)) if values[0][i] == 'Solid Tissue Normal']\n",
"[len(samples_to_omit), samples_to_omit[0: 5]]"
]
},
{
"cell_type": "code",
"execution_count": 196,
"metadata": {},
"outputs": [],
"source": [
"pancan_summary = {\n",
" 'samples': samples,\n",
" 'expression': expression_by_gene,\n",
" 'mutations': mutations,\n",
" 'samples_without_mutation_data': samples_without_mutation_data,\n",
" 'phenotypes': phenotypes,\n",
" 'samples_to_omit': samples_to_omit\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 197,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#\n",
"# TCGA TARGET GTEx\n",
"#\n",
"cohort = 'TCGA TARGET GTEx'\n",
"host = xena.PUBLIC_HUBS['toilHub']"
]
},
{
"cell_type": "code",
"execution_count": 198,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['TCGA-BR-8590-01',\n",
" 'TCGA-P4-A5E8-11',\n",
" 'TCGA-61-1727-01',\n",
" 'GTEX-QCQG-0326-SM-2I3ES',\n",
" 'TCGA-CN-5361-01',\n",
" 'GTEX-1399Q-2326-SM-5KM2X',\n",
" 'TCGA-D1-A17H-01',\n",
" 'TCGA-D1-A17S-01',\n",
" 'TCGA-EJ-7797-11',\n",
" 'GTEX-11DXY-0006-SM-5NQ8N']"
]
},
"execution_count": 198,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# get samples in cohort\n",
"samples = xena.cohort_samples(host, cohort, None)\n",
"samples[0: 10]"
]
},
{
"cell_type": "code",
"execution_count": 199,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[dict_keys(['FOXM1', 'TP53']),\n",
" 'FOXM1',\n",
" [10.5357,\n",
" 5.5018,\n",
" 'NaN',\n",
" 4.4925,\n",
" 12.0133,\n",
" 9.4554,\n",
" 'NaN',\n",
" 'NaN',\n",
" 7.0413,\n",
" 6.3072]]"
]
},
"execution_count": 199,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset = 'TcgaTargetGtex_gene_expected_count'\n",
"# This dataset is not in HUGO space, so we have to use a mapping dataset to connect\n",
"# gene names to probes. You can determine the namespace of dataset probes by inspecting\n",
"# probemap (ID/Gene Mapping) metadata for the dataset.\n",
"# https://xenabrowser.net/datapages/?host=https%3A%2F%2Ftoil.xenahubs.net&dataset=TcgaTargetGtex_gene_expected_count\n",
"# The dataset_gene_probe_avg query resolves probes for a gene. \n",
"expression = xena.dataset_gene_probe_avg(host, dataset, samples, GENES)\n",
"expression_by_gene = dict([(g['gene'], g['scores'][0]) for g in expression])\n",
"[expression_by_gene.keys(), GENES[0], expression_by_gene[GENES[0]][0:10]]"
]
},
{
"cell_type": "code",
"execution_count": 200,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['TCGA', 'TCGA', None, 'GTEX', 'TCGA', 'GTEX', None, None, 'TCGA', 'GTEX']"
]
},
"execution_count": 200,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset = 'TcgaTargetGTEX_phenotype.txt'\n",
"fields = ['_study', '_sample_type']\n",
"# As in pancan, there are normal samples in tcga which should probably be removed. _sample_type will\n",
"# identify normals. _study will identify tcga vs. gtex vs. target.\n",
"values = get_fields_and_codes(host, dataset, samples, fields) # list of lists\n",
"phenotypes = dict(zip(fields, values)) # index by phenotype\n",
"phenotypes['_study'][0:10]"
]
},
{
"cell_type": "code",
"execution_count": 201,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"toil_summary = {\n",
" 'samples': samples,\n",
" 'expression': expression_by_gene,\n",
" 'phenotypes': phenotypes\n",
"}"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
@vappiah
Copy link

vappiah commented May 31, 2021

I tried downloading datasets for about (50 samples each with 60499 genes) and I got this error from python

File "", line 2, in
File "", line 1, in
File "/home/vappiah/.local/lib/python3.5/site-packages/xenaPython/xenaQuery.py", line 201, in post
response = urlopen(req)
File "/opt/apps/Python/Python-3.5/lib/python3.5/urllib/request.py", line 163, in urlopen
return opener.open(url, data, timeout)
File "/opt/apps/Python/Python-3.5/lib/python3.5/urllib/request.py", line 472, in open
response = meth(req, response)
File "/opt/apps/Python/Python-3.5/lib/python3.5/urllib/request.py", line 582, in http_response
'http', request, response, code, msg, hdrs)
File "/opt/apps/Python/Python-3.5/lib/python3.5/urllib/request.py", line 510, in error
return self._call_chain(*args)
File "/opt/apps/Python/Python-3.5/lib/python3.5/urllib/request.py", line 444, in _call_chain
result = func(*args)
File "/opt/apps/Python/Python-3.5/lib/python3.5/urllib/request.py", line 590, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 504: Gateway Time-out

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment