Created
May 17, 2017 19:23
-
-
Save smrgit/e259c49e35c11bd2106a69a3a85cfd93 to your computer and use it in GitHub Desktop.
use the wiki pathways table to "score" pathways using the correlation between the protein quantification and the mRNA-seq based expression estimates
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| WITH | |
| -- first we get the 77 samples that passed the QC tests | |
| qcSet AS ( | |
| SELECT | |
| TCGA_case_ID AS case_barcode | |
| FROM | |
| `isb-cgc.hg19_data_previews.TCGA_Breast_SuppTable01` | |
| WHERE | |
| QC_Status="pass" ), | |
| -- | |
| -- next we extract the sample_barcode, gene, and logRatio from the | |
| -- CPTAC BRCA proteome iTRAQ table, whlie at the same time doing a | |
| -- JOIN with the qcSet in order to filter out samples that did not | |
| -- pass the QC tests | |
| -- this query returns 720196 avgLogRatio values for 77 samples and | |
| -- 10599 genes -- on average 9353 genes are detected for each sample | |
| pData AS ( | |
| SELECT | |
| sample_barcode, | |
| gene, | |
| -- the AVG() function is used here for the 3 samples which | |
| -- were assayed twice; to make sure that we have just one | |
| -- value per sample per gene | |
| AVG(unshared_logRatio) AS avgLogRatio | |
| FROM | |
| `isb-cgc.hg19_data_previews.TCGA_Breast_BI_Proteom_CDAP_r2_itraq` a | |
| JOIN | |
| qcSet b | |
| ON | |
| SUBSTR(a.sample_barcode,1,12)=b.case_barcode | |
| GROUP BY | |
| sample_barcode, | |
| gene ), | |
| -- | |
| -- now we get the RPPA protein data, while filtering down to | |
| -- the samples in the CPTAC set above ... | |
| -- from this stage, we will wind up with 16551 rows, with data | |
| -- for 74 samples, and 227 gene/protein pairs (176 unique genes, | |
| -- and 214 proteins) -- some of the proteins are phospho-proteins | |
| -- and in some cases multiple genes may map to a single | |
| -- protein due to non-specific binding in the RPPA assay | |
| jRppa AS ( | |
| SELECT | |
| r.sample_barcode AS sample, | |
| r.gene_name AS gene, | |
| r.protein_name AS protein, | |
| r.protein_expression AS rppaExp, | |
| pData.avgLogRatio AS iTRAQ_logRatio | |
| FROM | |
| `isb-cgc.TCGA_hg19_data_v0.Protein_Expression` r | |
| JOIN | |
| pData | |
| ON | |
| SUBSTR(r.sample_barcode,1,15)=SUBSTR(pData.sample_barcode,1,15) | |
| AND r.gene_name=pData.gene ), | |
| -- | |
| -- and then we compute a Pearson correlation on the two | |
| -- expression values, and keep only those correlations | |
| -- based on observations in at least 20 samples | |
| -- (1/4 of the available data) | |
| cRppa AS ( | |
| SELECT | |
| CORR(iTRAQ_logRatio, | |
| rppaExp) AS corrByGene_RPPA, | |
| COUNT(*) AS n, | |
| gene, | |
| protein | |
| FROM | |
| jRppa | |
| GROUP BY | |
| gene, | |
| protein | |
| HAVING | |
| n>=20 ), | |
| -- | |
| -- now we're going to repeat these previous two steps, but using | |
| -- the UNC/RNAseq dataset instead of the RPPA data | |
| -- the RNAseq expression table has ~228M rows but we want less then | |
| -- 1M of those for this analysis | |
| -- we also filter out any samples/genes where the RNAseq read-count is 0 | |
| jmRNA AS ( | |
| SELECT | |
| g.sample_barcode AS sample, | |
| g.HGNC_gene_symbol AS gene, | |
| LOG10(normalized_count+1) AS RNAseq_logCount, | |
| pData.avgLogRatio AS iTRAQ_logRatio | |
| FROM | |
| `isb-cgc.TCGA_hg19_data_v0.RNAseq_Gene_Expression_UNC_RSEM` g | |
| JOIN | |
| pData | |
| ON | |
| SUBSTR(g.sample_barcode,1,15)=SUBSTR(pData.sample_barcode,1,15) | |
| AND g.HGNC_gene_symbol=pData.gene | |
| WHERE | |
| normalized_count>0), | |
| cmRNA AS ( | |
| SELECT | |
| CORR(iTRAQ_logRatio, | |
| RNAseq_logCount) AS corrByGene_mRNA, | |
| COUNT(*) AS n, | |
| gene | |
| FROM | |
| jmRNA | |
| GROUP BY | |
| gene | |
| HAVING | |
| n >=20 ), | |
| fullJ AS ( | |
| SELECT | |
| a.gene AS a_Gene, | |
| a.protein AS a_Protein, | |
| a.n AS n_RPPA, | |
| a.CorrByGene_RPPA, | |
| b.gene AS b_Gene, | |
| b.n AS n_mRNA, | |
| b.CorrByGene_mRNA | |
| FROM | |
| cRppa a | |
| FULL JOIN | |
| cmRNA b | |
| ON | |
| a.gene=b.gene ), | |
| wikiP AS ( | |
| SELECT | |
| pathway, | |
| Symbol | |
| FROM | |
| `isb-cgc.QotM.WikiPathways_20170425_Annotated` | |
| GROUP BY | |
| pathway, | |
| Symbol ), | |
| withP AS ( | |
| SELECT | |
| * | |
| FROM | |
| fullJ | |
| JOIN | |
| wikiP | |
| ON | |
| ( fullJ.a_gene=wikiP.Symbol | |
| OR fullJ.b_gene=wikiP.Symbol ) ) | |
| SELECT | |
| pathway, | |
| SUM(CorrByGene_mRNA) AS sumCorr, | |
| COUNT(*) AS n, | |
| (SUM(CorrByGene_mRNA)/COUNT(*)) AS score | |
| FROM | |
| withP | |
| WHERE | |
| (CorrByGene_mRNA >= 0.5) | |
| GROUP BY | |
| pathway | |
| HAVING | |
| n>=15 | |
| ORDER BY | |
| score DESC |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment