Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Select an option

  • Save smrgit/e259c49e35c11bd2106a69a3a85cfd93 to your computer and use it in GitHub Desktop.

Select an option

Save smrgit/e259c49e35c11bd2106a69a3a85cfd93 to your computer and use it in GitHub Desktop.
use the wiki pathways table to "score" pathways using the correlation between the protein quantification and the mRNA-seq based expression estimates
WITH
-- first we get the 77 samples that passed the QC tests
qcSet AS (
SELECT
TCGA_case_ID AS case_barcode
FROM
`isb-cgc.hg19_data_previews.TCGA_Breast_SuppTable01`
WHERE
QC_Status="pass" ),
--
-- next we extract the sample_barcode, gene, and logRatio from the
-- CPTAC BRCA proteome iTRAQ table, whlie at the same time doing a
-- JOIN with the qcSet in order to filter out samples that did not
-- pass the QC tests
-- this query returns 720196 avgLogRatio values for 77 samples and
-- 10599 genes -- on average 9353 genes are detected for each sample
pData AS (
SELECT
sample_barcode,
gene,
-- the AVG() function is used here for the 3 samples which
-- were assayed twice; to make sure that we have just one
-- value per sample per gene
AVG(unshared_logRatio) AS avgLogRatio
FROM
`isb-cgc.hg19_data_previews.TCGA_Breast_BI_Proteom_CDAP_r2_itraq` a
JOIN
qcSet b
ON
SUBSTR(a.sample_barcode,1,12)=b.case_barcode
GROUP BY
sample_barcode,
gene ),
--
-- now we get the RPPA protein data, while filtering down to
-- the samples in the CPTAC set above ...
-- from this stage, we will wind up with 16551 rows, with data
-- for 74 samples, and 227 gene/protein pairs (176 unique genes,
-- and 214 proteins) -- some of the proteins are phospho-proteins
-- and in some cases multiple genes may map to a single
-- protein due to non-specific binding in the RPPA assay
jRppa AS (
SELECT
r.sample_barcode AS sample,
r.gene_name AS gene,
r.protein_name AS protein,
r.protein_expression AS rppaExp,
pData.avgLogRatio AS iTRAQ_logRatio
FROM
`isb-cgc.TCGA_hg19_data_v0.Protein_Expression` r
JOIN
pData
ON
SUBSTR(r.sample_barcode,1,15)=SUBSTR(pData.sample_barcode,1,15)
AND r.gene_name=pData.gene ),
--
-- and then we compute a Pearson correlation on the two
-- expression values, and keep only those correlations
-- based on observations in at least 20 samples
-- (1/4 of the available data)
cRppa AS (
SELECT
CORR(iTRAQ_logRatio,
rppaExp) AS corrByGene_RPPA,
COUNT(*) AS n,
gene,
protein
FROM
jRppa
GROUP BY
gene,
protein
HAVING
n>=20 ),
--
-- now we're going to repeat these previous two steps, but using
-- the UNC/RNAseq dataset instead of the RPPA data
-- the RNAseq expression table has ~228M rows but we want less then
-- 1M of those for this analysis
-- we also filter out any samples/genes where the RNAseq read-count is 0
jmRNA AS (
SELECT
g.sample_barcode AS sample,
g.HGNC_gene_symbol AS gene,
LOG10(normalized_count+1) AS RNAseq_logCount,
pData.avgLogRatio AS iTRAQ_logRatio
FROM
`isb-cgc.TCGA_hg19_data_v0.RNAseq_Gene_Expression_UNC_RSEM` g
JOIN
pData
ON
SUBSTR(g.sample_barcode,1,15)=SUBSTR(pData.sample_barcode,1,15)
AND g.HGNC_gene_symbol=pData.gene
WHERE
normalized_count>0),
cmRNA AS (
SELECT
CORR(iTRAQ_logRatio,
RNAseq_logCount) AS corrByGene_mRNA,
COUNT(*) AS n,
gene
FROM
jmRNA
GROUP BY
gene
HAVING
n >=20 ),
fullJ AS (
SELECT
a.gene AS a_Gene,
a.protein AS a_Protein,
a.n AS n_RPPA,
a.CorrByGene_RPPA,
b.gene AS b_Gene,
b.n AS n_mRNA,
b.CorrByGene_mRNA
FROM
cRppa a
FULL JOIN
cmRNA b
ON
a.gene=b.gene ),
wikiP AS (
SELECT
pathway,
Symbol
FROM
`isb-cgc.QotM.WikiPathways_20170425_Annotated`
GROUP BY
pathway,
Symbol ),
withP AS (
SELECT
*
FROM
fullJ
JOIN
wikiP
ON
( fullJ.a_gene=wikiP.Symbol
OR fullJ.b_gene=wikiP.Symbol ) )
SELECT
pathway,
SUM(CorrByGene_mRNA) AS sumCorr,
COUNT(*) AS n,
(SUM(CorrByGene_mRNA)/COUNT(*)) AS score
FROM
withP
WHERE
(CorrByGene_mRNA >= 0.5)
GROUP BY
pathway
HAVING
n>=15
ORDER BY
score DESC
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment