Skip to content

Instantly share code, notes, and snippets.

@smrgit
Created May 16, 2017 22:58
Show Gist options
  • Select an option

  • Save smrgit/ee4b61fde179dad45e98c6a9caec7eea to your computer and use it in GitHub Desktop.

Select an option

Save smrgit/ee4b61fde179dad45e98c6a9caec7eea to your computer and use it in GitHub Desktop.
join correlations between CPTAC data with RPPA and mRNA-Seq data all in one go!
WITH
-- first we get the 77 samples that passed the QC tests
qcSet AS (
SELECT
TCGA_case_ID AS case_barcode
FROM
`isb-cgc.hg19_data_previews.TCGA_Breast_SuppTable01`
WHERE
QC_Status="pass" ),
--
-- next we extract the sample_barcode, gene, and logRatio from the
-- CPTAC BRCA proteome iTRAQ table, whlie at the same time doing a
-- JOIN with the qcSet in order to filter out samples that did not
-- pass the QC tests
-- this query returns 720196 avgLogRatio values for 77 samples and
-- 10599 genes -- on average 9353 genes are detected for each sample
pData AS (
SELECT
sample_barcode,
gene,
-- the AVG() function is used here for the 3 samples which
-- were assayed twice; to make sure that we have just one
-- value per sample per gene
AVG(unshared_logRatio) AS avgLogRatio
FROM
`isb-cgc.hg19_data_previews.TCGA_Breast_BI_Proteom_CDAP_r2_itraq` a
JOIN
qcSet b
ON
SUBSTR(a.sample_barcode,1,12)=b.case_barcode
GROUP BY
sample_barcode,
gene ),
--
-- now we get the RPPA protein data, while filtering down to
-- the samples in the CPTAC set above ...
-- from this stage, we will wind up with 16551 rows, with data
-- for 74 samples, and 227 gene/protein pairs (176 unique genes,
-- and 214 proteins) -- some of the proteins are phospho-proteins
-- and in some cases multiple genes may map to a single
-- protein due to non-specific binding in the RPPA assay
jRppa AS (
SELECT
r.sample_barcode AS sample,
r.gene_name AS gene,
r.protein_name AS protein,
r.protein_expression AS rppaExp,
pData.avgLogRatio AS iTRAQ_logRatio
FROM
`isb-cgc.TCGA_hg19_data_v0.Protein_Expression` r
JOIN
pData
ON
SUBSTR(r.sample_barcode,1,15)=SUBSTR(pData.sample_barcode,1,15)
AND r.gene_name=pData.gene ),
--
-- and then we compute a Pearson correlation on the two
-- expression values, and keep only those correlations
-- based on observations in at least 20 samples
-- (1/4 of the available data)
cRppa AS (
SELECT
CORR(iTRAQ_logRatio,
rppaExp) AS corrByGene_RPPA,
COUNT(*) AS n,
gene,
protein
FROM
jRppa
GROUP BY
gene,
protein
HAVING
n>=20 ),
--
-- now we're going to repeat these previous two steps, but using
-- the UNC/RNAseq dataset instead of the RPPA data
-- the RNAseq expression table has ~228M rows but we want less then
-- 1M of those for this analysis
-- we also filter out any samples/genes where the RNAseq read-count is 0
jmRNA AS (
SELECT
g.sample_barcode AS sample,
g.HGNC_gene_symbol AS gene,
LOG10(normalized_count+1) AS RNAseq_logCount,
pData.avgLogRatio AS iTRAQ_logRatio
FROM
`isb-cgc.TCGA_hg19_data_v0.RNAseq_Gene_Expression_UNC_RSEM` g
JOIN
pData
ON
SUBSTR(g.sample_barcode,1,15)=SUBSTR(pData.sample_barcode,1,15)
AND g.HGNC_gene_symbol=pData.gene
WHERE
normalized_count>0),
cmRNA AS (
SELECT
CORR(iTRAQ_logRatio,
RNAseq_logCount) AS corrByGene_mRNA,
COUNT(*) AS n,
gene
FROM
jmRNA
GROUP BY
gene
HAVING
n >=20 )
SELECT
a.gene,
a.protein,
a.n AS n_RPPA,
a.CorrByGene_RPPA,
b.n AS n_mRNA,
b.CorrByGene_mRNA,
((a.CorrByGene_RPPA + b.CorrByGene_mRNA)/2.) AS avgCorrByGene,
(b.CorrByGene_mRNA - a.CorrByGene_RPPA) AS deltaCorrByGene
FROM
cRppa a
JOIN
cmRNA b
ON
a.gene=b.gene
ORDER BY
((a.CorrByGene_RPPA + b.CorrByGene_mRNA)/2.) DESC
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment