smrgit · May 16, 2017 22:58
diff --git a/joinCorr_CPTAC_RPPA_mRNAseq.sql b/joinCorr_CPTAC_RPPA_mRNAseq.sql
 WITH
  -- first we get the 77 samples that passed the QC tests
  qcSet AS (
  SELECT
    TCGA_case_ID AS case_barcode
  FROM
    `isb-cgc.hg19_data_previews.TCGA_Breast_SuppTable01`
  WHERE
    QC_Status="pass" ),
  --
  -- next we extract the sample_barcode, gene, and logRatio from the
  -- CPTAC BRCA proteome iTRAQ table, whlie at the same time doing a
  -- JOIN with the qcSet in order to filter out samples that did not
  -- pass the QC tests
  -- this query returns 720196 avgLogRatio values for 77 samples and
  -- 10599 genes -- on average 9353 genes are detected for each sample
  pData AS (
  SELECT
    sample_barcode,
    gene,
    -- the AVG() function is used here for the 3 samples which
    -- were assayed twice; to make sure that we have just one
    -- value per sample per gene
    AVG(unshared_logRatio) AS avgLogRatio
  FROM
    `isb-cgc.hg19_data_previews.TCGA_Breast_BI_Proteom_CDAP_r2_itraq` a
  JOIN
    qcSet b
  ON
    SUBSTR(a.sample_barcode,1,12)=b.case_barcode
  GROUP BY
    sample_barcode,
    gene ),
  --
  -- now we get the RPPA protein data, while filtering down to
  -- the samples in the CPTAC set above ...
  -- from this stage, we will wind up with 16551 rows, with data
  -- for 74 samples, and 227 gene/protein pairs (176 unique genes,
    -- and 214 proteins) -- some of the proteins are phospho-proteins
  -- and in some cases multiple genes may map to a single
  -- protein due to non-specific binding in the RPPA assay
  jRppa AS (
  SELECT
    r.sample_barcode AS sample,
    r.gene_name AS gene,
    r.protein_name AS protein,
    r.protein_expression AS rppaExp,
    pData.avgLogRatio AS iTRAQ_logRatio
  FROM
    `isb-cgc.TCGA_hg19_data_v0.Protein_Expression` r
  JOIN
    pData
  ON
    SUBSTR(r.sample_barcode,1,15)=SUBSTR(pData.sample_barcode,1,15)
    AND r.gene_name=pData.gene ),
  --
  -- and then we compute a Pearson correlation on the two
  -- expression values, and keep only those correlations
  -- based on observations in at least 20 samples
  -- (1/4 of the available data)
  cRppa AS (
  SELECT
    CORR(iTRAQ_logRatio,
      rppaExp) AS corrByGene_RPPA,
    COUNT(*) AS n,
    gene,
    protein
  FROM
    jRppa
  GROUP BY
    gene,
    protein
  HAVING
    n>=20 ),
  --
  -- now we're going to repeat these previous two steps, but using
  -- the UNC/RNAseq dataset instead of the RPPA data
  -- the RNAseq expression table has ~228M rows but we want less then
  -- 1M of those for this analysis
  -- we also filter out any samples/genes where the RNAseq read-count is 0
  jmRNA AS (
  SELECT
    g.sample_barcode AS sample,
    g.HGNC_gene_symbol AS gene,
    LOG10(normalized_count+1) AS RNAseq_logCount,
    pData.avgLogRatio AS iTRAQ_logRatio
  FROM
    `isb-cgc.TCGA_hg19_data_v0.RNAseq_Gene_Expression_UNC_RSEM` g
  JOIN
    pData
  ON
    SUBSTR(g.sample_barcode,1,15)=SUBSTR(pData.sample_barcode,1,15)
    AND g.HGNC_gene_symbol=pData.gene
  WHERE
    normalized_count>0),
  cmRNA AS (
  SELECT
    CORR(iTRAQ_logRatio,
      RNAseq_logCount) AS corrByGene_mRNA,
    COUNT(*) AS n,
    gene
  FROM
    jmRNA
  GROUP BY
    gene
  HAVING
    n >=20 )
 SELECT
  a.gene,
  a.protein,
  a.n AS n_RPPA,
  a.CorrByGene_RPPA,
  b.n AS n_mRNA,
  b.CorrByGene_mRNA,
  ((a.CorrByGene_RPPA + b.CorrByGene_mRNA)/2.) AS avgCorrByGene,
  (b.CorrByGene_mRNA - a.CorrByGene_RPPA) AS deltaCorrByGene
 FROM
  cRppa a
 JOIN
  cmRNA b
 ON
  a.gene=b.gene
 ORDER BY
  ((a.CorrByGene_RPPA + b.CorrByGene_mRNA)/2.) DESC
	WITH
	-- first we get the 77 samples that passed the QC tests
	qcSet AS (
	SELECT
	TCGA_case_ID AS case_barcode
	FROM
	`isb-cgc.hg19_data_previews.TCGA_Breast_SuppTable01`
	WHERE
	QC_Status="pass" ),
	--
	-- next we extract the sample_barcode, gene, and logRatio from the
	-- CPTAC BRCA proteome iTRAQ table, whlie at the same time doing a
	-- JOIN with the qcSet in order to filter out samples that did not
	-- pass the QC tests
	-- this query returns 720196 avgLogRatio values for 77 samples and
	-- 10599 genes -- on average 9353 genes are detected for each sample
	pData AS (
	SELECT
	sample_barcode,
	gene,
	-- the AVG() function is used here for the 3 samples which
	-- were assayed twice; to make sure that we have just one
	-- value per sample per gene
	AVG(unshared_logRatio) AS avgLogRatio
	FROM
	`isb-cgc.hg19_data_previews.TCGA_Breast_BI_Proteom_CDAP_r2_itraq` a
	JOIN
	qcSet b
	ON
	SUBSTR(a.sample_barcode,1,12)=b.case_barcode
	GROUP BY
	sample_barcode,
	gene ),
	--
	-- now we get the RPPA protein data, while filtering down to
	-- the samples in the CPTAC set above ...
	-- from this stage, we will wind up with 16551 rows, with data
	-- for 74 samples, and 227 gene/protein pairs (176 unique genes,
	-- and 214 proteins) -- some of the proteins are phospho-proteins
	-- and in some cases multiple genes may map to a single
	-- protein due to non-specific binding in the RPPA assay
	jRppa AS (
	SELECT
	r.sample_barcode AS sample,
	r.gene_name AS gene,
	r.protein_name AS protein,
	r.protein_expression AS rppaExp,
	pData.avgLogRatio AS iTRAQ_logRatio
	FROM
	`isb-cgc.TCGA_hg19_data_v0.Protein_Expression` r
	JOIN
	pData
	ON
	SUBSTR(r.sample_barcode,1,15)=SUBSTR(pData.sample_barcode,1,15)
	AND r.gene_name=pData.gene ),
	--
	-- and then we compute a Pearson correlation on the two
	-- expression values, and keep only those correlations
	-- based on observations in at least 20 samples
	-- (1/4 of the available data)
	cRppa AS (
	SELECT
	CORR(iTRAQ_logRatio,
	rppaExp) AS corrByGene_RPPA,
	COUNT(*) AS n,
	gene,
	protein
	FROM
	jRppa
	GROUP BY
	gene,
	protein
	HAVING
	n>=20 ),
	--
	-- now we're going to repeat these previous two steps, but using
	-- the UNC/RNAseq dataset instead of the RPPA data
	-- the RNAseq expression table has ~228M rows but we want less then
	-- 1M of those for this analysis
	-- we also filter out any samples/genes where the RNAseq read-count is 0
	jmRNA AS (
	SELECT
	g.sample_barcode AS sample,
	g.HGNC_gene_symbol AS gene,
	LOG10(normalized_count+1) AS RNAseq_logCount,
	pData.avgLogRatio AS iTRAQ_logRatio
	FROM
	`isb-cgc.TCGA_hg19_data_v0.RNAseq_Gene_Expression_UNC_RSEM` g
	JOIN
	pData
	ON
	SUBSTR(g.sample_barcode,1,15)=SUBSTR(pData.sample_barcode,1,15)
	AND g.HGNC_gene_symbol=pData.gene
	WHERE
	normalized_count>0),
	cmRNA AS (
	SELECT
	CORR(iTRAQ_logRatio,
	RNAseq_logCount) AS corrByGene_mRNA,
	COUNT(*) AS n,
	gene
	FROM
	jmRNA
	GROUP BY
	gene
	HAVING
	n >=20 )
	SELECT
	a.gene,
	a.protein,
	a.n AS n_RPPA,
	a.CorrByGene_RPPA,
	b.n AS n_mRNA,
	b.CorrByGene_mRNA,
	((a.CorrByGene_RPPA + b.CorrByGene_mRNA)/2.) AS avgCorrByGene,
	(b.CorrByGene_mRNA - a.CorrByGene_RPPA) AS deltaCorrByGene
	FROM
	cRppa a
	JOIN
	cmRNA b
	ON
	a.gene=b.gene
	ORDER BY
	((a.CorrByGene_RPPA + b.CorrByGene_mRNA)/2.) DESC
No results found