smrgit · May 17, 2017 19:23
diff --git a/use_protein_mRNA_corr_to_score_pathways.sql b/use_protein_mRNA_corr_to_score_pathways.sql
 WITH
  -- first we get the 77 samples that passed the QC tests
  qcSet AS (
  SELECT
    TCGA_case_ID AS case_barcode
  FROM
    `isb-cgc.hg19_data_previews.TCGA_Breast_SuppTable01`
  WHERE
    QC_Status="pass" ),
  --
  -- next we extract the sample_barcode, gene, and logRatio from the
  -- CPTAC BRCA proteome iTRAQ table, whlie at the same time doing a
  -- JOIN with the qcSet in order to filter out samples that did not
  -- pass the QC tests
  -- this query returns 720196 avgLogRatio values for 77 samples and
  -- 10599 genes -- on average 9353 genes are detected for each sample
  pData AS (
  SELECT
    sample_barcode,
    gene,
    -- the AVG() function is used here for the 3 samples which
    -- were assayed twice; to make sure that we have just one
    -- value per sample per gene
    AVG(unshared_logRatio) AS avgLogRatio
  FROM
    `isb-cgc.hg19_data_previews.TCGA_Breast_BI_Proteom_CDAP_r2_itraq` a
  JOIN
    qcSet b
  ON
    SUBSTR(a.sample_barcode,1,12)=b.case_barcode
  GROUP BY
    sample_barcode,
    gene ),
  --
  -- now we get the RPPA protein data, while filtering down to
  -- the samples in the CPTAC set above ...
  -- from this stage, we will wind up with 16551 rows, with data
  -- for 74 samples, and 227 gene/protein pairs (176 unique genes,
    -- and 214 proteins) -- some of the proteins are phospho-proteins
  -- and in some cases multiple genes may map to a single
  -- protein due to non-specific binding in the RPPA assay
  jRppa AS (
  SELECT
    r.sample_barcode AS sample,
    r.gene_name AS gene,
    r.protein_name AS protein,
    r.protein_expression AS rppaExp,
    pData.avgLogRatio AS iTRAQ_logRatio
  FROM
    `isb-cgc.TCGA_hg19_data_v0.Protein_Expression` r
  JOIN
    pData
  ON
    SUBSTR(r.sample_barcode,1,15)=SUBSTR(pData.sample_barcode,1,15)
    AND r.gene_name=pData.gene ),
  --
  -- and then we compute a Pearson correlation on the two
  -- expression values, and keep only those correlations
  -- based on observations in at least 20 samples
  -- (1/4 of the available data)
  cRppa AS (
  SELECT
    CORR(iTRAQ_logRatio,
      rppaExp) AS corrByGene_RPPA,
    COUNT(*) AS n,
    gene,
    protein
  FROM
    jRppa
  GROUP BY
    gene,
    protein
  HAVING
    n>=20 ),
  --
  -- now we're going to repeat these previous two steps, but using
  -- the UNC/RNAseq dataset instead of the RPPA data
  -- the RNAseq expression table has ~228M rows but we want less then
  -- 1M of those for this analysis
  -- we also filter out any samples/genes where the RNAseq read-count is 0
  jmRNA AS (
  SELECT
    g.sample_barcode AS sample,
    g.HGNC_gene_symbol AS gene,
    LOG10(normalized_count+1) AS RNAseq_logCount,
    pData.avgLogRatio AS iTRAQ_logRatio
  FROM
    `isb-cgc.TCGA_hg19_data_v0.RNAseq_Gene_Expression_UNC_RSEM` g
  JOIN
    pData
  ON
    SUBSTR(g.sample_barcode,1,15)=SUBSTR(pData.sample_barcode,1,15)
    AND g.HGNC_gene_symbol=pData.gene
  WHERE
    normalized_count>0),
  cmRNA AS (
  SELECT
    CORR(iTRAQ_logRatio,
      RNAseq_logCount) AS corrByGene_mRNA,
    COUNT(*) AS n,
    gene
  FROM
    jmRNA
  GROUP BY
    gene
  HAVING
    n >=20 ),
  fullJ AS (
  SELECT
    a.gene AS a_Gene,
    a.protein AS a_Protein,
    a.n AS n_RPPA,
    a.CorrByGene_RPPA,
    b.gene AS b_Gene,
    b.n AS n_mRNA,
    b.CorrByGene_mRNA
  FROM
    cRppa a
  FULL JOIN
    cmRNA b
  ON
    a.gene=b.gene ),
  wikiP AS (
  SELECT
    pathway,
    Symbol
  FROM
    `isb-cgc.QotM.WikiPathways_20170425_Annotated`
  GROUP BY
    pathway,
    Symbol ),
  withP AS (
  SELECT
    *
  FROM
    fullJ
  JOIN
    wikiP
  ON
    ( fullJ.a_gene=wikiP.Symbol
      OR fullJ.b_gene=wikiP.Symbol ) )
 SELECT
  pathway,
  SUM(CorrByGene_mRNA) AS sumCorr,
  COUNT(*) AS n,
  (SUM(CorrByGene_mRNA)/COUNT(*)) AS score
 FROM
  withP
 WHERE
  (CorrByGene_mRNA >= 0.5)
 GROUP BY
  pathway
 HAVING
  n>=15
 ORDER BY
  score DESC
	WITH
	-- first we get the 77 samples that passed the QC tests
	qcSet AS (
	SELECT
	TCGA_case_ID AS case_barcode
	FROM
	`isb-cgc.hg19_data_previews.TCGA_Breast_SuppTable01`
	WHERE
	QC_Status="pass" ),
	--
	-- next we extract the sample_barcode, gene, and logRatio from the
	-- CPTAC BRCA proteome iTRAQ table, whlie at the same time doing a
	-- JOIN with the qcSet in order to filter out samples that did not
	-- pass the QC tests
	-- this query returns 720196 avgLogRatio values for 77 samples and
	-- 10599 genes -- on average 9353 genes are detected for each sample
	pData AS (
	SELECT
	sample_barcode,
	gene,
	-- the AVG() function is used here for the 3 samples which
	-- were assayed twice; to make sure that we have just one
	-- value per sample per gene
	AVG(unshared_logRatio) AS avgLogRatio
	FROM
	`isb-cgc.hg19_data_previews.TCGA_Breast_BI_Proteom_CDAP_r2_itraq` a
	JOIN
	qcSet b
	ON
	SUBSTR(a.sample_barcode,1,12)=b.case_barcode
	GROUP BY
	sample_barcode,
	gene ),
	--
	-- now we get the RPPA protein data, while filtering down to
	-- the samples in the CPTAC set above ...
	-- from this stage, we will wind up with 16551 rows, with data
	-- for 74 samples, and 227 gene/protein pairs (176 unique genes,
	-- and 214 proteins) -- some of the proteins are phospho-proteins
	-- and in some cases multiple genes may map to a single
	-- protein due to non-specific binding in the RPPA assay
	jRppa AS (
	SELECT
	r.sample_barcode AS sample,
	r.gene_name AS gene,
	r.protein_name AS protein,
	r.protein_expression AS rppaExp,
	pData.avgLogRatio AS iTRAQ_logRatio
	FROM
	`isb-cgc.TCGA_hg19_data_v0.Protein_Expression` r
	JOIN
	pData
	ON
	SUBSTR(r.sample_barcode,1,15)=SUBSTR(pData.sample_barcode,1,15)
	AND r.gene_name=pData.gene ),
	--
	-- and then we compute a Pearson correlation on the two
	-- expression values, and keep only those correlations
	-- based on observations in at least 20 samples
	-- (1/4 of the available data)
	cRppa AS (
	SELECT
	CORR(iTRAQ_logRatio,
	rppaExp) AS corrByGene_RPPA,
	COUNT(*) AS n,
	gene,
	protein
	FROM
	jRppa
	GROUP BY
	gene,
	protein
	HAVING
	n>=20 ),
	--
	-- now we're going to repeat these previous two steps, but using
	-- the UNC/RNAseq dataset instead of the RPPA data
	-- the RNAseq expression table has ~228M rows but we want less then
	-- 1M of those for this analysis
	-- we also filter out any samples/genes where the RNAseq read-count is 0
	jmRNA AS (
	SELECT
	g.sample_barcode AS sample,
	g.HGNC_gene_symbol AS gene,
	LOG10(normalized_count+1) AS RNAseq_logCount,
	pData.avgLogRatio AS iTRAQ_logRatio
	FROM
	`isb-cgc.TCGA_hg19_data_v0.RNAseq_Gene_Expression_UNC_RSEM` g
	JOIN
	pData
	ON
	SUBSTR(g.sample_barcode,1,15)=SUBSTR(pData.sample_barcode,1,15)
	AND g.HGNC_gene_symbol=pData.gene
	WHERE
	normalized_count>0),
	cmRNA AS (
	SELECT
	CORR(iTRAQ_logRatio,
	RNAseq_logCount) AS corrByGene_mRNA,
	COUNT(*) AS n,
	gene
	FROM
	jmRNA
	GROUP BY
	gene
	HAVING
	n >=20 ),
	fullJ AS (
	SELECT
	a.gene AS a_Gene,
	a.protein AS a_Protein,
	a.n AS n_RPPA,
	a.CorrByGene_RPPA,
	b.gene AS b_Gene,
	b.n AS n_mRNA,
	b.CorrByGene_mRNA
	FROM
	cRppa a
	FULL JOIN
	cmRNA b
	ON
	a.gene=b.gene ),
	wikiP AS (
	SELECT
	pathway,
	Symbol
	FROM
	`isb-cgc.QotM.WikiPathways_20170425_Annotated`
	GROUP BY
	pathway,
	Symbol ),
	withP AS (
	SELECT
	*
	FROM
	fullJ
	JOIN
	wikiP
	ON
	( fullJ.a_gene=wikiP.Symbol
	OR fullJ.b_gene=wikiP.Symbol ) )
	SELECT
	pathway,
	SUM(CorrByGene_mRNA) AS sumCorr,
	COUNT(*) AS n,
	(SUM(CorrByGene_mRNA)/COUNT(*)) AS score
	FROM
	withP
	WHERE
	(CorrByGene_mRNA >= 0.5)
	GROUP BY
	pathway
	HAVING
	n>=15
	ORDER BY
	score DESC
No results found