smrgit · March 22, 2017 16:38 · smrgit · Mar 22, 2017 · smrgit · Mar 22, 2017
diff --git a/BRCA_CPTAC_RNAseq_corr.sql b/BRCA_CPTAC_RNAseq_corr.sql
 WITH
  -- first we get the 77 samples that passed the QC tests
  qcSet AS (
  SELECT
    TCGA_case_ID AS case_barcode
  FROM
    `isb-cgc.hg19_data_previews.TCGA_Breast_SuppTable01`
  WHERE
    QC_Status="pass" ),
  --
  -- next we extract the sample_barcode, gene, and logRatio from the
  -- CPTAC BRCA proteome iTRAQ table, while at the same time doing a
  -- JOIN with the qcSet in order to filter out samples that did not
  -- pass the QC tets
  -- this query returns 720196 avgLogRatio values for 77 samples and
  -- 10599 genes -- on average 9353 genes are detected for each sample
  p AS (
  SELECT
    a.sample_barcode,
    gene,
    -- the AVG() function is used here for the 3 samples which
    -- were assayed twice; to make sure that we have just one
    -- value per sample per gene
    AVG(unshared_logRatio) AS avgLogRatio
  FROM
    `isb-cgc.hg19_data_previews.TCGA_Breast_BI_Proteom_CDAP_r2_itraq` a
  JOIN
    qcSet b
  ON
    SUBSTR(a.sample_barcode,1,12)=b.case_barcode
  GROUP BY
    sample_barcode,
    gene ),
  --
  -- now we get the expression data from the UNC/RNAseq dataset
  -- while filtering, again using a JOIN with the table created above
  -- the RNAseq
  -- expression table has ~228M rows but we want less than 1M
  -- of those for this analysis
  -- we also filter out any samples/genes where the RNAseq
  -- read-count is 0
  -- from this stage, we will wind up with 710493 rows, with data
  -- for 77 samples and 10419 genes
  j AS (
  SELECT
    g.sample_barcode AS sample,
    g.HGNC_gene_symbol AS gene,
    LOG10(normalized_count+1) AS RNAseq_logCount,
    p.avgLogRatio AS iTRAQ_logRatio
  FROM
    `isb-cgc.TCGA_hg19_data_v0.RNAseq_Gene_Expression_UNC_RSEM` g
  JOIN
    p
  ON
    SUBSTR(g.sample_barcode,1,15)=SUBSTR(p.sample_barcode,1,15)
    AND g.HGNC_gene_symbol=p.gene
  WHERE
    normalized_count>0),
  c AS (
  SELECT
    CORR(iTRAQ_logRatio,
      RNAseq_logCount) AS corrByGene,
    COUNT(*) AS n,
    gene
  FROM
    j
  GROUP BY
    gene
  HAVING
    n >=20 )
 SELECT
  *
 FROM
  c
 ORDER BY
  corrByGene DESC
	WITH
	-- first we get the 77 samples that passed the QC tests
	qcSet AS (
	SELECT
	TCGA_case_ID AS case_barcode
	FROM
	`isb-cgc.hg19_data_previews.TCGA_Breast_SuppTable01`
	WHERE
	QC_Status="pass" ),
	--
	-- next we extract the sample_barcode, gene, and logRatio from the
	-- CPTAC BRCA proteome iTRAQ table, while at the same time doing a
	-- JOIN with the qcSet in order to filter out samples that did not
	-- pass the QC tets
	-- this query returns 720196 avgLogRatio values for 77 samples and
	-- 10599 genes -- on average 9353 genes are detected for each sample
	p AS (
	SELECT
	a.sample_barcode,
	gene,
	-- the AVG() function is used here for the 3 samples which
	-- were assayed twice; to make sure that we have just one
	-- value per sample per gene
	AVG(unshared_logRatio) AS avgLogRatio
	FROM
	`isb-cgc.hg19_data_previews.TCGA_Breast_BI_Proteom_CDAP_r2_itraq` a
	JOIN
	qcSet b
	ON
	SUBSTR(a.sample_barcode,1,12)=b.case_barcode
	GROUP BY
	sample_barcode,
	gene ),
	--
	-- now we get the expression data from the UNC/RNAseq dataset
	-- while filtering, again using a JOIN with the table created above
	-- the RNAseq
	-- expression table has ~228M rows but we want less than 1M
	-- of those for this analysis
	-- we also filter out any samples/genes where the RNAseq
	-- read-count is 0
	-- from this stage, we will wind up with 710493 rows, with data
	-- for 77 samples and 10419 genes
	j AS (
	SELECT
	g.sample_barcode AS sample,
	g.HGNC_gene_symbol AS gene,
	LOG10(normalized_count+1) AS RNAseq_logCount,
	p.avgLogRatio AS iTRAQ_logRatio
	FROM
	`isb-cgc.TCGA_hg19_data_v0.RNAseq_Gene_Expression_UNC_RSEM` g
	JOIN
	p
	ON
	SUBSTR(g.sample_barcode,1,15)=SUBSTR(p.sample_barcode,1,15)
	AND g.HGNC_gene_symbol=p.gene
	WHERE
	normalized_count>0),
	c AS (
	SELECT
	CORR(iTRAQ_logRatio,
	RNAseq_logCount) AS corrByGene,
	COUNT(*) AS n,
	gene
	FROM
	j
	GROUP BY
	gene
	HAVING
	n >=20 )
	SELECT
	*
	FROM
	c
	ORDER BY
	corrByGene DESC
No results found