smrgit · April 16, 2017 22:55 · smrgit · Apr 16, 2017
diff --git a/high_var_mirna.sql b/high_var_mirna.sql
 WITH
  -- first we just extract all of the rows corresponding to mature miR's
  d1 AS (
  SELECT
    sample_barcode,
    aliquot_barcode,
    chromosome,
    start_pos,
    end_pos,
    strand,
    mirna_id,
    mirna_accession,
    reads_per_million_miRNA_mapped AS rpm FROM
    -- query either the "hg19" or the "hg38" isoform (aka isomir)
    -- expression table here
    -- further down, use the v20 reference table for the hg19 data,
    -- or the v21 reference table for the hg38 table
    `isb-cgc.TCGA_hg38_data_v0.miRNAseq_Isoform_Expression`
  WHERE
    mirna_transcript="mature" ),
  -- next, we want to collapse any samples that have multiple aliquots
  -- (there are not very many)
  d2 AS (
  SELECT
    sample_barcode,
    chromosome,
    start_pos,
    end_pos,
    strand,
    mirna_id,
    mirna_accession,
    MIN(rpm) AS minRPM,
    MAX(rpm) AS maxRPM,
    AVG(rpm) AS avgRPM
  FROM
    d1
  GROUP BY
    sample_barcode,
    chromosome,
    start_pos,
    end_pos,
    strand,
    mirna_id,
    mirna_accession ),
  -- and then we sum the RPM over all isoforms grouping by mirna_id, mirna_accession, and mirna_transcript
  -- for samples with multiple aliquots, we take the max RPM value over the aliquots
  d3 AS (
  SELECT
    sample_barcode,
    mirna_accession,
    SUM(maxRPM) AS sumRPM
  FROM
    d2
  GROUP BY
    sample_barcode,
    mirna_accession ),
  -- and next we compute the standard deviation ...
  -- and require that the miR is observed in at least 1000 samples
  d4 AS (
  SELECT
    mirna_accession,
    STDDEV(LOG(sumRPM+1)) AS sigmaRPM,
    COUNT(*) AS n
  FROM
    d3
  GROUP BY
    mirna_accession
  HAVING
    n >=1000 )
  -- finally, join in the mature miRNA symbol (based on the MIMAT accession),
  -- and sort on sigmaRPM (descending)
 SELECT
  a.mirna_accession,
  b.product_symbol,
  a.sigmaRPM,
  a.n
 FROM
  d4 a LEFT JOIN
  -- use the v20 reference for the hg19 data,
  -- or the v21 reference for the hg38 data
  `isb-cgc.genome_reference.miRBase_v21` b
 ON
  a.mirna_accession=b.product_accession
 WHERE
  sigmaRPM > 1.4
 GROUP BY
  mirna_accession,
  product_symbol,
  sigmaRPM,
  n
 ORDER BY
  sigmaRPM DESC,
  n DESC
	WITH
	-- first we just extract all of the rows corresponding to mature miR's
	d1 AS (
	SELECT
	sample_barcode,
	aliquot_barcode,
	chromosome,
	start_pos,
	end_pos,
	strand,
	mirna_id,
	mirna_accession,
	reads_per_million_miRNA_mapped AS rpm FROM
	-- query either the "hg19" or the "hg38" isoform (aka isomir)
	-- expression table here
	-- further down, use the v20 reference table for the hg19 data,
	-- or the v21 reference table for the hg38 table
	`isb-cgc.TCGA_hg38_data_v0.miRNAseq_Isoform_Expression`
	WHERE
	mirna_transcript="mature" ),
	-- next, we want to collapse any samples that have multiple aliquots
	-- (there are not very many)
	d2 AS (
	SELECT
	sample_barcode,
	chromosome,
	start_pos,
	end_pos,
	strand,
	mirna_id,
	mirna_accession,
	MIN(rpm) AS minRPM,
	MAX(rpm) AS maxRPM,
	AVG(rpm) AS avgRPM
	FROM
	d1
	GROUP BY
	sample_barcode,
	chromosome,
	start_pos,
	end_pos,
	strand,
	mirna_id,
	mirna_accession ),
	-- and then we sum the RPM over all isoforms grouping by mirna_id, mirna_accession, and mirna_transcript
	-- for samples with multiple aliquots, we take the max RPM value over the aliquots
	d3 AS (
	SELECT
	sample_barcode,
	mirna_accession,
	SUM(maxRPM) AS sumRPM
	FROM
	d2
	GROUP BY
	sample_barcode,
	mirna_accession ),
	-- and next we compute the standard deviation ...
	-- and require that the miR is observed in at least 1000 samples
	d4 AS (
	SELECT
	mirna_accession,
	STDDEV(LOG(sumRPM+1)) AS sigmaRPM,
	COUNT(*) AS n
	FROM
	d3
	GROUP BY
	mirna_accession
	HAVING
	n >=1000 )
	-- finally, join in the mature miRNA symbol (based on the MIMAT accession),
	-- and sort on sigmaRPM (descending)
	SELECT
	a.mirna_accession,
	b.product_symbol,
	a.sigmaRPM,
	a.n
	FROM
	d4 a LEFT JOIN
	-- use the v20 reference for the hg19 data,
	-- or the v21 reference for the hg38 data
	`isb-cgc.genome_reference.miRBase_v21` b
	ON
	a.mirna_accession=b.product_accession
	WHERE
	sigmaRPM > 1.4
	GROUP BY
	mirna_accession,
	product_symbol,
	sigmaRPM,
	n
	ORDER BY
	sigmaRPM DESC,
	n DESC
No results found