Skip to content

Instantly share code, notes, and snippets.

@smrgit
Last active April 14, 2017 17:44
Show Gist options
  • Select an option

  • Save smrgit/b578430edec4e8a2d3fbfee607d656ab to your computer and use it in GitHub Desktop.

Select an option

Save smrgit/b578430edec4e8a2d3fbfee607d656ab to your computer and use it in GitHub Desktop.
GQ GDC hg19 vs hg38 miRNA correlation analysis -- at the stemloop expression level
WITH
hg38_d1 AS (
-- we start with a table at the aliquot level, in case there are multiple aliquots
-- for a single sample;
SELECT
sample_barcode,
aliquot_barcode,
mirna_id,
reads_per_million_miRNA_mapped AS RPM
FROM
`isb-cgc.TCGA_hg38_data_v0.miRNAseq_Expression` ),
hg38_d2 AS (
-- next we do a MAX() over the RPM values from the aliquots and just
-- keep the sample_barcode; and we're also going to do a LOG()
-- AND we are going to toss OUT (sample,mirna) pairs with ZERO reads
SELECT
sample_barcode,
mirna_id,
LOG(MAX(RPM)) AS logRPM
FROM
hg38_d1
WHERE
RPM>0
GROUP BY
sample_barcode,
mirna_id ),
hg19_d1 AS (
-- now we're going to do the same with the hg19 data ...
SELECT
sample_barcode,
aliquot_barcode,
mirna_id,
reads_per_million_miRNA_mapped AS RPM
FROM
`isb-cgc.TCGA_hg19_data_v0.miRNAseq_Expression` ),
hg19_d2 AS (
SELECT
sample_barcode,
mirna_id,
LOG(MAX(RPM)) AS logRPM
FROM
hg19_d1
WHERE
RPM>0
GROUP BY
sample_barcode,
mirna_id ),
j1 AS (
SELECT
a.logRPM AS hg19_logRPM,
b.logRPM AS hg38_logRPM,
b.sample_barcode AS sample_barcode,
b.mirna_id AS mirna_id
FROM
hg19_d2 a
JOIN
hg38_d2 b
ON
a.sample_barcode=b.sample_barcode
AND a.mirna_id=b.mirna_id ),
ct AS (
SELECT
mirna_id,
CORR(hg19_logRPM,
hg38_logRPM) AS corr,
COUNT(*) AS n
FROM
j1
GROUP BY
mirna_id
HAVING
n >= 1000 )
SELECT
*
FROM
ct
ORDER BY
corr DESC
@smrgit
Copy link
Copy Markdown
Author

smrgit commented Apr 14, 2017

updated to use new ISB-CGC tables

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment