Last active
April 16, 2017 22:55
-
-
Save smrgit/9f810d185c65d8f7c7eea049721b6c56 to your computer and use it in GitHub Desktop.
find most variable miRNAs (by MIMAT accession)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| WITH | |
| -- first we just extract all of the rows corresponding to mature miR's | |
| d1 AS ( | |
| SELECT | |
| sample_barcode, | |
| aliquot_barcode, | |
| chromosome, | |
| start_pos, | |
| end_pos, | |
| strand, | |
| mirna_id, | |
| mirna_accession, | |
| reads_per_million_miRNA_mapped AS rpm FROM | |
| -- query either the "hg19" or the "hg38" isoform (aka isomir) | |
| -- expression table here | |
| -- further down, use the v20 reference table for the hg19 data, | |
| -- or the v21 reference table for the hg38 table | |
| `isb-cgc.TCGA_hg38_data_v0.miRNAseq_Isoform_Expression` | |
| WHERE | |
| mirna_transcript="mature" ), | |
| -- next, we want to collapse any samples that have multiple aliquots | |
| -- (there are not very many) | |
| d2 AS ( | |
| SELECT | |
| sample_barcode, | |
| chromosome, | |
| start_pos, | |
| end_pos, | |
| strand, | |
| mirna_id, | |
| mirna_accession, | |
| MIN(rpm) AS minRPM, | |
| MAX(rpm) AS maxRPM, | |
| AVG(rpm) AS avgRPM | |
| FROM | |
| d1 | |
| GROUP BY | |
| sample_barcode, | |
| chromosome, | |
| start_pos, | |
| end_pos, | |
| strand, | |
| mirna_id, | |
| mirna_accession ), | |
| -- and then we sum the RPM over all isoforms grouping by mirna_id, mirna_accession, and mirna_transcript | |
| -- for samples with multiple aliquots, we take the max RPM value over the aliquots | |
| d3 AS ( | |
| SELECT | |
| sample_barcode, | |
| mirna_accession, | |
| SUM(maxRPM) AS sumRPM | |
| FROM | |
| d2 | |
| GROUP BY | |
| sample_barcode, | |
| mirna_accession ), | |
| -- and next we compute the standard deviation ... | |
| -- and require that the miR is observed in at least 1000 samples | |
| d4 AS ( | |
| SELECT | |
| mirna_accession, | |
| STDDEV(LOG(sumRPM+1)) AS sigmaRPM, | |
| COUNT(*) AS n | |
| FROM | |
| d3 | |
| GROUP BY | |
| mirna_accession | |
| HAVING | |
| n >=1000 ) | |
| -- finally, join in the mature miRNA symbol (based on the MIMAT accession), | |
| -- and sort on sigmaRPM (descending) | |
| SELECT | |
| a.mirna_accession, | |
| b.product_symbol, | |
| a.sigmaRPM, | |
| a.n | |
| FROM | |
| d4 a LEFT JOIN | |
| -- use the v20 reference for the hg19 data, | |
| -- or the v21 reference for the hg38 data | |
| `isb-cgc.genome_reference.miRBase_v21` b | |
| ON | |
| a.mirna_accession=b.product_accession | |
| WHERE | |
| sigmaRPM > 1.4 | |
| GROUP BY | |
| mirna_accession, | |
| product_symbol, | |
| sigmaRPM, | |
| n | |
| ORDER BY | |
| sigmaRPM DESC, | |
| n DESC |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
modified to work explicitly with "mature" miRNA products