Skip to content

Instantly share code, notes, and snippets.

@smrgit
smrgit / high_var_mirna.sql
Last active April 16, 2017 22:55
find most variable miRNAs (by MIMAT accession)
WITH
-- first we just extract all of the rows corresponding to mature miR's
d1 AS (
SELECT
sample_barcode,
aliquot_barcode,
chromosome,
start_pos,
end_pos,
strand,
@smrgit
smrgit / GDC_data_in_CloudStorage.sql
Last active September 15, 2017 17:05
This query joins two ISB-CGC GDC_metadata tables to get a summary of the GDC data that is available in ISB-CGC buckets in Google Cloud Storage.
WITH
t1 AS (
SELECT
dbName,
file_id AS file_gdc_id,
access,
cases__project__program__name AS program_name,
cases__project__project_id AS project_short_name,
experimental_strategy,
data_category,
@smrgit
smrgit / GDC_data_not_in_CloudStorage.sql
Last active September 15, 2017 17:16
Take a look at the types of open-access TXT and TSV data files that exist at the GDC but are *not* available in Google Cloud Storage.
WITH
t1 AS (
SELECT
dbName,
file_id AS file_gdc_id,
access,
cases__project__program__name AS program_name,
cases__project__project_id AS project_short_name,
experimental_strategy,
data_category,
@smrgit
smrgit / case_images_join.sql
Created April 28, 2017 00:56
Find radiology and diagnostic images for selected types of TCGA cases
WITH
cases AS (
SELECT
case_barcode
FROM
`isb-cgc.TCGA_bioclin_v0.Clinical`
WHERE
project_short_name="TCGA-LUSC"
OR project_short_name="TCGA-LUAD" ),
radImg AS (
@smrgit
smrgit / joinCorr_CPTAC_RPPA_mRNAseq.sql
Created May 16, 2017 22:58
join correlations between CPTAC data with RPPA and mRNA-Seq data all in one go!
WITH
-- first we get the 77 samples that passed the QC tests
qcSet AS (
SELECT
TCGA_case_ID AS case_barcode
FROM
`isb-cgc.hg19_data_previews.TCGA_Breast_SuppTable01`
WHERE
QC_Status="pass" ),
--
@smrgit
smrgit / use_protein_mRNA_corr_to_score_pathways.sql
Created May 17, 2017 19:23
use the wiki pathways table to "score" pathways using the correlation between the protein quantification and the mRNA-seq based expression estimates
WITH
-- first we get the 77 samples that passed the QC tests
qcSet AS (
SELECT
TCGA_case_ID AS case_barcode
FROM
`isb-cgc.hg19_data_previews.TCGA_Breast_SuppTable01`
WHERE
QC_Status="pass" ),
--
@smrgit
smrgit / hg19vs38_copy_number_mirna.sql
Created July 22, 2017 02:50
compare hg19 and hg38 copy-number segments at miRNA locations
WITH
hg38t1 AS (
SELECT
sample_barcode,
mirna_id AS mirna_name,
LOG(reads_per_million_miRNA_mapped+1,2) AS logRPM
FROM
`isb-cgc.TCGA_hg38_data_v0.miRNAseq_Expression`
WHERE
reads_per_million_miRNA_mapped > 4 ),
@smrgit
smrgit / GDC_rel8_metadata_query01.sql
Last active October 26, 2017 02:23
Identify GDC data available in ISB-CGC GCS buckets
WITH
t1 AS (
SELECT
DISTINCT file_gdc_id
FROM
`isb-cgc.GDC_metadata.rel8_GDCfileID_to_GCSurl` ),
j1 AS (
SELECT
a.dbName,
a.file_id,
@smrgit
smrgit / egress_01.sql
Created August 16, 2017 17:30
egress charges query
SELECT
dateString,
dayOfYear,
week,
year,
projectID,
productCat,
resourceType,
SUM(cost) AS totCost,
currency,
@smrgit
smrgit / find_AML_BAM_files.sql
Created September 13, 2017 20:08
GDC metadata exploration example
WITH
--
-- this initial table collects the case barcodes for all TCGA LAML and
-- TARGET AML cases from the two "Clinical" tables
-- Result: this sub-query returns a table with 1193 rows
t1 AS (
SELECT
program_name,
case_barcode
FROM