Skip to content

Instantly share code, notes, and snippets.

@seandavi
Created February 13, 2018 04:03
Show Gist options
  • Save seandavi/63494d4081f9f7828985b3ac934f9533 to your computer and use it in GitHub Desktop.
Save seandavi/63494d4081f9f7828985b3ac934f9533 to your computer and use it in GitHub Desktop.
Preparing VCF file metadata for TARGET Osteosarcoma
library(dplyr)
library(RMySQL)
con = dbConnect(MySQL(),user='USERNAME',password='PASSWORD',host='solexadb.XXXXXXXXXX.us-east-1.rds.amazonaws.com',port=3306,dbname='solexa')
res = dbGetQuery(con,
"select study_id,source.name as source_name, sample.name as sample_name, fcb.type as software,
fcb.dateStamp as basecalldate, fcb.softwareVersion as version, study_id,
sr.date as run_date, sr.sequencer as sequencer, sm.model, ssr.library_id as library_id,
sr.ID as run_id, sample.ID as sample_id, nt.value as sample_source,
sample.sample_type, part.value as partitioning_method, ssr.ID as read_group_id
from solexa_sample as sample
join solexa_source_sample as sss on sss.sample_id=sample.ID
join solexa_source as source on sss.source_id=source.ID
join solexa_sample_library as sslib on sslib.sample_id=sample.ID
join solexa_studyresult as ssr on ssr.library_id=sslib.library_id
join solexa_lane as slane on ssr.lane_id=slane.ID
join solexa_run as sr on sr.ID = slane.run_id
join solexa_variable_sample as svs on svs.sample_id=sample.ID
join solexa_variable as sv on svs.variable_id=sv.ID
join flowCellBasecall as fcb on fcb.id = ssr.`basecall_id`
join solexa_variable_library as svl on ssr.library_id=svl.library_id
join solexa_machine sm on sr.sequencer=sm.name
left join (select sv.ID as variable_id,value
from solexa_variable as sv where name='Genome Partition') as part
on part.variable_id=svl.variable_id
left join (select sv.ID as variable_id,value
from solexa_variable as sv where name='Normal/Tumor') as nt
on nt.variable_id=svs.variable_id
where study_id in (16,25,94,152,128,179,124,166)
and ssr.include='yes'
")
res[[3]] = gsub('_','-',res[[3]])
res[[3]] = gsub('(0[0-9])$','\\1D',res[[3]])
colnames(res)[3] = 'samplename'
res = res %>% filter(!is.na(sample_source)) %>% unique()
captureTrans = function(partition_method) {
retmodel = rep('',length(partition_method))
retmodel[grepl("TARGETOsteo020416_Valid_BAC_KlenowBait",partition_method)] = "LCC"
retmodel[grepl('SeqCap_141001_HG19_TargetOsteo1807_EZ',partition_method)] = 'Roche'
retmodel[partition_method %in% c('Illumina_Nextera_Rapid_Capture_Exome',
'Illumina_TruSeq_Exome_Enrichment_Kit',
'Agilent_SureSelect_Human_All_Exon_50Mb_Kit')] = 'Exome'
retmodel
}
res = res[res$sample_type != 'mRNA',]
res$capture_strategy = captureTrans(res$partitioning_method)
vcf_manifest = readr::read_delim('~/Downloads/vcf_manifest_md5.txt',delim="/",col_names=FALSE)
discover_validation_map = data.frame(directory = sub('TargetOsteo','',vcf_manifest[[5]]),
capture_strategy = sub('Discovery|Validation','',sub('TargetOsteo','',vcf_manifest[[5]])),
filename = vcf_manifest[[8]],
source_name = substr(vcf_manifest[[8]],1,6),
sample_source = ifelse(grepl('Tumor',vcf_manifest[[8]]),'Tumor','Normal'),
stringsAsFactors = FALSE)
tmpmap = discover_validation_map[discover_validation_map$capture_strategy=='',]
tmpmap$capture_strategy = 'Exome'
discover_validation_map = rbind(discover_validation_map,
tmpmap)
res2 = res[,-1] %>% left_join(discover_validation_map)
readr::write_tsv(res2,'target_vcf_metadata.tsv',col_names=TRUE)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment