Created
February 13, 2018 04:03
-
-
Save seandavi/63494d4081f9f7828985b3ac934f9533 to your computer and use it in GitHub Desktop.
Preparing VCF file metadata for TARGET Osteosarcoma
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(dplyr) | |
library(RMySQL) | |
con = dbConnect(MySQL(),user='USERNAME',password='PASSWORD',host='solexadb.XXXXXXXXXX.us-east-1.rds.amazonaws.com',port=3306,dbname='solexa') | |
res = dbGetQuery(con, | |
"select study_id,source.name as source_name, sample.name as sample_name, fcb.type as software, | |
fcb.dateStamp as basecalldate, fcb.softwareVersion as version, study_id, | |
sr.date as run_date, sr.sequencer as sequencer, sm.model, ssr.library_id as library_id, | |
sr.ID as run_id, sample.ID as sample_id, nt.value as sample_source, | |
sample.sample_type, part.value as partitioning_method, ssr.ID as read_group_id | |
from solexa_sample as sample | |
join solexa_source_sample as sss on sss.sample_id=sample.ID | |
join solexa_source as source on sss.source_id=source.ID | |
join solexa_sample_library as sslib on sslib.sample_id=sample.ID | |
join solexa_studyresult as ssr on ssr.library_id=sslib.library_id | |
join solexa_lane as slane on ssr.lane_id=slane.ID | |
join solexa_run as sr on sr.ID = slane.run_id | |
join solexa_variable_sample as svs on svs.sample_id=sample.ID | |
join solexa_variable as sv on svs.variable_id=sv.ID | |
join flowCellBasecall as fcb on fcb.id = ssr.`basecall_id` | |
join solexa_variable_library as svl on ssr.library_id=svl.library_id | |
join solexa_machine sm on sr.sequencer=sm.name | |
left join (select sv.ID as variable_id,value | |
from solexa_variable as sv where name='Genome Partition') as part | |
on part.variable_id=svl.variable_id | |
left join (select sv.ID as variable_id,value | |
from solexa_variable as sv where name='Normal/Tumor') as nt | |
on nt.variable_id=svs.variable_id | |
where study_id in (16,25,94,152,128,179,124,166) | |
and ssr.include='yes' | |
") | |
res[[3]] = gsub('_','-',res[[3]]) | |
res[[3]] = gsub('(0[0-9])$','\\1D',res[[3]]) | |
colnames(res)[3] = 'samplename' | |
res = res %>% filter(!is.na(sample_source)) %>% unique() | |
captureTrans = function(partition_method) { | |
retmodel = rep('',length(partition_method)) | |
retmodel[grepl("TARGETOsteo020416_Valid_BAC_KlenowBait",partition_method)] = "LCC" | |
retmodel[grepl('SeqCap_141001_HG19_TargetOsteo1807_EZ',partition_method)] = 'Roche' | |
retmodel[partition_method %in% c('Illumina_Nextera_Rapid_Capture_Exome', | |
'Illumina_TruSeq_Exome_Enrichment_Kit', | |
'Agilent_SureSelect_Human_All_Exon_50Mb_Kit')] = 'Exome' | |
retmodel | |
} | |
res = res[res$sample_type != 'mRNA',] | |
res$capture_strategy = captureTrans(res$partitioning_method) | |
vcf_manifest = readr::read_delim('~/Downloads/vcf_manifest_md5.txt',delim="/",col_names=FALSE) | |
discover_validation_map = data.frame(directory = sub('TargetOsteo','',vcf_manifest[[5]]), | |
capture_strategy = sub('Discovery|Validation','',sub('TargetOsteo','',vcf_manifest[[5]])), | |
filename = vcf_manifest[[8]], | |
source_name = substr(vcf_manifest[[8]],1,6), | |
sample_source = ifelse(grepl('Tumor',vcf_manifest[[8]]),'Tumor','Normal'), | |
stringsAsFactors = FALSE) | |
tmpmap = discover_validation_map[discover_validation_map$capture_strategy=='',] | |
tmpmap$capture_strategy = 'Exome' | |
discover_validation_map = rbind(discover_validation_map, | |
tmpmap) | |
res2 = res[,-1] %>% left_join(discover_validation_map) | |
readr::write_tsv(res2,'target_vcf_metadata.tsv',col_names=TRUE) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment