seandavi · February 13, 2018 04:03
diff --git a/prepTargetVCFMetadata.R b/prepTargetVCFMetadata.R
 library(dplyr)


 library(RMySQL)
 con = dbConnect(MySQL(),user='USERNAME',password='PASSWORD',host='solexadb.XXXXXXXXXX.us-east-1.rds.amazonaws.com',port=3306,dbname='solexa')
 res = dbGetQuery(con,
                 "select study_id,source.name as source_name, sample.name as sample_name, fcb.type as software,
                 fcb.dateStamp as basecalldate, fcb.softwareVersion as version, study_id,
                 sr.date as run_date, sr.sequencer as sequencer, sm.model, ssr.library_id as library_id,
                 sr.ID as run_id, sample.ID as sample_id, nt.value as sample_source,
                 sample.sample_type, part.value as partitioning_method, ssr.ID as read_group_id
                 from solexa_sample as sample
                 join solexa_source_sample as sss on sss.sample_id=sample.ID
                 join solexa_source as source on sss.source_id=source.ID
                 join solexa_sample_library as sslib on sslib.sample_id=sample.ID
                 join solexa_studyresult as ssr on ssr.library_id=sslib.library_id
                 join solexa_lane as slane on ssr.lane_id=slane.ID
                 join solexa_run as sr on sr.ID = slane.run_id
                 join solexa_variable_sample as svs on svs.sample_id=sample.ID
                 join solexa_variable as sv on svs.variable_id=sv.ID
                 join flowCellBasecall as fcb on fcb.id = ssr.`basecall_id`
                 join solexa_variable_library as svl on ssr.library_id=svl.library_id
                 join solexa_machine sm on sr.sequencer=sm.name
                 left join (select sv.ID as variable_id,value
                 from solexa_variable as sv where name='Genome Partition') as part
                 on part.variable_id=svl.variable_id
                 left join (select sv.ID as variable_id,value
                 from solexa_variable as sv where name='Normal/Tumor') as nt
                 on nt.variable_id=svs.variable_id
                 where study_id in (16,25,94,152,128,179,124,166)
                 and ssr.include='yes'
                 ")
 res[[3]] = gsub('_','-',res[[3]])
 res[[3]] = gsub('(0[0-9])$','\\1D',res[[3]])
 colnames(res)[3] = 'samplename'
 res = res %>% filter(!is.na(sample_source)) %>% unique()

 captureTrans = function(partition_method) {
  retmodel = rep('',length(partition_method))
  retmodel[grepl("TARGETOsteo020416_Valid_BAC_KlenowBait",partition_method)] = "LCC"
  retmodel[grepl('SeqCap_141001_HG19_TargetOsteo1807_EZ',partition_method)] = 'Roche'
  retmodel[partition_method %in% c('Illumina_Nextera_Rapid_Capture_Exome',
                                   'Illumina_TruSeq_Exome_Enrichment_Kit',
                                    'Agilent_SureSelect_Human_All_Exon_50Mb_Kit')] = 'Exome'
  retmodel
 }



 res = res[res$sample_type != 'mRNA',]
 res$capture_strategy = captureTrans(res$partitioning_method)

 vcf_manifest = readr::read_delim('~/Downloads/vcf_manifest_md5.txt',delim="/",col_names=FALSE)
 discover_validation_map = data.frame(directory = sub('TargetOsteo','',vcf_manifest[[5]]),
                                     capture_strategy = sub('Discovery|Validation','',sub('TargetOsteo','',vcf_manifest[[5]])),
                                     filename = vcf_manifest[[8]],
                                     source_name = substr(vcf_manifest[[8]],1,6),
                                     sample_source = ifelse(grepl('Tumor',vcf_manifest[[8]]),'Tumor','Normal'),
                                     stringsAsFactors = FALSE)
 tmpmap = discover_validation_map[discover_validation_map$capture_strategy=='',]
 tmpmap$capture_strategy = 'Exome'
 discover_validation_map = rbind(discover_validation_map,
                                tmpmap)
                                
 res2 = res[,-1] %>% left_join(discover_validation_map)

 readr::write_tsv(res2,'target_vcf_metadata.tsv',col_names=TRUE)
	library(dplyr)


	library(RMySQL)
	con = dbConnect(MySQL(),user='USERNAME',password='PASSWORD',host='solexadb.XXXXXXXXXX.us-east-1.rds.amazonaws.com',port=3306,dbname='solexa')
	res = dbGetQuery(con,
	"select study_id,source.name as source_name, sample.name as sample_name, fcb.type as software,
	fcb.dateStamp as basecalldate, fcb.softwareVersion as version, study_id,
	sr.date as run_date, sr.sequencer as sequencer, sm.model, ssr.library_id as library_id,
	sr.ID as run_id, sample.ID as sample_id, nt.value as sample_source,
	sample.sample_type, part.value as partitioning_method, ssr.ID as read_group_id
	from solexa_sample as sample
	join solexa_source_sample as sss on sss.sample_id=sample.ID
	join solexa_source as source on sss.source_id=source.ID
	join solexa_sample_library as sslib on sslib.sample_id=sample.ID
	join solexa_studyresult as ssr on ssr.library_id=sslib.library_id
	join solexa_lane as slane on ssr.lane_id=slane.ID
	join solexa_run as sr on sr.ID = slane.run_id
	join solexa_variable_sample as svs on svs.sample_id=sample.ID
	join solexa_variable as sv on svs.variable_id=sv.ID
	join flowCellBasecall as fcb on fcb.id = ssr.`basecall_id`
	join solexa_variable_library as svl on ssr.library_id=svl.library_id
	join solexa_machine sm on sr.sequencer=sm.name
	left join (select sv.ID as variable_id,value
	from solexa_variable as sv where name='Genome Partition') as part
	on part.variable_id=svl.variable_id
	left join (select sv.ID as variable_id,value
	from solexa_variable as sv where name='Normal/Tumor') as nt
	on nt.variable_id=svs.variable_id
	where study_id in (16,25,94,152,128,179,124,166)
	and ssr.include='yes'
	")
	res[[3]] = gsub('_','-',res[[3]])
	res[[3]] = gsub('(0[0-9])$','\\1D',res[[3]])
	colnames(res)[3] = 'samplename'
	res = res %>% filter(!is.na(sample_source)) %>% unique()

	captureTrans = function(partition_method) {
	retmodel = rep('',length(partition_method))
	retmodel[grepl("TARGETOsteo020416_Valid_BAC_KlenowBait",partition_method)] = "LCC"
	retmodel[grepl('SeqCap_141001_HG19_TargetOsteo1807_EZ',partition_method)] = 'Roche'
	retmodel[partition_method %in% c('Illumina_Nextera_Rapid_Capture_Exome',
	'Illumina_TruSeq_Exome_Enrichment_Kit',
	'Agilent_SureSelect_Human_All_Exon_50Mb_Kit')] = 'Exome'
	retmodel
	}



	res = res[res$sample_type != 'mRNA',]
	res$capture_strategy = captureTrans(res$partitioning_method)

	vcf_manifest = readr::read_delim('~/Downloads/vcf_manifest_md5.txt',delim="/",col_names=FALSE)
	discover_validation_map = data.frame(directory = sub('TargetOsteo','',vcf_manifest[[5]]),
	capture_strategy = sub('Discovery\|Validation','',sub('TargetOsteo','',vcf_manifest[[5]])),
	filename = vcf_manifest[[8]],
	source_name = substr(vcf_manifest[[8]],1,6),
	sample_source = ifelse(grepl('Tumor',vcf_manifest[[8]]),'Tumor','Normal'),
	stringsAsFactors = FALSE)
	tmpmap = discover_validation_map[discover_validation_map$capture_strategy=='',]
	tmpmap$capture_strategy = 'Exome'
	discover_validation_map = rbind(discover_validation_map,
	tmpmap)

	res2 = res[,-1] %>% left_join(discover_validation_map)

	readr::write_tsv(res2,'target_vcf_metadata.tsv',col_names=TRUE)