Created
September 10, 2015 10:06
-
-
Save cth/a6b6a846c6fe0cf0f2d5 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
#Joseph Glessner | |
=head | |
Inputs: | |
PennCNV .log -> LRR_SD, GCWF | |
PennCNV .rawcnv -> CountCNV | |
Plink --missing .imiss / Illumina LIMs Project Detail Report .csv / Illumina Genome Studio Samples Table / General format SampleID CallRate -> CallRate | |
Eigenstrat smartPCA .pca.evec / Plink .mds -> PCA | |
Plink --genome .genome -> PI_HAT relatedness | |
make sure to include extremevalues_2.2.tar.gz http://cran.r-project.org/web/packages/extremevalues/index.html | |
For Illumina 550k data and related Illumina chip platforms, the key data quality metric thresholds we have observed are: call rate > 98%, SD LRR < 0.3, |GCWF| < 0.05, and count CNV< 100. | |
For Affymetrix 6.0 data, these measures include: call rate > 96%, SD LRR < 0.35, |GCWF| < 0.02, and count CNV < 80. | |
In addition, observations of quality metric modes from individual labs and sample sources are advisable to determine appropriate QC thresholds. | |
The distribution of these metric measures are constantly reviewed to include only those who fall within a linear mode of the quality metric outside exponential modes for any given genotyping platform. | |
Run once with no thresholds specified and review automatically determined thresholds based on outliers in your dataset. | |
Then you may specify some adjusted threshold for a given quality metric based on your review of the plots. | |
=cut | |
if(@ARGV<1) | |
{ | |
print "USAGE: | |
perl ParseCNV_QC.pl [arguments] | |
PLEASE RUN \"export R_LIBS=./\" first. | |
Arguments: | |
--log PennCNV .log | |
--rawcnv PennCNV .rawcnv | |
--callrate Plink --missing .imiss OR Illumina LIMs Project Detail Report .csv OR Illumina Genome Studio Samples Table OR General format SampleID CallRate | |
--popstrat Eigenstrat smartPCA .pca.evec OR Plink .mds | |
--related Plink --genome .genome | |
--out An output file prefix to label this run | |
Run once with no thresholds specified and review automatically determined thresholds based on outliers in your dataset. | |
Then you may specify some adjusted threshold for a given quality metric based on your review of the plots. | |
--qccallrate <float> Call Rate threshold for inclusion of the sample | |
--qclrrsd <float> LRR_SD threshold for inclusion of the sample | |
--qcgcwf <float> GCWF threshold for inclusion of the sample | |
--qcnumcnv <int> number of CNV calls for inclusion of the sample | |
--qcrelated <float> sample relatedness for inclusion of the sample | |
--popstrat is just for plotting, no exclusion is currently supported based on popstrat. | |
A tab delimited QC Summary is provided: <myLog>_QC_Vals_Header.log | |
"; | |
=head | |
=== Options under development === | |
--numsnp <int> minimum number of SNPs in CNV calls | |
--length <int> minimum length of CNV calls (suffix of k or m is okay) | |
--confidence <float> minimum confidence score of CNV calls | |
Other available PennCNV log quality metrics I do not use typically: LRR_mean (low/high), BAF_mean(low/high), BAF_SD(high), BAF_DRIFT(high), WF(low/high) | |
--qcbafdrift <float> BAF_DRIFT threshold for inclusion of the sample | |
--qclrrmean <float> LRR_mean threshold for inclusion of the sample | |
--qcbafmean <float> BAF_mean threshold for inclusion of the sample | |
--qcbafsd <float> BAF_SD threshold for inclusion of the sample | |
--qcbafdrift <float> BAF_DRIFT threshold for inclusion of the sample | |
--qcwf <float> WF threshold for inclusion of the sample | |
=cut | |
exit; | |
} | |
use Getopt::Long; | |
GetOptions('log=s'=>\$log,'rawcnv=s'=>\$rawcnv,'callrate=s'=>\$callrate,'popstrat=s'=>\$popstrat,'related=s'=>\$related,'out=s'=>\$out,'qccallrate=s'=>\$qccallrate,'qclrrsd=s'=>\$qclrrsd,'qcgcwf=s'=>\$qcgcwf,'qcnumcnv=s'=>\$qcnumcnv,'qcrelated=s'=>\$qcrelated); | |
##perl ParseCNV_QC.pl --log PennCNV_Omni1.log --rawcnv file.rawcnv --callrate PCGC_Omni25-8v1_BED_Miss.imiss --popstrat file.pca.evec --related PCGC_Omni25-8v1_BED_GenomeAll.genome | |
if(!($log)){$log=1;print "WARNING:--log is strongly recommended to include since it contains crucial quality metric LRR_SD\n";} ## Allow missing arguments | |
if(!($rawcnv)){$rawcnv=1;} | |
if(!($callrate)){$callrate=1;print "WARNING:--callrate is strongly recommended to include since it contains crucial quality metric CallRate. BAF_SD may be best alternative\n";} | |
if(!($popstrat)){$popstrat=1;} | |
if(!($related)){$related=1;} | |
if(!($qccallrate)){$qccallrate=-999;} | |
if(!($qclrrsd)){$qclrrsd=-999;} | |
if(!($qcgcwf)){$qcgcwf=-999;} | |
if(!($qcnumcnv)){$qcnumcnv=-999;} | |
if(!($qcrelated)){$qcrelated=.2;} | |
$c="grep summary $log | grep -v Xmean > $log"."_QC.log";print"$c\n";`$c`; #PennCNV_Omni1.log | |
$c="head -1 $log"."_QC.log | sed 's/=[^ ]* / /g' | sed 's/=.*//' | sed 's/.*for \\(.*\\):/ChipID/g' > $log" . "_Header";print"$c\n";`$c`; | |
$c="sed 's/ [^ ]*=/ /g' $log"."_QC.log | sed 's/.*for \\(.*\\):/\\1/g' > $log"."_QC_Vals.log";print"$c\n";`$c`; | |
$c="cat $log"."_Header $log"."_QC_Vals.log > $log"."_QC_Vals_Header.log";print"$c\n";`$c`; | |
$c="echo CountCNV ChipID> $rawcnv"."_CountCNVChipID.txt";print"$c\n";`$c`; | |
$c="awk '{print \$5}' $rawcnv | uniq --count | sed -e 's/^[ \\t]*//' >> $rawcnv"."_CountCNVChipID.txt";print"$c\n";`$c`; # file.rawcnv ../GeneBased/PQN_2+_8Cols.rawcnv_CLEANQC.rawcnv_CLEANQC.rawcnv_keep_PCGC_CaseIDs.txt | |
$c="perl PerlModules/GetCallRate.pl $callrate";print"$c\n";$output=`$c`;#### PCGC_Omni25-8v1_BED_Miss.imiss | |
print"$output\n"; | |
###perl GetCallRate.pl PCGC_PDR_7-19-12.csv | |
###perl GetCallRate.pl PCGC_Omni1_SamplesTable.txt | |
###file.pca.evec file.mds | |
$c="head -1 $popstrat";print"$c\n";$output=`$c`; | |
# #eigvals: 199.492 39.213 9.287 7.900 3.519 3.015 2.579 2.347 2.335 2.324 | |
# FID IID SOL C1 C2 C3 C4 | |
#die("horribly"); | |
if($output=~/eigvals/) | |
{ | |
print "MATCH PCA\n"; | |
$c="cp $popstrat $popstrat"."_Standard.txt";print"$c\n";$output=`$c`; | |
$PCAOrMDS="PCA"; | |
} | |
elsif($output=~/FID/) | |
{ | |
print "MATCH MDS\n"; | |
$c="awk '{print \$2\"\\t\"\$4\"\\t\"\$5\"\\t\"\$6\"\\t\"\$7}' $popstrat > $popstrat"."_Standard.txt";print"$c\n";$output=`$c`; | |
$PCAOrMDS="MDS"; | |
} | |
else | |
{ | |
print "Unrecognized population stratification file format $popstrat\n"; | |
} | |
###PCGC_Omni25-8v1_BED_GenomeAll.genome | |
###$qcrelated was .2 | |
$c="awk '{if(\$10>$qcrelated){print \$2\"\\t\"\$4\"\\t\"\$10}}' $related > $related"."_3Col.genome";print"$c\n";$output=`$c`; ###Added condition to lessen computational burden of displaying so many points | |
##Count IDs to determine minimal exclusions | |
$c="awk '{print \$2\"\\t\"\$4\"\\t\"\$10}' $related | awk '{if(\$3>$qcrelated){print \$1; print \$2}}' | sort | uniq --count| sed -e 's/^[ \\t]*//' | sort -n -r> $related"."_Gt2PihatCountIDs.txt";print"$c\n";$output=`$c`; | |
unless( open( RELATEDFILE, $related."_3Col.genome")) | |
{ | |
print "File not found!\n"; | |
exit; | |
} | |
unless( open( COUNTIDFILE, $related."_Gt2PihatCountIDs.txt")) #### TO DO Start by removing IDs failing other QC metrics first | |
{ | |
print "File not found!\n"; | |
exit; | |
} | |
open(RELATEDQCREMOVEIDFILE,">$out"."QC_RemoveIDs.txt"); | |
print RELATEDQCREMOVEIDFILE "ChipIDRemove Value Metric\n"; | |
$PairsRemainingGt2Pihat=1;###Just initialize | |
#print"PairsResolvedByThisID:PairsRemainingGt2Pihat "; | |
for($i=0;$PairsRemainingGt2Pihat>0;$i++) | |
{ | |
if($i>0) | |
{ | |
$c="mv RemainingGt2Pihat.txt Gt2Pihat.txt";`$c`; | |
unless( open( RELATEDFILE, "Gt2Pihat.txt")) | |
{ | |
print "File not found!\n"; | |
exit; | |
} | |
} | |
$HighestCountIDLine=<COUNTIDFILE>; ### Next highest count ID taken each iteration of for loop | |
@Vals=split(/\s+/,$HighestCountIDLine); | |
$IDCount=$Vals[0]; | |
$HighestCountID=$Vals[1]; | |
open(RESFILE,">RemainingGt2Pihat.txt"); | |
$PairsRemainingGt2Pihat=0; | |
$PairsResolvedByThisID=0; | |
while($line = <RELATEDFILE>) | |
{ | |
chomp($line); | |
$line =~ s/[\r\n]+$//g; | |
@Vals=split(/\s+/,$line); | |
if($Vals[0] eq $HighestCountID || $Vals[1] eq $HighestCountID) | |
{ | |
$PairsResolvedByThisID++; | |
} | |
else | |
{ | |
print RESFILE "$line\n"; | |
$PairsRemainingGt2Pihat++; | |
} | |
} | |
if($PairsResolvedByThisID>0 && $HighestCountID ne "IID1" && $HighestCountID ne "IID2") | |
{print RELATEDQCREMOVEIDFILE "$HighestCountID $IDCount"."PiHat>$qcrelated PiHat>$qcrelated"."HighestCountIDIterativelyRemoved\n";}#print"$PairsResolvedByThisID:$PairsRemainingGt2Pihat "} | |
close RELATEDFILE; | |
close RESFILE; | |
} | |
close RELATEDQCREMOVEIDFILE; | |
close COUNTIDFILE; | |
print"\n"; | |
#$c="awk '{print \$1;print\$2}' $related"."_3Col.genome | sort |uniq --count >$related"."_AllIDCounts";print"$c\n";$output=`$c`; | |
#$c="R CMD INSTALL gWidgets_0.0-54.tar.gz -l ./";@output=`$c`; | |
#print "@output\n"; | |
#$c="R CMD INSTALL digest_0.6.8.tar.gz -l ./";@output=`$c`; | |
#print "@output\n"; | |
#$c="R CMD INSTALL gWidgetstcltk_0.0-55.tar.gz -l ./";@output=`$c`; | |
#print "@output\n"; | |
$c="R CMD INSTALL extremevalues_2.2.tar.gz -l ./";@output=`$c`; | |
print "@output\n"; | |
#print "PLEASE RUN \"export R_LIBS=./\" if you get \"Error: cannot execute system command R CMD BATCH .AllRes.R\"\n"; | |
#print "If still error, check .AllRes.Rout for specific error and submit to [email protected] if not resolvable.\n"; | |
$c="export R_LIBS=./";`$c`; ##Somehow does not work | |
open (R, ">$out.AllRes.R"); | |
print R <<END; | |
pdf(file=paste("$out","QC_Plot.pdf" ,sep=""), width=3,height=3, pointsize=6) | |
if("$log"!=1){a=read.table(paste("$log","_QC_Vals_Header.log",sep=""),header=TRUE, comment.char="")} | |
#one=read.table(paste("BafLrr/$out","AllRes.txt",sep=""),sep="\t") | |
if("$rawcnv"!=1){b=read.table(paste("$rawcnv","_CountCNVChipID.txt",sep=""),header=TRUE, comment.char="")} | |
if("$callrate"!=1){d=read.table(paste("$callrate","_CallRates.txt",sep=""),header=TRUE)} | |
if("$popstrat"!=1){e=read.table(paste("$popstrat","_Standard.txt",sep=""))} | |
if("$related"!=1){g=read.table(paste("$related","_3Col.genome",sep=""),header=TRUE)} | |
#install.packages("extremevalues_2.2.tgz",repos=FALSE) | |
#install.packages("extremevalues", dep=TRUE, lib=NULL) | |
par(mfrow=c(3,2),las=1,mar=c(4,4.2,1.5,1.3)) | |
library() | |
library(extremevalues) | |
if("$callrate"!=1){ | |
out <- getOutliers(sort(d\$CallRate),distribution="weibull") | |
x <- array(dim=c(1,length(d\$CallRate))) | |
x[out\$iLeft]="blue" | |
if(max(out\$iLeft)==-Inf) { out\$iLeft = 0 } | |
if(min(out\$iRight)==Inf) { out\$iRight = sum(!is.na(a\$CallRate)) } | |
if("$qccallrate"!=-999) { | |
out\$limit[1] <- as.numeric("$qccallrate") | |
out\$iLeft = c(1:(sum(d\$CallRate < as.numeric("$qccallrate")))) | |
x <- array(dim=c(1,length(d\$CallRate))) | |
x[out\$iLeft]="blue" | |
out\$nOut[1]=sum(d\$CallRate < as.numeric("$qccallrate")) | |
} | |
x[max(out\$iLeft)+1:length(d\$CallRate)]="black" | |
#plot(sort(d$CallRate),ylim=c(.9,1),col=x) | |
plot(sort(d\$CallRate),ylim=c(.85,1),col=x,main=paste("CallRate >",round(out\$limit[1],4),"Fail=", out\$nOut[1]))#######out\$iLeft)) | |
report<-head(d[sort.list(d[,2]), ],max(out\$iLeft)) | |
report[1:length(report\$CallRate),3]=paste("CallRate>",round(out\$limit[1],4),sep="") | |
###write.table("ChipIDRemove Value Metric", file = "QC_RemoveIDs.txt", append = TRUE,quote=FALSE,row.names=FALSE, col.names =FALSE) ### append false for the first one to start new file if rerun (Now PI_HAT related above > clears) | |
write.table(report, file = paste("$out","QC_RemoveIDs.txt",sep=""), append = TRUE,quote=FALSE,row.names=FALSE, col.names =FALSE) | |
} | |
if("$log"!=1){ | |
out <- getOutliers(sort(a\$LRR_SD),distribution="lognormal") | |
x <- array(dim=c(1,length(a\$LRR_SD))) | |
x[out\$iRight]="blue" | |
if(max(out\$iLeft)==-Inf) { out\$iLeft = 0 } | |
if(min(out\$iRight)==Inf) { out\$iRight = sum(!is.na(a\$LRR_SD)) } | |
if("$qclrrsd"!=-999) { | |
out\$limit[2] <- as.numeric("$qclrrsd") | |
out\$iRight = c(((length(a\$LRR_SD)+1)-(sum(a\$LRR_SD > as.numeric("$qclrrsd")))):(length(a\$LRR_SD))) | |
x <- array(dim=c(1,length(a\$LRR_SD))) | |
x[out\$iRight]="blue" | |
out\$nOut[2]=sum(a\$LRR_SD > as.numeric("$qclrrsd")) | |
} | |
x[1: min(out\$iRight)-1]="black" | |
#plot(sort(a\$LRR_SD), col=x) | |
plot(sort(a\$LRR_SD), col=x, main=paste("LRR_SD <",round(out\$limit[2],4),"Fail=", out\$nOut[2])) ###sort(a\$LRR_SD)[min(out\$iRight)] instead of round(out\$limit[2],4) | |
q_Lrrsd<-tail(a[order(a\$LRR_SD), ],max(out\$iRight)-min(out\$iRight)+1) | |
report<-q_Lrrsd[,c("ChipID","LRR_SD")] | |
report[1:length(report\$ChipID),3]= paste("LRR_SD<", round(out\$limit[2],4) ,sep="") | |
write.table(report, file = paste("$out","QC_RemoveIDs.txt",sep=""), append = TRUE,quote=FALSE,row.names=FALSE, col.names =FALSE) | |
if(sum(!is.na(a\$GCWF))>2) | |
{ | |
out <- getOutliers(sort(a\$GCWF),method="II") | |
x <- array(dim=c(1,length(a\$GCWF))) | |
x[out\$iRight]="blue" | |
x[out\$iLeft]="blue" | |
if(max(out\$iLeft)==-Inf) { out\$iLeft = 0 } | |
if(min(out\$iRight)==Inf) { out\$iRight = sum(!is.na(a\$GCWF)) } | |
if("$qcgcwf"!=-999) { | |
out\$limit[2] <- as.numeric("$qcgcwf") | |
out\$iRight = c(((length(a\$GCWF)+1)-(sum(a\$GCWF > as.numeric("$qcgcwf")))):(length(a\$GCWF))) | |
x <- array(dim=c(1,length(a\$GCWF))) | |
x[out\$iRight]="blue" | |
out\$nOut[2]=sum(a\$GCWF > as.numeric("$qcgcwf")) | |
out\$limit[1] <- as.numeric("-$qcgcwf") | |
out\$iLeft = c(1:(sum(a\$GCWF < as.numeric("-$qcgcwf")))) | |
###x <- array(dim=c(1,length(a\$GCWF))) | |
x[out\$iLeft]="blue" | |
out\$nOut[1]=sum(a\$GCWF < as.numeric("-$qcgcwf")) | |
} | |
x[(max(out\$iLeft)+1):(min(out\$iRight)-1)]="black" | |
#plot(sort(a\$GCWF), col=x,ylim=c(-.01,.01)) | |
plot(sort(a\$GCWF), col=x, main=paste(round(out\$limit[1],4),"< GCWF <",round(out\$limit[2],4),"Fail=", out\$nOut[1]+ out\$nOut[2])) # out\$limit was not correct #sort(a\$GCWF)[max(out\$iLeft)] AND sort(a\$GCWF)[min(out\$iRight)] prints closest to specified threshold | |
q_Gcwf_Low <-head(a[order(a\$GCWF), ],max(out\$iLeft)) | |
q_Gcwf_High<-tail(a[order(a\$GCWF), ], max(out\$iRight)-min(out\$iRight)+1) | |
report<-q_Gcwf_Low[,c("ChipID","GCWF")] | |
report[1:length(report\$ChipID),3]= paste(round(out\$limit[1],4),"<GCWF<",round(out\$limit[2],4),sep="") | |
write.table(report, file = paste("$out","QC_RemoveIDs.txt",sep=""), append = TRUE,quote=FALSE,row.names=FALSE, col.names =FALSE) | |
report<-q_Gcwf_High[,c("ChipID","GCWF")] | |
report[1:length(report\$ChipID),3]= paste(round(out\$limit[1],4),"<GCWF<",round(out\$limit[2],4),sep="") | |
write.table(report, file = paste("$out","QC_RemoveIDs.txt",sep=""), append = TRUE,quote=FALSE,row.names=FALSE, col.names =FALSE) | |
} | |
} | |
if("$rawcnv"!=1){ | |
out <- getOutliers(sort(b\$CountCNV),method="II") | |
x <- array(dim=c(1,length(b\$CountCNV))) | |
x[out\$iRight]="blue" | |
if(min(out\$iRight)==Inf) { out\$iRight = sum(!is.na(b\$CountCNV)) } | |
if("$qcnumcnv"!=-999) { | |
out\$limit[2] <- as.numeric("$qcnumcnv") | |
out\$iRight = c(((length(b\$CountCNV)+1)-(sum(b\$CountCNV > as.numeric("$qcnumcnv")))):(length(b\$CountCNV))) | |
x <- array(dim=c(1,length(b\$CountCNV))) | |
x[out\$iRight]="blue" | |
out\$nOut[2]=sum(b\$CountCNV > as.numeric("$qcnumcnv")) | |
x[1: min(out\$iRight)-1]="black" | |
#plot(sort(b\$CountCNV), col=x) | |
plot(sort(b\$CountCNV), col=x, main=paste("CountCNV <",round(out\$limit[2],4) ,"Fail=", out\$nOut[2])) ### was sort(b\$CountCNV)[min(out\$iRight)] | |
q_CountCNV <-tail(b[order(b\$CountCNV), ], max(out\$iRight)-min(out\$iRight)+1) | |
report<-q_CountCNV [,c("ChipID","CountCNV")] | |
report[1:length(report\$ChipID),3]= paste("CountCNV<", round(out\$limit[2],4) ,sep="") ### was sort(b\$CountCNV)[min(out\$iRight)] | |
write.table(report, file = paste("$out","QC_RemoveIDs.txt",sep=""), append = TRUE,quote=FALSE,row.names=FALSE, col.names =FALSE) | |
} | |
else{ | |
x[1: min(out\$iRight)-1]="black" | |
#plot(sort(b\$CountCNV), col=x) | |
plot(sort(b\$CountCNV), col=x, main=paste("CountCNV <",sort(b\$CountCNV)[min(out\$iRight)] ,"Fail=", out\$nOut[2])) | |
q_CountCNV <-tail(b[order(b\$CountCNV), ], max(out\$iRight)-min(out\$iRight)+1) | |
report<-q_CountCNV [,c("ChipID","CountCNV")] | |
report[1:length(report\$ChipID),3]= paste("CountCNV<", sort(b\$CountCNV)[min(out\$iRight)] ,sep="") | |
write.table(report, file = paste("$out","QC_RemoveIDs.txt",sep=""), append = TRUE,quote=FALSE,row.names=FALSE, col.names =FALSE) | |
} | |
} | |
if("$popstrat"!=1){ | |
mydata <- array(dim=c(length(e\$V2),2)) | |
mydata[1:length(e\$V2),1]=e\$V2 | |
mydata[1:length(e\$V2),2]=e\$V3 | |
fit <- kmeans(mydata, 3) | |
library(cluster) | |
clusplot(mydata, fit\$cluster, color=TRUE,main="$PCAOrMDS Clusters")##, shade=TRUE, labels=2, lines=0 http://www.statmethods.net/advstats/cluster.html | |
} | |
if("$related"!=1){ | |
out <- getOutliers(sort(g\$PI_HAT),method="II") | |
x <- array(dim=c(1,length(g\$PI_HAT))) | |
out\$iRight=1:length(g\$PI_HAT) ###Just <0.2 keep condition $qcrelated | |
x[out\$iRight]="blue" | |
###x[1: min(out\$iRight)-1]="black" ###Just <0.2 keep condition $qcrelated | |
#plot(sort(g\$PI_HAT), col=x) | |
out\$nOut[2]=length(g\$PI_HAT)###Just <0.2 keep condition $qcrelated | |
plot(sort(g\$PI_HAT), col=x, main=paste("PI_HAT <", sort(g\$PI_HAT)[min(out\$iRight)],"Fail=", out\$nOut[2])) | |
q_PI_HAT <-tail(g[order(g\$PI_HAT), ], max(out\$iRight)-min(out\$iRight)+1) | |
report<-tail(q_PI_HAT [,c("IID1","PI_HAT")], max(out\$iRight)-min(out\$iRight)+1) ##"IID2", Just exclude first ID , better would be exclude highest count occurrence IDs, then remove resolved pairs with that ID, then exclude first ID when count ID is all one | |
report[1:length(report\$IID1),3]=paste("PI_HAT<", sort(g\$PI_HAT)[min(out\$iRight)],sep="") | |
######write.table(report, file = "QC_RemoveIDs.txt", append = TRUE,quote=FALSE,row.names=FALSE, col.names =FALSE) | |
######remove high count IDs iteratively above to minimize number of samples removed | |
} | |
AllQC<-read.table(paste("$out","QC_RemoveIDs.txt",sep="") ,header=TRUE,comment.char="") | |
#par(xpd=NA) | |
##text(0,-.08,paste(length(table(AllQC\$ChipIDRemove)),"unique fail any QC"),adj=0,cex=2)##Difficult because plot axis ranges unknown and applies to all plots, not just PI_HAT | |
mtext(paste(length(table(AllQC\$ChipIDRemove)),"unique fail any QC"), 1, padj=2.4)###put text below | |
#par(xpd=FALSE) | |
dev.off() | |
pdf(file=paste("$out","QC_Plot_2.pdf",sep=""), width=3,height=3, pointsize=6) | |
#other available PennCNV log quality metrics LRR_mean (low/high), BAF_mean(low/high), BAF_SD(high), BAF_DRIFT(high), WF(low/high) | |
par(mfrow=c(3,2),las=1,mar=c(4,4.2,1.5,1.3)) | |
if("$log"!=1){ | |
out <- getOutliers(sort(a\$LRR_mean),method="II") | |
x <- array(dim=c(1,length(a\$LRR_mean))) | |
x[out\$iRight]="blue" | |
x[out\$iLeft]="blue" | |
if(max(out\$iLeft)==-Inf) { out\$iLeft = 0 } | |
if(min(out\$iRight)==Inf) { out\$iRight = sum(!is.na(a\$LRR_mean)) } | |
x[(max(out\$iLeft)+1):(min(out\$iRight)-1)]="black" | |
#plot(sort(a\$LRR_mean), col=x,ylim=c(-.01,.01)) | |
plot(sort(a\$LRR_mean), col=x, main=paste(sort(a\$LRR_mean)[max(out\$iLeft)+1],"<= LRR_mean <=",sort(a\$LRR_mean)[min(out\$iRight)-1],"Fail=", out\$nOut[1]+ out\$nOut[2])) # out\$limit was not correct | |
q_LRR_mean_Low <-head(a[order(a\$LRR_mean), ],max(out\$iLeft)) | |
q_LRR_mean_High<-tail(a[order(a\$LRR_mean), ], max(out\$iRight)-min(out\$iRight)+1) | |
report<-q_LRR_mean_Low[,c("ChipID","LRR_mean")] | |
report[1:length(report\$ChipID),3]= paste(sort(a\$LRR_mean)[max(out\$iLeft)+1],"<=LRR_mean<=",sort(a\$LRR_mean)[min(out\$iRight)-1],sep="") | |
write.table("ChipIDRemove Value Metric", file = paste("$out","QC_RemoveIDs_LRRmean_BAFmean_BAFSD_BAFDRIFT_WF.txt",sep="") , append = FALSE,quote=FALSE,row.names=FALSE, col.names =FALSE) ### append false for the first one to start new file if rerun | |
write.table(report, file = paste("$out","QC_RemoveIDs_LRRmean_BAFmean_BAFSD_BAFDRIFT_WF.txt",sep=""), append = TRUE,quote=FALSE,row.names=FALSE, col.names =FALSE) | |
report<-q_LRR_mean_High[,c("ChipID","LRR_mean")] | |
report[1:length(report\$ChipID),3]= paste(sort(a\$LRR_mean)[max(out\$iLeft)+1],"<=LRR_mean<=",sort(a\$LRR_mean)[min(out\$iRight)-1],sep="") | |
write.table(report, file = paste("$out","QC_RemoveIDs_LRRmean_BAFmean_BAFSD_BAFDRIFT_WF.txt",sep=""), append = TRUE,quote=FALSE,row.names=FALSE, col.names =FALSE) | |
#BAF_mean(low/high) | |
out <- getOutliers(sort(a\$BAF_mean),method="II") | |
x <- array(dim=c(1,length(a\$BAF_mean))) | |
x[out\$iRight]="blue" | |
x[out\$iLeft]="blue" | |
if(max(out\$iLeft)==-Inf) { out\$iLeft = 0 } | |
if(min(out\$iRight)==Inf) { out\$iRight = sum(!is.na(a\$BAF_mean)) } | |
x[(max(out\$iLeft)+1):(min(out\$iRight)-1)]="black" | |
#plot(sort(a\$BAF_mean), col=x,ylim=c(-.01,.01)) | |
plot(sort(a\$BAF_mean), col=x, main=paste(sort(a\$BAF_mean)[max(out\$iLeft)+1],"<= BAF_mean <=",sort(a\$BAF_mean)[min(out\$iRight)-1],"Fail=", out\$nOut[1]+ out\$nOut[2])) # out\$limit was not correct | |
q_BAF_mean_Low <-head(a[order(a\$BAF_mean), ],max(out\$iLeft)) | |
q_BAF_mean_High<-tail(a[order(a\$BAF_mean), ], max(out\$iRight)-min(out\$iRight)+1) | |
report<-q_BAF_mean_Low[,c("ChipID","BAF_mean")] | |
report[1:length(report\$ChipID),3]= paste(sort(a\$BAF_mean)[max(out\$iLeft)+1],"<=BAF_mean<=",sort(a\$BAF_mean)[min(out\$iRight)-1],sep="") | |
write.table(report, file = paste("$out","QC_RemoveIDs_LRRmean_BAFmean_BAFSD_BAFDRIFT_WF.txt",sep=""), append = TRUE,quote=FALSE,row.names=FALSE, col.names =FALSE) | |
report<-q_BAF_mean_High[,c("ChipID","BAF_mean")] | |
report[1:length(report\$ChipID),3]= paste(sort(a\$BAF_mean)[max(out\$iLeft)+1],"<=BAF_mean<=",sort(a\$BAF_mean)[min(out\$iRight)-1],sep="") | |
write.table(report, file = paste("$out","QC_RemoveIDs_LRRmean_BAFmean_BAFSD_BAFDRIFT_WF.txt",sep=""), append = TRUE,quote=FALSE,row.names=FALSE, col.names =FALSE) | |
#BAF_SD(high) | |
out <- getOutliers(sort(a\$BAF_SD)) | |
x <- array(dim=c(1,length(a\$BAF_SD))) | |
x[out\$iRight]="blue" | |
if(min(out\$iRight)==Inf) { out\$iRight = sum(!is.na(a\$BAF_SD)) } | |
x[1: min(out\$iRight)-1]="black" | |
#plot(sort(a\$BAF_SD), col=x) | |
plot(sort(a\$BAF_SD), col=x, main=paste("BAF_SD <", sort(a\$BAF_SD)[min(out\$iRight)],"Fail=", out\$nOut[2])) | |
q_Lrrsd<-tail(a[order(a\$BAF_SD), ],max(out\$iRight)-min(out\$iRight)+1) | |
report<-q_Lrrsd[,c("ChipID","BAF_SD")] | |
report[1:length(report\$ChipID),3]= paste("BAF_SD<", sort(a\$BAF_SD)[min(out\$iRight)] ,sep="") | |
write.table(report, file = paste("$out","QC_RemoveIDs_LRRmean_BAFmean_BAFSD_BAFDRIFT_WF.txt",sep=""), append = TRUE,quote=FALSE,row.names=FALSE, col.names =FALSE) | |
#BAF_DRIFT(high) | |
out <- getOutliers(sort(a\$BAF_DRIFT)) | |
x <- array(dim=c(1,length(a\$BAF_DRIFT))) | |
x[out\$iRight]="blue" | |
if(min(out\$iRight)==Inf) { out\$iRight = sum(!is.na(a\$BAF_DRIFT)) } | |
x[1: min(out\$iRight)-1]="black" | |
#plot(sort(a\$BAF_DRIFT), col=x) | |
plot(sort(a\$BAF_DRIFT), col=x, main=paste("BAF_DRIFT <", sort(a\$BAF_DRIFT)[min(out\$iRight)],"Fail=", out\$nOut[2])) | |
q_Lrrsd<-tail(a[order(a\$BAF_DRIFT), ],max(out\$iRight)-min(out\$iRight)+1) | |
report<-q_Lrrsd[,c("ChipID","BAF_DRIFT")] | |
report[1:length(report\$ChipID),3]= paste("BAF_DRIFT<", sort(a\$BAF_DRIFT)[min(out\$iRight)] ,sep="") | |
write.table(report, file = paste("$out","QC_RemoveIDs_LRRmean_BAFmean_BAFSD_BAFDRIFT_WF.txt",sep=""), append = TRUE,quote=FALSE,row.names=FALSE, col.names =FALSE) | |
#WF(low/high) | |
out <- getOutliers(sort(a\$WF),method="II") | |
x <- array(dim=c(1,length(a\$WF))) | |
x[out\$iRight]="blue" | |
x[out\$iLeft]="blue" | |
if(max(out\$iLeft)==-Inf) { out\$iLeft = 0 } | |
if(min(out\$iRight)==Inf) { out\$iRight = sum(!is.na(a\$WF)) } | |
x[(max(out\$iLeft)+1):(min(out\$iRight)-1)]="black" | |
#plot(sort(a\$WF), col=x,ylim=c(-.01,.01)) | |
plot(sort(a\$WF), col=x, main=paste(sort(a\$WF)[max(out\$iLeft)+1],"<= WF <=",sort(a\$WF)[min(out\$iRight)-1],"Fail=", out\$nOut[1]+ out\$nOut[2])) # out\$limit was not correct | |
q_WF_Low <-head(a[order(a\$WF), ],max(out\$iLeft)) | |
q_WF_High<-tail(a[order(a\$WF), ], max(out\$iRight)-min(out\$iRight)+1) | |
report<-q_WF_Low[,c("ChipID","WF")] | |
report[1:length(report\$ChipID),3]= paste(sort(a\$WF)[max(out\$iLeft)+1],"<=WF<=",sort(a\$WF)[min(out\$iRight)-1],sep="") | |
write.table(report, file = paste("$out","QC_RemoveIDs_LRRmean_BAFmean_BAFSD_BAFDRIFT_WF.txt",sep=""), append = TRUE,quote=FALSE,row.names=FALSE, col.names =FALSE) | |
report<-q_WF_High[,c("ChipID","WF")] | |
report[1:length(report\$ChipID),3]= paste(sort(a\$WF)[max(out\$iLeft)+1],"<=WF<=",sort(a\$WF)[min(out\$iRight)-1],sep="") | |
write.table(report, file = paste("$out","QC_RemoveIDs_LRRmean_BAFmean_BAFSD_BAFDRIFT_WF.txt",sep=""), append = TRUE,quote=FALSE,row.names=FALSE, col.names =FALSE) | |
} | |
END | |
print R "dev.off()\n"; | |
close (R); | |
system ("R CMD BATCH $out.AllRes.R"); | |
if ($? == 0) { | |
#print " - command executed successfully!\n"; | |
$c="head $out"."QC_RemoveIDs.txt";print"=======The Remove IDs Report first 10 lines=======\n";$output=`$c`;print"$output\n"; | |
} | |
else { | |
print "Error: cannot execute system command R CMD BATCH $out.AllRes.R PLEASE TRY RUNNING \"export R_LIBS=./\" \n"; | |
print "If still error, check $out.AllRes.Rout (below) for specific error and if not resolveable submit to parsecnv\@gmail.com.\n"; | |
$c="tail $out.AllRes.Rout";print"=======The R Error log last 10 lines=======\n";$output=`$c`;print"$output\n"; | |
} | |
#unlink ("$out.AllRes.R","$out.AllRes.Rout", ".RData"); | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment