Last active
August 29, 2015 14:02
-
-
Save chapmanb/9248ab8409a473249a07 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
################################################################### | |
# FindSomatic.pl | |
# | |
# Tool for detecting somatic mutations in normal/tumor pair | |
# using microassembly | |
# | |
# Author: Giuseppe Narzisi | |
# Date: December 11, 2013 | |
# | |
################################################################### | |
use warnings; | |
use strict; | |
use POSIX; | |
use FindBin qw($Bin); | |
use lib $Bin; # add $Bin directory to @INC | |
use Usage; | |
use Utils qw(:DEFAULT $findVariants $findDenovos $findSomatic $exportTool $bamtools); | |
use HashesIO; | |
use Parallel::ForkManager; | |
use List::Util qw[min max]; | |
#use Math::Random qw(:all); | |
#use Sys::CPU; | |
#use IO::Uncompress::Gunzip qw(gunzip $GunzipError); | |
use File::Spec; | |
use Getopt::Long; | |
use MLDBM::Sync; # this gets the default, SDBM_File | |
use MLDBM qw(DB_File Storable); # use Storable for serializing | |
use MLDBM qw(MLDBM::Sync::SDBM_File); # use extended SDBM_File, handles values > 1024 bytes | |
use Fcntl qw(:DEFAULT); # import symbols O_CREAT & O_RDWR for use with DBMs | |
$|=1; | |
# required arguments | |
my $BAMNORMAL; | |
my $BAMTUMOR; | |
my $BEDFILE; | |
my $REF; | |
my $defaults = getDefaults(); | |
# optional arguments | |
my $kmer = $defaults->{kmer}; | |
my $cov_threshold = $defaults->{cov_threshold}; | |
my $tip_cov_threshold = $defaults->{tip_cov_threshold}; | |
my $radius = $defaults->{radius}; | |
my $windowSize = $defaults->{windowSize}; | |
my $delta = $defaults->{delta}; | |
my $map_qual = $defaults->{map_qual}; | |
my $maxmismatch = $defaults->{maxmismatch}; | |
my $min_cov = $defaults->{min_cov}; | |
my $covratio = $defaults->{covratio}; | |
my $outratio = $defaults->{outratio}; | |
my $WORK = $defaults->{WORK}; | |
my $MAX_PROCESSES = $defaults->{MAX_PROCESSES}; | |
my $selected = $defaults->{selected}; | |
my $dfs_limit = $defaults->{pathlimit}; | |
my $outformat = $defaults->{format}; | |
my $intarget; | |
my $help = 0; | |
my $VERBOSE = 0; | |
my %candidates; | |
my %exons; | |
my %locations; | |
my %normalCov; | |
my %tumorCov; | |
my %normalL2K; | |
my %tumorL2K; | |
my %normalSVs; | |
my %tumorSVs; | |
my %somaticSVs; | |
my %commonSVs; | |
my @members = qw/normal tumor/; | |
my $argcnt = scalar(@ARGV); | |
my $start_time = time; | |
##################################################### | |
# | |
# Command line options processing | |
# | |
GetOptions( | |
'help!' => \$help, | |
'verbose!' => \$VERBOSE, | |
# required parameters | |
'normal=s' => \$BAMNORMAL, | |
'tumor=s' => \$BAMTUMOR, | |
'bed=s' => \$BEDFILE, | |
'ref=s' => \$REF, | |
# optional paramters | |
'kmer=i' => \$kmer, | |
'covthr=i' => \$cov_threshold, | |
'lowcov=i' => \$tip_cov_threshold, | |
'mincov=i' => \$min_cov, | |
'covratio=f' => \$covratio, | |
'radius=i' => \$radius, | |
'window=i' => \$windowSize, | |
'step=i' => \$delta, | |
'mapscore=i' => \$map_qual, | |
'mismatches=i' => \$maxmismatch, | |
'dir=s' => \$WORK, | |
'numprocs=i' => \$MAX_PROCESSES, | |
'coords=s' => \$selected, | |
'format=s' => \$outformat, | |
# ouptut parameters | |
'intarget!' => \$intarget, | |
'outratio=f' => \$outratio, | |
) or usageDenovo("FindDenovo.pl"); | |
##################################################### | |
# | |
# Command line options processing | |
# | |
sub init() | |
{ | |
usageSomatic("FindSomatic.pl") if ($argcnt < 1); | |
usageSomatic("FindSomatic.pl") if( $help ); | |
if( (!defined $BAMNORMAL) || (!defined $BAMTUMOR) || (!defined $BEDFILE) || (!defined $REF) ) { | |
print STDERR "Required parameter missing!\n"; | |
usageSomatic("FindSomatic.pl"); | |
} | |
mkdir $WORK if ! -r $WORK; | |
if($MAX_PROCESSES == 1) { $MAX_PROCESSES = 0; } | |
} | |
##################################################### | |
sub printParams { | |
my $file = $_[0]; | |
print STDERR "-- Print parameters to $file\n"; | |
open PFILE, "> $file" or die "Can't open $file ($!)\n"; | |
print PFILE "Parameters: \n"; | |
print PFILE "-- K-mer size (bp): $kmer\n"; | |
print PFILE "-- coverage threshold: $cov_threshold\n"; | |
print PFILE "-- low coverage threshold: $tip_cov_threshold\n"; | |
print PFILE "-- size (bp) of the left and right extensions (radius): $radius\n"; | |
print PFILE "-- window-size size (bp): $windowSize\n"; | |
print PFILE "-- step size (bp): $delta\n"; | |
print PFILE "-- minimum mapping-quality: $map_qual\n"; | |
print PFILE "-- minimum coverage for somatic mutation: $min_cov\n"; | |
print PFILE "-- minimum coverage ratio for sequencing error: $covratio\n"; | |
print PFILE "-- minimum coverage ratio for exporting mutation: $outratio\n"; | |
print PFILE "-- max number of mismatches for near-perfect repeats: $maxmismatch\n"; | |
print PFILE "-- output directory: $WORK\n"; | |
print PFILE "-- max number of parallel jobs: $MAX_PROCESSES\n"; | |
print PFILE "-- BAM file normal: $BAMNORMAL\n"; | |
print PFILE "-- BAM file tumor: $BAMTUMOR\n"; | |
print PFILE "-- bedfile: $BEDFILE\n"; | |
print PFILE "-- reference file: $REF\n"; | |
print PFILE "-- file of selected locations: $selected\n"; | |
print PFILE "-- output format for variants: $outformat\n"; | |
if($intarget) { print PFILE "-- output variants in target? yes\n"; } | |
else { print PFILE "-- output variants in target? no\n"; } | |
close PFILE; | |
} | |
## link input files | |
##################################################### | |
sub linkFiles { | |
print STDERR "-- Linking input bamfile\n"; | |
my $bamfile = $_[0]; | |
my $dirname = $_[1]; | |
my $flag = 0; | |
print STDERR "$bamfile\n"; | |
my $fdir = "$WORK/$dirname"; | |
# get absolute path | |
my $bam_abs_path = File::Spec->rel2abs($bamfile); | |
if (! -r $fdir) { | |
$flag = 1; | |
## link the input files | |
mkdir $fdir; | |
if (-e "$bam_abs_path") { | |
symlink "$bam_abs_path", "$fdir/bamfile.bam"; | |
} | |
else { | |
print STDERR "Can't find BAM file: $bamfile\n"; | |
next; | |
} | |
if (-e "$bam_abs_path.bai") { | |
symlink "$bam_abs_path.bai", "$fdir/bamfile.bam.bai"; | |
} | |
else { | |
print STDERR "Indexing BAM file...\n"; | |
runCmd("index bamfile", "$bamtools index -in $bam_abs_path"); | |
symlink "$bam_abs_path.bai", "$fdir/bamfile.bam.bai"; | |
#print STDERR "Can't find index BAM file: $bamfile.bai\n"; | |
next; | |
} | |
} | |
return $flag; | |
} | |
## load mutations and coverage info | |
##################################################### | |
sub loadHashes { | |
print STDERR "-- Loading mutations and coverage info\n"; | |
# clear hashes | |
for my $k (keys %normalSVs) { delete $normalSVs{$k}; } | |
for my $k (keys %tumorSVs) { delete $tumorSVs{$k}; } | |
for my $k (keys %normalCov) { delete $normalCov{$k}; } | |
for my $k (keys %tumorCov) { delete $tumorCov{$k}; } | |
# load databases of mutations | |
print STDERR "load variants for normal...\n"; | |
loadDB("$WORK/normal/variants.db", \%normalSVs, \%exons, 0); | |
print STDERR "load variants for tumor...\n"; | |
loadDB("$WORK/tumor/variants.db", \%tumorSVs, \%exons, 0); | |
print STDERR "load coverage for normal...\n"; | |
loadCov("$WORK/normal/refcov.txt", \%normalCov); | |
print STDERR "load coverage for tumor...\n"; | |
loadCov("$WORK/tumor/refcov.txt", \%tumorCov); | |
print STDERR "load loc2key for for normal...\n"; | |
loadLoc2Key("$WORK/normal/loc2keys.txt.gz", \%normalL2K); | |
print STDERR "load loc2key for for tumor...\n"; | |
loadLoc2Key("$WORK/tumor/loc2keys.txt.gz", \%tumorL2K); | |
#remove reference coverage files: | |
print STDERR "remove coverage file for normal...\n"; | |
runCmd("remove coverage file", "rm $WORK/normal/refcov.txt.gz"); | |
print STDERR "remove coverage file for tumor...\n"; | |
runCmd("remove coverage file", "rm $WORK/tumor/refcov.txt.gz"); | |
} | |
## call variants on each family memeber | |
##################################################### | |
sub callSVs { | |
my $bamfile = $_[0]; | |
my $dir = $_[1]; | |
print STDERR "-- Detect mutations on $dir\n"; | |
my $bam_abs_path = File::Spec->rel2abs($bamfile); | |
my $command = "$findVariants ". | |
"--bam $bam_abs_path ". | |
"--bed $BEDFILE ". | |
"--ref $REF ". | |
"--kmer $kmer ". | |
"--covthr $cov_threshold ". | |
"--lowcov $tip_cov_threshold ". | |
"--covratio $covratio ". | |
"--radius $radius ". | |
"--window $windowSize ". | |
"--step $delta ". | |
"--mapscore $map_qual ". | |
"--mismatches $maxmismatch ". | |
"--dir $WORK/$dir ". | |
"--sample ALL ". | |
"--numprocs $MAX_PROCESSES ". | |
"--coords $selected ". | |
"--format $outformat ". | |
"--cov2file"; | |
if($intarget) { $command .= " --intarget"; } | |
if($VERBOSE) { $command .= " --verbose"; } | |
print STDERR "Command: $command\n" if($VERBOSE); | |
runCmd("findVariants", "$command"); | |
} | |
## Compute best state for the location of the mutation | |
##################################################### | |
sub computeBestState { | |
my $vars; | |
my $covthr = 0; # min cov threshold to remove sequencing errors | |
foreach my $ID (@members) { | |
if($ID eq "normal") { $vars = \%normalSVs; } | |
if($ID eq "tumor") { $vars = \%tumorSVs; } | |
foreach my $currKey (keys %$vars) { | |
#print STDERR "$currKey\n"; | |
next if($currKey eq "numPaths"); | |
my $var = $vars->{$currKey}; | |
my $chr = $var->{chr}; | |
my $pos = $var->{pos}; | |
my $loc = "$chr:$pos"; | |
my $refState; | |
#my $mutState; | |
my $altState; | |
my $refCov; | |
my $mutCov; | |
my $altCov; | |
foreach my $role (@members) { | |
#print STDERR "$role: "; | |
my $varh; | |
my $refhash; | |
my $l2k; | |
if($role eq "normal") { $varh = \%normalSVs; $refhash = \%normalCov; $l2k = \%normalL2K; } | |
if($role eq "tumor") { $varh = \%tumorSVs; $refhash = \%tumorCov; $l2k = \%tumorL2K; } | |
$refState->{$role} = 0; $altState->{$role} = 0; | |
$refCov->{$role} = 0; $mutCov->{$role} = 0; $altCov->{$role} = 0; | |
my $sum = 0; | |
my $cnt = 0; | |
if ( exists($refhash->{$loc}) ) { # if there is reference coverage at location | |
## compute reference coverage at locus | |
for (my $t = $pos; $t<($pos+$kmer+1); $t++) { # compute avg coverage over window of size K | |
my $loc2 = "$chr:$t"; | |
if ( exists $refhash->{$loc2} && | |
!( exists($l2k->{$loc2}) && ( (scalar @{$l2k->{$loc2}->{keys}})!=0) ) ) { | |
$sum += $refhash->{$loc2}; | |
$cnt++; | |
} | |
} | |
} | |
my $RCov = $sum; | |
if($cnt>0) { $RCov = floor($sum/$cnt); } | |
if($RCov > $covthr) { $refState->{$role} = 1; $refCov->{$role} = $RCov; } | |
else { $refState->{$role} = 0; $refCov->{$role} = 0; } | |
## compute coverage of mut at locus | |
my $ACov = 0; | |
if( exists($varh->{$currKey}) ) { | |
my $altMut = $varh->{$currKey}; | |
$ACov = $altMut->{mincov}; | |
} | |
if($ACov > $covthr) { $altState->{$role} = 1; $mutCov->{$role} = $ACov; } | |
else { $altState->{$role} = 0; $mutCov->{$role} = 0; } | |
## compute alternative allele coverage | |
my $OCov = 0; | |
if( exists($l2k->{$loc}) ) { | |
foreach my $altKey (@{$l2k->{$loc}->{keys}}) { | |
if( exists($varh->{$altKey}) && ($altKey ne $currKey) ) { | |
my $altMut = $varh->{$altKey}; | |
$OCov += $altMut->{mincov}; | |
} | |
} | |
} | |
if($OCov > $covthr) { $altCov->{$role} = $OCov; } | |
else { $altCov->{$role} = 0; } | |
} | |
# 2 2/0 1 (ref: N,T / qry: N,T) | |
my @RS=(); my @AS=(); | |
my @RC=(); my @AC=(); my @OC=(); | |
foreach my $role (@members) { # the order or members must be N,T | |
if( ($refState->{$role} == 1) && ($altState->{$role} == 1) ) { | |
push @RS , 1; | |
push @AS , 1; | |
} | |
elsif( ($refState->{$role} == 1) && ($altState->{$role} == 0) ) { | |
push @RS , 2; | |
push @AS , 0; | |
} | |
elsif( ($refState->{$role} == 0) && ($altState->{$role} == 1) ) { | |
push @RS , 0; | |
push @AS , 2; | |
} | |
elsif( ($refState->{$role} == 0) && ($altState->{$role} == 0) ) { | |
push @RS , 0; | |
push @AS , 0; | |
} | |
push @RC , $refCov->{$role}; | |
push @AC , $mutCov->{$role}; | |
push @OC , $altCov->{$role}; | |
} | |
my $stateStr = "$RS[0]$RS[1]/$AS[0]$AS[1]"; | |
my $covStr = "$RC[0] $RC[1]/$AC[0] $AC[1]/$OC[0] $OC[1]"; | |
$var->{bestState} = $stateStr; | |
$var->{covState} = $covStr; | |
# compute hom/het state and chi2score | |
parseCovState($var, $ID); | |
$vars->{$currKey} = $var; | |
} | |
} | |
} | |
## parse covState and compute hom/het state and chi2score | |
########################################## | |
sub parseCovState { | |
my $sv = $_[0]; | |
my $role = $_[1]; | |
my $BS = $sv->{covState}; | |
my ($REF,$ALT,$OTH) = split("/",$BS); | |
my @R = split(" ", $REF); # reference | |
my @A = split(" ", $ALT); # alternative | |
my @O = split(" ", $OTH); # other | |
my $id = 0; | |
if($role eq "normal") { $id = 0; } | |
if($role eq "tumor") { $id = 1; } | |
my $alleleCnt = 0; | |
if($R[$id] > 0) { $alleleCnt++; } | |
if($A[$id] > 0) { $alleleCnt++; } | |
if($O[$id] > 0) { $alleleCnt++; } | |
my $totCov = $R[$id] + $A[$id] + $O[$id]; | |
$sv->{altcov} = $totCov - $sv->{mincov}; | |
if($alleleCnt > 1) { $sv->{zygosity} = "het"; } | |
else { $sv->{zygosity} = "hom"; } | |
my $covRatio = 0; | |
my $chi2Score = "na"; | |
if ($totCov > 0) { | |
$covRatio = sprintf('%.2f', ($sv->{mincov} / $totCov) ); | |
$sv->{covratio} = $covRatio; | |
#Chi-squared Test for allele coverage | |
my $o1 = $sv->{mincov}; # observed 1 | |
my $o2 = $sv->{altcov}; # observed 2 | |
my $e1 = $totCov/2; # expected 1 | |
my $e2 = $totCov/2; # expected 2 | |
my $term1 = (($o1-$e1)*($o1-$e1))/$e1; | |
my $term2 = (($o2-$e2)*($o2-$e2))/$e2; | |
$chi2Score = sprintf('%.2f', $term1 + $term2); | |
} | |
$sv->{chi2score} = $chi2Score; | |
} | |
## parse bestState and update mutation info | |
########################################## | |
sub parseBestState { | |
my $sv = $_[0]; | |
my $BS = $sv->{bestState}; | |
$sv->{somatic} = "no"; | |
$sv->{inheritance} = "no"; | |
if( ($BS eq "12/10") || ($BS eq "02/20") ) { # only in normal | |
$sv->{inheritance} = "no"; | |
} | |
else { | |
my ($REF,$ALT) = split("/",$BS); | |
my @R = split("", $REF); # reference | |
my @A = split("", $ALT); # alternative | |
my $n=0; my $t=1; | |
#if ( ($A[$n]>0) && ($A[$t]>0) ) { # both in normal and tumor | |
if ( $A[$n] == $A[$t] ) { # genotypes for ALT identical | |
$sv->{inheritance} = "normal"; | |
} | |
#elsif ( ($A[$n]==0) && ($A[$t]>0) ) { # only in tumor | |
elsif ( $A[$n] != $A[$t] ) { # genotypes for ALT different | |
$sv->{inheritance} = "no"; | |
$sv->{somatic} = "yes"; | |
} | |
else { # unknown state | |
$sv->{inheritance} = "no"; | |
} | |
#if($A[$t]>0) { $sv->{id} = "tumor"; } | |
} | |
} | |
## find denovo events | |
##################################################### | |
sub findSomaticMut { | |
print STDERR "-- Finding somatic mutations\n"; | |
my $family = $_[0]; | |
if (-e "$WORK/somatic.db.dir") { runCmd("remove database files", "rm $WORK/somatic.db.*"); } | |
if (-e "$WORK/common.db.dir") { runCmd("remove database files", "rm $WORK/common.db.*"); } | |
my $somatic_dbm_obj = tie %somaticSVs, 'MLDBM::Sync', "$WORK/somatic.db", O_CREAT|O_RDWR, 0640; | |
my $common_dbm_obj = tie %commonSVs, 'MLDBM::Sync', "$WORK/common.db", O_CREAT|O_RDWR, 0640; | |
$somatic_dbm_obj->Lock; | |
$common_dbm_obj->Lock; | |
foreach my $k (keys %tumorSVs) { | |
my $mut = $tumorSVs{$k}; | |
# parse bestState and update info | |
parseBestState($mut); | |
my $chr = $mut->{chr}; | |
my $pos = $mut->{pos}; | |
my $key = "$chr:$pos"; | |
# skip mutation if normal not sampled | |
next if (!exists $normalCov{$key}); | |
next if ($normalCov{$key} < $min_cov); # min cov requirement | |
# compute inheritance | |
if($mut->{somatic} eq "yes") { | |
$somaticSVs{$k} = $mut; | |
} | |
else { # non somatic | |
if ($mut->{inheritance} eq "normal") { | |
$commonSVs{$k} = $mut; | |
} | |
} | |
$tumorSVs{$k} = $mut; | |
} | |
$somatic_dbm_obj->UnLock; | |
$common_dbm_obj->UnLock; | |
} | |
## export family mutations to file | |
##################################################### | |
sub exportSVs { | |
print STDERR "-- Exporting SVs to file\n"; | |
my $max_cov = 10000000; | |
# export somatic | |
#runCmd("export all", "$exportTool --db $WORK/somatic.db --bed $BEDFILE --format annovar --type all --mincov $min_cov --maxcov $max_cov --covratio $outratio > $WORK/somatic.${min_cov}x.all.txt"); | |
#runCmd("export snp", "$exportTool --db $WORK/somatic.db --bed $BEDFILE --format annovar --type snp --mincov $min_cov --maxcov $max_cov --covratio $outratio > $WORK/somatic.${min_cov}x.snp.txt"); | |
#runCmd("export indels", "$exportTool --db $WORK/somatic.db --bed $BEDFILE --format annovar --type indel --mincov $min_cov --maxcov $max_cov --covratio $outratio > $WORK/somatic.${min_cov}x.indel.txt"); | |
my $command_somatic = "$exportTool ". | |
"--db $WORK/somatic.db ". | |
"--bed $BEDFILE ". | |
"--format $outformat ". | |
"--type indel ". | |
"--mincov $min_cov ". | |
"--maxcov $max_cov ". | |
"--covratio $outratio"; | |
if($intarget) { $command_somatic .= " --intarget"; } | |
if ($outformat eq "annovar") { $command_somatic .= " > $WORK/somatic.${min_cov}x.indel.annovar"; } | |
elsif ($outformat eq "vcf") { $command_somatic .= " > $WORK/somatic.${min_cov}x.indel.vcf"; } | |
print STDERR "Command: $command_somatic\n" if($VERBOSE); | |
runCmd("export somatic", $command_somatic); | |
# export common variants | |
#runCmd("export all", "$exportTool --db $WORK/common.db --bed $BEDFILE --format annovar --type all --mincov $min_cov --maxcov $max_cov --covratio $outratio > $WORK/common.${min_cov}x.all.txt"); | |
#runCmd("export snp", "$exportTool --db $WORK/common.db --bed $BEDFILE --format annovar --type snp --mincov $min_cov --maxcov $max_cov --covratio $outratio > $WORK/common.${min_cov}x.snp.txt"); | |
#runCmd("export indels", "$exportTool --db $WORK/common.db --bed $BEDFILE --format annovar --type indel --mincov $min_cov --maxcov $max_cov --covratio $outratio > $WORK/common.${min_cov}x.indel.txt"); | |
my $command_comm = "$exportTool ". | |
"--db $WORK/common.db ". | |
"--bed $BEDFILE ". | |
"--format $outformat ". | |
"--type indel ". | |
"--mincov $min_cov ". | |
"--maxcov $max_cov ". | |
"--covratio $outratio"; | |
if($intarget) { $command_comm .= " --intarget"; } | |
if ($outformat eq "annovar") { $command_comm .= " > $WORK/common.${min_cov}x.indel.annovar"; } | |
elsif ($outformat eq "vcf") { $command_comm .= " > $WORK/common.${min_cov}x.indel.vcf"; } | |
print STDERR "Command: $command_comm\n" if($VERBOSE); | |
runCmd("export common", $command_comm); | |
} | |
## process single family | |
##################################################### | |
sub processBAM { | |
my $bamfile = $_[0]; | |
my $outdir = $_[1]; | |
#my $toprocess = linkFiles("$bamfile", "$outdir"); # link input files | |
#if($toprocess == 1) { | |
callSVs($bamfile, $outdir); # call mutations on each family | |
#} | |
#else { print STDERR "$outdir already processed!\n"; } | |
} | |
## do the job | |
########################################## | |
init(); | |
printParams("$WORK/parameters.txt"); # print parameters | |
processBAM($BAMNORMAL, "normal"); # process normal | |
processBAM($BAMTUMOR, "tumor"); # process tumor | |
loadHashes(); # load mutations and coverage info | |
computeBestState(); # compute bestState for each mutation | |
findSomaticMut(); # detect somatic mutations in tumor | |
exportSVs(); # export mutations to file | |
########################################## | |
my $time_taken = time - $start_time; | |
#if($VERBOSE) { | |
elapsedTime($time_taken, "FindSomatic"); | |
#} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
################################################################### | |
# FindVariants.pl | |
# | |
# Tool for detecting variants in one single exome | |
# using microassembly | |
# | |
# Author: Giuseppe Narzisi | |
# Date: December 11, 2013 | |
# | |
################################################################### | |
use warnings; | |
use strict; | |
use POSIX; | |
use FindBin qw($Bin); | |
use lib $Bin; # add $Bin directory to @INC | |
use Usage; | |
use Utils; | |
use SequenceIO; | |
use HashesIO; | |
use Parallel::ForkManager; | |
use List::Util qw[min max]; | |
#use MathRandom::Random qw(:all); | |
use Digest::MD5 qw(md5_hex); | |
#use IO::Uncompress::Gunzip qw(gunzip $GunzipError); | |
use Getopt::Long; | |
use File::Spec; | |
use MLDBM::Sync; # this gets the default, SDBM_File | |
use MLDBM qw(DB_File Storable); # use Storable for serializing | |
use MLDBM qw(MLDBM::Sync::SDBM_File); # use extended SDBM_File, handles values > 1024 bytes | |
use Fcntl qw(:DEFAULT); # import symbols O_CREAT & O_RDWR for use with DBMs | |
$|=1; | |
# required arguments | |
my $bamfile; | |
my $bedfile; | |
my $REF; | |
my $defaults = getDefaults(); | |
# optional arguments | |
my $kmer = $defaults->{kmer}; | |
my $cov_threshold = $defaults->{cov_threshold}; | |
my $tip_cov_threshold = $defaults->{tip_cov_threshold}; | |
my $radius = $defaults->{radius}; | |
my $windowSize = $defaults->{windowSize}; | |
my $delta = $defaults->{delta}; | |
my $map_qual = $defaults->{map_qual}; | |
my $maxmismatch = $defaults->{maxmismatch}; | |
my $min_cov = $defaults->{min_cov}; | |
my $covratio = $defaults->{covratio}; | |
my $outratio = $defaults->{outratio}; | |
my $WORK = $defaults->{WORK}; | |
my $MAX_PROCESSES = $defaults->{MAX_PROCESSES}; | |
my $sample = $defaults->{sample}; | |
my $selected = $defaults->{selected}; | |
my $outformat = $defaults->{format}; | |
my $intarget; | |
#microassembler parameters: | |
my $dfs_limit = $defaults->{pathlimit}; | |
my $max_tip_len = 25; | |
my $maxKmer = 90; | |
my $help = 0; | |
my $VERBOSE = 0; | |
my $COV2FILE = 0; | |
# programs via absolute path | |
#my $samtools = "samtools"; | |
my $microassembler = "$Bin/Microassembler/Microassembler"; | |
my $exportTool = "$Bin/ExportVariants.pl"; | |
my $bamtools = "$Bin/bamtools-2.3.0/bin/bamtools"; | |
my $rgfile = "readgroups.txt"; | |
#my %pathways; | |
#my %exonshash; | |
my %variants; | |
my %exons; | |
my %locations; | |
my %readgroups; | |
my %refcov; | |
my %loc2keys; | |
###my %genome; | |
my $variants_dbm_obj; | |
my $argcnt = scalar(@ARGV); | |
my $start_time = time; | |
##################################################### | |
# | |
# Command line options processing | |
# | |
GetOptions( | |
'help!' => \$help, | |
'verbose!' => \$VERBOSE, | |
'cov2file' => \$COV2FILE, | |
# required parameters | |
'bam=s' => \$bamfile, | |
'bed=s' => \$bedfile, | |
'ref=s' => \$REF, | |
# optional paramters | |
'kmer=i' => \$kmer, | |
'covthr=i' => \$cov_threshold, | |
'lowcov=i' => \$tip_cov_threshold, | |
'mincov=i' => \$min_cov, | |
'covratio=f' => \$covratio, | |
'radius=i' => \$radius, | |
'window=i' => \$windowSize, | |
'step=i' => \$delta, | |
'mapscore=i' => \$map_qual, | |
'mismatches=i' => \$maxmismatch, | |
'pathlimit=i' => \$dfs_limit, | |
'dir=s' => \$WORK, | |
'numprocs=i' => \$MAX_PROCESSES, | |
'sample=s' => \$sample, | |
'coords=s' => \$selected, | |
'format=s' => \$outformat, | |
# ouptut parameters | |
'intarget!' => \$intarget, | |
'outratio=f' => \$outratio, | |
) or usageSingle("FindVariants.pl"); | |
##################################################### | |
# | |
# Initilization | |
# | |
sub init() | |
{ | |
usageSingle("FindVariants.pl") if ($argcnt < 1); | |
usageSingle("FindVariants.pl") if( $help ); | |
if( (!defined $bamfile) || (!defined $bedfile) || (!defined $REF) ) { | |
print STDERR "Required parameter missing!\n"; | |
usageSingle("FindVariants.pl"); | |
} | |
mkdir $WORK if ! -r $WORK; | |
if($MAX_PROCESSES == 1) { $MAX_PROCESSES = 0; } | |
} | |
##################################################### | |
sub printParams { | |
my $file = $_[0]; | |
print STDERR "-- Print parameters to $file\n"; | |
open PFILE, "> $file" or die "Can't open $file ($!)\n"; | |
print PFILE "Parameters: \n"; | |
print PFILE "-- k-mer size (bp): $kmer\n"; | |
print PFILE "-- coverage threshold: $cov_threshold\n"; | |
print PFILE "-- low coverage threshold: $tip_cov_threshold\n"; | |
print PFILE "-- size (bp) of the left and right extensions (radius): $radius\n"; | |
print PFILE "-- window-size size (bp): $windowSize\n"; | |
print PFILE "-- step size (bp): $delta\n"; | |
print PFILE "-- minimum mapping quality: $map_qual\n"; | |
print PFILE "-- minimum coverage for mutation: $min_cov\n"; | |
print PFILE "-- minimum coverage ratio for sequencing errors: $covratio\n"; | |
print PFILE "-- minimum coverage ratio for exporting mutation: $outratio\n"; | |
print PFILE "-- max number of mismatches for near-perfect repeats: $maxmismatch\n"; | |
print PFILE "-- limit number of sequence paths: $dfs_limit\n"; | |
print PFILE "-- output directory: $WORK\n"; | |
print PFILE "-- max number of parallel jobs: $MAX_PROCESSES\n"; | |
print PFILE "-- bam file: $bamfile\n"; | |
print PFILE "-- bed file: $bedfile\n"; | |
print PFILE "-- reference file: $REF\n"; | |
print PFILE "-- sample: $sample\n"; | |
print PFILE "-- output format for variants: $outformat\n"; | |
print PFILE "-- file of selected coordinates: $selected\n"; | |
if($intarget) { print PFILE "-- output variants in target? yes\n"; } | |
else { print PFILE "-- output variants in target? no\n"; } | |
close PFILE; | |
} | |
## build directory structure and run assembly | |
##################################################### | |
sub run { | |
my $fdir = "$WORK"; | |
if (! -r $fdir) { mkdir $fdir; } | |
printParams("$fdir/parameters.txt"); | |
if($selected ne "null") { | |
loadCoordinates("$selected", \%exons, $VERBOSE); | |
} | |
else { | |
loadExonsBed("$bedfile", \%exons, $radius, $VERBOSE); | |
} | |
#load genome from fasta file | |
###loadGenomeFasta($REF, \%genome); | |
if (-e "$fdir/variants.db.dir") { runCmd("remove database files", "rm $fdir/variants.db.*"); } | |
$variants_dbm_obj = tie %variants, 'MLDBM::Sync', "$fdir/variants.db", O_CREAT|O_RDWR, 0640; | |
#$variants_dbm_obj->SyncCacheSize('1000M'); # 1000 Megabyte max memory used | |
#print "$bamfile\n"; | |
# get absolute path | |
my $bam_abs_path = File::Spec->rel2abs($bamfile); | |
## link the input files | |
if (-e $bam_abs_path) { | |
symlink "$bam_abs_path", "$fdir/bamfile.bam"; | |
} | |
else { | |
print STDERR "Can't find bamfile: $bamfile\n"; | |
next; | |
} | |
if (-e "$bam_abs_path.bai") { | |
symlink "$bam_abs_path.bai", "$fdir/bamfile.bam.bai"; | |
} | |
else { | |
print STDERR "Indexing BAM file...\n"; | |
runCmd("index bamfile", "$bamtools index -in $bam_abs_path"); | |
symlink "$bam_abs_path.bai", "$fdir/bamfile.bam.bai"; | |
} | |
parseHeader($WORK, \%readgroups); | |
saveReadGroup("$sample", $WORK, \%readgroups, $rgfile); | |
print STDERR "Assembly Exons\n"; | |
assemblyExons(\%variants, $variants_dbm_obj, \%refcov, \%loc2keys); | |
if ($COV2FILE) { printRefCov("$WORK/refcov.txt", \%refcov); } | |
printLoc2key("$WORK/loc2keys.txt", \%loc2keys); | |
# export variants to file | |
exportSVs(); | |
} | |
## export family mutations to file | |
##################################################### | |
sub exportSVs { | |
#$exportvars --db $DIR/variants.db --bed $BEDFILE --format annovar --type all --mincov $mincov --maxcov $maxcov > $DIR/variants.txt | |
print STDERR "-- Exporting SVs to file\n"; | |
my $max_cov = 10000000; | |
# export denovos | |
#runCmd("export denovos", "$exportTool --db $WORK/variants.db --bed $bedfile --format annovar --type all --mincov $min_cov --maxcov $max_cov --covratio $outratio > $WORK/variants.${min_cov}x.all.txt"); | |
#runCmd("export denovos", "$exportTool --db $WORK/variants.db --bed $bedfile --format annovar --type snp --mincov $min_cov --maxcov $max_cov --covratio $outratio > $WORK/variants.${min_cov}x.snp.txt"); | |
#runCmd("export denovos", "$exportTool --db $WORK/variants.db --bed $bedfile --format annovar --type indel --mincov $min_cov --maxcov $max_cov --covratio $outratio > $WORK/variants.${min_cov}x.indel.txt"); | |
my $command = "$exportTool ". | |
"--db $WORK/variants.db ". | |
"--bed $bedfile ". | |
"--format $outformat ". | |
"--type indel ". | |
"--mincov $min_cov ". | |
"--maxcov $max_cov ". | |
"--covratio $outratio"; | |
if($intarget) { $command .= " --intarget"; } | |
if ($outformat eq "annovar") { $command .= " > $WORK/variants.${min_cov}x.indel.annovar"; } | |
elsif ($outformat eq "vcf") { $command .= " > $WORK/variants.${min_cov}x.indel.vcf"; } | |
print STDERR "Command: $command\n" if($VERBOSE); | |
runCmd("ExportVariants", $command); | |
} | |
##################################################### | |
## Write the list of read groups per family member on file | |
## Extract the sequence around indel from reference | |
## Create input files for microassembler | |
## Run microassembler | |
sub assemblyExons { | |
my $hash = $_[0]; | |
my $dbm_obj = $_[1]; | |
my $ref_hash = $_[2]; | |
my $loc2keys_hash = $_[3]; | |
my $count_exons = 0; | |
my $count_regions = 0; | |
my @regions; | |
my @region_files; | |
my $PREFIX = "$WORK"; | |
#foreach my $exonkey (sort {$a <=> $b} keys %exons) { # for each chromosome | |
foreach my $exonkey (keys %exons) { # for each chromosome | |
$count_exons = 0; | |
#foreach my $exon (sort {$a->{start} <=> $b->{start}} @{$exons{$exonkey}}) { # for each exon | |
foreach my $exon (@{$exons{$exonkey}}) { # for each exon | |
$count_exons++; | |
#last if($count_exons > 5); | |
#my %pathways; | |
#my %exonshash; | |
# Forks and returns the pid for the child: | |
#my $pid = $pm->start and next; | |
my $start = $exon->{start}; | |
my $end = $exon->{end}; | |
my $chr = $exon->{chr}; | |
#print STDERR "$chr:$start-$end\n"; | |
## extend region left and right by radius | |
my $left = $start-$radius; | |
if ($left < 0) { $left = $start; } | |
my $right = $end+$radius; | |
#if ($right > $end) { $right = $end; } | |
#print STDERR "$chr:$left-$right\n"; | |
my $exonSize = $right-$left; | |
# generate list of regions to examine | |
my $l = $left; | |
my $u = $l+$windowSize; | |
if($u >= $right) { | |
$u = $right; | |
my $region; | |
$region->{chr} = $chr; | |
$region->{left} = $l; | |
$region->{right} = $u; | |
push @regions, $region; | |
#print "$region->{chr}:$region->{left}-$region->{right}\n"; | |
$count_regions++; | |
} | |
else { | |
while($u < $right) { | |
my $region; | |
$region->{chr} = $chr; | |
$region->{left} = $l; | |
$region->{right} = $u; | |
$region->{exonstart} = $left; | |
$region->{exonend} = $right; | |
push @regions, $region; | |
$count_regions++; | |
$l += $delta; | |
$u += $delta; | |
if($u > $right) { | |
$u = $right; | |
$l = $u-$windowSize; | |
my $region; | |
$region->{chr} = $chr; | |
$region->{left} = $l; | |
$region->{right} = $u; | |
$region->{exonstart} = $left; | |
$region->{exonend} = $right; | |
push @regions, $region; | |
#print "$region->{chr}:$region->{left}-$region->{right}\n"; | |
$count_regions++; | |
} | |
} | |
} | |
} | |
} | |
#update statistics in the database | |
if(exists $hash->{stats}) { | |
my $sts = $hash->{stats}; | |
$sts->{num_exceptions} = 0; | |
$sts->{num_dfs_limit} = 0; | |
$sts->{num_partial_align} = 0; | |
$sts->{num_with_cycles} = 0; | |
$sts->{num_ok} = 0; | |
$hash->{stats} = $sts; | |
} | |
#schedule jobs | |
schedule_jobs(\@regions, $hash, $dbm_obj, $ref_hash, $loc2keys_hash, "regions"); | |
my $stats = $hash->{stats}; | |
#if ($VERBOSE) { | |
if(exists $hash->{stats}) { | |
my $sts = $hash->{stats}; | |
print STDERR "\nDfs limit: $sts->{num_dfs_limit}\n"; | |
print STDERR "Partial alignments: $sts->{num_partial_align}\n"; | |
print STDERR "Cycles: $sts->{num_with_cycles}\n"; | |
print STDERR "OK assemblies: $sts->{num_ok}\n"; | |
print STDERR "Exceptions: $sts->{num_exceptions}\n"; | |
print STDERR "Repeats: $sts->{num_repeats}\n\n"; | |
} | |
#} | |
} | |
# partition the region to assemble into equal size | |
# random sets and run the parallel jobs | |
##################################################### | |
sub schedule_jobs { | |
my $array = $_[0]; | |
my $hash = $_[1]; | |
my $dbm_obj = $_[2]; | |
my $ref_hash = $_[3]; | |
my $loc2keys_hash = $_[4]; | |
my $fprefix = $_[5]; | |
my $PREFIX = "$WORK"; | |
# random permutation of the regions to assemble | |
#my @regions = random_permutation(@$array); | |
my @regions = @$array; | |
my $arraySize = $#regions + 1; | |
print STDERR "start assembly of $arraySize regions.\n"; | |
# partition regions | |
my $step = $arraySize; | |
if ($MAX_PROCESSES>0) { | |
$step = ceil($arraySize/$MAX_PROCESSES); | |
} | |
print STDERR "stepSize: $step\n"; | |
for(my $i = 0; $i < $arraySize; $i+=$step) { | |
my $s = $i; | |
my $e = min( ($arraySize - 1), ($i + $step-1) ); | |
my $ref_file = "$fprefix-$s-$e.fa"; | |
#push @region_files, $ref_file; | |
open FILE, "> $PREFIX/$ref_file" or die "Can't open $PREFIX/$ref_file ($!)\n"; | |
foreach my $reg (@regions[$s..$e]) { # for each region | |
my $chr = $reg->{chr}; | |
my $left = $reg->{left}; | |
my $right = $reg->{right}; | |
my $exonstart = $reg->{exonstart}; | |
my $exonend = $reg->{exonend}; | |
#my $chrom = $chr; | |
#if ($chr =~ /^chr/) { $chrom = substr($chr,3); } | |
print FILE ">$chr:$left-$right\n"; | |
### die "Undefined sequence ($chr)\n" if (!exists($genome{$chr})); | |
my ($header, $seq) = split(/\n/, `samtools faidx $REF $chr:$left-$right`, 2); | |
$seq =~ s/[\n\r\s]+//g; | |
###my $seq = substr($genome{$chr}->{seq}, $left-1, $right-$left+1); | |
#my $seq = substr($genome{$chr}->{seq},$exonstart, $exonend-$exonstart+1); | |
for(my $i=0; $i<length($seq); $i+=80) { | |
my $str = substr($seq,$i,80); | |
print FILE "$str\n"; | |
} | |
} | |
close FILE; | |
} | |
# free genome memory before parallelization | |
###for my $k (keys %genome) { delete $genome{$k}; } | |
my $pm = new Parallel::ForkManager($MAX_PROCESSES); | |
my $count = 0; | |
for(my $i = 0; $i < $arraySize; $i+=$step) { | |
$count++; | |
my $pid = $pm->start and next; | |
my $s = $i; | |
my $e = min( ($arraySize - 1), ($i + $step-1) ); | |
my $ref_file = "$fprefix-$s-$e.fa"; | |
print STDERR "$count. [$s..$e]\n"; | |
assemblyRegion($count, $ref_file, \%$hash, $dbm_obj); | |
$pm->finish; # Terminates the child process | |
} | |
$pm->wait_all_children; # wait for all parallel jobs to complete | |
# load reference coverage info from files | |
loadRefCovMerge($ref_hash, $count, $WORK); | |
loadLoc2KeyMerge($loc2keys_hash, $ref_hash, $count, $kmer, $WORK); | |
# compute zygosity for all mutations | |
computeHomHetState($hash, $ref_hash, $loc2keys_hash); | |
} | |
## Compute homozygous vs heterozygous state and | |
## flags possible sequencing errors | |
##################################################### | |
sub computeHomHetState { | |
print STDERR "Computing hom/het state...\n"; | |
my $vars = $_[0]; | |
my $refh = $_[1]; | |
my $l2k = $_[2]; | |
my $hom_cnt = 0; | |
my $het_cnt = 0; | |
# improve performance: tie once to database and do all read/wrote operations. | |
$variants_dbm_obj->Lock; | |
foreach my $currKey (keys %$vars) { | |
next if($currKey eq "stats"); # skip stats info | |
my $currMut = $vars->{$currKey}; | |
my $chr = $currMut->{chr}; | |
my $pos = $currMut->{pos}; | |
my $loc = "$chr:$pos"; | |
my $alleleCnt = 0; | |
my $Rcov = 0; | |
my $Acov = 0; | |
my $Ocov = 0; | |
my $totCov = 0; | |
# reference coverage | |
my $sum = 0; | |
my $cnt = 0; | |
if ( (exists $refh->{$loc}) ) { # if cov at location | |
## compute reference coverage at locus | |
for (my $t = $pos; $t<($pos+$kmer+1); $t++) { # compute avg coverage over window of size K | |
my $loc2 = "$chr:$t"; | |
if ( (exists $refh->{$loc2}) && | |
!( (exists $l2k->{$loc2}) && ( (scalar @{$l2k->{$loc2}->{keys}})!=0 ) ) ) { | |
$sum += $refh->{$loc2}; | |
$cnt++; | |
} | |
} | |
} | |
$Rcov=$sum; | |
if($cnt>0) { $Rcov = floor($sum/$cnt); } | |
# alternative allele coverage | |
$Acov = $currMut->{mincov}; | |
# other coverage | |
$Ocov = 0; | |
if(exists($l2k->{$loc})) { | |
foreach my $altKey (@{$l2k->{$loc}->{keys}}) { | |
if( exists($vars->{$altKey}) && ($altKey ne $currKey) ) { | |
my $altMut = $vars->{$altKey}; | |
$Ocov += $altMut->{mincov}; | |
} | |
} | |
} | |
$totCov = $Rcov + $Acov + $Ocov; | |
$currMut->{altcov} = $totCov - $currMut->{mincov}; | |
if($Rcov > 0) { $alleleCnt++; } | |
if($Acov > 0) { $alleleCnt++; } | |
if($Ocov > 0) { $alleleCnt++; } | |
if($alleleCnt > 1) { | |
$currMut->{zygosity} = "het"; | |
$het_cnt++; | |
} | |
else { # alleCnt <= 1 | |
$currMut->{zygosity} = "hom"; | |
$hom_cnt++; | |
} | |
# update mutation in db | |
$vars->{$currKey} = $currMut; | |
} | |
$variants_dbm_obj->UnLock; | |
print STDERR "Num. Homozygous: $hom_cnt\n"; | |
print STDERR "Num. Heterozygous: $het_cnt\n"; | |
} | |
# assembly a specific region | |
##################################################### | |
sub assemblyRegion { | |
my $num = $_[0]; | |
my $refs_file = $_[1]; | |
my $hash = $_[2]; | |
my $dbm_obj = $_[3]; | |
#my %pathways; | |
#my %exonshash; | |
my $PREFIX = "$WORK"; | |
my $log_dir = "$WORK/logs"; | |
if (! -r $log_dir) { mkdir $log_dir; } | |
my $outfile = "$log_dir/$refs_file.log"; | |
my $command = "$microassembler -R -I -C "; | |
if($VERBOSE) { $command .= "-v "; } | |
$command .= "-F $dfs_limit ". | |
"-k $kmer ". | |
"-K $maxKmer ". | |
"-M $maxmismatch ". | |
"-l $max_tip_len ". | |
"-c $cov_threshold ". | |
"-x $covratio ". | |
"-d $tip_cov_threshold ". | |
"-b $map_qual ". | |
"-p $PREFIX ". | |
"-m bamfile.bam ". | |
"-g $rgfile ". | |
" -r $refs_file ". | |
">& $outfile"; | |
## run microassembler on each fasta file | |
runCmd("microassembly", "$command"); | |
runCmd("gzip", "gzip -f $outfile"); | |
parseLogFile("$outfile", $num, \%$hash, $dbm_obj); | |
# remove reference file | |
runCmd("delete file", "rm $PREFIX/$refs_file"); | |
} | |
## parse the log file generated by microassembler and populate data structures containing the output statistics | |
##################################################### | |
sub parseLogFile { | |
my $logFile = $_[0]; | |
my $num = $_[1]; | |
my $hash = $_[2]; | |
my $dbm_obj = $_[3]; | |
#my $PATHS = new IO::Uncompress::Gunzip "$logFile.gz" or die "IO::Uncompress::Gunzip failed: $GunzipError\n"; | |
open PATHS, "gunzip -c $logFile.gz|" or die "Can't open $logFile.gz ($!)\n!"; | |
my $curc; | |
my $curs; | |
my $cure; | |
my @pathcov; | |
my @pathdiff; | |
my %tmp_hash; | |
my %tmp_ref_cov; | |
my %tmp_loc2keys; | |
my $stats; | |
$stats->{num_dfs_limit} = 0; | |
$stats->{num_partial_align} = 0; | |
$stats->{num_with_cycles} = 0; | |
$stats->{num_ok} = 0; | |
$stats->{num_exceptions} = 0; | |
$stats->{num_repeats} = 0; | |
my $exon; | |
my @pathways; | |
while (<PATHS>) | |
{ | |
if (/== Processing\s+\d+:\s+([\w\.]+):(\d+)-(\d+)/) { | |
$curc = $1; # chr | |
$curs = $2; # start | |
$cure = $3; # end | |
} | |
elsif (/cov: (\S+)/) { | |
my $cov = $1; | |
$exon->{start} = $curs; | |
$exon->{end} = $cure; | |
$exon->{cov} = $cov; | |
$exon->{status} = "-"; | |
my $exonkey = sprintf("%s:%d:%d", $curc, $curs, $cure); | |
#$exonshash->{$exonkey} = $exon; | |
} | |
elsif (/ref trim5: (\d+) trim3: (\d+) uncovered: (\d+) ref_dist: (\d+)/) { | |
$exon->{trim5} = $1; | |
$exon->{trim3} = $2; | |
$exon->{uncovered} = $3; | |
$exon->{ref_dist} = $4; | |
} | |
elsif (/allcycles: (\d+)/) { | |
$exon->{cycles} = $1; | |
if ($exon->{cycles} >= 1) { $stats->{num_with_cycles} += 1; } | |
} | |
elsif (/exception/) { | |
$exon->{status} = "excp"; | |
$stats->{num_exceptions} += 1; | |
} | |
elsif (/Found repeat in/ || /Found cycle in/) { | |
$exon->{status} = "repeat"; | |
$stats->{num_repeats} += 1; | |
} | |
elsif (/WARNING: DFS_LIMIT/) { | |
$exon->{status} = "dfs"; | |
$stats->{num_dfs_limit} += 1; | |
} | |
elsif(/^>sp/) { | |
$exon->{status} = "partial"; | |
$stats->{num_partial_align} += 1; | |
} | |
elsif(/^c':/) { | |
chomp; | |
my $covstr = (split(/:/, $_))[1]; | |
@pathcov = split(/ /, $covstr); | |
} | |
elsif(/^d':/) { | |
chomp; | |
my $diffstr = (split(/:/, $_))[1]; | |
@pathdiff = split(//,$diffstr); | |
} | |
elsif (/^>p_([\w\.]+):(\d+)-(\d+)_(\d+)/) { | |
$exon->{status} = "ok"; | |
my $chr = $1; | |
my $path; | |
$path->{chr} = $chr; | |
$path->{rstart} = $2; | |
$path->{rend} = $3; | |
$path->{trim5} = $exon->{trim5}; | |
$path->{trim3} = $exon->{trim3}; | |
$path->{pcnt} = $4; | |
@{$path->{cov}} = @pathcov; | |
@{$path->{diff}} = @pathdiff; | |
chomp; | |
# >p_9:96439830-96440030_1 cycle: 0 match: 200 snp: 0 ins: 0 del: 1 96439931:T|-|25.7 | |
my ($tag, $cc, $cycle, $mm, $match, $ss, $snp, $ii, $ins, $dd, $del, $info) = split(/\s/, $_, 12); | |
$path->{cycle} = $cycle; | |
$path->{match} = $match; | |
$path->{snp} = $snp; | |
$path->{ins} = $ins; | |
$path->{del} = $del; | |
$path->{info} = $info; | |
#my $pathkey = sprintf("%s:%d:%d", $chr, $2, $3); | |
if($path->{cycle} == 0) { ## skip paths with cycles | |
push @pathways, $path; | |
} | |
} | |
elsif (/FINISHED/) { # end of current region | |
if((scalar @pathways) > 0) { | |
if( ($exon->{status} eq "ok") && ($exon->{cycles} == 0) ) { # skip regions hard to assemble and paths with at least one cycle | |
# at this point the exon status is ok | |
$stats->{num_ok} += 1; | |
extractVariantsFromPath(\@pathways, $exon, \%tmp_hash, \%tmp_ref_cov, $stats, \%tmp_loc2keys); | |
} | |
undef (@pathways); # free up the memory resources occupied by the array | |
} | |
} | |
} | |
#copy mutations to database | |
$dbm_obj->Lock; | |
foreach my $key (keys %tmp_hash) { | |
my $mut = $tmp_hash{$key}; | |
if( !(exists $hash->{$key}) ) { | |
$hash->{$key} = $mut; | |
} | |
else { # denovo already in the table | |
# update the coverage to the highest value found so far | |
my $mut_old = $hash->{$key}; | |
if ($mut_old->{avgcov} < $mut->{avgcov}) { $mut_old->{avgcov} = $mut->{avgcov}; } # max avg coverage | |
if ($mut_old->{mincov} < $mut->{mincov}) { $mut_old->{mincov} = $mut->{mincov}; } # max min coverage | |
$hash->{$key} = $mut_old; | |
} | |
} | |
#update statistics in the database | |
if( !(exists $hash->{stats}) ) { | |
$hash->{stats} = $stats; | |
} | |
else { | |
my $old_stats = $hash->{stats}; | |
$old_stats->{num_repeats} += $stats->{num_repeats}; | |
$old_stats->{num_exceptions} += $stats->{num_exceptions}; | |
$old_stats->{num_dfs_limit} += $stats->{num_dfs_limit}; | |
$old_stats->{num_partial_align} += $stats->{num_partial_align}; | |
$old_stats->{num_with_cycles} += $stats->{num_with_cycles}; | |
$old_stats->{num_ok} += $stats->{num_ok}; | |
$hash->{stats} = $old_stats; | |
} | |
$dbm_obj->UnLock; | |
# write reference coverage to file | |
my $outfile = "$WORK/refcov.$num.txt"; | |
open FILE, "> $outfile" or die "Can't open $outfile ($!)\n"; | |
foreach my $key (keys %tmp_ref_cov) { | |
print FILE "$key\t$tmp_ref_cov{$key}\n"; | |
} | |
close FILE; | |
# write mutations by positions to file | |
$outfile = "$WORK/loc2keys.$num.txt"; | |
open FILE, "> $outfile" or die "Can't open $outfile ($!)\n"; | |
foreach my $loc (keys %tmp_loc2keys) { | |
print FILE "$loc"; | |
foreach my $key (@{$tmp_loc2keys{$loc}}) { print FILE "\t$key"; } | |
print FILE "\n"; | |
} | |
close FILE; | |
close PATHS; | |
} | |
## extract mutations from path | |
########################################## | |
sub extractVariantsFromPath { | |
my $pathways = $_[0]; | |
my $exon = $_[1]; | |
my $hash = $_[2]; | |
my $ref_cov_hash = $_[3]; | |
my $stats = $_[4]; | |
my $mutbypos_hash = $_[5]; | |
my $outflag = 0; | |
my $exoncov = $exon->{cov}; | |
my $exonsta = $exon->{status}; | |
my $exoncycle = $exon->{cycles}; | |
$exoncycle = 0 if !defined $exoncycle; | |
foreach my $path (@{$pathways}) { ## for each path in the region | |
updateRefCoverage($path, $ref_cov_hash); # updated coverage info for matching reference | |
my $chr = $path->{chr}; | |
#if ($chr =~ /^chr/) { $chr = substr($chr,3); } | |
my $info = $path->{info}; | |
$info = "" if !defined $info; | |
#print "---$info---\n"; | |
my @var = split /\s+/, $info; | |
foreach my $v (@var) { ## for each variant | |
my ($pos, $ref, $qry, $avgcov, $mincov, $prevbp) = split /[:|]/, $v; | |
my $t = ""; | |
my $l = 0; | |
if ( ($ref !~ /-+/) && ($qry =~ /-+/) ) {$t = "del"; $l = length($qry); } # deletion | |
elsif( ($ref =~ /-+/) && ($qry !~ /-+/) ) {$t = "ins"; $l = length($qry); } # insertion | |
elsif( ($ref !~ /-+/) && ($qry !~ /-+/) ) {$t = "snp"; $l = length($qry); } # SNP | |
$outflag = 1; | |
my $mut; | |
$mut->{chr} = $chr; | |
$mut->{pos} = $pos; | |
$mut->{type} = $t; | |
$mut->{len} = $l; | |
$mut->{ref} = $ref; | |
$mut->{seq} = $qry; | |
$mut->{prevbp} = $prevbp; | |
$mut->{avgcov} = $avgcov; | |
$mut->{mincov} = $mincov; | |
$mut->{status} = $exonsta; | |
$mut->{zygosity} = "na"; | |
$mut->{altcov} = 0; | |
$mut->{inheritance} = "na"; | |
my $ref_encoded = md5_hex($ref); | |
my $qry_encoded = md5_hex($qry); | |
my $key = sprintf("%s:%d:%s:%d:%s:%s", $chr, $pos, $t, $l, $ref_encoded, $qry_encoded); | |
my $loc = sprintf("%s:%d", $chr, $pos); | |
#print "$key\n"; | |
## add mutation to the hash table | |
if( !(exists $hash->{$key}) ) { | |
$hash->{$key} = $mut; | |
push @{$mutbypos_hash->{$loc}}, $key; | |
} | |
else { # already in the table | |
# update the coverage to the highest value found so far | |
my $mut_old = $hash->{$key}; | |
if ($mut_old->{avgcov} < $mut->{avgcov}) { | |
$mut_old->{avgcov} = $mut->{avgcov}; # max avg coverage | |
} | |
if ($mut_old->{mincov} < $mut->{mincov}) { | |
$mut_old->{mincov} = $mut->{mincov}; # max min coverage | |
} | |
$hash->{$key} = $mut_old; | |
} | |
} | |
} | |
#return 1 if at least one event is detected | |
return $outflag; | |
} | |
## update covearge info for bases matching the reference | |
########################################## | |
sub updateRefCoverage { | |
my $p = $_[0]; | |
my $ref_hash = $_[1]; | |
my @cov = @{$p->{cov}}; | |
my @diff = @{$p->{diff}}; | |
my $chr = $p->{chr}; | |
my $start = $p->{rstart}; | |
my $trim5 = $p->{trim5}; | |
#my $end = $p->{rend}; | |
#my $L = $end-$start+1; # length of the region | |
my $len = scalar @diff; | |
my $r = $start+$trim5; # reference index | |
my $j = 0; # used to scan the coverage array | |
for (my $i = 0; $i<$len; $i++) { | |
my $key = "$chr:$r"; | |
if ($diff[$i] eq " ") { # if matching the reference | |
#if( ($i>0) && ($diff[$i-1] eq " ") ) { # test needed to correctly handle homozygous ins | |
if( ($i>0) && ($diff[$i-1] ne "^") ) { # test needed to correctly handle homozygous ins | |
if ( (exists $ref_hash->{$key}) ) { # already in the table | |
my $cov = $ref_hash->{$key}; | |
if ($cov < $cov[$j]) { # keep max coverage | |
$cov = $cov[$j]; | |
} | |
$ref_hash->{$key} = $cov; | |
} | |
else { # not in the table | |
$ref_hash->{$key} = $cov[$j]; | |
} | |
} | |
$j++; $r++; | |
} | |
elsif ($diff[$i] eq "x") { $j++; $r++; } # snp | |
elsif ($diff[$i] eq "^") { $j++; } # ins | |
elsif ($diff[$i] eq "v") { $r++; } # del | |
} | |
} | |
## do the job | |
########################################## | |
init(); | |
run(); | |
########################################## | |
my $time_taken = time - $start_time; | |
#if($VERBOSE) { | |
elapsedTime($time_taken, "FindVariants"); | |
#} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff --git a/FindSomatic.pl b/FindSomatic.pl | |
old mode 100755 | |
new mode 100644 | |
index 7a42a1e..6e90b4e | |
--- a/FindSomatic.pl | |
+++ b/FindSomatic.pl | |
@@ -508,10 +508,12 @@ sub parseBestState { | |
my @R = split("", $REF); # reference | |
my @A = split("", $ALT); # alternative | |
my $n=0; my $t=1; | |
- if ( ($A[$n]>0) && ($A[$t]>0) ) { # both in normal and tumor | |
+ #if ( ($A[$n]>0) && ($A[$t]>0) ) { # both in normal and tumor | |
+ if ( $A[$n] == $A[$t] ) { # genotypes for ALT identical | |
$sv->{inheritance} = "normal"; | |
} | |
- elsif ( ($A[$n]==0) && ($A[$t]>0) ) { # only in tumor | |
+ #elsif ( ($A[$n]==0) && ($A[$t]>0) ) { # only in tumor | |
+ elsif ( $A[$n] != $A[$t] ) { # genotypes for ALT different | |
$sv->{inheritance} = "no"; | |
$sv->{somatic} = "yes"; | |
} | |
diff --git a/FindVariants.pl b/FindVariants.pl | |
index 5f0dceb..712095b 100755 | |
--- a/FindVariants.pl | |
+++ b/FindVariants.pl | |
@@ -218,7 +218,7 @@ sub run { | |
} | |
# decide if loading genome from fasta or use samtools faidx | |
- if (-e "$bedfile") { $USEFAIDX = 0; } | |
+ if (-e "$bedfile") { $USEFAIDX = 1; } | |
elsif ($bedfile =~ m/(\w+):(\d+)-(\d+)/) { | |
#parse region | |
my ($chr,$interval) = split(':',$bedfile); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment