chapmanb · August 29, 2015 14:02
diff --git a/FindSomatic.pl b/FindSomatic.pl
 #!/usr/bin/perl

 ###################################################################
 # FindSomatic.pl
 #
 # Tool for detecting somatic mutations in normal/tumor pair 
 # using microassembly
 # 
 #  Author: Giuseppe Narzisi 
 #    Date: December 11, 2013
 #
 ###################################################################

 use warnings;
 use strict;
 use POSIX;
 use FindBin qw($Bin);
 use lib $Bin; # add $Bin directory to @INC
 use Usage;
 use Utils qw(:DEFAULT $findVariants $findDenovos $findSomatic $exportTool $bamtools);
 use HashesIO;
 use Parallel::ForkManager;
 use List::Util qw[min max];
 #use Math::Random qw(:all);
 #use Sys::CPU;
 #use IO::Uncompress::Gunzip qw(gunzip $GunzipError);
 use File::Spec;
 use Getopt::Long;

 use MLDBM::Sync;                       # this gets the default, SDBM_File
 use MLDBM qw(DB_File Storable);        # use Storable for serializing
 use MLDBM qw(MLDBM::Sync::SDBM_File);  # use extended SDBM_File, handles values > 1024 bytes
 use Fcntl qw(:DEFAULT);                # import symbols O_CREAT & O_RDWR for use with DBMs

 $|=1;

 # required arguments
 my $BAMNORMAL;
 my $BAMTUMOR;
 my $BEDFILE;
 my $REF;

 my $defaults = getDefaults();  

 # optional arguments
 my $kmer = $defaults->{kmer};
 my $cov_threshold = $defaults->{cov_threshold};
 my $tip_cov_threshold = $defaults->{tip_cov_threshold};
 my $radius = $defaults->{radius};
 my $windowSize = $defaults->{windowSize};
 my $delta = $defaults->{delta};
 my $map_qual = $defaults->{map_qual};
 my $maxmismatch = $defaults->{maxmismatch};
 my $min_cov = $defaults->{min_cov};
 my $covratio = $defaults->{covratio};
 my $outratio = $defaults->{outratio};
 my $WORK  = $defaults->{WORK};
 my $MAX_PROCESSES = $defaults->{MAX_PROCESSES};
 my $selected = $defaults->{selected};
 my $dfs_limit = $defaults->{pathlimit};
 my $outformat = $defaults->{format};
 my $intarget;

 my $help = 0;
 my $VERBOSE = 0;

 my %candidates;
 my %exons;
 my %locations;

 my %normalCov;
 my %tumorCov;

 my %normalL2K;
 my %tumorL2K;

 my %normalSVs;
 my %tumorSVs;
 my %somaticSVs;
 my %commonSVs;

 my @members = qw/normal tumor/;

 my $argcnt = scalar(@ARGV);
 my $start_time = time;

 #####################################################
 #
 # Command line options processing
 #
 GetOptions(
 	'help!'    => \$help,
 	'verbose!' => \$VERBOSE,
    
 	# required parameters
    'normal=s' => \$BAMNORMAL,
    'tumor=s'  => \$BAMTUMOR,
    'bed=s'    => \$BEDFILE,
    'ref=s'    => \$REF,

 	# optional paramters
    'kmer=i'       => \$kmer,
    'covthr=i'     => \$cov_threshold,
    'lowcov=i'     => \$tip_cov_threshold,
    'mincov=i'     => \$min_cov,
    'covratio=f'   => \$covratio,
    'radius=i'     => \$radius,
    'window=i'     => \$windowSize,
    'step=i'       => \$delta,
    'mapscore=i'   => \$map_qual,
    'mismatches=i' => \$maxmismatch,
    'dir=s'        => \$WORK,
    'numprocs=i'   => \$MAX_PROCESSES,
    'coords=s'     => \$selected,
    'format=s'     => \$outformat,

 	# ouptut parameters
 	'intarget!'    => \$intarget,
 	'outratio=f'   => \$outratio,

 ) or usageDenovo("FindDenovo.pl");

 #####################################################
 #
 # Command line options processing
 #
 sub init()
 {
 	usageSomatic("FindSomatic.pl") if ($argcnt < 1);
 	usageSomatic("FindSomatic.pl") if( $help );
 	
 	if( (!defined $BAMNORMAL) || (!defined $BAMTUMOR) || (!defined $BEDFILE) || (!defined $REF) ) { 
 		print STDERR "Required parameter missing!\n";
 		usageSomatic("FindSomatic.pl"); 
 	}
 	
 	mkdir $WORK if ! -r $WORK;
 	
 	if($MAX_PROCESSES == 1) { $MAX_PROCESSES = 0; }
 }

 #####################################################

 sub printParams { 

 	my $file = $_[0];
 	
 	print STDERR "-- Print parameters to $file\n";
 	
 	open PFILE, "> $file" or die "Can't open $file ($!)\n";
 	
 	print PFILE "Parameters: \n";
 	print PFILE "-- K-mer size (bp): $kmer\n";
 	print PFILE "-- coverage threshold: $cov_threshold\n";
 	print PFILE "-- low coverage threshold: $tip_cov_threshold\n";
 	print PFILE "-- size (bp) of the left and right extensions (radius): $radius\n";
 	print PFILE "-- window-size size (bp): $windowSize\n";
 	print PFILE "-- step size (bp): $delta\n";
 	print PFILE "-- minimum mapping-quality: $map_qual\n";
 	print PFILE "-- minimum coverage for somatic mutation: $min_cov\n";
 	print PFILE "-- minimum coverage ratio for sequencing error: $covratio\n";
 	print PFILE "-- minimum coverage ratio for exporting mutation: $outratio\n";
 	print PFILE "-- max number of mismatches for near-perfect repeats: $maxmismatch\n";
 	print PFILE "-- output directory: $WORK\n";
 	print PFILE "-- max number of parallel jobs: $MAX_PROCESSES\n";
 	print PFILE "-- BAM file normal: $BAMNORMAL\n";
 	print PFILE "-- BAM file tumor: $BAMTUMOR\n";
 	print PFILE "-- bedfile: $BEDFILE\n";
 	print PFILE "-- reference file: $REF\n";
 	print PFILE "-- file of selected locations: $selected\n";
 	print PFILE "-- output format for variants: $outformat\n";
 	if($intarget) { print PFILE "-- output variants in target? yes\n"; }
 	else { print PFILE "-- output variants in target? no\n"; }
 	
 	close PFILE;
 }

 ## link input files
 #####################################################

 sub linkFiles {
 	
 	print STDERR "-- Linking input bamfile\n";
 	
 	my $bamfile = $_[0]; 
 	my $dirname = $_[1];
 	
 	my $flag = 0;
 	
 	print STDERR "$bamfile\n";
 	my $fdir = "$WORK/$dirname";
 	
 	# get absolute path
 	my $bam_abs_path = File::Spec->rel2abs($bamfile);
 	
 	if (! -r $fdir) {
 		$flag = 1;
 		## link the input files
 		mkdir $fdir;
 	
 		if (-e "$bam_abs_path") {
 			symlink "$bam_abs_path", "$fdir/bamfile.bam";
 		}
 		else {
 			print STDERR "Can't find BAM file: $bamfile\n";
 			next;
 		}
 		
 		if (-e "$bam_abs_path.bai") {
 			symlink "$bam_abs_path.bai", "$fdir/bamfile.bam.bai";
 		}
 		else {
 			print STDERR "Indexing BAM file...\n";
 			runCmd("index bamfile", "$bamtools index -in $bam_abs_path");
 			symlink "$bam_abs_path.bai", "$fdir/bamfile.bam.bai";
 			#print STDERR "Can't find index BAM file: $bamfile.bai\n";
 			next;
 		}
 	}
 	
 	return $flag;
 }

 ## load mutations and coverage info
 #####################################################

 sub loadHashes {
 	
 	print STDERR "-- Loading mutations and coverage info\n";
 		
 	# clear hashes 
 	for my $k (keys %normalSVs) { delete $normalSVs{$k}; }
 	for my $k (keys %tumorSVs) { delete $tumorSVs{$k}; }
 	for my $k (keys %normalCov) { delete $normalCov{$k}; }
 	for my $k (keys %tumorCov) { delete $tumorCov{$k}; }
 	
 	# load databases of mutations
 	print STDERR "load variants for normal...\n";
 	loadDB("$WORK/normal/variants.db", \%normalSVs, \%exons, 0);
 	print STDERR "load variants for tumor...\n";
 	loadDB("$WORK/tumor/variants.db", \%tumorSVs, \%exons, 0);
 	
 	print STDERR "load coverage for normal...\n";
 	loadCov("$WORK/normal/refcov.txt", \%normalCov);
 	print STDERR "load coverage for tumor...\n";
 	loadCov("$WORK/tumor/refcov.txt", \%tumorCov);
 	
 	print STDERR "load loc2key for for normal...\n";
 	loadLoc2Key("$WORK/normal/loc2keys.txt.gz", \%normalL2K);
 	print STDERR "load loc2key for for tumor...\n";
 	loadLoc2Key("$WORK/tumor/loc2keys.txt.gz", \%tumorL2K);
 	
 	#remove reference coverage files:
 	print STDERR "remove coverage file for normal...\n";
 	runCmd("remove coverage file", "rm $WORK/normal/refcov.txt.gz");
 	print STDERR "remove coverage file for tumor...\n";
 	runCmd("remove coverage file", "rm $WORK/tumor/refcov.txt.gz");
 }

 ## call variants on each family memeber
 #####################################################

 sub callSVs {
 	
 	my $bamfile = $_[0]; 
 	my $dir = $_[1]; 
 	
 	print STDERR "-- Detect mutations on $dir\n";

 	my $bam_abs_path = File::Spec->rel2abs($bamfile);

 	my $command = "$findVariants ".
 		"--bam $bam_abs_path ".
 		"--bed $BEDFILE ".
 		"--ref $REF ".
 		"--kmer $kmer ". 
 		"--covthr $cov_threshold ". 
 		"--lowcov $tip_cov_threshold ".
 		"--covratio $covratio ".
 		"--radius $radius ". 
 		"--window $windowSize ". 
 		"--step $delta ".
 		"--mapscore $map_qual ".
 		"--mismatches $maxmismatch ".
 		"--dir $WORK/$dir ".
 		"--sample ALL ".
 		"--numprocs $MAX_PROCESSES ".
 		"--coords $selected ".
 		"--format $outformat ".
 		"--cov2file";
 	if($intarget) { $command .= " --intarget"; }	
 	if($VERBOSE) { $command .= " --verbose"; }

 	print STDERR "Command: $command\n" if($VERBOSE);
 	runCmd("findVariants", "$command");
 }

 ## Compute best state for the location of the mutation
 #####################################################
 sub computeBestState {
 	
 	my $vars;	
 	my $covthr = 0; # min cov threshold to remove sequencing errors
 	
 	foreach my $ID (@members) {
 		
 		if($ID eq "normal") { $vars = \%normalSVs; }
 		if($ID eq "tumor") { $vars = \%tumorSVs; }
 		
 		foreach my $currKey (keys %$vars) {
 			#print STDERR "$currKey\n";
 			
 			next if($currKey eq "numPaths");
 	
 			my $var = $vars->{$currKey};
 			my $chr = $var->{chr};
 			my $pos = $var->{pos}; 
 			my $loc = "$chr:$pos";
 	
 			my $refState;
 			#my $mutState;
 			my $altState;
 			
 			my $refCov;
 			my $mutCov;
 			my $altCov;
 						
 			foreach my $role (@members) {
 				#print STDERR "$role: ";
 				
 				my $varh;	
 				my $refhash;
 				my $l2k;
 				
 				if($role eq "normal") { $varh = \%normalSVs; $refhash = \%normalCov; $l2k = \%normalL2K; }
 				if($role eq "tumor") { $varh = \%tumorSVs; $refhash = \%tumorCov; $l2k = \%tumorL2K; }
 								
 				$refState->{$role} = 0; $altState->{$role} = 0;
 				$refCov->{$role} = 0; $mutCov->{$role} = 0; $altCov->{$role} = 0;
 				
 				my $sum = 0;
 				my $cnt = 0;
 				if ( exists($refhash->{$loc}) ) { # if there is reference coverage at location
 					## compute reference coverage at locus
 					for (my $t = $pos; $t<($pos+$kmer+1); $t++) { # compute avg coverage over window of size K
 						my $loc2 = "$chr:$t";
 						if ( exists $refhash->{$loc2} && 
 							!( exists($l2k->{$loc2}) && ( (scalar @{$l2k->{$loc2}->{keys}})!=0) ) ) {
 							$sum += $refhash->{$loc2};
 							$cnt++;
 						}					
 					}
 				}
 				my $RCov = $sum;
 				if($cnt>0) { $RCov = floor($sum/$cnt); }
 				if($RCov > $covthr) { $refState->{$role} = 1; $refCov->{$role} = $RCov; }
 				else { $refState->{$role} = 0; $refCov->{$role} = 0; }
 		
 				## compute coverage of mut at locus
 				my $ACov = 0;
 				if( exists($varh->{$currKey}) ) {
 					my $altMut = $varh->{$currKey};
 					$ACov = $altMut->{mincov};
 				}
 				if($ACov > $covthr) { $altState->{$role} = 1; $mutCov->{$role} = $ACov; }
 				else { $altState->{$role} = 0; $mutCov->{$role} = 0; }
 				
 				## compute alternative allele coverage
 				my $OCov = 0;
 				if( exists($l2k->{$loc}) ) {
 					foreach my $altKey (@{$l2k->{$loc}->{keys}}) {
 						if( exists($varh->{$altKey}) && ($altKey ne $currKey) ) {
 							my $altMut = $varh->{$altKey};
 							$OCov += $altMut->{mincov};
 						}
 					}
 				}
 				if($OCov > $covthr) { $altCov->{$role} = $OCov; }
 				else { $altCov->{$role} = 0; }
 			}
 			
 			# 2 2/0 1 (ref: N,T / qry: N,T)
 			my @RS=(); my @AS=();
 			my @RC=(); my @AC=(); my @OC=();
 			foreach my $role (@members) { # the order or members must be N,T
 				if( ($refState->{$role} == 1) && ($altState->{$role} == 1) ) {
 					push @RS , 1;
 					push @AS , 1;
 				}
 				elsif( ($refState->{$role} == 1) && ($altState->{$role} == 0) ) {
 					push @RS , 2;
 					push @AS , 0;
 				}
 				elsif( ($refState->{$role} == 0) && ($altState->{$role} == 1) ) {
 					push @RS , 0;
 					push @AS , 2;
 				}
 				elsif( ($refState->{$role} == 0) && ($altState->{$role} == 0) ) {
 					push @RS , 0;
 					push @AS , 0;
 				}
 				
 				push @RC , $refCov->{$role};
 				push @AC , $mutCov->{$role};
 				push @OC , $altCov->{$role};
 			}
 			
 			my $stateStr = "$RS[0]$RS[1]/$AS[0]$AS[1]";
 			my $covStr   = "$RC[0] $RC[1]/$AC[0] $AC[1]/$OC[0] $OC[1]";
 			
 			$var->{bestState} = $stateStr;
 			$var->{covState} = $covStr;
 			
 			# compute hom/het state and chi2score
 			parseCovState($var, $ID);
 			
 			$vars->{$currKey} = $var;
 		}
 	}
 }

 ## parse covState and compute hom/het state and chi2score
 ##########################################

 sub parseCovState {

 	my $sv = $_[0];
 	my $role = $_[1];
 	my $BS = $sv->{covState};
 	
 	my ($REF,$ALT,$OTH) = split("/",$BS);
 	my @R = split(" ", $REF); # reference
 	my @A = split(" ", $ALT); # alternative
 	my @O = split(" ", $OTH); # other

 	my $id = 0;	
 	if($role eq "normal") { $id = 0; }
 	if($role eq "tumor") { $id = 1; }	
 	
 	my $alleleCnt = 0;	
 	if($R[$id] > 0) { $alleleCnt++; }
 	if($A[$id] > 0) { $alleleCnt++; }
 	if($O[$id] > 0) { $alleleCnt++; }
 	
 	my $totCov = $R[$id] + $A[$id] + $O[$id];
 	$sv->{altcov} = $totCov - $sv->{mincov};

 	if($alleleCnt > 1) { $sv->{zygosity} = "het"; }
 	else { $sv->{zygosity} = "hom"; }
 	
 	my $covRatio = 0;
 	my $chi2Score = "na";
 	if ($totCov > 0) { 
 		$covRatio = sprintf('%.2f', ($sv->{mincov} / $totCov) );
 		$sv->{covratio} = $covRatio; 

 		#Chi-squared Test for allele coverage
 		my $o1 = $sv->{mincov}; # observed 1
 		my $o2 = $sv->{altcov}; # observed 2
 		my $e1 = $totCov/2; # expected 1
 		my $e2 = $totCov/2; # expected 2

 		my $term1 = (($o1-$e1)*($o1-$e1))/$e1;
 		my $term2 = (($o2-$e2)*($o2-$e2))/$e2;

 		$chi2Score = sprintf('%.2f', $term1 + $term2);		
 	}
 	$sv->{chi2score} = $chi2Score;	
 }

 ## parse bestState and update mutation info
 ##########################################
 sub parseBestState {

 	my $sv = $_[0];
 	my $BS = $sv->{bestState};
 	
 	$sv->{somatic} = "no";
 	$sv->{inheritance} = "no";
 	
 	if( ($BS eq "12/10") || ($BS eq "02/20") ) { # only in normal
 		$sv->{inheritance} = "no";
 	}			
 	else {
 		my ($REF,$ALT) = split("/",$BS);
 		my @R = split("", $REF); # reference
 		my @A = split("", $ALT); # alternative
 		my $n=0; my $t=1;
 		#if ( ($A[$n]>0) && ($A[$t]>0) ) { # both in normal and tumor
 		if ( $A[$n] == $A[$t] ) { # genotypes for ALT identical 
 			$sv->{inheritance} = "normal";
 		}
 		#elsif ( ($A[$n]==0) && ($A[$t]>0) ) { # only in tumor
 		elsif ( $A[$n] != $A[$t] ) { # genotypes for ALT different 
 			$sv->{inheritance} = "no";
 			$sv->{somatic} = "yes";
 		}
 		else { # unknown state
 			$sv->{inheritance} = "no"; 
 		}
 		#if($A[$t]>0) {  $sv->{id} = "tumor"; }
 	}	
 }


 ## find denovo events
 #####################################################

 sub findSomaticMut {
 	
 	print STDERR "-- Finding somatic mutations\n";
 	
 	my $family = $_[0];
 	
 	if (-e "$WORK/somatic.db.dir") { runCmd("remove database files", "rm $WORK/somatic.db.*"); }
 	if (-e "$WORK/common.db.dir") { runCmd("remove database files", "rm $WORK/common.db.*"); }
 	
 	my $somatic_dbm_obj = tie %somaticSVs, 'MLDBM::Sync', "$WORK/somatic.db", O_CREAT|O_RDWR, 0640; 
 	my $common_dbm_obj = tie %commonSVs, 'MLDBM::Sync', "$WORK/common.db", O_CREAT|O_RDWR, 0640; 
 	
 	$somatic_dbm_obj->Lock;
 	$common_dbm_obj->Lock;
 	foreach my $k (keys %tumorSVs) {
 		
 		my $mut = $tumorSVs{$k};
 		
 		# parse bestState and update info
 		parseBestState($mut);
 		
 		my $chr = $mut->{chr};
 		my $pos = $mut->{pos};
 		my $key = "$chr:$pos";
 		
 		# skip mutation if normal not sampled
 		next if (!exists $normalCov{$key});
 		next if ($normalCov{$key} < $min_cov); # min cov requirement
 				
 		# compute inheritance
 		if($mut->{somatic} eq "yes") {
 			$somaticSVs{$k} = $mut;
 		}
 		else { # non somatic
 			if ($mut->{inheritance} eq "normal") {
 				$commonSVs{$k} = $mut;
 			}
 		}
 		$tumorSVs{$k} = $mut;
 	}

 	$somatic_dbm_obj->UnLock;
 	$common_dbm_obj->UnLock;
 }


 ## export family mutations to file
 #####################################################

 sub exportSVs {
 	
 	print STDERR "-- Exporting SVs to file\n";
 	
 	my $max_cov = 10000000;
 	
 	# export somatic 
 	#runCmd("export all", "$exportTool --db $WORK/somatic.db --bed $BEDFILE --format annovar --type all --mincov $min_cov --maxcov $max_cov --covratio $outratio > $WORK/somatic.${min_cov}x.all.txt");
 	#runCmd("export snp", "$exportTool --db $WORK/somatic.db --bed $BEDFILE --format annovar --type snp --mincov $min_cov --maxcov $max_cov --covratio $outratio > $WORK/somatic.${min_cov}x.snp.txt");
 	#runCmd("export indels", "$exportTool --db $WORK/somatic.db --bed $BEDFILE --format annovar --type indel --mincov $min_cov --maxcov $max_cov --covratio $outratio > $WORK/somatic.${min_cov}x.indel.txt");
 	
 	my $command_somatic = "$exportTool ".
 		"--db $WORK/somatic.db ".
 		"--bed $BEDFILE ".
 		"--format $outformat ".
 		"--type indel ". 
 		"--mincov $min_cov ". 
 		"--maxcov $max_cov ".
 		"--covratio $outratio";
 	if($intarget) { $command_somatic .= " --intarget"; }
 	if ($outformat eq "annovar") { $command_somatic .= " > $WORK/somatic.${min_cov}x.indel.annovar"; }
 	elsif ($outformat eq "vcf") { $command_somatic .= " > $WORK/somatic.${min_cov}x.indel.vcf"; }
 	
 	print STDERR "Command: $command_somatic\n" if($VERBOSE);
 	runCmd("export somatic", $command_somatic);
 	
 	# export common variants
 	#runCmd("export all", "$exportTool --db $WORK/common.db --bed $BEDFILE --format annovar --type all --mincov $min_cov --maxcov $max_cov --covratio $outratio > $WORK/common.${min_cov}x.all.txt");
 	#runCmd("export snp", "$exportTool --db $WORK/common.db --bed $BEDFILE --format annovar --type snp --mincov $min_cov --maxcov $max_cov --covratio $outratio > $WORK/common.${min_cov}x.snp.txt");
 	#runCmd("export indels", "$exportTool --db $WORK/common.db --bed $BEDFILE --format annovar --type indel --mincov $min_cov --maxcov $max_cov --covratio $outratio > $WORK/common.${min_cov}x.indel.txt");
 	
 	my $command_comm = "$exportTool ".
 		"--db $WORK/common.db ".
 		"--bed $BEDFILE ".
 		"--format $outformat ".
 		"--type indel ". 
 		"--mincov $min_cov ". 
 		"--maxcov $max_cov ".
 		"--covratio $outratio";
 	if($intarget) { $command_comm .= " --intarget"; }
 	if ($outformat eq "annovar") { $command_comm .= " > $WORK/common.${min_cov}x.indel.annovar"; }
 	elsif ($outformat eq "vcf") { $command_comm .= " > $WORK/common.${min_cov}x.indel.vcf"; }
 	
 	print STDERR "Command: $command_comm\n" if($VERBOSE);
 	runCmd("export common", $command_comm);
 }

 ## process single family
 #####################################################

 sub processBAM {

 	my $bamfile = $_[0];
 	my $outdir = $_[1];
 	
 	#my $toprocess = linkFiles("$bamfile", "$outdir"); # link input files
 	#if($toprocess == 1) {
 		callSVs($bamfile, $outdir); # call mutations on each family
 	#}
 	#else { print STDERR "$outdir already processed!\n"; }
 }

 ## do the job
 ##########################################

 init();
 printParams("$WORK/parameters.txt"); # print parameters
 processBAM($BAMNORMAL, "normal"); # process normal
 processBAM($BAMTUMOR, "tumor"); # process tumor
 loadHashes(); # load mutations and coverage info
 computeBestState(); # compute bestState for each mutation
 findSomaticMut(); # detect somatic mutations in tumor
 exportSVs(); # export mutations to file

 ##########################################


 my $time_taken = time - $start_time;

 #if($VERBOSE) {
 	elapsedTime($time_taken, "FindSomatic");
 #}
diff --git a/FindVariants.pl b/FindVariants.pl
 #!/usr/bin/perl

 ###################################################################
 # FindVariants.pl
 #
 # Tool for detecting variants in one single exome 
 # using microassembly
 # 
 #  Author: Giuseppe Narzisi 
 #    Date: December 11, 2013
 #
 ###################################################################

 use warnings;
 use strict;
 use POSIX;
 use FindBin qw($Bin);
 use lib $Bin; # add $Bin directory to @INC
 use Usage;
 use Utils;
 use SequenceIO;
 use HashesIO;
 use Parallel::ForkManager;
 use List::Util qw[min max];
 #use MathRandom::Random qw(:all);
 use Digest::MD5 qw(md5_hex);
 #use IO::Uncompress::Gunzip qw(gunzip $GunzipError);
 use Getopt::Long;
 use File::Spec;

 use MLDBM::Sync;						# this gets the default, SDBM_File
 use MLDBM qw(DB_File Storable);			# use Storable for serializing
 use MLDBM qw(MLDBM::Sync::SDBM_File);	# use extended SDBM_File, handles values > 1024 bytes
 use Fcntl qw(:DEFAULT);					# import symbols O_CREAT & O_RDWR for use with DBMs

 $|=1;

 # required arguments
 my $bamfile;
 my $bedfile;
 my $REF;

 my $defaults = getDefaults();  

 # optional arguments
 my $kmer = $defaults->{kmer};
 my $cov_threshold = $defaults->{cov_threshold};
 my $tip_cov_threshold = $defaults->{tip_cov_threshold};
 my $radius = $defaults->{radius};
 my $windowSize = $defaults->{windowSize};
 my $delta = $defaults->{delta};
 my $map_qual = $defaults->{map_qual};
 my $maxmismatch = $defaults->{maxmismatch};
 my $min_cov = $defaults->{min_cov};
 my $covratio = $defaults->{covratio};
 my $outratio = $defaults->{outratio};
 my $WORK  = $defaults->{WORK};
 my $MAX_PROCESSES = $defaults->{MAX_PROCESSES};
 my $sample = $defaults->{sample};
 my $selected = $defaults->{selected};
 my $outformat = $defaults->{format};
 my $intarget;

 #microassembler parameters:
 my $dfs_limit = $defaults->{pathlimit};
 my $max_tip_len = 25;
 my $maxKmer = 90;

 my $help = 0;
 my $VERBOSE = 0;
 my $COV2FILE = 0;

 # programs via absolute path
 #my $samtools = "samtools";
 my $microassembler = "$Bin/Microassembler/Microassembler";
 my $exportTool = "$Bin/ExportVariants.pl";
 my $bamtools = "$Bin/bamtools-2.3.0/bin/bamtools";

 my $rgfile = "readgroups.txt";

 #my %pathways;
 #my %exonshash;
 my %variants;
 my %exons;
 my %locations;
 my %readgroups;
 my %refcov;
 my %loc2keys;
 ###my %genome;
 my $variants_dbm_obj;

 my $argcnt = scalar(@ARGV);
 my $start_time = time;

 #####################################################
 #
 # Command line options processing
 #
 GetOptions(
 	'help!'    => \$help,
 	'verbose!' => \$VERBOSE,
 	'cov2file' => \$COV2FILE,
    
 	# required parameters
    'bam=s' => \$bamfile,
    'bed=s' => \$bedfile,
    'ref=s' => \$REF,

 	# optional paramters
    'kmer=i'       => \$kmer,
    'covthr=i'     => \$cov_threshold,
    'lowcov=i'     => \$tip_cov_threshold,
    'mincov=i'     => \$min_cov,
    'covratio=f'   => \$covratio,
    'radius=i'     => \$radius,
    'window=i'     => \$windowSize,
    'step=i'       => \$delta,
    'mapscore=i'   => \$map_qual,
    'mismatches=i' => \$maxmismatch,
    'pathlimit=i'  => \$dfs_limit,
    'dir=s'        => \$WORK,
    'numprocs=i'   => \$MAX_PROCESSES,
    'sample=s'     => \$sample,
    'coords=s'     => \$selected,
    'format=s'     => \$outformat,

 	# ouptut parameters
 	'intarget!'    => \$intarget,
 	'outratio=f'   => \$outratio,
    

 ) or usageSingle("FindVariants.pl");

 #####################################################
 #
 # Initilization 
 #
 sub init()
 {
 	usageSingle("FindVariants.pl") if ($argcnt < 1);
 	usageSingle("FindVariants.pl") if( $help );
 	
 	if( (!defined $bamfile) || (!defined $bedfile) || (!defined $REF) ) { 
 		print STDERR "Required parameter missing!\n";
 		usageSingle("FindVariants.pl");
 	}
 	
 	mkdir $WORK if ! -r $WORK;
 	
 	if($MAX_PROCESSES == 1) { $MAX_PROCESSES = 0; }
 }

 #####################################################

 sub printParams { 

 	my $file = $_[0]; 
 	
 	print STDERR "-- Print parameters to $file\n";
 	
 	open PFILE, "> $file" or die "Can't open $file ($!)\n";
 	
 	print PFILE "Parameters: \n";
 	print PFILE "-- k-mer size (bp): $kmer\n";
 	print PFILE "-- coverage threshold: $cov_threshold\n";
 	print PFILE "-- low coverage threshold: $tip_cov_threshold\n";
 	print PFILE "-- size (bp) of the left and right extensions (radius): $radius\n";
 	print PFILE "-- window-size size (bp): $windowSize\n";
 	print PFILE "-- step size (bp): $delta\n";
 	print PFILE "-- minimum mapping quality: $map_qual\n";
 	print PFILE "-- minimum coverage for mutation: $min_cov\n";
 	print PFILE "-- minimum coverage ratio for sequencing errors: $covratio\n";
 	print PFILE "-- minimum coverage ratio for exporting mutation: $outratio\n";
 	print PFILE "-- max number of mismatches for near-perfect repeats: $maxmismatch\n";
 	print PFILE "-- limit number of sequence paths: $dfs_limit\n";
 	print PFILE "-- output directory: $WORK\n";
 	print PFILE "-- max number of parallel jobs: $MAX_PROCESSES\n";
 	print PFILE "-- bam file: $bamfile\n";
 	print PFILE "-- bed file: $bedfile\n";
 	print PFILE "-- reference file: $REF\n";
 	print PFILE "-- sample: $sample\n";
 	print PFILE "-- output format for variants: $outformat\n";
 	print PFILE "-- file of selected coordinates: $selected\n";
 	if($intarget) { print PFILE "-- output variants in target? yes\n"; }
 	else { print PFILE "-- output variants in target? no\n"; }
 	
 	close PFILE;
 }

 ## build directory structure and run assembly
 #####################################################

 sub run {

 	my $fdir = "$WORK";
 	
 	if (! -r $fdir) { mkdir $fdir; }

 	printParams("$fdir/parameters.txt");
 		
 	if($selected ne "null") {
 		loadCoordinates("$selected", \%exons, $VERBOSE);
 	}
 	else {
 		loadExonsBed("$bedfile", \%exons, $radius, $VERBOSE);
 	}
 	
 	#load genome from fasta file
 	###loadGenomeFasta($REF, \%genome);
 	
 	if (-e "$fdir/variants.db.dir") { runCmd("remove database files", "rm $fdir/variants.db.*"); }	
 	$variants_dbm_obj = tie %variants, 'MLDBM::Sync', "$fdir/variants.db", O_CREAT|O_RDWR, 0640;	
 	#$variants_dbm_obj->SyncCacheSize('1000M');    # 1000 Megabyte max memory used
 	
 	#print "$bamfile\n";
 	
 	# get absolute path
 	my $bam_abs_path = File::Spec->rel2abs($bamfile);

 	## link the input files	
    if (-e $bam_abs_path) { 
 		symlink "$bam_abs_path", "$fdir/bamfile.bam"; 
 	}
    else { 
 		print STDERR "Can't find bamfile: $bamfile\n";
 		next;
    }

    if (-e "$bam_abs_path.bai") {
 		symlink "$bam_abs_path.bai", "$fdir/bamfile.bam.bai";
    }
    else {
 		print STDERR "Indexing BAM file...\n";
 		runCmd("index bamfile", "$bamtools index -in $bam_abs_path");
 		symlink "$bam_abs_path.bai", "$fdir/bamfile.bam.bai";
    }

 	parseHeader($WORK, \%readgroups);
 	saveReadGroup("$sample", $WORK, \%readgroups, $rgfile);
 	
 	print STDERR "Assembly Exons\n";
 	assemblyExons(\%variants, $variants_dbm_obj, \%refcov, \%loc2keys);

 	if ($COV2FILE) { printRefCov("$WORK/refcov.txt", \%refcov); }
 	printLoc2key("$WORK/loc2keys.txt", \%loc2keys);
 	
 	# export variants to file
 	exportSVs();
 }

 ## export family mutations to file
 #####################################################

 sub exportSVs {

 	#$exportvars --db $DIR/variants.db --bed $BEDFILE --format annovar --type all --mincov $mincov --maxcov $maxcov > $DIR/variants.txt	
 	print STDERR "-- Exporting SVs to file\n";
 	
 	my $max_cov = 10000000;
 	
 	# export denovos 
 	#runCmd("export denovos", "$exportTool --db $WORK/variants.db --bed $bedfile --format annovar --type all --mincov $min_cov --maxcov $max_cov --covratio $outratio > $WORK/variants.${min_cov}x.all.txt");
 	#runCmd("export denovos", "$exportTool --db $WORK/variants.db --bed $bedfile --format annovar --type snp --mincov $min_cov --maxcov $max_cov --covratio $outratio > $WORK/variants.${min_cov}x.snp.txt");
 	#runCmd("export denovos", "$exportTool --db $WORK/variants.db --bed $bedfile --format annovar --type indel --mincov $min_cov --maxcov $max_cov --covratio $outratio > $WORK/variants.${min_cov}x.indel.txt");
 	
 	my $command = "$exportTool ".
 		"--db $WORK/variants.db ".
 		"--bed $bedfile ".
 		"--format $outformat ".
 		"--type indel ". 
 		"--mincov $min_cov ". 
 		"--maxcov $max_cov ".
 		"--covratio $outratio";
 	if($intarget) { $command .= " --intarget"; }
 	if ($outformat eq "annovar") { $command .= " > $WORK/variants.${min_cov}x.indel.annovar"; }
 	elsif ($outformat eq "vcf") { $command .= " > $WORK/variants.${min_cov}x.indel.vcf"; }
 	
 	print STDERR "Command: $command\n" if($VERBOSE);
 	runCmd("ExportVariants", $command);
 }

 #####################################################


 ## Write the list of read groups per family member on file 
 ## Extract the sequence around indel from reference
 ## Create input files for microassembler
 ## Run microassembler

 sub assemblyExons {
 	
 	my $hash = $_[0];
 	my $dbm_obj = $_[1];
 	my $ref_hash = $_[2];
 	my $loc2keys_hash = $_[3];
 	
 	my $count_exons = 0;
 	my $count_regions = 0;
 	my @regions;
 	my @region_files;
 	
 	my $PREFIX = "$WORK";
 						
 	#foreach my $exonkey (sort {$a <=> $b} keys %exons) { # for each chromosome
 	foreach my $exonkey (keys %exons) { # for each chromosome

 		$count_exons = 0;
 		#foreach my $exon (sort {$a->{start} <=> $b->{start}} @{$exons{$exonkey}}) { # for each exon
 		foreach my $exon (@{$exons{$exonkey}}) { # for each exon
 			
 			$count_exons++;
 			#last if($count_exons > 5);
 		
 			#my %pathways;
 			#my %exonshash;
 			
 			# Forks and returns the pid for the child:
 			#my $pid = $pm->start and next;
 	
 			my $start = $exon->{start};
 			my $end = $exon->{end};
 			my $chr = $exon->{chr};
 			#print STDERR "$chr:$start-$end\n";
 			
 			## extend region left and right by radius
 			my $left = $start-$radius;
 			if ($left < 0) { $left = $start; }
 			my $right = $end+$radius;
 			#if ($right > $end) { $right = $end; }
 			#print STDERR "$chr:$left-$right\n";
 			
 			my $exonSize = $right-$left;
 							
 			# generate list of regions to examine			
 			my $l = $left;
 			my $u = $l+$windowSize;
 			if($u >= $right) { 
 				$u = $right;
 				my $region;	
 				$region->{chr} = $chr;				
 				$region->{left} = $l;
 				$region->{right} = $u;
 				push @regions, $region;
 				#print "$region->{chr}:$region->{left}-$region->{right}\n";
 				$count_regions++;
 			}
 			else {
 				while($u < $right) {
 				
 					my $region;
 					$region->{chr} = $chr;		
 					$region->{left} = $l;
 					$region->{right} = $u;
 					$region->{exonstart} = $left;
 					$region->{exonend} = $right;					
 					push @regions, $region;
 					$count_regions++;
 			
 					$l += $delta;
 					$u += $delta;
 					if($u > $right) { 
 						$u = $right; 
 						$l = $u-$windowSize; 
 						
 						my $region;
 						$region->{chr} = $chr;
 						$region->{left} = $l;
 						$region->{right} = $u;
 						$region->{exonstart} = $left;
 						$region->{exonend} = $right;
 						
 						push @regions, $region;
 						#print "$region->{chr}:$region->{left}-$region->{right}\n";
 						$count_regions++;
 					}
 				}
 			}
 		}
 	}
 	
 	#update statistics in the database
 	if(exists $hash->{stats})  { 
 		my $sts = $hash->{stats}; 
 		$sts->{num_exceptions} = 0;
 		$sts->{num_dfs_limit} = 0;
 		$sts->{num_partial_align} = 0;
 		$sts->{num_with_cycles} = 0;
 		$sts->{num_ok} = 0;
 		$hash->{stats} = $sts;
 	}
 	
 	#schedule jobs
 	schedule_jobs(\@regions, $hash, $dbm_obj, $ref_hash, $loc2keys_hash, "regions");

 	my $stats = $hash->{stats}; 

 	#if ($VERBOSE) {
 	if(exists $hash->{stats})  {
 		my $sts = $hash->{stats};
 		print STDERR "\nDfs limit: $sts->{num_dfs_limit}\n";
 		print STDERR "Partial alignments: $sts->{num_partial_align}\n";
 		print STDERR "Cycles: $sts->{num_with_cycles}\n";
 		print STDERR "OK assemblies: $sts->{num_ok}\n";
 		print STDERR "Exceptions: $sts->{num_exceptions}\n";
 		print STDERR "Repeats: $sts->{num_repeats}\n\n";
 	}
 	#}
 }

 # partition the region to assemble into equal size 
 # random sets and run the parallel jobs
 #####################################################

 sub schedule_jobs {
 	
 	my $array = $_[0];
 	my $hash = $_[1];
 	my $dbm_obj = $_[2];
 	my $ref_hash = $_[3];
 	my $loc2keys_hash = $_[4];
 	my $fprefix = $_[5];
 	
 	my $PREFIX = "$WORK";
 	
 	# random permutation of the regions to assemble
 	#my @regions = random_permutation(@$array);
 	my @regions = @$array;
 	my $arraySize = $#regions + 1;

 	print STDERR "start assembly of $arraySize regions.\n"; 

 	# partition regions
 	my $step = $arraySize;
 	if ($MAX_PROCESSES>0) {
 		$step = ceil($arraySize/$MAX_PROCESSES);
 	}	
 	print STDERR "stepSize: $step\n";

 	for(my $i = 0; $i < $arraySize; $i+=$step) {
 	
 		my $s = $i;
 		my $e = min( ($arraySize - 1), ($i + $step-1) );
 		my $ref_file = "$fprefix-$s-$e.fa";
 		#push @region_files, $ref_file;
 			
 		open FILE, "> $PREFIX/$ref_file" or die "Can't open $PREFIX/$ref_file ($!)\n";
 		foreach my $reg (@regions[$s..$e]) { # for each region
 			my $chr = $reg->{chr};
 			my $left = $reg->{left};
 			my $right = $reg->{right};
 			my $exonstart = $reg->{exonstart};
 			my $exonend = $reg->{exonend};
 			
 			#my $chrom = $chr;
 			#if ($chr =~ /^chr/) { $chrom = substr($chr,3); }

 			print FILE ">$chr:$left-$right\n";
 			### die "Undefined sequence ($chr)\n" if (!exists($genome{$chr}));
 			my ($header, $seq) = split(/\n/, `samtools faidx $REF $chr:$left-$right`, 2);
 			$seq =~ s/[\n\r\s]+//g; 
 			###my $seq = substr($genome{$chr}->{seq}, $left-1, $right-$left+1);
 			#my $seq = substr($genome{$chr}->{seq},$exonstart, $exonend-$exonstart+1);
 			for(my $i=0; $i<length($seq); $i+=80) {
 				my $str = substr($seq,$i,80);
 				print FILE "$str\n";
 			}
 		}
 		close FILE;
 	}
 	
 	# free genome memory before parallelization
 	###for my $k (keys %genome) { delete $genome{$k}; }
 	
 	my $pm = new Parallel::ForkManager($MAX_PROCESSES);
 	my $count = 0;
 	for(my $i = 0; $i < $arraySize; $i+=$step) {
 		$count++;
 	
 		my $pid = $pm->start and next;

 		my $s = $i;
 		my $e = min( ($arraySize - 1), ($i + $step-1) );
 		my $ref_file = "$fprefix-$s-$e.fa";
 	
 		print STDERR "$count. [$s..$e]\n";
 		
 		assemblyRegion($count, $ref_file, \%$hash, $dbm_obj);
 		$pm->finish; # Terminates the child process
 	}
 	$pm->wait_all_children; # wait for all parallel jobs to complete

 	# load reference coverage info from files
 	loadRefCovMerge($ref_hash, $count, $WORK);
 	loadLoc2KeyMerge($loc2keys_hash, $ref_hash, $count, $kmer, $WORK);
 	
 	# compute zygosity for all mutations
 	computeHomHetState($hash, $ref_hash, $loc2keys_hash);
 }

 ## Compute homozygous vs heterozygous state and  
 ## flags possible sequencing errors
 #####################################################

 sub computeHomHetState {
 	
 	print STDERR "Computing hom/het state...\n";
 	
 	my $vars = $_[0];
 	my $refh = $_[1];
 	my $l2k  = $_[2];
 	
 	my $hom_cnt = 0;
 	my $het_cnt = 0;
 	
 	# improve performance: tie once to database and do all read/wrote operations.
 	$variants_dbm_obj->Lock;

 	foreach my $currKey (keys %$vars) {
 		
 		next if($currKey eq "stats"); # skip stats info
 				
 		my $currMut = $vars->{$currKey};
 		my $chr = $currMut->{chr};
 		my $pos = $currMut->{pos};
 		
 		my $loc = "$chr:$pos";
 		
 		my $alleleCnt = 0;
 		my $Rcov = 0;
 		my $Acov = 0;
 		my $Ocov = 0;
 		my $totCov = 0;
 		
 		# reference coverage
 		my $sum = 0;
 		my $cnt = 0;
 		if ( (exists $refh->{$loc}) ) { # if cov at location					
 			## compute reference coverage at locus
 			for (my $t = $pos; $t<($pos+$kmer+1); $t++) { # compute avg coverage over window of size K
 				my $loc2 = "$chr:$t";
 				if ( (exists $refh->{$loc2})  && 
 					!( (exists $l2k->{$loc2}) && ( (scalar @{$l2k->{$loc2}->{keys}})!=0 ) ) ) {
 					$sum += $refh->{$loc2};
 					$cnt++;
 				}					
 			}
 		}
 		$Rcov=$sum;
 		if($cnt>0) { $Rcov = floor($sum/$cnt); }
 		
 		# alternative allele coverage
 		$Acov = $currMut->{mincov};
 		
 		# other coverage
 		$Ocov = 0;
 		if(exists($l2k->{$loc})) {
 			foreach my $altKey (@{$l2k->{$loc}->{keys}}) {
 				if( exists($vars->{$altKey}) && ($altKey ne $currKey) ) {
 					my $altMut = $vars->{$altKey};
 					$Ocov += $altMut->{mincov};
 				}
 			}
 		}

 		$totCov = $Rcov + $Acov + $Ocov;		
 		$currMut->{altcov} = $totCov - $currMut->{mincov};
 		
 		if($Rcov > 0) { $alleleCnt++; }
 		if($Acov > 0) { $alleleCnt++; }
 		if($Ocov > 0) { $alleleCnt++; }
 		
 		if($alleleCnt > 1) {
 			$currMut->{zygosity} = "het";
 			$het_cnt++;
 		}
 		else { # alleCnt <= 1
 			$currMut->{zygosity} = "hom";
 			$hom_cnt++;
 		}
 		
 		# update mutation in db
 		$vars->{$currKey} = $currMut;
 	}
 	
 	$variants_dbm_obj->UnLock;

 	print STDERR "Num. Homozygous: $hom_cnt\n";
 	print STDERR "Num. Heterozygous: $het_cnt\n";
 }

 # assembly a specific region
 #####################################################

 sub assemblyRegion {

 	my $num = $_[0];	
 	my $refs_file = $_[1];
 	my $hash = $_[2];
 	my $dbm_obj = $_[3];
 	
 	#my %pathways;
 	#my %exonshash;
 	
 	my $PREFIX = "$WORK";
 	my $log_dir = "$WORK/logs";
 	if (! -r $log_dir) { mkdir $log_dir; }
 	
 	my $outfile = "$log_dir/$refs_file.log";
 	
 	my $command = "$microassembler -R -I -C ";
 	if($VERBOSE) { $command .= "-v "; }
 	$command .= "-F $dfs_limit ".
 		"-k $kmer ".
 		"-K $maxKmer ". 
 		"-M $maxmismatch ".
 		"-l $max_tip_len ". 
 		"-c $cov_threshold ".
 		"-x $covratio ".
 		"-d $tip_cov_threshold ". 
 		"-b $map_qual ". 
 		"-p $PREFIX ".
 		"-m bamfile.bam ".
 		"-g $rgfile ".
 		" -r $refs_file ".
 		">& $outfile";
 		
 	## run microassembler on each fasta file 
 	runCmd("microassembly", "$command"); 
 	runCmd("gzip", "gzip -f $outfile"); 
 	
 	parseLogFile("$outfile", $num, \%$hash, $dbm_obj);
 			
 	# remove reference file
 	runCmd("delete file", "rm $PREFIX/$refs_file");
 }

 ## parse the log file generated by microassembler and populate data structures containing the output statistics
 #####################################################

 sub parseLogFile {

 	my $logFile = $_[0];
 	my $num = $_[1];
 	my $hash = $_[2];
 	my $dbm_obj = $_[3];

 	#my $PATHS = new IO::Uncompress::Gunzip "$logFile.gz" or die "IO::Uncompress::Gunzip failed: $GunzipError\n";
 	open PATHS, "gunzip -c $logFile.gz|" or die "Can't open $logFile.gz ($!)\n!";

 	my $curc;
 	my $curs;
 	my $cure;

 	my @pathcov;
 	my @pathdiff;

 	my %tmp_hash;
 	my %tmp_ref_cov;
 	my %tmp_loc2keys;
 	
 	my $stats;
 	$stats->{num_dfs_limit} = 0;
 	$stats->{num_partial_align} = 0;
 	$stats->{num_with_cycles} = 0;
 	$stats->{num_ok} = 0;
 	$stats->{num_exceptions} = 0;
 	$stats->{num_repeats} = 0;

 	my $exon;
 	my @pathways;

 	while (<PATHS>)
 	{
 		if (/== Processing\s+\d+:\s+([\w\.]+):(\d+)-(\d+)/) {
 			
 			$curc = $1; # chr
 			$curs = $2; # start
 			$cure = $3; # end
 		}
 		elsif (/cov: (\S+)/) {
 			my $cov = $1;

 			$exon->{start}  = $curs;
 			$exon->{end}    = $cure;
 			$exon->{cov}    = $cov;
 			$exon->{status} = "-";

 			my $exonkey = sprintf("%s:%d:%d", $curc, $curs, $cure);

 			#$exonshash->{$exonkey} = $exon;
 		}
 		elsif (/ref trim5: (\d+) trim3: (\d+) uncovered: (\d+) ref_dist: (\d+)/) {
 			$exon->{trim5} = $1;
 			$exon->{trim3} = $2;
 			$exon->{uncovered} = $3;
 			$exon->{ref_dist} = $4;
 		}
 		elsif (/allcycles: (\d+)/) {
 			$exon->{cycles} = $1;
 			if ($exon->{cycles} >= 1) { $stats->{num_with_cycles} += 1; }			
 		}
 		elsif (/exception/) {
 			$exon->{status} = "excp";
 			$stats->{num_exceptions} += 1;
 		}
 		elsif (/Found repeat in/ || /Found cycle in/) {
 			$exon->{status} = "repeat";
 			$stats->{num_repeats} += 1;
 		}
 		elsif (/WARNING: DFS_LIMIT/) {
 			$exon->{status} = "dfs";
 			$stats->{num_dfs_limit} += 1;
 		}
 		elsif(/^>sp/) {
 			$exon->{status} = "partial";
 			$stats->{num_partial_align} += 1;
 		}
 		elsif(/^c':/) {
 			chomp;
 			my $covstr = (split(/:/, $_))[1];
 			@pathcov = split(/ /, $covstr);
 		}
 		elsif(/^d':/) {
 			chomp;
 			my $diffstr = (split(/:/, $_))[1];
 			@pathdiff = split(//,$diffstr);
 		}
 		elsif (/^>p_([\w\.]+):(\d+)-(\d+)_(\d+)/) {
 			$exon->{status} = "ok";
 			my $chr = $1;

 			my $path;
 			$path->{chr} = $chr;
 			$path->{rstart} = $2;
 			$path->{rend}   = $3;
 			$path->{trim5}  = $exon->{trim5};
 			$path->{trim3}  = $exon->{trim3};	
 			$path->{pcnt}   = $4;
 			@{$path->{cov}} = @pathcov;
 			@{$path->{diff}} = @pathdiff;
 			chomp;

 			# >p_9:96439830-96440030_1 cycle: 0 match: 200 snp: 0 ins: 0 del: 1 96439931:T|-|25.7

 			my ($tag, $cc, $cycle, $mm, $match, $ss, $snp, $ii, $ins, $dd, $del, $info) = split(/\s/, $_, 12);

 			$path->{cycle} = $cycle;
 			$path->{match} = $match;
 			$path->{snp} = $snp;
 			$path->{ins} = $ins;
 			$path->{del} = $del;
 			$path->{info} = $info;

 			#my $pathkey = sprintf("%s:%d:%d", $chr, $2, $3);
 			if($path->{cycle} == 0) { ## skip paths with cycles
 				push @pathways, $path;
 			}
 		}
 		elsif (/FINISHED/) { # end of current region
 			if((scalar @pathways) > 0) { 
 				if( ($exon->{status} eq "ok") && ($exon->{cycles} == 0) ) { # skip regions hard to assemble and paths with at least one cycle
 					# at this point the exon status is ok
 					$stats->{num_ok} += 1;
 					extractVariantsFromPath(\@pathways, $exon, \%tmp_hash, \%tmp_ref_cov, $stats, \%tmp_loc2keys);
 				}
 				undef (@pathways); # free up the memory resources occupied by the array
 			}
 		}
 	}
 	
 	#copy mutations to database
 	$dbm_obj->Lock;
 	foreach my $key (keys %tmp_hash) {
 		my $mut = $tmp_hash{$key};
 		
 		if( !(exists $hash->{$key}) ) {
 			$hash->{$key} = $mut;
 		}
 		else { # denovo already in the table
 			# update the coverage to the highest value found so far
 			my $mut_old = $hash->{$key};
 			if ($mut_old->{avgcov} < $mut->{avgcov}) { $mut_old->{avgcov} = $mut->{avgcov}; } # max avg coverage
 			if ($mut_old->{mincov} < $mut->{mincov}) { $mut_old->{mincov} = $mut->{mincov}; } # max min coverage
 			$hash->{$key} = $mut_old;
 		}
 	}
 	
 	#update statistics in the database
 	if( !(exists $hash->{stats}) ) { 
 		$hash->{stats} = $stats; 
 	}
 	else { 
 		my $old_stats = $hash->{stats}; 
 		$old_stats->{num_repeats} += $stats->{num_repeats};
 		$old_stats->{num_exceptions} += $stats->{num_exceptions};
 		$old_stats->{num_dfs_limit} += $stats->{num_dfs_limit};
 		$old_stats->{num_partial_align} += $stats->{num_partial_align};
 		$old_stats->{num_with_cycles} += $stats->{num_with_cycles};
 		$old_stats->{num_ok} += $stats->{num_ok};
 		$hash->{stats} = $old_stats;
 	}
 	$dbm_obj->UnLock;
 	
 	# write reference coverage to file	
 	my $outfile = "$WORK/refcov.$num.txt";
 	open FILE, "> $outfile" or die "Can't open $outfile ($!)\n";
 	foreach my $key (keys %tmp_ref_cov) {
 		print FILE "$key\t$tmp_ref_cov{$key}\n";
 	}
 	close FILE;
 	
 	# write mutations by positions to file	
 	$outfile = "$WORK/loc2keys.$num.txt";
 	open FILE, "> $outfile" or die "Can't open $outfile ($!)\n";
 	foreach my $loc (keys %tmp_loc2keys) {
 		print FILE "$loc";		
 		foreach my $key (@{$tmp_loc2keys{$loc}}) { print FILE "\t$key"; }
 		print FILE "\n";
 	}
 	close FILE;

 	close PATHS;
 }

 ## extract mutations from path
 ##########################################

 sub extractVariantsFromPath {
 	
    my $pathways = $_[0];
    my $exon = $_[1];
 	my $hash = $_[2];
 	my $ref_cov_hash = $_[3];
 	my $stats = $_[4];
 	my $mutbypos_hash = $_[5];

 	my $outflag = 0;
 	
 	my $exoncov = $exon->{cov};
 	my $exonsta = $exon->{status};
 	my $exoncycle = $exon->{cycles};
 	$exoncycle = 0 if !defined $exoncycle;

 	foreach my $path (@{$pathways}) { ## for each path in the region
 		
 		updateRefCoverage($path, $ref_cov_hash); # updated coverage info for matching reference

 		my $chr = $path->{chr};	
 		#if ($chr =~ /^chr/) { $chr = substr($chr,3); }
 		my $info = $path->{info};
 		$info = "" if !defined $info;
 		#print "---$info---\n";
 		my @var = split /\s+/, $info;
 					
 		foreach my $v (@var) { ## for each variant
 			my ($pos, $ref, $qry, $avgcov, $mincov, $prevbp) = split /[:|]/, $v;
 			
 			my $t = "";
 			my $l = 0;
 			if   ( ($ref !~ /-+/) && ($qry =~ /-+/) ) {$t = "del"; $l = length($qry); } # deletion
 			elsif( ($ref =~ /-+/) && ($qry !~ /-+/) ) {$t = "ins"; $l = length($qry); } # insertion
 			elsif( ($ref !~ /-+/) && ($qry !~ /-+/) ) {$t = "snp"; $l = length($qry); } # SNP

 			$outflag = 1;
 					
 			my $mut;
 			$mut->{chr}  = $chr;							
 			$mut->{pos}  = $pos;
 			$mut->{type} = $t;
 			$mut->{len}  = $l;
 			$mut->{ref}  = $ref;
 			$mut->{seq}  = $qry;
 			$mut->{prevbp} = $prevbp;
 			$mut->{avgcov} = $avgcov;
 			$mut->{mincov} = $mincov;
 			$mut->{status} = $exonsta;
 			$mut->{zygosity} = "na";
 			$mut->{altcov} = 0;
 			$mut->{inheritance} = "na";
 		
 			my $ref_encoded = md5_hex($ref);
 			my $qry_encoded = md5_hex($qry);
 					
 			my $key = sprintf("%s:%d:%s:%d:%s:%s", $chr, $pos, $t, $l, $ref_encoded, $qry_encoded);
 			my $loc = sprintf("%s:%d", $chr, $pos);
 			#print "$key\n";

 			## add mutation to the hash table
 			if( !(exists $hash->{$key}) ) {
 				$hash->{$key} = $mut;
 				push @{$mutbypos_hash->{$loc}}, $key;
 			}
 			else { # already in the table
 				# update the coverage to the highest value found so far
 				my $mut_old = $hash->{$key};
 				if ($mut_old->{avgcov} < $mut->{avgcov}) { 
 					$mut_old->{avgcov} = $mut->{avgcov}; # max avg coverage
 				} 
 				if ($mut_old->{mincov} < $mut->{mincov}) { 
 					$mut_old->{mincov} = $mut->{mincov}; # max min coverage
 				} 
 				$hash->{$key} = $mut_old;
 			}
 		}
 	}
 	#return 1 if at least one event is detected
 	return $outflag;
 }

 ## update covearge info for bases matching the reference
 ##########################################

 sub updateRefCoverage {
 	
 	my $p = $_[0];
 	my $ref_hash = $_[1];
 	
 	my @cov = @{$p->{cov}};
 	my @diff = @{$p->{diff}};
  	my $chr = $p->{chr};
  	my $start = $p->{rstart};
  	my $trim5 = $p->{trim5};
  	#my $end = $p->{rend};
 	#my $L = $end-$start+1; # length of the region
 	my $len = scalar @diff;
 		
 	my $r = $start+$trim5; # reference index
 	my $j = 0; # used to scan the coverage array
 	for (my $i = 0; $i<$len; $i++) {

 		my $key = "$chr:$r";

 		if ($diff[$i] eq " ") { # if matching the reference
 			#if( ($i>0) && ($diff[$i-1] eq " ") ) { # test needed to correctly handle homozygous ins
 			if( ($i>0) && ($diff[$i-1] ne "^") ) { # test needed to correctly handle homozygous ins
 				if ( (exists $ref_hash->{$key}) ) { # already in the table
 					my $cov = $ref_hash->{$key};
 					if ($cov < $cov[$j]) { # keep max coverage
 						$cov = $cov[$j]; 
 					}
 					$ref_hash->{$key} = $cov;
 				}
 				else { # not in the table
 					$ref_hash->{$key} = $cov[$j];
 				}
 			}
 			$j++; $r++;
 		}
 		elsif ($diff[$i] eq "x") { $j++; $r++; } # snp 
 		elsif ($diff[$i] eq "^") { $j++; } # ins 
 		elsif ($diff[$i] eq "v") { $r++; } # del
 	}	
 }

 ## do the job
 ##########################################

 init();
 run();

 ##########################################

 my $time_taken = time - $start_time;

 #if($VERBOSE) {
 	elapsedTime($time_taken, "FindVariants");
 #}
diff --git a/samtools_index.patch b/samtools_index.patch
 diff --git a/FindSomatic.pl b/FindSomatic.pl
 old mode 100755
 new mode 100644
 index 7a42a1e..6e90b4e
 --- a/FindSomatic.pl
 +++ b/FindSomatic.pl
 @@ -508,10 +508,12 @@ sub parseBestState {
 		my @R = split("", $REF); # reference
 		my @A = split("", $ALT); # alternative
 		my $n=0; my $t=1;
 -		if ( ($A[$n]>0) && ($A[$t]>0) ) { # both in normal and tumor
 +		#if ( ($A[$n]>0) && ($A[$t]>0) ) { # both in normal and tumor
 +		if ( $A[$n] == $A[$t] ) { # genotypes for ALT identical 
 			$sv->{inheritance} = "normal";
 		}
 -		elsif ( ($A[$n]==0) && ($A[$t]>0) ) { # only in tumor
 +		#elsif ( ($A[$n]==0) && ($A[$t]>0) ) { # only in tumor
 +		elsif ( $A[$n] != $A[$t] ) { # genotypes for ALT different 
 			$sv->{inheritance} = "no";
 			$sv->{somatic} = "yes";
 		}
 diff --git a/FindVariants.pl b/FindVariants.pl
 index 5f0dceb..712095b 100755
 --- a/FindVariants.pl
 +++ b/FindVariants.pl
 @@ -218,7 +218,7 @@ sub run {
 	}
 	
 	# decide if loading genome from fasta or use samtools faidx
 -	if (-e "$bedfile") { $USEFAIDX = 0; }
 +	if (-e "$bedfile") { $USEFAIDX = 1; }
 	elsif ($bedfile =~ m/(\w+):(\d+)-(\d+)/) { 
 		#parse region
 		my ($chr,$interval) = split(':',$bedfile);
	#!/usr/bin/perl

	###################################################################
	# FindSomatic.pl
	#
	# Tool for detecting somatic mutations in normal/tumor pair
	# using microassembly
	#
	# Author: Giuseppe Narzisi
	# Date: December 11, 2013
	#
	###################################################################

	use warnings;
	use strict;
	use POSIX;
	use FindBin qw($Bin);
	use lib $Bin; # add $Bin directory to @INC
	use Usage;
	use Utils qw(:DEFAULT $findVariants $findDenovos $findSomatic $exportTool $bamtools);
	use HashesIO;
	use Parallel::ForkManager;
	use List::Util qw[min max];
	#use Math::Random qw(:all);
	#use Sys::CPU;
	#use IO::Uncompress::Gunzip qw(gunzip $GunzipError);
	use File::Spec;
	use Getopt::Long;

	use MLDBM::Sync; # this gets the default, SDBM_File
	use MLDBM qw(DB_File Storable); # use Storable for serializing
	use MLDBM qw(MLDBM::Sync::SDBM_File); # use extended SDBM_File, handles values > 1024 bytes
	use Fcntl qw(:DEFAULT); # import symbols O_CREAT & O_RDWR for use with DBMs

	$\|=1;

	# required arguments
	my $BAMNORMAL;
	my $BAMTUMOR;
	my $BEDFILE;
	my $REF;

	my $defaults = getDefaults();

	# optional arguments
	my $kmer = $defaults->{kmer};
	my $cov_threshold = $defaults->{cov_threshold};
	my $tip_cov_threshold = $defaults->{tip_cov_threshold};
	my $radius = $defaults->{radius};
	my $windowSize = $defaults->{windowSize};
	my $delta = $defaults->{delta};
	my $map_qual = $defaults->{map_qual};
	my $maxmismatch = $defaults->{maxmismatch};
	my $min_cov = $defaults->{min_cov};
	my $covratio = $defaults->{covratio};
	my $outratio = $defaults->{outratio};
	my $WORK = $defaults->{WORK};
	my $MAX_PROCESSES = $defaults->{MAX_PROCESSES};
	my $selected = $defaults->{selected};
	my $dfs_limit = $defaults->{pathlimit};
	my $outformat = $defaults->{format};
	my $intarget;

	my $help = 0;
	my $VERBOSE = 0;

	my %candidates;
	my %exons;
	my %locations;

	my %normalCov;
	my %tumorCov;

	my %normalL2K;
	my %tumorL2K;

	my %normalSVs;
	my %tumorSVs;
	my %somaticSVs;
	my %commonSVs;

	my @members = qw/normal tumor/;

	my $argcnt = scalar(@ARGV);
	my $start_time = time;

	#####################################################
	#
	# Command line options processing
	#
	GetOptions(
	'help!' => \$help,
	'verbose!' => \$VERBOSE,

	# required parameters
	'normal=s' => \$BAMNORMAL,
	'tumor=s' => \$BAMTUMOR,
	'bed=s' => \$BEDFILE,
	'ref=s' => \$REF,

	# optional paramters
	'kmer=i' => \$kmer,
	'covthr=i' => \$cov_threshold,
	'lowcov=i' => \$tip_cov_threshold,
	'mincov=i' => \$min_cov,
	'covratio=f' => \$covratio,
	'radius=i' => \$radius,
	'window=i' => \$windowSize,
	'step=i' => \$delta,
	'mapscore=i' => \$map_qual,
	'mismatches=i' => \$maxmismatch,
	'dir=s' => \$WORK,
	'numprocs=i' => \$MAX_PROCESSES,
	'coords=s' => \$selected,
	'format=s' => \$outformat,

	# ouptut parameters
	'intarget!' => \$intarget,
	'outratio=f' => \$outratio,

	) or usageDenovo("FindDenovo.pl");

	#####################################################
	#
	# Command line options processing
	#
	sub init()
	{
	usageSomatic("FindSomatic.pl") if ($argcnt < 1);
	usageSomatic("FindSomatic.pl") if( $help );

	if( (!defined $BAMNORMAL) \|\| (!defined $BAMTUMOR) \|\| (!defined $BEDFILE) \|\| (!defined $REF) ) {
	print STDERR "Required parameter missing!\n";
	usageSomatic("FindSomatic.pl");
	}

	mkdir $WORK if ! -r $WORK;

	if($MAX_PROCESSES == 1) { $MAX_PROCESSES = 0; }
	}

	#####################################################

	sub printParams {

	my $file = $_[0];

	print STDERR "-- Print parameters to $file\n";

	open PFILE, "> $file" or die "Can't open $file ($!)\n";

	print PFILE "Parameters: \n";
	print PFILE "-- K-mer size (bp): $kmer\n";
	print PFILE "-- coverage threshold: $cov_threshold\n";
	print PFILE "-- low coverage threshold: $tip_cov_threshold\n";
	print PFILE "-- size (bp) of the left and right extensions (radius): $radius\n";
	print PFILE "-- window-size size (bp): $windowSize\n";
	print PFILE "-- step size (bp): $delta\n";
	print PFILE "-- minimum mapping-quality: $map_qual\n";
	print PFILE "-- minimum coverage for somatic mutation: $min_cov\n";
	print PFILE "-- minimum coverage ratio for sequencing error: $covratio\n";
	print PFILE "-- minimum coverage ratio for exporting mutation: $outratio\n";
	print PFILE "-- max number of mismatches for near-perfect repeats: $maxmismatch\n";
	print PFILE "-- output directory: $WORK\n";
	print PFILE "-- max number of parallel jobs: $MAX_PROCESSES\n";
	print PFILE "-- BAM file normal: $BAMNORMAL\n";
	print PFILE "-- BAM file tumor: $BAMTUMOR\n";
	print PFILE "-- bedfile: $BEDFILE\n";
	print PFILE "-- reference file: $REF\n";
	print PFILE "-- file of selected locations: $selected\n";
	print PFILE "-- output format for variants: $outformat\n";
	if($intarget) { print PFILE "-- output variants in target? yes\n"; }
	else { print PFILE "-- output variants in target? no\n"; }

	close PFILE;
	}

	## link input files
	#####################################################

	sub linkFiles {

	print STDERR "-- Linking input bamfile\n";

	my $bamfile = $_[0];
	my $dirname = $_[1];

	my $flag = 0;

	print STDERR "$bamfile\n";
	my $fdir = "$WORK/$dirname";

	# get absolute path
	my $bam_abs_path = File::Spec->rel2abs($bamfile);

	if (! -r $fdir) {
	$flag = 1;
	## link the input files
	mkdir $fdir;

	if (-e "$bam_abs_path") {
	symlink "$bam_abs_path", "$fdir/bamfile.bam";
	}
	else {
	print STDERR "Can't find BAM file: $bamfile\n";
	next;
	}

	if (-e "$bam_abs_path.bai") {
	symlink "$bam_abs_path.bai", "$fdir/bamfile.bam.bai";
	}
	else {
	print STDERR "Indexing BAM file...\n";
	runCmd("index bamfile", "$bamtools index -in $bam_abs_path");
	symlink "$bam_abs_path.bai", "$fdir/bamfile.bam.bai";
	#print STDERR "Can't find index BAM file: $bamfile.bai\n";
	next;
	}
	}

	return $flag;
	}

	## load mutations and coverage info
	#####################################################

	sub loadHashes {

	print STDERR "-- Loading mutations and coverage info\n";

	# clear hashes
	for my $k (keys %normalSVs) { delete $normalSVs{$k}; }
	for my $k (keys %tumorSVs) { delete $tumorSVs{$k}; }
	for my $k (keys %normalCov) { delete $normalCov{$k}; }
	for my $k (keys %tumorCov) { delete $tumorCov{$k}; }

	# load databases of mutations
	print STDERR "load variants for normal...\n";
	loadDB("$WORK/normal/variants.db", \%normalSVs, \%exons, 0);
	print STDERR "load variants for tumor...\n";
	loadDB("$WORK/tumor/variants.db", \%tumorSVs, \%exons, 0);

	print STDERR "load coverage for normal...\n";
	loadCov("$WORK/normal/refcov.txt", \%normalCov);
	print STDERR "load coverage for tumor...\n";
	loadCov("$WORK/tumor/refcov.txt", \%tumorCov);

	print STDERR "load loc2key for for normal...\n";
	loadLoc2Key("$WORK/normal/loc2keys.txt.gz", \%normalL2K);
	print STDERR "load loc2key for for tumor...\n";
	loadLoc2Key("$WORK/tumor/loc2keys.txt.gz", \%tumorL2K);

	#remove reference coverage files:
	print STDERR "remove coverage file for normal...\n";
	runCmd("remove coverage file", "rm $WORK/normal/refcov.txt.gz");
	print STDERR "remove coverage file for tumor...\n";
	runCmd("remove coverage file", "rm $WORK/tumor/refcov.txt.gz");
	}

	## call variants on each family memeber
	#####################################################

	sub callSVs {

	my $bamfile = $_[0];
	my $dir = $_[1];

	print STDERR "-- Detect mutations on $dir\n";

	my $bam_abs_path = File::Spec->rel2abs($bamfile);

	my $command = "$findVariants ".
	"--bam $bam_abs_path ".
	"--bed $BEDFILE ".
	"--ref $REF ".
	"--kmer $kmer ".
	"--covthr $cov_threshold ".
	"--lowcov $tip_cov_threshold ".
	"--covratio $covratio ".
	"--radius $radius ".
	"--window $windowSize ".
	"--step $delta ".
	"--mapscore $map_qual ".
	"--mismatches $maxmismatch ".
	"--dir $WORK/$dir ".
	"--sample ALL ".
	"--numprocs $MAX_PROCESSES ".
	"--coords $selected ".
	"--format $outformat ".
	"--cov2file";
	if($intarget) { $command .= " --intarget"; }
	if($VERBOSE) { $command .= " --verbose"; }

	print STDERR "Command: $command\n" if($VERBOSE);
	runCmd("findVariants", "$command");
	}

	## Compute best state for the location of the mutation
	#####################################################
	sub computeBestState {

	my $vars;
	my $covthr = 0; # min cov threshold to remove sequencing errors

	foreach my $ID (@members) {

	if($ID eq "normal") { $vars = \%normalSVs; }
	if($ID eq "tumor") { $vars = \%tumorSVs; }

	foreach my $currKey (keys %$vars) {
	#print STDERR "$currKey\n";

	next if($currKey eq "numPaths");

	my $var = $vars->{$currKey};
	my $chr = $var->{chr};
	my $pos = $var->{pos};
	my $loc = "$chr:$pos";

	my $refState;
	#my $mutState;
	my $altState;

	my $refCov;
	my $mutCov;
	my $altCov;

	foreach my $role (@members) {
	#print STDERR "$role: ";

	my $varh;
	my $refhash;
	my $l2k;

	if($role eq "normal") { $varh = \%normalSVs; $refhash = \%normalCov; $l2k = \%normalL2K; }
	if($role eq "tumor") { $varh = \%tumorSVs; $refhash = \%tumorCov; $l2k = \%tumorL2K; }

	$refState->{$role} = 0; $altState->{$role} = 0;
	$refCov->{$role} = 0; $mutCov->{$role} = 0; $altCov->{$role} = 0;

	my $sum = 0;
	my $cnt = 0;
	if ( exists($refhash->{$loc}) ) { # if there is reference coverage at location
	## compute reference coverage at locus
	for (my $t = $pos; $t<($pos+$kmer+1); $t++) { # compute avg coverage over window of size K
	my $loc2 = "$chr:$t";
	if ( exists $refhash->{$loc2} &&
	!( exists($l2k->{$loc2}) && ( (scalar @{$l2k->{$loc2}->{keys}})!=0) ) ) {
	$sum += $refhash->{$loc2};
	$cnt++;
	}
	}
	}
	my $RCov = $sum;
	if($cnt>0) { $RCov = floor($sum/$cnt); }
	if($RCov > $covthr) { $refState->{$role} = 1; $refCov->{$role} = $RCov; }
	else { $refState->{$role} = 0; $refCov->{$role} = 0; }

	## compute coverage of mut at locus
	my $ACov = 0;
	if( exists($varh->{$currKey}) ) {
	my $altMut = $varh->{$currKey};
	$ACov = $altMut->{mincov};
	}
	if($ACov > $covthr) { $altState->{$role} = 1; $mutCov->{$role} = $ACov; }
	else { $altState->{$role} = 0; $mutCov->{$role} = 0; }

	## compute alternative allele coverage
	my $OCov = 0;
	if( exists($l2k->{$loc}) ) {
	foreach my $altKey (@{$l2k->{$loc}->{keys}}) {
	if( exists($varh->{$altKey}) && ($altKey ne $currKey) ) {
	my $altMut = $varh->{$altKey};
	$OCov += $altMut->{mincov};
	}
	}
	}
	if($OCov > $covthr) { $altCov->{$role} = $OCov; }
	else { $altCov->{$role} = 0; }
	}

	# 2 2/0 1 (ref: N,T / qry: N,T)
	my @RS=(); my @AS=();
	my @RC=(); my @AC=(); my @OC=();
	foreach my $role (@members) { # the order or members must be N,T
	if( ($refState->{$role} == 1) && ($altState->{$role} == 1) ) {
	push @RS , 1;
	push @AS , 1;
	}
	elsif( ($refState->{$role} == 1) && ($altState->{$role} == 0) ) {
	push @RS , 2;
	push @AS , 0;
	}
	elsif( ($refState->{$role} == 0) && ($altState->{$role} == 1) ) {
	push @RS , 0;
	push @AS , 2;
	}
	elsif( ($refState->{$role} == 0) && ($altState->{$role} == 0) ) {
	push @RS , 0;
	push @AS , 0;
	}

	push @RC , $refCov->{$role};
	push @AC , $mutCov->{$role};
	push @OC , $altCov->{$role};
	}

	my $stateStr = "$RS[0]$RS[1]/$AS[0]$AS[1]";
	my $covStr = "$RC[0] $RC[1]/$AC[0] $AC[1]/$OC[0] $OC[1]";

	$var->{bestState} = $stateStr;
	$var->{covState} = $covStr;

	# compute hom/het state and chi2score
	parseCovState($var, $ID);

	$vars->{$currKey} = $var;
	}
	}
	}

	## parse covState and compute hom/het state and chi2score
	##########################################

	sub parseCovState {

	my $sv = $_[0];
	my $role = $_[1];
	my $BS = $sv->{covState};

	my ($REF,$ALT,$OTH) = split("/",$BS);
	my @R = split(" ", $REF); # reference
	my @A = split(" ", $ALT); # alternative
	my @O = split(" ", $OTH); # other

	my $id = 0;
	if($role eq "normal") { $id = 0; }
	if($role eq "tumor") { $id = 1; }

	my $alleleCnt = 0;
	if($R[$id] > 0) { $alleleCnt++; }
	if($A[$id] > 0) { $alleleCnt++; }
	if($O[$id] > 0) { $alleleCnt++; }

	my $totCov = $R[$id] + $A[$id] + $O[$id];
	$sv->{altcov} = $totCov - $sv->{mincov};

	if($alleleCnt > 1) { $sv->{zygosity} = "het"; }
	else { $sv->{zygosity} = "hom"; }

	my $covRatio = 0;
	my $chi2Score = "na";
	if ($totCov > 0) {
	$covRatio = sprintf('%.2f', ($sv->{mincov} / $totCov) );
	$sv->{covratio} = $covRatio;

	#Chi-squared Test for allele coverage
	my $o1 = $sv->{mincov}; # observed 1
	my $o2 = $sv->{altcov}; # observed 2
	my $e1 = $totCov/2; # expected 1
	my $e2 = $totCov/2; # expected 2

	my $term1 = (($o1-$e1)*($o1-$e1))/$e1;
	my $term2 = (($o2-$e2)*($o2-$e2))/$e2;

	$chi2Score = sprintf('%.2f', $term1 + $term2);
	}
	$sv->{chi2score} = $chi2Score;
	}

	## parse bestState and update mutation info
	##########################################
	sub parseBestState {

	my $sv = $_[0];
	my $BS = $sv->{bestState};

	$sv->{somatic} = "no";
	$sv->{inheritance} = "no";

	if( ($BS eq "12/10") \|\| ($BS eq "02/20") ) { # only in normal
	$sv->{inheritance} = "no";
	}
	else {
	my ($REF,$ALT) = split("/",$BS);
	my @R = split("", $REF); # reference
	my @A = split("", $ALT); # alternative
	my $n=0; my $t=1;
	#if ( ($A[$n]>0) && ($A[$t]>0) ) { # both in normal and tumor
	if ( $A[$n] == $A[$t] ) { # genotypes for ALT identical
	$sv->{inheritance} = "normal";
	}
	#elsif ( ($A[$n]==0) && ($A[$t]>0) ) { # only in tumor
	elsif ( $A[$n] != $A[$t] ) { # genotypes for ALT different
	$sv->{inheritance} = "no";
	$sv->{somatic} = "yes";
	}
	else { # unknown state
	$sv->{inheritance} = "no";
	}
	#if($A[$t]>0) { $sv->{id} = "tumor"; }
	}
	}


	## find denovo events
	#####################################################

	sub findSomaticMut {

	print STDERR "-- Finding somatic mutations\n";

	my $family = $_[0];

	if (-e "$WORK/somatic.db.dir") { runCmd("remove database files", "rm $WORK/somatic.db.*"); }
	if (-e "$WORK/common.db.dir") { runCmd("remove database files", "rm $WORK/common.db.*"); }

	my $somatic_dbm_obj = tie %somaticSVs, 'MLDBM::Sync', "$WORK/somatic.db", O_CREAT\|O_RDWR, 0640;
	my $common_dbm_obj = tie %commonSVs, 'MLDBM::Sync', "$WORK/common.db", O_CREAT\|O_RDWR, 0640;

	$somatic_dbm_obj->Lock;
	$common_dbm_obj->Lock;
	foreach my $k (keys %tumorSVs) {

	my $mut = $tumorSVs{$k};

	# parse bestState and update info
	parseBestState($mut);

	my $chr = $mut->{chr};
	my $pos = $mut->{pos};
	my $key = "$chr:$pos";

	# skip mutation if normal not sampled
	next if (!exists $normalCov{$key});
	next if ($normalCov{$key} < $min_cov); # min cov requirement

	# compute inheritance
	if($mut->{somatic} eq "yes") {
	$somaticSVs{$k} = $mut;
	}
	else { # non somatic
	if ($mut->{inheritance} eq "normal") {
	$commonSVs{$k} = $mut;
	}
	}
	$tumorSVs{$k} = $mut;
	}

	$somatic_dbm_obj->UnLock;
	$common_dbm_obj->UnLock;
	}


	## export family mutations to file
	#####################################################

	sub exportSVs {

	print STDERR "-- Exporting SVs to file\n";

	my $max_cov = 10000000;

	# export somatic
	#runCmd("export all", "$exportTool --db $WORK/somatic.db --bed $BEDFILE --format annovar --type all --mincov $min_cov --maxcov $max_cov --covratio $outratio > $WORK/somatic.${min_cov}x.all.txt");
	#runCmd("export snp", "$exportTool --db $WORK/somatic.db --bed $BEDFILE --format annovar --type snp --mincov $min_cov --maxcov $max_cov --covratio $outratio > $WORK/somatic.${min_cov}x.snp.txt");
	#runCmd("export indels", "$exportTool --db $WORK/somatic.db --bed $BEDFILE --format annovar --type indel --mincov $min_cov --maxcov $max_cov --covratio $outratio > $WORK/somatic.${min_cov}x.indel.txt");

	my $command_somatic = "$exportTool ".
	"--db $WORK/somatic.db ".
	"--bed $BEDFILE ".
	"--format $outformat ".
	"--type indel ".
	"--mincov $min_cov ".
	"--maxcov $max_cov ".
	"--covratio $outratio";
	if($intarget) { $command_somatic .= " --intarget"; }
	if ($outformat eq "annovar") { $command_somatic .= " > $WORK/somatic.${min_cov}x.indel.annovar"; }
	elsif ($outformat eq "vcf") { $command_somatic .= " > $WORK/somatic.${min_cov}x.indel.vcf"; }

	print STDERR "Command: $command_somatic\n" if($VERBOSE);
	runCmd("export somatic", $command_somatic);

	# export common variants
	#runCmd("export all", "$exportTool --db $WORK/common.db --bed $BEDFILE --format annovar --type all --mincov $min_cov --maxcov $max_cov --covratio $outratio > $WORK/common.${min_cov}x.all.txt");
	#runCmd("export snp", "$exportTool --db $WORK/common.db --bed $BEDFILE --format annovar --type snp --mincov $min_cov --maxcov $max_cov --covratio $outratio > $WORK/common.${min_cov}x.snp.txt");
	#runCmd("export indels", "$exportTool --db $WORK/common.db --bed $BEDFILE --format annovar --type indel --mincov $min_cov --maxcov $max_cov --covratio $outratio > $WORK/common.${min_cov}x.indel.txt");

	my $command_comm = "$exportTool ".
	"--db $WORK/common.db ".
	"--bed $BEDFILE ".
	"--format $outformat ".
	"--type indel ".
	"--mincov $min_cov ".
	"--maxcov $max_cov ".
	"--covratio $outratio";
	if($intarget) { $command_comm .= " --intarget"; }
	if ($outformat eq "annovar") { $command_comm .= " > $WORK/common.${min_cov}x.indel.annovar"; }
	elsif ($outformat eq "vcf") { $command_comm .= " > $WORK/common.${min_cov}x.indel.vcf"; }

	print STDERR "Command: $command_comm\n" if($VERBOSE);
	runCmd("export common", $command_comm);
	}

	## process single family
	#####################################################

	sub processBAM {

	my $bamfile = $_[0];
	my $outdir = $_[1];

	#my $toprocess = linkFiles("$bamfile", "$outdir"); # link input files
	#if($toprocess == 1) {
	callSVs($bamfile, $outdir); # call mutations on each family
	#}
	#else { print STDERR "$outdir already processed!\n"; }
	}

	## do the job
	##########################################

	init();
	printParams("$WORK/parameters.txt"); # print parameters
	processBAM($BAMNORMAL, "normal"); # process normal
	processBAM($BAMTUMOR, "tumor"); # process tumor
	loadHashes(); # load mutations and coverage info
	computeBestState(); # compute bestState for each mutation
	findSomaticMut(); # detect somatic mutations in tumor
	exportSVs(); # export mutations to file

	##########################################


	my $time_taken = time - $start_time;

	#if($VERBOSE) {
	elapsedTime($time_taken, "FindSomatic");
	#}
	diff --git a/FindSomatic.pl b/FindSomatic.pl
	old mode 100755
	new mode 100644
	index 7a42a1e..6e90b4e
	--- a/FindSomatic.pl
	+++ b/FindSomatic.pl
	@@ -508,10 +508,12 @@ sub parseBestState {
	my @R = split("", $REF); # reference
	my @A = split("", $ALT); # alternative
	my $n=0; my $t=1;
	- if ( ($A[$n]>0) && ($A[$t]>0) ) { # both in normal and tumor
	+ #if ( ($A[$n]>0) && ($A[$t]>0) ) { # both in normal and tumor
	+ if ( $A[$n] == $A[$t] ) { # genotypes for ALT identical
	$sv->{inheritance} = "normal";
	}
	- elsif ( ($A[$n]==0) && ($A[$t]>0) ) { # only in tumor
	+ #elsif ( ($A[$n]==0) && ($A[$t]>0) ) { # only in tumor
	+ elsif ( $A[$n] != $A[$t] ) { # genotypes for ALT different
	$sv->{inheritance} = "no";
	$sv->{somatic} = "yes";
	}
	diff --git a/FindVariants.pl b/FindVariants.pl
	index 5f0dceb..712095b 100755
	--- a/FindVariants.pl
	+++ b/FindVariants.pl
	@@ -218,7 +218,7 @@ sub run {
	}

	# decide if loading genome from fasta or use samtools faidx
	- if (-e "$bedfile") { $USEFAIDX = 0; }
	+ if (-e "$bedfile") { $USEFAIDX = 1; }
	elsif ($bedfile =~ m/(\w+):(\d+)-(\d+)/) {
	#parse region
	my ($chr,$interval) = split(':',$bedfile);