Created
March 9, 2017 13:19
-
-
Save mmterpstra/09e5e5eea83c83ecab185795d46c7730 to your computer and use it in GitHub Desktop.
Finds repeating basepair units in DNA use: FindPolyNucSitesRef.pl <[ATCGN]+> <REPEATLENGTH> <FASTA> | bedtools merge -i - > result.bed
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
use warnings; | |
use strict; | |
use Data::Dumper; | |
use Scalar::Util qw/ looks_like_number/ ; | |
my $use = <<"END1"; | |
use | |
$0 NUC LEN FASTA>BED | |
Traverses the FASTA file and returns a BED file with the locations of the NUCleotide sequences spanning a length greather then or equal to LEN. | |
END1 | |
my ($nuc, $len, $fastaFile) = @ARGV or die $use; | |
my $fasta = OpenFasta("<",$fastaFile) or die "Cannot Read '$fastaFile':".$use; | |
#open(my $fahandle,"<",$fastaFile)or die "Cannot Read '$fastaFile':".$use; | |
#warn $fahandle; | |
#$fasta = _SetFastaHandle ($fasta,\$fahandle); | |
$fasta = _SetFastaBuflen($fasta,$len*2); | |
#die Dumper \$fasta; | |
my $cnt = 0; | |
while(my $fasta = ReadFastaBuffered($fasta)){ | |
#warn Dumper \$fasta; | |
FindNucs($fasta, $nuc, $len); | |
# if($cnt == 0); | |
# die Dumper \$fasta if($cnt > 3); | |
$cnt++; | |
} | |
sub FindNucs{ | |
my ($fasta, $nuc, $len) = @_; | |
my $offset = 0; | |
#find match | |
while((my $i = index(_GetFastaSeq($fasta),$nuc x $len, $offset))>-1){ | |
#expand match | |
my $matchSeq = $nuc x $len; | |
while(substr(_GetFastaSeq($fasta),$i+length($matchSeq),length($nuc)) eq $nuc){ | |
#warn join("\t",(length($matchSeq),$matchSeq))."\n"; | |
$matchSeq = $matchSeq.$nuc; | |
} | |
# \/ -1 because bed format \/ -1 because maybe it is a offset thing... | |
print join("\t", (_GetFastaHeader($fasta), _GetFastaOffset($fasta) + $i-1, _GetFastaOffset($fasta) + $i + length($matchSeq) -1))."\n"; | |
$offset = $i + length($matchSeq); | |
} | |
} | |
sub OpenFasta { | |
my ($mode, $file)= @_; | |
my $fasta; | |
open($fasta->[5],"<",$fastaFile) or die "Cannot Read '$file':".$use; | |
return $fasta; | |
} | |
sub ReadFasta { | |
my $faHandle = ${shift(@_)}; | |
return if(eof($faHandle)); | |
my $seqHeader = <$faHandle>; | |
chomp $seqHeader; | |
$seqHeader = substr($seqHeader,1); | |
my $seq = <$faHandle>; | |
chomp $seq; | |
my $fasta; | |
$fasta->[1]=$seq; | |
$fasta->[0]=$seqHeader; | |
#warn $fastq->[0]; | |
return $fasta; | |
} | |
sub ReadFastaBuffered { | |
my $fasta = shift(@_); | |
$fasta=_ReadFastaBuffer($fasta) or return undef(); | |
$fasta=_UpdateFastaMetafields($fasta) or return undef(); | |
return $fasta; | |
} | |
sub _SetFastaHeader{ | |
my $fasta = shift @_; | |
my $header = shift @_; | |
$fasta->[0]=$header; | |
return $fasta; | |
} | |
sub _GetFastaHeader{ | |
my $fasta = shift @_; | |
return $fasta->[0]; | |
} | |
sub _SetFastaBuflen{ | |
my $fasta = shift @_; | |
my $len = shift @_; | |
$fasta->[2]=$len; | |
return $fasta; | |
} | |
sub _GetFastaBuflen{ | |
my $fasta = shift @_; | |
return $fasta->[2]; | |
} | |
sub _SetFastaHandle{ | |
my $fasta = shift @_; | |
my $handle =${${ shift @_ }} ; | |
warn $handle; | |
$fasta->[5]=$handle; | |
return $fasta; | |
} | |
sub _GetFastaHandle{ | |
my $fasta = shift @_; | |
return $fasta->[5]; | |
} | |
sub _ReadFastaBuffer { | |
my $fasta = shift(@_); | |
if(defined($fasta->[4])){ | |
my $h = ${$fasta->[5]}; | |
$fasta->[3]=$fasta->[4]; | |
$fasta->[4]=undef(); | |
if(not(defined($fasta->[4]=<$h>))){ | |
return $fasta; | |
} | |
chomp $fasta->[4]; | |
}elsif(not(eof($fasta->[5]))){ | |
my $h = $fasta->[5]; | |
defined($fasta->[3]=<$h>) or die "strange error here"; | |
chomp $fasta->[3]; | |
defined($fasta->[4]=<$h>) or die "strange error here, possibly end of file."; | |
chomp $fasta->[4]; | |
}else{ | |
return undef; | |
} | |
return $fasta; | |
} | |
sub _SetFastaSeq{ | |
my $fasta = shift @_; | |
my $header = shift @_; | |
$fasta->[1]=$header; | |
return $fasta; | |
} | |
sub _GetFastaSeq{ | |
my $fasta = shift @_; | |
return $fasta->[1]; | |
} | |
sub _SetFastaOffset{ | |
my $fasta = shift @_; | |
my $header = shift @_; | |
$fasta->[6]=$header; | |
return $fasta; | |
} | |
sub _GetFastaOffset{ | |
my $fasta = shift @_; | |
return $fasta->[6]; | |
} | |
sub _UpdateFastaMetafields { | |
my $fasta = shift @_; | |
my $line = $fasta->[3]; | |
chomp $line; | |
if($line =~ m/^>(\S+)/){ | |
$fasta = _SetFastaHeader($fasta,$1);#assign header | |
$fasta = _ReadFastaBuffer($fasta) or return undef; | |
$fasta = _SetFastaSeq($fasta,""); | |
my $offset = 1; | |
$fasta = _SetFastaOffset($fasta,$offset); | |
$fasta = _UpdateFastaMetafields($fasta); | |
#$fasta = _SetFastaSeq($fasta,""); | |
#$fasta = _SetFastaOffset($fasta,1); | |
}elsif($line =~ m/^[ATCGNWUSMKRYBDHVN]*$/){ | |
my $seq; | |
my $offset; | |
if(defined($seq =_GetFastaSeq($fasta))){ | |
$fasta =_SetFastaSeq($fasta,substr($seq,-1*_GetFastaBuflen($fasta)).$line); | |
if(length($seq)-_GetFastaBuflen($fasta) > 0){ | |
$offset = _GetFastaOffset($fasta) + length($seq)-1*_GetFastaBuflen($fasta); | |
}else{ | |
$offset = _GetFastaOffset($fasta); | |
} | |
$fasta = _SetFastaOffset($fasta,$offset); | |
}else{ | |
#first line init | |
$seq=""; | |
$offset = 1; | |
$fasta =_SetFastaSeq($fasta,$line); | |
} | |
$fasta = _SetFastaOffset($fasta,$offset); | |
}else{ | |
die "Format exception:".Dumper($fasta) | |
} | |
return $fasta; | |
} | |
sub _UnReadFastaBuffer { | |
my $fasta = shift @_; | |
$fasta->[4] = $fasta->[3]; | |
return $fasta; | |
} | |
sub _Notes{ | |
my $fasta; | |
$fasta->[0]="HEADER"; | |
$fasta->[1]="SEQ"; | |
$fasta->[2]="Seqlen"; | |
$fasta->[3]="lcur"; | |
$fasta->[4]="lnext"; | |
$fasta->[5]="handle"; | |
$fasta->[6]="offset"; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment