Skip to content

Instantly share code, notes, and snippets.

@mmterpstra
Created March 9, 2017 13:19
Show Gist options
  • Save mmterpstra/09e5e5eea83c83ecab185795d46c7730 to your computer and use it in GitHub Desktop.
Save mmterpstra/09e5e5eea83c83ecab185795d46c7730 to your computer and use it in GitHub Desktop.
Finds repeating basepair units in DNA use: FindPolyNucSitesRef.pl <[ATCGN]+> <REPEATLENGTH> <FASTA> | bedtools merge -i - > result.bed
#!/usr/bin/perl
use warnings;
use strict;
use Data::Dumper;
use Scalar::Util qw/ looks_like_number/ ;
my $use = <<"END1";
use
$0 NUC LEN FASTA>BED
Traverses the FASTA file and returns a BED file with the locations of the NUCleotide sequences spanning a length greather then or equal to LEN.
END1
my ($nuc, $len, $fastaFile) = @ARGV or die $use;
my $fasta = OpenFasta("<",$fastaFile) or die "Cannot Read '$fastaFile':".$use;
#open(my $fahandle,"<",$fastaFile)or die "Cannot Read '$fastaFile':".$use;
#warn $fahandle;
#$fasta = _SetFastaHandle ($fasta,\$fahandle);
$fasta = _SetFastaBuflen($fasta,$len*2);
#die Dumper \$fasta;
my $cnt = 0;
while(my $fasta = ReadFastaBuffered($fasta)){
#warn Dumper \$fasta;
FindNucs($fasta, $nuc, $len);
# if($cnt == 0);
# die Dumper \$fasta if($cnt > 3);
$cnt++;
}
sub FindNucs{
my ($fasta, $nuc, $len) = @_;
my $offset = 0;
#find match
while((my $i = index(_GetFastaSeq($fasta),$nuc x $len, $offset))>-1){
#expand match
my $matchSeq = $nuc x $len;
while(substr(_GetFastaSeq($fasta),$i+length($matchSeq),length($nuc)) eq $nuc){
#warn join("\t",(length($matchSeq),$matchSeq))."\n";
$matchSeq = $matchSeq.$nuc;
}
# \/ -1 because bed format \/ -1 because maybe it is a offset thing...
print join("\t", (_GetFastaHeader($fasta), _GetFastaOffset($fasta) + $i-1, _GetFastaOffset($fasta) + $i + length($matchSeq) -1))."\n";
$offset = $i + length($matchSeq);
}
}
sub OpenFasta {
my ($mode, $file)= @_;
my $fasta;
open($fasta->[5],"<",$fastaFile) or die "Cannot Read '$file':".$use;
return $fasta;
}
sub ReadFasta {
my $faHandle = ${shift(@_)};
return if(eof($faHandle));
my $seqHeader = <$faHandle>;
chomp $seqHeader;
$seqHeader = substr($seqHeader,1);
my $seq = <$faHandle>;
chomp $seq;
my $fasta;
$fasta->[1]=$seq;
$fasta->[0]=$seqHeader;
#warn $fastq->[0];
return $fasta;
}
sub ReadFastaBuffered {
my $fasta = shift(@_);
$fasta=_ReadFastaBuffer($fasta) or return undef();
$fasta=_UpdateFastaMetafields($fasta) or return undef();
return $fasta;
}
sub _SetFastaHeader{
my $fasta = shift @_;
my $header = shift @_;
$fasta->[0]=$header;
return $fasta;
}
sub _GetFastaHeader{
my $fasta = shift @_;
return $fasta->[0];
}
sub _SetFastaBuflen{
my $fasta = shift @_;
my $len = shift @_;
$fasta->[2]=$len;
return $fasta;
}
sub _GetFastaBuflen{
my $fasta = shift @_;
return $fasta->[2];
}
sub _SetFastaHandle{
my $fasta = shift @_;
my $handle =${${ shift @_ }} ;
warn $handle;
$fasta->[5]=$handle;
return $fasta;
}
sub _GetFastaHandle{
my $fasta = shift @_;
return $fasta->[5];
}
sub _ReadFastaBuffer {
my $fasta = shift(@_);
if(defined($fasta->[4])){
my $h = ${$fasta->[5]};
$fasta->[3]=$fasta->[4];
$fasta->[4]=undef();
if(not(defined($fasta->[4]=<$h>))){
return $fasta;
}
chomp $fasta->[4];
}elsif(not(eof($fasta->[5]))){
my $h = $fasta->[5];
defined($fasta->[3]=<$h>) or die "strange error here";
chomp $fasta->[3];
defined($fasta->[4]=<$h>) or die "strange error here, possibly end of file.";
chomp $fasta->[4];
}else{
return undef;
}
return $fasta;
}
sub _SetFastaSeq{
my $fasta = shift @_;
my $header = shift @_;
$fasta->[1]=$header;
return $fasta;
}
sub _GetFastaSeq{
my $fasta = shift @_;
return $fasta->[1];
}
sub _SetFastaOffset{
my $fasta = shift @_;
my $header = shift @_;
$fasta->[6]=$header;
return $fasta;
}
sub _GetFastaOffset{
my $fasta = shift @_;
return $fasta->[6];
}
sub _UpdateFastaMetafields {
my $fasta = shift @_;
my $line = $fasta->[3];
chomp $line;
if($line =~ m/^>(\S+)/){
$fasta = _SetFastaHeader($fasta,$1);#assign header
$fasta = _ReadFastaBuffer($fasta) or return undef;
$fasta = _SetFastaSeq($fasta,"");
my $offset = 1;
$fasta = _SetFastaOffset($fasta,$offset);
$fasta = _UpdateFastaMetafields($fasta);
#$fasta = _SetFastaSeq($fasta,"");
#$fasta = _SetFastaOffset($fasta,1);
}elsif($line =~ m/^[ATCGNWUSMKRYBDHVN]*$/){
my $seq;
my $offset;
if(defined($seq =_GetFastaSeq($fasta))){
$fasta =_SetFastaSeq($fasta,substr($seq,-1*_GetFastaBuflen($fasta)).$line);
if(length($seq)-_GetFastaBuflen($fasta) > 0){
$offset = _GetFastaOffset($fasta) + length($seq)-1*_GetFastaBuflen($fasta);
}else{
$offset = _GetFastaOffset($fasta);
}
$fasta = _SetFastaOffset($fasta,$offset);
}else{
#first line init
$seq="";
$offset = 1;
$fasta =_SetFastaSeq($fasta,$line);
}
$fasta = _SetFastaOffset($fasta,$offset);
}else{
die "Format exception:".Dumper($fasta)
}
return $fasta;
}
sub _UnReadFastaBuffer {
my $fasta = shift @_;
$fasta->[4] = $fasta->[3];
return $fasta;
}
sub _Notes{
my $fasta;
$fasta->[0]="HEADER";
$fasta->[1]="SEQ";
$fasta->[2]="Seqlen";
$fasta->[3]="lcur";
$fasta->[4]="lnext";
$fasta->[5]="handle";
$fasta->[6]="offset";
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment