nylander · February 13, 2017 09:04
diff --git a/extract_masked.pl b/extract_masked.pl
 #!/usr/bin/env perl

 use strict;
 use warnings;

 =pod

 =head1

         FILE: extract_masked.pl

        USAGE: ./extract_masked.pl fasta_file.fas

  DESCRIPTION: Question on biosupport.se (https://biosupport.se/p/327/), 11/22/2013:
               "Does anyone know of a quick way to extract the positions for a particular mask from a fasta file?
               So for example if I wanted to extract the position for all missing sites for chr 1 and this is
               coded as "." in my fasta file - how can I generate a bed file with the list of these positions?
               Input:

               chr1
               AAAAA.NNNNCCCCTTTT..A

               output:
               <chr> <start_pos (index="" start="" 0)=""> <end_pos>
               chr1: 5 5
               chr1: 18 19
               Thank you in advance."

        NOTES: Mostly untested, and with plenty of room for improvments. Caveat emptor!

       AUTHOR: Johan Nylander (JN), [email protected]

      COMPANY: BILS/NRM

      VERSION: 1.0

      CREATED: 11/22/2013

     REVISION: ---

 =cut

 ## Specify search symbol 
 my $char = '.'; # Character to search for

 ## Print first line in output to standard out
 print STDOUT '<chr> <start_pos (index="" start="" 0)=""> <end_pos>', "\n";

 ## Parse FASTA format manually from standard in. Assuming that sequences might be on separate lines.
 ## Also assuming that the fasta header given as example above ("chr1") actually should be (">chr1")!
 local $/ = '>';
 while(<>) {
    chomp;
    next if(/^\s*$/);
    my ($header, @record) = split /\n/;
    my $seq = '';
    foreach my $line (@record) {
        $seq .= $line;
    }
    #print STDERR ">$header\n$seq\n";

    ## Get positions of $char in sequence
    my @positions = ();
    my $offset = 0;
    my $position = index($seq, $char, $offset);
    while ($position != -1) {
        push(@positions, $position);
        $offset = $position + 1;
        $position = index($seq, $char, $offset);
    }

    ## Print only start and stop of continuous occurrences of $char.
    ## Mind you - plenty of room for improvment in this section!
    if ((scalar(@positions) == length($seq)) or (scalar(@positions) == 0)) {
        ## Skip records with no chars, or records with all chars...
    }
    else {
        my $x = -1;
        my $first = '';
        my $didfirst = 0;
        foreach my $val (@positions) {
            $x++;
            my $y = $x + 1;
            if ( ($positions[$y] - $val) == 1 ) {
                if ($first) {
                    print STDOUT "\n";
                }
                else {
                    print STDOUT "$header $val ";
                    $didfirst = 1;
                }
            }
            else {
                if ($didfirst) {
                    print STDOUT "$val\n";
                    $didfirst = 0;
                }
                else {
                    print STDOUT "$header: $val $val\n";
                    $first = '';
                }
            }
        }
    }
 }
 __END__
	#!/usr/bin/env perl

	use strict;
	use warnings;

	=pod

	=head1

	FILE: extract_masked.pl

	USAGE: ./extract_masked.pl fasta_file.fas

	DESCRIPTION: Question on biosupport.se (https://biosupport.se/p/327/), 11/22/2013:
	"Does anyone know of a quick way to extract the positions for a particular mask from a fasta file?
	So for example if I wanted to extract the position for all missing sites for chr 1 and this is
	coded as "." in my fasta file - how can I generate a bed file with the list of these positions?
	Input:

	chr1
	AAAAA.NNNNCCCCTTTT..A

	output:
	<chr> <start_pos (index="" start="" 0)=""> <end_pos>
	chr1: 5 5
	chr1: 18 19
	Thank you in advance."

	NOTES: Mostly untested, and with plenty of room for improvments. Caveat emptor!

	AUTHOR: Johan Nylander (JN), [email protected]

	COMPANY: BILS/NRM

	VERSION: 1.0

	CREATED: 11/22/2013

	REVISION: ---

	=cut

	## Specify search symbol
	my $char = '.'; # Character to search for

	## Print first line in output to standard out
	print STDOUT '<chr> <start_pos (index="" start="" 0)=""> <end_pos>', "\n";

	## Parse FASTA format manually from standard in. Assuming that sequences might be on separate lines.
	## Also assuming that the fasta header given as example above ("chr1") actually should be (">chr1")!
	local $/ = '>';
	while(<>) {
	chomp;
	next if(/^\s*$/);
	my ($header, @record) = split /\n/;
	my $seq = '';
	foreach my $line (@record) {
	$seq .= $line;
	}
	#print STDERR ">$header\n$seq\n";

	## Get positions of $char in sequence
	my @positions = ();
	my $offset = 0;
	my $position = index($seq, $char, $offset);
	while ($position != -1) {
	push(@positions, $position);
	$offset = $position + 1;
	$position = index($seq, $char, $offset);
	}

	## Print only start and stop of continuous occurrences of $char.
	## Mind you - plenty of room for improvment in this section!
	if ((scalar(@positions) == length($seq)) or (scalar(@positions) == 0)) {
	## Skip records with no chars, or records with all chars...
	}
	else {
	my $x = -1;
	my $first = '';
	my $didfirst = 0;
	foreach my $val (@positions) {
	$x++;
	my $y = $x + 1;
	if ( ($positions[$y] - $val) == 1 ) {
	if ($first) {
	print STDOUT "\n";
	}
	else {
	print STDOUT "$header $val ";
	$didfirst = 1;
	}
	}
	else {
	if ($didfirst) {
	print STDOUT "$val\n";
	$didfirst = 0;
	}
	else {
	print STDOUT "$header: $val $val\n";
	$first = '';
	}
	}
	}
	}
	}
	__END__