Created
February 13, 2017 09:04
-
-
Save nylander/a814398690d56295f082461d44469be0 to your computer and use it in GitHub Desktop.
Extract masked positions in fasta file. Question on [biosupport.se](https://biosupport.se/p/327/)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
use strict; | |
use warnings; | |
=pod | |
=head1 | |
FILE: extract_masked.pl | |
USAGE: ./extract_masked.pl fasta_file.fas | |
DESCRIPTION: Question on biosupport.se (https://biosupport.se/p/327/), 11/22/2013: | |
"Does anyone know of a quick way to extract the positions for a particular mask from a fasta file? | |
So for example if I wanted to extract the position for all missing sites for chr 1 and this is | |
coded as "." in my fasta file - how can I generate a bed file with the list of these positions? | |
Input: | |
chr1 | |
AAAAA.NNNNCCCCTTTT..A | |
output: | |
<chr> <start_pos (index="" start="" 0)=""> <end_pos> | |
chr1: 5 5 | |
chr1: 18 19 | |
Thank you in advance." | |
NOTES: Mostly untested, and with plenty of room for improvments. Caveat emptor! | |
AUTHOR: Johan Nylander (JN), [email protected] | |
COMPANY: BILS/NRM | |
VERSION: 1.0 | |
CREATED: 11/22/2013 | |
REVISION: --- | |
=cut | |
## Specify search symbol | |
my $char = '.'; # Character to search for | |
## Print first line in output to standard out | |
print STDOUT '<chr> <start_pos (index="" start="" 0)=""> <end_pos>', "\n"; | |
## Parse FASTA format manually from standard in. Assuming that sequences might be on separate lines. | |
## Also assuming that the fasta header given as example above ("chr1") actually should be (">chr1")! | |
local $/ = '>'; | |
while(<>) { | |
chomp; | |
next if(/^\s*$/); | |
my ($header, @record) = split /\n/; | |
my $seq = ''; | |
foreach my $line (@record) { | |
$seq .= $line; | |
} | |
#print STDERR ">$header\n$seq\n"; | |
## Get positions of $char in sequence | |
my @positions = (); | |
my $offset = 0; | |
my $position = index($seq, $char, $offset); | |
while ($position != -1) { | |
push(@positions, $position); | |
$offset = $position + 1; | |
$position = index($seq, $char, $offset); | |
} | |
## Print only start and stop of continuous occurrences of $char. | |
## Mind you - plenty of room for improvment in this section! | |
if ((scalar(@positions) == length($seq)) or (scalar(@positions) == 0)) { | |
## Skip records with no chars, or records with all chars... | |
} | |
else { | |
my $x = -1; | |
my $first = ''; | |
my $didfirst = 0; | |
foreach my $val (@positions) { | |
$x++; | |
my $y = $x + 1; | |
if ( ($positions[$y] - $val) == 1 ) { | |
if ($first) { | |
print STDOUT "\n"; | |
} | |
else { | |
print STDOUT "$header $val "; | |
$didfirst = 1; | |
} | |
} | |
else { | |
if ($didfirst) { | |
print STDOUT "$val\n"; | |
$didfirst = 0; | |
} | |
else { | |
print STDOUT "$header: $val $val\n"; | |
$first = ''; | |
} | |
} | |
} | |
} | |
} | |
__END__ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment