Skip to content

Instantly share code, notes, and snippets.

@pgonzale60
Created September 21, 2023 10:29
Show Gist options
  • Save pgonzale60/930e1bdda2159066235f1e84b6827a0b to your computer and use it in GitHub Desktop.
Save pgonzale60/930e1bdda2159066235f1e84b6827a0b to your computer and use it in GitHub Desktop.
extract positions of softclipped telomeres from BAM
#!/usr/bin/env perl
use strict;
use warnings;
use Getopt::Long;
use File::Basename;
use Data::Dumper;
# globals
my $EXE = basename($0);
my $VERSION = "0.1.1";
my $AUTHOR = 'Torsten Seemann (@torstenseemann) and Pablo Manuel Gonzalez de la Rosa';
my $HOMEPAGE = "undefined";
# SAM file TSV columns
use constant {
SAM_RID => 0,
SAM_FLAG => 1,
SAM_RNAME => 2,
SAM_POS => 3,
SAM_MAPQ => 4,
SAM_CIGAR => 5,
SAM_TLEN => 8,
SAM_SEQ => 9,
};
#----------------------------------------------------------------------
# command line parameters
my $telomere = "TTAGGC";
my $debug = 0;
my $minTeloN = 1;
#----------------------------------------------------------------------
sub usage {
my($exitcode) = @_;
$exitcode=0 if !defined($exitcode) or $exitcode eq 'help';
my $fh = $exitcode ? \*STDERR : \*STDOUT;
print $fh
"SYNOPSIS\n Get coordinates of soft clipped telomeric sequences \n",
"OUTPUT\n TSV printed to STDOUT. Columns: 1) reference seq ID, 2) position of softclipped telomere,\n",
" 3) R or L indicating if telomere occurs at right or left of alignment and 4) MAPQ\n",
"AUTHOR\n $AUTHOR\n",
"USAGE\n",
" % minimap2 ref.fa telomeric_reads.fa | softclipped_telo_coords.pl > clippedTeloPos.tsv\n",
"OPTIONS\n",
" --help This help\n",
" --version Print version and exit\n",
" --telomere STR Telomeric repeat for species (default is nematodes' $telomere)\n",
" --debug Print verbose debug info to stderr\n",
"HOMEPAGE\n $HOMEPAGE\n",
"";
exit($exitcode);
}
#----------------------------------------------------------------------
# getopts
GetOptions(
"help" => \&usage,
"version" => \&version,
"telomere=s" => \$telomere,
"debug" => \$debug,
) or usage(1);
!@ARGV and -t STDIN and err("Please provide or pipe a SAM file to $EXE");
#----------------------------------------------------------------------
# main
msg("$EXE $VERSION by $AUTHOR");
my $total=0;
my $leftend=0;
my $rightend=0;
my $addedlen=0;
# at most this number of bases can lack telomeric repeat
# intended for dealing with read errors
my $telolength = length($telomere);
my $telosearchspace = 3 * $telolength;
my $revcomptelomere= reverse $telomere;
$revcomptelomere =~ tr/ACGTacgt/TGCAtgca/;
# read SAM one line ar a time
while (my $line = <ARGV>) {
# skip SAM header
if ($line =~ m/^@/) {
next;
}
$total++;
my @sam = split m/\t/, $line;
# ensure there is softclipped alignment before heavyweight parsing
my $isSoftClipped = ($sam[SAM_CIGAR] =~ /\dS/);
my $isHardClipped = ($sam[SAM_CIGAR] =~ /\dH/);
my $isPrimaryAlignment = !($sam[SAM_FLAG] & 2048 or $sam[SAM_FLAG] & 256);
if ($isSoftClipped and not $isHardClipped and $isPrimaryAlignment ) {
my $forwardStrand = !($sam[SAM_FLAG] & 16);
my $start = $sam[SAM_POS];
my $readlen = length($sam[SAM_SEQ]);
my $end = $start + $readlen - 1;
my $contigname = $sam[SAM_RNAME];
my ($SL, undef, $SR)
= ($sam[SAM_CIGAR] =~ m/ ^ (?:(\d+)S)? (.*?) (?:(\d+)S)? $/x);
$SL ||= 0; $SR ||= 0;
## Check telomere on the right side of read
if ( $SR > $telosearchspace ) {
my $potentialSequence = substr($sam[SAM_SEQ], $readlen - $SR - ($telolength), $readlen + 1);
my $endswithtelomere = () = $potentialSequence =~ /$telomere/gi;
# location from https://stackoverflow.com/questions/1849329/is-there-a-perl-shortcut-to-count-the-number-of-matches-in-a-string
# telomere match is correct, but when converting back to the coordinates, need to account for the fact
# that the substring starts before the softclip coordinate. That is, the $location could be negative
# relative to the softclip position.
my $location = index($potentialSequence, $telomere) - ($telolength);
if ($endswithtelomere > $minTeloN){
# adjust end of alignment by indels and deletion length
# this is obtained by parsing the CIGAR string
# code adapted from https://github.com/holmeso/adamaperl/blob/56fee71f0c431e3b98ef5f3fc1a2a7213e755e14/lib/QCMG/SamTools/Bam/Alignment.pm#L569
my $adjust = 0;
my @cigar = $sam[SAM_CIGAR] =~ /(\d+)(\w)/g;
while (@cigar) {
my ($len,$op) = splice(@cigar,0,2);
$adjust += $len if $op eq 'I';
$adjust -= $len if $op eq 'D';
}
$adjust += $SL + $SR;
my $adjustedend = $start + $readlen - $adjust;
print join("\t", $sam[SAM_RID], $sam[SAM_RNAME], $adjustedend, "R", $sam[SAM_MAPQ], $location, $endswithtelomere), "\n";
$rightend++;
}
}
if ( $SL > $telosearchspace ){
## Check telomere on the left side of read
my $potentialSequence = substr($sam[SAM_SEQ], 0, $SL + 1 + ($telolength));
# my $startswithtelomere = ($potentialSequence =~ /^\w{0,\Q$telosearchspace\E}\Q$revcomptelomere/);
my $startswithtelomere = () = $potentialSequence =~ /$revcomptelomere/gi;
my $location = rindex($potentialSequence, $revcomptelomere ) + $telolength - $SL;
if($startswithtelomere > $minTeloN){
print join("\t", $sam[SAM_RID], $sam[SAM_RNAME], $start, "L", $sam[SAM_MAPQ], $location, $startswithtelomere), "\n";
$leftend++;
}
}
}
}
# stats
msg("Total SAM records $total");
msg("Left end softclipped telomeres $leftend");
msg("Right end softclipped telomeres $rightend");
msg("Done.");
#----------------------------------------------------------------------
sub version {
print "$EXE $VERSION\n";
exit(0);
}
#----------------------------------------------------------------------
sub msg {
print STDERR "[$EXE] @_\n";
}
#----------------------------------------------------------------------
sub err {
msg("ERROR:", @_);
exit(1);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment