Last active
April 9, 2021 10:03
-
-
Save pmenzel/2c9668478305af6a6f704f22f8c1c93f to your computer and use it in GitHub Desktop.
Modify SAM CIGAR string to soft-clip the last n bases
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
my $cigar = "30M40D50M"; | |
my $n_soft = 71; | |
my $out_cigar = ""; | |
# there are already at least $n_soft soft-clipped bases at the end of the CIGAR | |
# then do nothing | |
if($cigar =~ m/(\d+)S$/ and $1 >= $n_soft) { | |
$out_cigar = $cigar; | |
} | |
else { | |
# incrementally fetch the last part of the CIGAR and sum up their lengths until $n_soft is reached | |
my $sum = 0; | |
while(1) { | |
if($cigar =~ m/((\d+[SMINDH])*)(\d+)([NDH])$/) { # hard clips, dels and ref skips are not counted | |
$cigar = $1; | |
} | |
elsif($cigar =~ m/((\d+[SMINDH])*)(\d+)([SMI])$/) { | |
$sum += $3; | |
if($sum > $n_soft) { | |
$out_cigar = $1 . ($sum - $n_soft) . $4 . $n_soft . "S"; | |
last; | |
} | |
else { | |
$cigar = $1; | |
} | |
} | |
else { # last field was read, but number of bases in alignment is < $n_soft | |
$out_cigar = $sum . "S"; | |
last; | |
} | |
} | |
} | |
print "$out_cigar\n"; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment