Skip to content

Instantly share code, notes, and snippets.

@lh3
Last active May 1, 2017 16:53
Show Gist options
  • Save lh3/5022436 to your computer and use it in GitHub Desktop.
Save lh3/5022436 to your computer and use it in GitHub Desktop.
Convert GEM alignment format to SAM. Mate positions are not computed in this version.
#!/usr/bin/env perl
use strict;
use warnings;
die("Usage: gem2sam.pl <gem.map>\n") if (@ARGV == 0 && -t STDIN);
while (<>) {
chomp;
my @t = split("\t");
my $o = ($t[2] =~ /^\d+/ && length($t[1]) != length($t[2]))? 0 : 1; # for FASTQ, $o=1; "o" means offset
my @seq = split(" ", $t[1]);
my @qual = $o? split(" ", $t[2]) : @seq == 2? ('*', '*') : ('*');
my (@sam, @sam_NM);
my $is_pe = @seq == 2? 1 : 0;
$t[3+$o] =~ s/ [^:]+//g;
if ($t[3+$o] eq '-') { # unmapped single-end
push(@sam, [4, '*', 0, 0, '*']);
$sam_NM[0] = 0;
} else {
my @hits = split(',', $t[3+$o]);
my ($s1, $s2, $i);
my ($X0, $X1) = (0, 0);
# parse the FIRST hit of each end
my @x = split('::', $hits[0]);
$s1 = &parse_hit($x[0]);
$s2 = &parse_hit($x[1]) if $is_pe;
# compute X0 and X1
@x = split(':', $t[2+$o]);
for ($i = 0; $i < @x; ++$i) {
last if ($x[$i] ne '0');
}
$X0 = $1 + (defined($2)? $2 : 0) if ($i < @x && $x[$i] =~ /^(\d+)(\+\d+)?/);
if ($i == $#x) {
$X1 = -1;
} elsif ($x[$i+1] =~ /^(\d+)(\+\d+)?/) {
$X1 = $1 + (defined($2)? $2 : 0);
}
$sam_NM[0] = $s1->[3];
$sam_NM[1] = $s2->[3] if $is_pe;
# compute mapping quality
my $mapq;
if ($X0 > 1) {
$mapq = 0;
} elsif ($X1 < 0) {
$mapq = 25;
} elsif ($X1 == 0) {
$mapq = 37;
} else {
$mapq = int(23 - 4.343 * log($X1+1) + .499);
$mapq = $mapq > 0? $mapq : 0;
}
# populate the SAM information
if ($is_pe) {
push(@sam, [0x43|$s1->[0], $s1->[1], $s1->[2], $mapq, $s1->[4]]);
push(@sam, [0x83|$s2->[0], $s2->[1], $s2->[2], $mapq, $s2->[4]]);
} else {
push(@sam, [$s1->[0], $s1->[1], $s1->[2], $mapq, $s1->[4]]);
}
}
for (my $i = 0; $i < @sam; ++$i) {
print(join("\t", $t[0], @{$sam[$i]}, "*\t0\t0", $seq[$i], $qual[$i], "NM:i:$sam_NM[$i]"), "\n");
}
}
sub parse_hit { # such as: 1:-:170882712:1>1-1>1-11T50G33
my $x = shift;
my ($ref, $strand, $pos, $y) = split(':', $x);
my $flag = $strand eq '+'? 0 : 16;
my @cigar;
my ($cigar, $NM) = ('', 0);
while ($y =~ /(\d+)|([A-Za-z])|(>(\d+)([+-]))/g) {
my ($len, $op);
if (defined $1) {
($len, $op) = ($1, 'M');
} elsif (defined $2) {
($len, $op) = (1, 'M');
++$NM;
} elsif (defined $3) {
($len, $op) = ($4, $5 eq '+'? 'D' : 'I');
$NM += $4;
}
if (@cigar && $cigar[$#cigar][1] eq $op) {
$cigar[$#cigar][0] += $len;
} else {
push(@cigar, [$len, $op]);
}
}
$cigar .= $_->[0] . $_->[1] for (@cigar);
# print("$x\t$cigar\t$NM\n");
return [$flag, $ref, $pos, $NM, $cigar];
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment