Created
May 31, 2019 16:36
-
-
Save jkbonfield/bb600c57e710f60cbfd79407230c4a4c to your computer and use it in GitHub Desktop.
Adds methylation to sequences in SAM.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl -w | |
# Uses figures from https://www.biorxiv.org/content/early/2016/03/15/043794 | |
# to add random C methylation to a SAM file, storing in a variety of ways. | |
use strict; | |
srand(0); | |
my $method = shift(@ARGV); | |
my $double_stranded = 0; | |
# From mouse, based on C content. | |
# C 492782672(+h+f+x below) | |
# x 40875807 7.653% 1 in 13. | |
# m 460257 0.8617% 1 in 1161 | |
# h 20538 0.003845% 1 in 26007 | |
# f 2790 0.0005223% 1 in 191449 | |
# C: m h f c (complement 1 2 3 4) | |
my $pm = 3; | |
my $ph = 0.055; | |
my $pf = 0.0014; | |
my $pc = 0.000335; | |
# G: o (complement O, NB that's my made up symbol) | |
my $po = 0.05; | |
# 10x freq for a harder test | |
my $scale=10; | |
#my $scale=1; | |
#my $scale=0.1; | |
$pm *= $scale; | |
$ph *= $scale; | |
$pf *= $scale; | |
$pc *= $scale; | |
$po *= $scale; | |
# Turn into cumulative frequencies | |
$pf += $pc; | |
$ph += $pf; | |
$pm += $ph; | |
# Map base mod symbol into the original unmodified base type | |
my %fundamental = ( | |
"A" => "A", "C" => "C", "G" => "G", "T" => "T", "N" => "N", | |
"m" => "C", "h" => "C", "f" => "C", "c" => "C", "O" => "C", | |
"1" => "G", "2" => "G", "3" => "G", "4" => "G", "o" => "G", | |
); | |
# Largest type size based for B array | |
sub asize { | |
my $max = [sort {$b <=> $a} @_]->[0]; | |
if ($max < 256) { | |
return "C"; | |
} elsif ($max < 65536) { | |
return "S"; | |
} else { | |
return "I"; | |
} | |
} | |
# Add methylation to a sequence base with a random chance | |
sub modify_base { | |
$_ = shift(@_); | |
my $strand = shift(@_); | |
my $seq; | |
my $r = rand()*100; | |
# Replace C with potential m,h,f,c | |
if ($_ eq "C" && ($strand & 1)) { | |
if ($r > $pm) { | |
$seq = "C"; | |
} elsif ($r > $ph) { | |
$seq = "m"; | |
} elsif ($r > $pf) { | |
$seq = "h"; | |
} elsif ($r > $pc) { | |
$seq = "f"; | |
} else { | |
$seq = "c"; | |
} | |
} elsif ($_ eq "C" && ($strand & 2)) { | |
if ($r > $po) { | |
$seq = "C"; | |
} else { | |
$seq = "O"; | |
} | |
# Replace G with potential 1,2,3,4 (C mod to other strand) | |
} elsif ($_ eq "G" && ($strand & 1)) { | |
if ($r > $po) { | |
$seq = "G"; | |
} else { | |
$seq = "o"; | |
} | |
} elsif ($_ eq "G" && ($strand & 2)) { | |
if ($r > $pm) { | |
$seq = "G"; | |
} elsif ($r > $ph) { | |
$seq = "1"; | |
} elsif ($r > $pf) { | |
$seq = "2"; | |
} elsif ($r > $pc) { | |
$seq = "3"; | |
} else { | |
$seq = "4"; | |
} | |
} else { | |
$seq = $_; | |
} | |
return $seq; | |
} | |
$"="\t"; | |
while (<>) { | |
if (/^@/) { | |
print; next; | |
} | |
chomp(); | |
my @F = split("\t", $_); | |
# Produce modified sequence for top and bottom strands | |
my $top = ""; | |
my $bot = ""; | |
foreach (split("", $F[9])) { | |
$top .= modify_base($_, 1); | |
$bot .= modify_base($_, 2); | |
} | |
# FIXME: implement $double_stranded here. | |
# For now we assume mods on either top OR bottom only. | |
# | |
# To cope with double stranded we'd need to expand our alphabet as | |
# some mods could rarely occur on both strands at the same time. | |
my $mod; | |
if ($F[1] & 16) { | |
$mod = $bot; | |
} else { | |
$mod = $top; | |
} | |
# Just modify the sequence field (SAM/CRAM only) | |
if ($method eq 1) { | |
$F[9] = $mod; | |
} | |
# Unmodified seqs are as-is | |
$mod =~ tr/ACGTN/...../; # mask out real bases | |
if ($mod =~ /^\.*$/) { | |
print "@F\n"; | |
next; | |
} | |
# Meth delta as MM tag, dot style | |
if ($method eq 2) { | |
push(@F, "MM:Z:$mod"); | |
} | |
# Meth coord as MM tag, MD style | |
if ($method eq 3) { | |
$mod =~ s/(\.+)/length($1)/ge; | |
push(@F, "MM:Z:$mod") | |
} | |
# Context dependent fields; A C G T N | |
# Near optimal when methylation is common. | |
# | |
# Primary cause of encoding loss is the fields are variable length, whereas | |
# we actually know the length from the sequence itself (how many Cs, how | |
# many Gs, etc). | |
if ($method eq 4) { | |
my %seq; | |
my $p = 0; | |
foreach (split("", $F[9])) { | |
my $m = substr($mod, $p, 1); | |
$p++; | |
my $f = $m; | |
$f = $_ if ($m eq "."); | |
$seq{$fundamental{$f}} .= $m; | |
} | |
push(@F, "mC:Z:".$seq{C}) unless ($seq{C} =~ /^\.*$/); | |
push(@F, "mG:Z:".$seq{G}) unless ($seq{G} =~ /^\.*$/);; | |
} | |
# Like 4, but swap 1234 for mhfc as we use the codes to indicate strand | |
# in conjunction with "G". As "m" is a C code, it's clear "G" means other strand. | |
if ($method eq 22) { | |
my %seq; | |
my $p = 0; | |
foreach (split("", $F[9])) { | |
my $m = substr($mod, $p, 1); | |
$p++; | |
my $f = $m; | |
$f = $_ if ($m eq "."); | |
$seq{$fundamental{$f}} .= $m; | |
} | |
$seq{C} =~ tr/O/o/; | |
push(@F, "mC:Z:".$seq{C}) unless ($seq{C} =~ /^\.*$/); | |
$seq{G} =~ tr/1234/mhfc/; | |
push(@F, "mG:Z:".$seq{G}) unless ($seq{G} =~ /^\.*$/);; | |
} | |
# As 4 but with CIGAR style | |
if ($method eq 7) { | |
my %seq; | |
my $p = 0; | |
foreach (split("", $F[9])) { | |
my $m = substr($mod, $p, 1); | |
$p++; | |
my $f = $m; | |
$f = $_ if ($m eq "."); | |
$seq{$fundamental{$f}} .= $m; | |
} | |
unless ($seq{C} =~ /^\.*$/) { | |
$seq{C} =~ tr/O/o/; | |
$seq{C} =~ s/(\.+)/length($1)/ge; | |
push(@F, "mC:Z:".$seq{C}); | |
} | |
unless ($seq{G} =~ /^\.*$/) { | |
$seq{G} =~ tr/1234/mhfc/; # Strand disambuates it | |
$seq{G} =~ s/(\.+)/length($1)/ge; | |
push(@F, "mG:Z:".$seq{G}); | |
} | |
} | |
# As 2 but mod is fundamental-aware and overlapping | |
# Eg C->m = a, C->h = b | |
# G->1 = a, G->2 = b | |
if ($method eq 5) { | |
$mod =~ tr/mhfcO/abcde/; | |
$mod =~ tr/1234o/abcde/; | |
push(@F, "MM:Z:$mod"); | |
} | |
# As 5 but CIGAR | |
if ($method eq 8) { | |
$mod =~ tr/mhfcO/abcde/; | |
$mod =~ tr/1234o/abcde/; | |
$mod =~ s/(\.+)/length($1)/ge; | |
push(@F, "MM:Z:$mod"); | |
} | |
# As 4: Dotty-seq x4 but with strand too | |
if ($method eq 16) { | |
my %seq; | |
my $p = 0; | |
foreach (split("", $F[9])) { | |
my $m = substr($mod, $p, 1); | |
$p++; | |
my $f = $m; | |
$f = $_ if ($m eq "."); | |
$m =~ s/([mhfco])/+$1/g; | |
$m =~ s/([1234O])/-$1/g; | |
$m =~ tr/1234O/mhfco/; | |
$seq{$fundamental{$f}} .= $m; | |
} | |
push(@F, "mC:Z:".$seq{C}) unless ($seq{C} =~ /^\.*$/); | |
push(@F, "mG:Z:".$seq{G}) unless ($seq{G} =~ /^\.*$/);; | |
} | |
# As 7, but also +/- strand | |
if ($method eq 9) { | |
my %seq; | |
my $p = 0; | |
foreach (split("", $F[9])) { | |
my $m = substr($mod, $p, 1); | |
$p++; | |
my $f = $m; | |
$f = $_ if ($m eq "."); | |
$m =~ s/([mhfco])/+$1/g; | |
$m =~ s/([1234O])/-$1/g; | |
$m =~ tr/1234O/mhfco/; | |
$seq{$fundamental{$f}} .= $m; | |
} | |
unless ($seq{C} =~ /^\.*$/) { | |
$seq{C} =~ s/(\.+)/length($1)/ge; | |
push(@F, "mC:Z:".$seq{C}); | |
} | |
unless ($seq{G} =~ /^\.*$/) { | |
$seq{G} =~ tr/1234/pqrs/; | |
$seq{G} =~ s/(\.+)/length($1)/ge; | |
push(@F, "mG:Z:".$seq{G}); | |
} | |
} | |
# As 16, but strand is remembered until toggled | |
if ($method eq 21) { | |
my %seq; | |
my $p = 0; | |
foreach (split("", $F[9])) { | |
my $m = substr($mod, $p, 1); | |
$p++; | |
my $f = $m; | |
$f = $_ if ($m eq "."); | |
$m =~ s/([mhfco])/+$1/g; | |
$m =~ s/([1234O])/-$1/g; | |
$m =~ tr/1234O/mhfco/; | |
$seq{$fundamental{$f}} .= $m; | |
} | |
unless ($seq{C} =~ /^\.*$/) { | |
while ($seq{C}=~s/(\+[^-+]*)\+/$1/) {} | |
while ($seq{C}=~s/(-[^-+]*)-/$1/) {} | |
$seq{C} =~ s/(\.+)/length($1)/ge; | |
push(@F, "mC:Z:".$seq{C}); | |
} | |
unless ($seq{G} =~ /^\.*$/) { | |
while ($seq{G}=~s/(\+[^-+]*)\+/$1/) {} | |
while ($seq{G}=~s/(-[^-+]*)-/$1/) {} | |
$seq{G} =~ tr/1234O/mhfco/; | |
$seq{G} =~ s/(\.+)/length($1)/ge; | |
push(@F, "mG:Z:".$seq{G}); | |
} | |
} | |
# As 23. One tag for base-type + codes, one for pos delta. | |
if ($method eq 23) { | |
my $codes = ""; | |
my $delta = ""; | |
my %pos; | |
my $p = 0; # pos in mod string | |
my %last; | |
my %fp; # fundamental pos | |
foreach (split("", $F[9])) { | |
my $m = substr($mod, $p++, 1); | |
# Find fundamental base | |
my $f = $m; | |
$f = $_ if ($m eq "."); | |
$f = $fundamental{$f}; | |
# Switch to original strand | |
$m =~ tr/1234O/mhfco/; | |
my $fm = $f.$m; | |
$fp{$f} = 0 if !exists($fp{$f}); | |
$fp{$f}++; | |
next if ($m eq "."); | |
# Add to delta | |
$last{$fm} = 0 if !exists($last{$fm}); | |
push(@{$pos{$fm}}, $fp{$f} - $last{$fm}); | |
$last{$fm}=$fp{$f}; | |
} | |
my @delta = (); | |
foreach (keys(%pos)) { | |
$codes .= $_ . scalar(@{$pos{$_}}); | |
push(@delta, @{$pos{$_}}); | |
} | |
$delta .= join(",",@delta); | |
push(@F, "mC:Z:$codes"); | |
push(@F, "mP:Z:$delta"); | |
} | |
# AS 23, but B array | |
if ($method eq 24) { | |
my $codes = ""; | |
my $delta = ""; | |
my %pos; | |
my $p = 0; # pos in mod string | |
my %last; | |
my %fp; # fundamental pos | |
foreach (split("", $F[9])) { | |
my $m = substr($mod, $p++, 1); | |
# Find fundamental base | |
my $f = $m; | |
$f = $_ if ($m eq "."); | |
$f = $fundamental{$f}; | |
# Switch to original strand | |
$m =~ tr/1234O/mhfco/; | |
my $fm = $f.$m; | |
$fp{$f} = 0 if !exists($fp{$f}); | |
$fp{$f}++; | |
next if ($m eq "."); | |
# Add to delta | |
$last{$fm} = 0 if !exists($last{$fm}); | |
push(@{$pos{$fm}}, $fp{$f} - $last{$fm}); | |
$last{$fm}=$fp{$f}; | |
} | |
my @delta = (); | |
foreach (keys(%pos)) { | |
$codes .= $_ . scalar(@{$pos{$_}}); | |
push(@delta, @{$pos{$_}}); | |
} | |
$delta .= join(",",@delta); | |
push(@F, "mC:Z:$codes"); | |
push(@F, "mP:B:".asize(@delta).",$delta"); | |
} | |
# Like 23, but one inline tag toggling code and delta. | |
if ($method eq 25) { | |
my $codes = ""; | |
my %pos; | |
my $p = 0; # pos in mod string | |
my %last; | |
my %fp; # fundamental pos | |
foreach (split("", $F[9])) { | |
my $m = substr($mod, $p++, 1); | |
# Find fundamental base | |
my $f = $m; | |
$f = $_ if ($m eq "."); | |
$f = $fundamental{$f}; | |
# Switch to original strand | |
$m =~ s/([mhfco])/+$1/; | |
$m =~ s/([1234O])/-$1/; | |
$m =~ tr/1234O/mhfco/; | |
my $fm = $f.$m; | |
#my $fm = "$f,$m"; | |
$fp{$f} = 0 if !exists($fp{$f}); | |
$fp{$f}++; | |
next if ($m eq "."); | |
# Add to delta | |
$last{$fm} = 0 if !exists($last{$fm}); | |
push(@{$pos{$fm}}, $fp{$f} - $last{$fm}); | |
$last{$fm}=$fp{$f}; | |
} | |
foreach (keys(%pos)) { | |
$codes .= "," if ($codes ne ""); | |
$codes .= $_ . "," . join(",", @{$pos{$_}}); | |
} | |
push(@F, "mC:Z:$codes"); | |
} | |
# As 25, but with single B array. | |
# -ve numbers are codes. | |
my %code2id = ( | |
"m" => 0, | |
"h" => 1, | |
"f" => 2, | |
"c" => 3, | |
"o" => 4 | |
); | |
my %base2id = ( | |
"A" => 0, | |
"C" => 1, | |
"G" => 2, | |
"T" => 3, | |
"N" => 4, | |
); | |
my %strand2id = ( | |
"+" => 0, | |
"-" => 1, | |
); | |
if ($method eq 26) { | |
my @codes = (); | |
my %pos; | |
my $p = 0; # pos in mod string | |
my %last; | |
my %fp; # fundamental pos | |
foreach (split("", $F[9])) { | |
my $m = substr($mod, $p++, 1); | |
# Find fundamental base | |
my $f = $m; | |
$f = $_ if ($m eq "."); | |
$f = $fundamental{$f}; | |
# Switch to original strand | |
$m =~ s/([mhfco])/+$1/; | |
$m =~ s/([1234O])/-$1/; | |
$m =~ tr/1234O/mhfco/; | |
my $fm = $f.$m; | |
$fp{$f} = 0 if !exists($fp{$f}); | |
$fp{$f}++; | |
next if ($m eq "."); | |
# Add to delta | |
$last{$fm} = 0 if !exists($last{$fm}); | |
push(@{$pos{$fm}}, $fp{$f} - $last{$fm}); | |
$last{$fm}=$fp{$f}; | |
} | |
foreach (keys(%pos)) { | |
push(@codes, $base2id{substr($_, 0, 1)}); | |
push(@codes, $strand2id{substr($_, 1, 1)}); | |
push(@codes, $code2id{substr($_, 2, 1)}); | |
push(@codes, scalar(@{$pos{$_}})); | |
push(@codes, @{$pos{$_}}); | |
} | |
push(@F, "mC:B:".asize(@codes).",".join(",", @codes)); | |
} | |
print "@F\n"; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl -w | |
use strict; | |
my %pos; | |
sub edit { | |
my $MM = shift(@_); | |
my $base = shift(@_); | |
my $mod = shift(@_); | |
my $strand = shift(@_); | |
if ($strand eq "-") { | |
$mod =~ tr/mhfco/1234O/; | |
} | |
foreach (@_) { | |
substr($MM, $pos{$base}[$_], 1) = $mod; | |
} | |
return $MM; | |
} | |
# Turns methylation method 25 to modified seq notation (method 1). | |
while (<>) { | |
next if (/^@/); | |
my @F = split("\t", $_); | |
my $seq = $F[9]; | |
# Find location of Nth occurrence of each seq base type | |
my $p = 0; | |
while ($seq =~ /(A)/g) { $pos{A}[++$p] = $-[0]; } | |
$p = 0; | |
while ($seq =~ /(C)/g) { $pos{C}[++$p] = $-[0]; } | |
$p = 0; | |
while ($seq =~ /(G)/g) { $pos{G}[++$p] = $-[0]; } | |
$p = 0; | |
while ($seq =~ /(T)/g) { $pos{T}[++$p] = $-[0]; } | |
# Convert mC ops into base modified sequence | |
my $MM = $seq; | |
if (/\tmC:Z:([^\t]*)/) { | |
my @m = split(",",$1); | |
my @pos = (); | |
my $last = 0; | |
my $base = ""; | |
my $mod = ""; | |
my $strand = ""; | |
for (my $i = 0; $i < scalar(@m); $i++) { | |
if ($m[$i] !~ /\d/) { | |
$MM = edit($MM, $base, $mod, $strand, @pos) if ($base ne ""); | |
@pos = (); | |
$last = 0; | |
$base = substr($m[$i], 0, 1); | |
$mod = substr($m[$i], 2, 1); | |
$strand = substr($m[$i], 1, 1); | |
} else { | |
push(@pos, $last+$m[$i]); | |
$last += $m[$i]; | |
} | |
} | |
$MM = edit($MM, $base, $mod, $strand, @pos) if ($base ne ""); | |
} | |
print "$MM\n"; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment