Skip to content

Instantly share code, notes, and snippets.

@jkbonfield
Created May 31, 2019 16:36
Show Gist options
  • Save jkbonfield/bb600c57e710f60cbfd79407230c4a4c to your computer and use it in GitHub Desktop.
Save jkbonfield/bb600c57e710f60cbfd79407230c4a4c to your computer and use it in GitHub Desktop.
Adds methylation to sequences in SAM.
#!/usr/bin/perl -w
# Uses figures from https://www.biorxiv.org/content/early/2016/03/15/043794
# to add random C methylation to a SAM file, storing in a variety of ways.
use strict;
srand(0);
my $method = shift(@ARGV);
my $double_stranded = 0;
# From mouse, based on C content.
# C 492782672(+h+f+x below)
# x 40875807 7.653% 1 in 13.
# m 460257 0.8617% 1 in 1161
# h 20538 0.003845% 1 in 26007
# f 2790 0.0005223% 1 in 191449
# C: m h f c (complement 1 2 3 4)
my $pm = 3;
my $ph = 0.055;
my $pf = 0.0014;
my $pc = 0.000335;
# G: o (complement O, NB that's my made up symbol)
my $po = 0.05;
# 10x freq for a harder test
my $scale=10;
#my $scale=1;
#my $scale=0.1;
$pm *= $scale;
$ph *= $scale;
$pf *= $scale;
$pc *= $scale;
$po *= $scale;
# Turn into cumulative frequencies
$pf += $pc;
$ph += $pf;
$pm += $ph;
# Map base mod symbol into the original unmodified base type
my %fundamental = (
"A" => "A", "C" => "C", "G" => "G", "T" => "T", "N" => "N",
"m" => "C", "h" => "C", "f" => "C", "c" => "C", "O" => "C",
"1" => "G", "2" => "G", "3" => "G", "4" => "G", "o" => "G",
);
# Largest type size based for B array
sub asize {
my $max = [sort {$b <=> $a} @_]->[0];
if ($max < 256) {
return "C";
} elsif ($max < 65536) {
return "S";
} else {
return "I";
}
}
# Add methylation to a sequence base with a random chance
sub modify_base {
$_ = shift(@_);
my $strand = shift(@_);
my $seq;
my $r = rand()*100;
# Replace C with potential m,h,f,c
if ($_ eq "C" && ($strand & 1)) {
if ($r > $pm) {
$seq = "C";
} elsif ($r > $ph) {
$seq = "m";
} elsif ($r > $pf) {
$seq = "h";
} elsif ($r > $pc) {
$seq = "f";
} else {
$seq = "c";
}
} elsif ($_ eq "C" && ($strand & 2)) {
if ($r > $po) {
$seq = "C";
} else {
$seq = "O";
}
# Replace G with potential 1,2,3,4 (C mod to other strand)
} elsif ($_ eq "G" && ($strand & 1)) {
if ($r > $po) {
$seq = "G";
} else {
$seq = "o";
}
} elsif ($_ eq "G" && ($strand & 2)) {
if ($r > $pm) {
$seq = "G";
} elsif ($r > $ph) {
$seq = "1";
} elsif ($r > $pf) {
$seq = "2";
} elsif ($r > $pc) {
$seq = "3";
} else {
$seq = "4";
}
} else {
$seq = $_;
}
return $seq;
}
$"="\t";
while (<>) {
if (/^@/) {
print; next;
}
chomp();
my @F = split("\t", $_);
# Produce modified sequence for top and bottom strands
my $top = "";
my $bot = "";
foreach (split("", $F[9])) {
$top .= modify_base($_, 1);
$bot .= modify_base($_, 2);
}
# FIXME: implement $double_stranded here.
# For now we assume mods on either top OR bottom only.
#
# To cope with double stranded we'd need to expand our alphabet as
# some mods could rarely occur on both strands at the same time.
my $mod;
if ($F[1] & 16) {
$mod = $bot;
} else {
$mod = $top;
}
# Just modify the sequence field (SAM/CRAM only)
if ($method eq 1) {
$F[9] = $mod;
}
# Unmodified seqs are as-is
$mod =~ tr/ACGTN/...../; # mask out real bases
if ($mod =~ /^\.*$/) {
print "@F\n";
next;
}
# Meth delta as MM tag, dot style
if ($method eq 2) {
push(@F, "MM:Z:$mod");
}
# Meth coord as MM tag, MD style
if ($method eq 3) {
$mod =~ s/(\.+)/length($1)/ge;
push(@F, "MM:Z:$mod")
}
# Context dependent fields; A C G T N
# Near optimal when methylation is common.
#
# Primary cause of encoding loss is the fields are variable length, whereas
# we actually know the length from the sequence itself (how many Cs, how
# many Gs, etc).
if ($method eq 4) {
my %seq;
my $p = 0;
foreach (split("", $F[9])) {
my $m = substr($mod, $p, 1);
$p++;
my $f = $m;
$f = $_ if ($m eq ".");
$seq{$fundamental{$f}} .= $m;
}
push(@F, "mC:Z:".$seq{C}) unless ($seq{C} =~ /^\.*$/);
push(@F, "mG:Z:".$seq{G}) unless ($seq{G} =~ /^\.*$/);;
}
# Like 4, but swap 1234 for mhfc as we use the codes to indicate strand
# in conjunction with "G". As "m" is a C code, it's clear "G" means other strand.
if ($method eq 22) {
my %seq;
my $p = 0;
foreach (split("", $F[9])) {
my $m = substr($mod, $p, 1);
$p++;
my $f = $m;
$f = $_ if ($m eq ".");
$seq{$fundamental{$f}} .= $m;
}
$seq{C} =~ tr/O/o/;
push(@F, "mC:Z:".$seq{C}) unless ($seq{C} =~ /^\.*$/);
$seq{G} =~ tr/1234/mhfc/;
push(@F, "mG:Z:".$seq{G}) unless ($seq{G} =~ /^\.*$/);;
}
# As 4 but with CIGAR style
if ($method eq 7) {
my %seq;
my $p = 0;
foreach (split("", $F[9])) {
my $m = substr($mod, $p, 1);
$p++;
my $f = $m;
$f = $_ if ($m eq ".");
$seq{$fundamental{$f}} .= $m;
}
unless ($seq{C} =~ /^\.*$/) {
$seq{C} =~ tr/O/o/;
$seq{C} =~ s/(\.+)/length($1)/ge;
push(@F, "mC:Z:".$seq{C});
}
unless ($seq{G} =~ /^\.*$/) {
$seq{G} =~ tr/1234/mhfc/; # Strand disambuates it
$seq{G} =~ s/(\.+)/length($1)/ge;
push(@F, "mG:Z:".$seq{G});
}
}
# As 2 but mod is fundamental-aware and overlapping
# Eg C->m = a, C->h = b
# G->1 = a, G->2 = b
if ($method eq 5) {
$mod =~ tr/mhfcO/abcde/;
$mod =~ tr/1234o/abcde/;
push(@F, "MM:Z:$mod");
}
# As 5 but CIGAR
if ($method eq 8) {
$mod =~ tr/mhfcO/abcde/;
$mod =~ tr/1234o/abcde/;
$mod =~ s/(\.+)/length($1)/ge;
push(@F, "MM:Z:$mod");
}
# As 4: Dotty-seq x4 but with strand too
if ($method eq 16) {
my %seq;
my $p = 0;
foreach (split("", $F[9])) {
my $m = substr($mod, $p, 1);
$p++;
my $f = $m;
$f = $_ if ($m eq ".");
$m =~ s/([mhfco])/+$1/g;
$m =~ s/([1234O])/-$1/g;
$m =~ tr/1234O/mhfco/;
$seq{$fundamental{$f}} .= $m;
}
push(@F, "mC:Z:".$seq{C}) unless ($seq{C} =~ /^\.*$/);
push(@F, "mG:Z:".$seq{G}) unless ($seq{G} =~ /^\.*$/);;
}
# As 7, but also +/- strand
if ($method eq 9) {
my %seq;
my $p = 0;
foreach (split("", $F[9])) {
my $m = substr($mod, $p, 1);
$p++;
my $f = $m;
$f = $_ if ($m eq ".");
$m =~ s/([mhfco])/+$1/g;
$m =~ s/([1234O])/-$1/g;
$m =~ tr/1234O/mhfco/;
$seq{$fundamental{$f}} .= $m;
}
unless ($seq{C} =~ /^\.*$/) {
$seq{C} =~ s/(\.+)/length($1)/ge;
push(@F, "mC:Z:".$seq{C});
}
unless ($seq{G} =~ /^\.*$/) {
$seq{G} =~ tr/1234/pqrs/;
$seq{G} =~ s/(\.+)/length($1)/ge;
push(@F, "mG:Z:".$seq{G});
}
}
# As 16, but strand is remembered until toggled
if ($method eq 21) {
my %seq;
my $p = 0;
foreach (split("", $F[9])) {
my $m = substr($mod, $p, 1);
$p++;
my $f = $m;
$f = $_ if ($m eq ".");
$m =~ s/([mhfco])/+$1/g;
$m =~ s/([1234O])/-$1/g;
$m =~ tr/1234O/mhfco/;
$seq{$fundamental{$f}} .= $m;
}
unless ($seq{C} =~ /^\.*$/) {
while ($seq{C}=~s/(\+[^-+]*)\+/$1/) {}
while ($seq{C}=~s/(-[^-+]*)-/$1/) {}
$seq{C} =~ s/(\.+)/length($1)/ge;
push(@F, "mC:Z:".$seq{C});
}
unless ($seq{G} =~ /^\.*$/) {
while ($seq{G}=~s/(\+[^-+]*)\+/$1/) {}
while ($seq{G}=~s/(-[^-+]*)-/$1/) {}
$seq{G} =~ tr/1234O/mhfco/;
$seq{G} =~ s/(\.+)/length($1)/ge;
push(@F, "mG:Z:".$seq{G});
}
}
# As 23. One tag for base-type + codes, one for pos delta.
if ($method eq 23) {
my $codes = "";
my $delta = "";
my %pos;
my $p = 0; # pos in mod string
my %last;
my %fp; # fundamental pos
foreach (split("", $F[9])) {
my $m = substr($mod, $p++, 1);
# Find fundamental base
my $f = $m;
$f = $_ if ($m eq ".");
$f = $fundamental{$f};
# Switch to original strand
$m =~ tr/1234O/mhfco/;
my $fm = $f.$m;
$fp{$f} = 0 if !exists($fp{$f});
$fp{$f}++;
next if ($m eq ".");
# Add to delta
$last{$fm} = 0 if !exists($last{$fm});
push(@{$pos{$fm}}, $fp{$f} - $last{$fm});
$last{$fm}=$fp{$f};
}
my @delta = ();
foreach (keys(%pos)) {
$codes .= $_ . scalar(@{$pos{$_}});
push(@delta, @{$pos{$_}});
}
$delta .= join(",",@delta);
push(@F, "mC:Z:$codes");
push(@F, "mP:Z:$delta");
}
# AS 23, but B array
if ($method eq 24) {
my $codes = "";
my $delta = "";
my %pos;
my $p = 0; # pos in mod string
my %last;
my %fp; # fundamental pos
foreach (split("", $F[9])) {
my $m = substr($mod, $p++, 1);
# Find fundamental base
my $f = $m;
$f = $_ if ($m eq ".");
$f = $fundamental{$f};
# Switch to original strand
$m =~ tr/1234O/mhfco/;
my $fm = $f.$m;
$fp{$f} = 0 if !exists($fp{$f});
$fp{$f}++;
next if ($m eq ".");
# Add to delta
$last{$fm} = 0 if !exists($last{$fm});
push(@{$pos{$fm}}, $fp{$f} - $last{$fm});
$last{$fm}=$fp{$f};
}
my @delta = ();
foreach (keys(%pos)) {
$codes .= $_ . scalar(@{$pos{$_}});
push(@delta, @{$pos{$_}});
}
$delta .= join(",",@delta);
push(@F, "mC:Z:$codes");
push(@F, "mP:B:".asize(@delta).",$delta");
}
# Like 23, but one inline tag toggling code and delta.
if ($method eq 25) {
my $codes = "";
my %pos;
my $p = 0; # pos in mod string
my %last;
my %fp; # fundamental pos
foreach (split("", $F[9])) {
my $m = substr($mod, $p++, 1);
# Find fundamental base
my $f = $m;
$f = $_ if ($m eq ".");
$f = $fundamental{$f};
# Switch to original strand
$m =~ s/([mhfco])/+$1/;
$m =~ s/([1234O])/-$1/;
$m =~ tr/1234O/mhfco/;
my $fm = $f.$m;
#my $fm = "$f,$m";
$fp{$f} = 0 if !exists($fp{$f});
$fp{$f}++;
next if ($m eq ".");
# Add to delta
$last{$fm} = 0 if !exists($last{$fm});
push(@{$pos{$fm}}, $fp{$f} - $last{$fm});
$last{$fm}=$fp{$f};
}
foreach (keys(%pos)) {
$codes .= "," if ($codes ne "");
$codes .= $_ . "," . join(",", @{$pos{$_}});
}
push(@F, "mC:Z:$codes");
}
# As 25, but with single B array.
# -ve numbers are codes.
my %code2id = (
"m" => 0,
"h" => 1,
"f" => 2,
"c" => 3,
"o" => 4
);
my %base2id = (
"A" => 0,
"C" => 1,
"G" => 2,
"T" => 3,
"N" => 4,
);
my %strand2id = (
"+" => 0,
"-" => 1,
);
if ($method eq 26) {
my @codes = ();
my %pos;
my $p = 0; # pos in mod string
my %last;
my %fp; # fundamental pos
foreach (split("", $F[9])) {
my $m = substr($mod, $p++, 1);
# Find fundamental base
my $f = $m;
$f = $_ if ($m eq ".");
$f = $fundamental{$f};
# Switch to original strand
$m =~ s/([mhfco])/+$1/;
$m =~ s/([1234O])/-$1/;
$m =~ tr/1234O/mhfco/;
my $fm = $f.$m;
$fp{$f} = 0 if !exists($fp{$f});
$fp{$f}++;
next if ($m eq ".");
# Add to delta
$last{$fm} = 0 if !exists($last{$fm});
push(@{$pos{$fm}}, $fp{$f} - $last{$fm});
$last{$fm}=$fp{$f};
}
foreach (keys(%pos)) {
push(@codes, $base2id{substr($_, 0, 1)});
push(@codes, $strand2id{substr($_, 1, 1)});
push(@codes, $code2id{substr($_, 2, 1)});
push(@codes, scalar(@{$pos{$_}}));
push(@codes, @{$pos{$_}});
}
push(@F, "mC:B:".asize(@codes).",".join(",", @codes));
}
print "@F\n";
}
#!/usr/bin/perl -w
use strict;
my %pos;
sub edit {
my $MM = shift(@_);
my $base = shift(@_);
my $mod = shift(@_);
my $strand = shift(@_);
if ($strand eq "-") {
$mod =~ tr/mhfco/1234O/;
}
foreach (@_) {
substr($MM, $pos{$base}[$_], 1) = $mod;
}
return $MM;
}
# Turns methylation method 25 to modified seq notation (method 1).
while (<>) {
next if (/^@/);
my @F = split("\t", $_);
my $seq = $F[9];
# Find location of Nth occurrence of each seq base type
my $p = 0;
while ($seq =~ /(A)/g) { $pos{A}[++$p] = $-[0]; }
$p = 0;
while ($seq =~ /(C)/g) { $pos{C}[++$p] = $-[0]; }
$p = 0;
while ($seq =~ /(G)/g) { $pos{G}[++$p] = $-[0]; }
$p = 0;
while ($seq =~ /(T)/g) { $pos{T}[++$p] = $-[0]; }
# Convert mC ops into base modified sequence
my $MM = $seq;
if (/\tmC:Z:([^\t]*)/) {
my @m = split(",",$1);
my @pos = ();
my $last = 0;
my $base = "";
my $mod = "";
my $strand = "";
for (my $i = 0; $i < scalar(@m); $i++) {
if ($m[$i] !~ /\d/) {
$MM = edit($MM, $base, $mod, $strand, @pos) if ($base ne "");
@pos = ();
$last = 0;
$base = substr($m[$i], 0, 1);
$mod = substr($m[$i], 2, 1);
$strand = substr($m[$i], 1, 1);
} else {
push(@pos, $last+$m[$i]);
$last += $m[$i];
}
}
$MM = edit($MM, $base, $mod, $strand, @pos) if ($base ne "");
}
print "$MM\n";
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment