-
-
Save pditommaso/d0153e128a40fb0296e9da4457c4ce2a to your computer and use it in GitHub Desktop.
Scripts and command lines to create universal mask for hs37d5
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# | |
# generate 01compositional.bed.gz | |
# | |
# low-complexity by mDUST | |
mdust hs37d5.fa -c -w7 -v28 \ | |
| hs37d5.mdust-w7-v28.txt \ | |
| cut -f1,3,4 \ | |
| gawk -vOFS="\t" '{--$2;print}' \ | |
| bgzip > 01compositional/hs37d5.mdust-w7-v28.bed.gz | |
# long homopolymer runs | |
seqtk hrun hs37d5.fa \ | |
| bgzip > 01compositional/hs37d5.hrun.bed.gz | |
# satellite sequences reported by RepeatMasker. rmsk.txt.gz is downloaded from UCSC. | |
zcat rmsk.txt.gz \ | |
| grep Satellite \ | |
| cut -f6,7,8 \ | |
| sed s,^chr,, \ | |
| perl -pe 's/^[^\s_]+_([^\s_]+)_random/$1.1/' \ | |
| tr "gl" "GL" \ | |
| sort-alt -k1,1N -k2,2n \ | |
| bgzip > 01compositional/hs37d5.satellite.bed.gz | |
# Low-complexity regions identified by RepeatMasker | |
zcat rmsk.txt.gz \ | |
| grep Low_complexity \ | |
| cut -f6,7,8 \ | |
| sed s,^chr,, \ | |
| perl -pe 's/^[^\s_]+_([^\s_]+)_random/$1.1/' \ | |
| tr "gl" "GL" \ | |
| sort-alt -k1,1N -k2,2n \ | |
| bgzip > 01compositional/hs37d5.rmsk-lc.bed.gz | |
# merge the four filters above with 10bp flanking sequences. bedmerge can be replaced by bedtools | |
zcat hs37d5.hrun.bed.gz hs37d5.rmsk-lc.bed.gz hs37d5.satellite.bed.gz hs37d5.mdust-w7-v28.bed.gz \ | |
| sort-alt -k1,1N -k2,2n \ | |
| gawk -v OFS="\t" '{$2=$2<10?0:$2-10;$3+=10;print}' \ | |
| bioutils.lua bedmerge \ | |
| bgzip > 01compositional.bed.gz | |
# | |
# generate 02structural.bed.gz | |
# | |
bcftools view -D hs37d5.fasta.fai -GeS unflt.vcf.gz \ | |
| ./filter.pl \ | |
| awk '$6<0&&($7>=100||($7>=50&&((/^[0-9]/&&$4>=22000)||(/^X/&&$4>=19000))))' \ | |
| k8 clustreg.js \ | |
| gawk -v OFS="\t" '{$2=$2<150?0:$2-150;$3+=150;print}' \ | |
| bioutils.lua bedmerge \ | |
| bgzip > 02structural.bed.gz | |
# | |
# Mappability filter is generated with instructions here: | |
# http://lh3lh3.users.sourceforge.net/snpable.shtml | |
# | |
# | |
# Merge compositional, structural and mappability filters | |
# | |
zcat 01compositional/01compositional.bed.gz 02structural/02structural.bed.gz 03mappable/hs37d5.mask75_50.bed.gz \ | |
| sort-alt -k1,1N -k2,2n \ | |
| bioutils.lua bedmerge \ | |
| k8 merge-close.js \ | |
| bgzip > um75-hs37d5.bed.gz |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
use strict; | |
use warnings; | |
while (<>) { | |
next if /^#/; | |
next if /INDEL/; | |
my @t = split("\t"); | |
my ($hwe, $f) = (1, 0); | |
my @g3; | |
my $af = ($t[7] =~ /AF1=([^;]+)/)? $1 : 0; | |
if ($t[7] =~ /G3=([^;,]+),([^;,]+),([^;,]+).*HWE=([^;,]+)/) { | |
$hwe = $4 == 0? 1e-300 : $4; | |
my $p = $1 + .5 * $2; | |
$f = ($p > 0 && $p < 1)? 1 - $2 / (2 * $p * (1-$p)) : 0; | |
$hwe = -4.343 * log($hwe); | |
@g3 = ($1, $2, $3); | |
} else { | |
$hwe = 0; | |
} | |
$t[7] =~ /DP=(\d+)/; | |
my $dp = $1; | |
print join("\t", $t[0], $t[1], $t[5], $dp, sprintf("%.4g\t%.4g\t%.4f", $af, $f, $hwe), @g3), "\n"; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var getopt = function(args, ostr) { | |
var oli; // option letter list index | |
if (typeof(getopt.place) == 'undefined') | |
getopt.ind = 0, getopt.arg = null, getopt.place = -1; | |
if (getopt.place == -1) { // update scanning pointer | |
if (getopt.ind >= args.length || args[getopt.ind].charAt(getopt.place = 0) != '-') { | |
getopt.place = -1; | |
return null; | |
} | |
if (getopt.place + 1 < args[getopt.ind].length && args[getopt.ind].charAt(++getopt.place) == '-') { // found "--" | |
++getopt.ind; | |
getopt.place = -1; | |
return null; | |
} | |
} | |
var optopt = args[getopt.ind].charAt(getopt.place++); // character checked for validity | |
if (optopt == ':' || (oli = ostr.indexOf(optopt)) < 0) { | |
if (optopt == '-') return null; // if the user didn't specify '-' as an option, assume it means null. | |
if (getopt.place < 0) ++getopt.ind; | |
return '?'; | |
} | |
if (oli+1 >= ostr.length || ostr.charAt(++oli) != ':') { // don't need argument | |
getopt.arg = null; | |
if (getopt.place < 0 || getopt.place >= args[getopt.ind].length) ++getopt.ind, getopt.place = -1; | |
} else { // need an argument | |
if (getopt.place >= 0 && getopt.place < args[getopt.ind].length) | |
getopt.arg = args[getopt.ind].substr(getopt.place); | |
else if (args.length <= ++getopt.ind) { // no arg | |
getopt.place = -1; | |
if (ostr.length > 0 && ostr.charAt(0) == ':') return ':'; | |
return '?'; | |
} else getopt.arg = args[getopt.ind]; // white space | |
getopt.place = -1; | |
++getopt.ind; | |
} | |
return optopt; | |
} | |
// parse command-line options | |
var c, w1 = 1000, w2 = 10000, min_n = 2; | |
while ((c = getopt(arguments, "w:W:n:")) != null) { | |
if (c == 'w') w1 = parseInt(getopt.arg); | |
else if (c == 'W') w2 = parseInt(getopt.arg); | |
else if (c == 'n') min_n = parseInt(getopt.arg); | |
} | |
// read data | |
var file = getopt.ind < arguments.length? new File(arguments[getopt.ind]) : new File(); | |
var buf = new Bytes(); | |
var seqs = {}; | |
while (file.readline(buf) >= 0) { | |
var t = buf.toString().split("\t"); | |
if (seqs[t[0]] == null) seqs[t[0]] = []; | |
seqs[t[0]].push(parseInt(t[1])); | |
} | |
buf.destroy(); | |
file.close(); | |
// cluster | |
function cluster(a, w1, w2) | |
{ | |
a.sort(function(b,c){return b-c}); | |
var x = [], y = []; | |
for (var i = 0; i < a.length; ++i) | |
x.push([i, a[i] - 1, a[i], 1]); | |
for (var i = 0; i < a.length - 1; ++i) | |
y.push([a[i+1] - a[i], i]); | |
y.sort(function(b,c){return b[0]!=c[0]?b[0]-c[0]:b[1]-c[1]}); | |
for (var j = 0; j < y.length && y[j][0] <= w2; ++j) { | |
var l = y[j][1] + 1; | |
if (x[l][0] != l) | |
throw Error("Bug!!!"); | |
// find the parent of y[j][1] | |
var k = y[j][1], tmp = []; | |
while (x[k][0] != k) | |
tmp.push((k = x[k][0])); | |
for (var i = 0; i < tmp.length; ++i) x[tmp[i]][0] = k; | |
// test merge | |
if ((x[l][2] - x[k][1]) / (x[l][3] + x[k][3]) > w1) continue; | |
// merge l into k | |
x[k][2] = x[l][2]; x[k][3] += x[l][3]; | |
x[l][0] = k; | |
} | |
y.length = 0; | |
return x; | |
} | |
for (var seq in seqs) { | |
var x = cluster(seqs[seq], w1, w2); | |
for (var i = 0; i < x.length; ++i) | |
if (x[i][0] == i && x[i][3] >= min_n) | |
print(seq, x[i][1], x[i][2], x[i][3]); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var s = 10; | |
var f = new File(); | |
var b = new Bytes(); | |
var last_chr = null, start = 0, end = 0; | |
while (f.readline(b) >= 0) { | |
var t = b.toString().split("\t"); | |
t[1] = parseInt(t[1]); | |
t[2] = parseInt(t[2]); | |
if (t[0] != last_chr) { | |
if (last_chr) print(last_chr, start, end); | |
last_chr = t[0], start = t[1], end = t[2]; | |
} else if (t[1] - end > s) { | |
print(last_chr, start, end); | |
start = t[1], end = t[2]; | |
} else { | |
end = end > t[2]? end : t[2]; | |
} | |
} | |
if (last_chr) print(last_chr, start, end); | |
b.destroy(); | |
f.close(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment