Last active
August 29, 2015 14:06
-
-
Save lh3/12a56904deebfcfaacb3 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# generate the list of novel segments for each sample | |
seqtk seq -L200 001*.mag.gz | ./fermi2 match -dk101 hs38a.fmd /dev/stdin | k8 hs38d.js fm2seg -p 001 | gzip > 001.novel.gz | |
k8 hs38d.js tab2fa 001.novel.gz | bwa-r868 mem s38a.fa - | k8 hs38d.js dropsim | gzip -1 > 001.dropped.gz | |
tabtk isct -c 001.dropped.gz 001.novel.gz | gzip -1 > 001.div1.gz | |
# | |
zcat div1/*.div1.gz | cut -f1-5 | sort -k5,5 -k1,1 -S60G | k8 hs38d.js fixname | gzip -1 > 01ori.txt.gz | |
# tabtk isct -c gbk.novel.dropped gbk.novel.gz | sort -k5,5 -k1,1 -S60G | grep -v NNNNNNNNNN | gzip -1 > 01ori.txt.gz | |
tabtk isct -d_ -c contaminated.txt 01ori.txt.gz | gzip -1 > 02contam.txt.gz | |
# zcat 01ori.txt.gz|grep -vFf DoNotUseList_May2011.list | gzip -1 > 02contam.txt.gz | |
k8 hs38d.js nrexact 02contam.txt.gz | awk -f drop-sub.awk | gzip -1 > 03nr.txt.gz | |
zcat 03nr.txt.gz | cut -f5 | ropebwt2 -dLr > 03nr.fmd | |
k8 hs38d.js tab2fa 03nr.txt.gz | fermi2-r166 match -t16 03nr.fmd /dev/stdin | gzip -1 > 03nr.match.gz | |
zcat 03nr.match.gz | awk '/^SQ/{x=$2;getline;if($4==1)print x}' > 03nr.kept | |
tabtk isct 03nr.kept 03nr.txt.gz | gzip -1 > 04nr.txt.gz | |
k8 hs38d.js tab2fa 04nr.txt.gz|seqtk kfreq ggaat - > 04nr.kfreq | |
awk '{print int($4/$2*500+.499)}' 04nr.kfreq | sort -n | uniq -c > 04nr.khist | |
zcat 04nr.txt.gz|awk 'BEGIN{while((getline<"04nr.kfreq")>0)if($4*5/$2<=0.4)l[$1]=1}l[$1]' | gzip -1 > 05ggaat.txt.gz | |
k8 hs38d.js tab2fa 05ggaat.txt.gz | ./mdust -c > 05ggaat.mdust | |
awk '{print $1"\t"($3-1)"\t"$4"\t"$2}' 05ggaat.mdust|k8 hs38d.js bedsum|awk '{print int($3/$2*100+.499)}'|sort -n|uniq -c > 05ggaat.mhist | |
awk '{print $1"\t"($3-1)"\t"$4"\t"$2}' 05ggaat.mdust|k8 hs38d.js bedsum|awk '$3/$2>=.9||$2-$3<=100' > 05ggaat.dropped | |
tabtk isct -c 05ggaat.dropped 05ggaat.txt.gz | gzip -1 > 07dust.txt.gz | |
k8 hs38d.js tab2fa 07dust.txt.gz > 07dust.fa | |
bwa index 07dust.fa | |
bwa-r868 mem -aey100 -t16 07dust.fa 07dust.fa | gzip -1 > 07dust.sam.gz | |
k8 hs38d.js nr 07dust.sam.gz |sort|uniq > 07dust.dropped | |
tabtk isct -c 07dust.dropped 07dust.txt.gz | gzip -1 > 08nr.txt.gz | |
k8 hs38d.js tab2fa 08nr.txt.gz > 08nr.fa | |
bwa index 08nr.fa | |
bwa-r868 mem -aey1000 -t16 08nr.fa 08nr.fa | gzip -1 > 08nr.sam.gz | |
k8 hs38d.js nr 08nr.sam.gz |sort|uniq > 08nr.dropped | |
tabtk isct -c 08nr.dropped 08nr.txt.gz | gzip -1 > 09nr.txt.gz | |
k8 hs38d.js tab2fa 09nr.txt.gz > 09nr.fa | |
bwa index 09nr.fa | |
bwa-r868 mem -aey1000 -t16 09nr.fa 09nr.fa | gzip -1 > 09nr.sam.gz | |
k8 hs38d.js nr 09nr.sam.gz |sort|uniq > 09nr.dropped | |
tabtk isct -c 09nr.dropped 09nr.txt.gz | gzip -1 > 10nr.txt.gz | |
k8 hs38d.js tab2fa 10nr.txt.gz | bwa-r868 mem -y100 -t16 ../gbk/09nr.fa - | gzip -1 > 10nr.y.sam.gz | |
k8 hs38d.js tab2fa 10nr.txt.gz | bwa-r868 mem -t16 ../gbk/09nr.fa - | gzip -1 > 10nr.def.sam.gz | |
zcat 10nr.*.sam.gz|grep -v ^@|k8 hs38d.js dropsim |sort|uniq > 10nr.dropped | |
tabtk isct -c 10nr.dropped 10nr.txt.gz|gzip -1 > 11gbk.txt.gz | |
k8 hs38d.js tab2fa 11gbk.txt.gz | bwa-r868 mem -y 100 -VN100 -t16 -k17 -A1 -B1 -O1 -E1 -L0 ~/hengli/nt/nt - | gzip -1 > 11gbk.sam.gz | |
k8 hs38d.js ntanno 11gbk.sam.gz > 11gbk.anno | |
cut -f5 11gbk.anno|tr ":" "\t"|cut -f1|sort -n|uniq |grep ^[0-9] > 11gbk.all.gi | |
k8 hs38d.js genquery 11gbk.all.gi > 11gbk.all.querys | |
awk 'NR==1' 11gbk.all.querys > 11gbk.all.query1 | |
curl -d @11gbk.all.query1 http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi > 11gbk.all.rst1 | |
cat 11gbk.all.rst?|perl -ne '$id=$1 if/<Id>(\d+)</;if (/TaxId.*>(\d+)</){print "$id\t$1\n"}' > 11gbk.taxid | |
cat 11gbk.taxid|cut -f2|sort -n|uniq|awk 'BEGIN{printf("db=taxonomy&id=")}{if(f)printf(",");printf("%d",$1);f=1}' > 11gbk.taxid.query | |
curl -d @11gbk.taxid.query http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi > 11gbk.taxid.rst | |
k8 hs38d.js hslcs 11gbk.taxid.rst > 11gbk.taxid.lcs | |
awk -t 'BEGIN{while((getline<"11gbk.taxid.lcs")>0)l[$1]=$2}{print $1,$2,l[$2]}' 11gbk.taxid > 11gbk.gi.lcs | |
awk 'BEGIN{FS=OFS="\t";while((getline<"11gbk.gi.lcs")>0)l[$1]=$3}$5=="*"{print}$5!="*"{x=substr($5,1,index($5,":")-1);$5=$5":"l[x];print}' 11gbk.anno > 11gbk.lcs.anno | |
egrep -v ":(cellular|Eukaryota|Opisthokonta|organisms)" 11gbk.lcs.anno|awk '$5!="*"' |cut -f1 > 11gbk.kept | |
zcat 11gbk.txt.gz|awk 'BEGIN{while((getline<"11gbk.kept")>0)l[$1]=1}l[$1]||$2>0||$3>0'|gzip -1 > 12contam.txt.gz | |
k8 hs38d.js tab2fa 13mix.txt.gz > 13mix.fa | |
bwa index 13mix.fa | |
bwa-r868 mem -aey1000 -t32 13mix.fa 13mix.fa | gzip -1 > 13mix.sam.gz | |
k8 hs38d.js nr 13mix.sam.gz | sort |uniq > 13mix.dropped | |
(awk 'NR>3' 09nr.fa.out; awk 'NR>3' 11gbk.fa.out) > ../254/13mix.rm | |
./mdust 09nr.fa -c > 09nr.mdust # for gbk | |
cat ../rm-rst/09nr.mdust 05ggaat.mdust > 13mix.mdust | |
(awk '{print $1"\t"($3-1)"\t"$4"\t"$2"\tdust\tSimple_repeat"}' 13mix.mdust; k8 hs38d.js rm2bed 13mix.rm)|sort -k4,4nr -k1,1 -k2,2n -k3,3n|tabtk isct 13mix.txt.gz | gzip -1 > 13mix.rep.gz | |
zcat 13mix.rep.gz|k8 hs38d.js bedsum|awk '$3/$2>=.8||$2-$3<=100' > 13mix.rep.high | |
tabtk isct -c 13mix.rep.high 14nr.txt.gz | awk '$4>=500' | gzip -1 > 15split.uni.gz | |
tabtk isct 13mix.rep.high 14nr.txt.gz | awk '$4>=500' | gzip -1 > 15split.rep.gz | |
k8 hs38d.js tab2fa 14nr.txt.gz | bwa-r868 mem -t32 ~/hengli/db/hs38/hs38a.fa - | gzip -1 > 14nr.sam.gz | |
k8 hs38d.js dropsim -f 14nr.sam.gz > 14nr.dropped | |
tabtk isct -c 14nr.dropped 14nr.txt.gz | awk '$4>=500' | gzip -1 > 15div.txt.gz | |
k8 hs38d.js tab2fa 15div.txt.gz > 15div.fa | |
bwa index 15div.fa | |
bwa-r868 mem -aey1000 -t32 15div.fa 15div.fa | gzip -1 > 15div.sam.gz | |
tabtk isct -c 15div.dropped 15div.txt.gz|gzip -1 > 16nr.txt.gz | |
tabtk isct 13mix.lcsat.high 16nr.txt.gz| awk '$4>=2000' | gzip -1 > 17split.lcsat.txt.gz | |
tabtk isct -c 13mix.lcsat.high 16nr.txt.gz| tabtk isct 13mix.rep.high | awk '$4>=1000' | gzip -1 > 17split.inter.txt.gz | |
tabtk isct -c 13mix.rep.high 16nr.txt.gz| gzip -1 > 17split.uni.txt.gz | |
k8 hs38d.js tab2fa 17split.uni.txt.gz > 17split.uni.fa | |
bwa index 17split.uni.fa | |
bwa-r868 mem -aey100 -t32 17split.uni.fa 17split.uni.fa | gzip -1 > 17split.uni.sam.gz | |
k8 hs38d.js nr -s 400 17split.uni.sam.gz |sort |uniq > 17split.uni.nr | |
cat 17split.mgd-*.sub > 17split.dropped | |
tabtk isct 17split.uni.nr 18mgd.uni.txt.gz | gzip -1 > 18mgd.redun.txt.gz | |
tabtk isct -c 17split.uni.nr 18mgd.uni.txt.gz | gzip -1 > 18mgd.nr.txt.gz | |
zcat 18mgd.nr.txt.gz|awk '$4>=1000'|sort -k4,4nr | awk '{printf(">decoy0%.4d %s\n%s\n",NR,$1,$5)}' | fold -w120 > 19fa.0nr.fa | |
zcat 18mgd.redun.txt.gz|awk '$4>=1000'|sort -k4,4nr | awk '{printf(">decoy1%.4d %s\n%s\n",NR,$1,$5)}' | fold -w120 > 19fa.1redun.fa | |
zcat 18mgd.inter.txt.gz|awk '$4>=1000'|sort -k4,4nr | awk '{printf(">decoy2%.4d %s\n%s\n",NR,$1,$5)}' | fold -w120 > 19fa.2inter.fa | |
zcat 18mgd.lcsat.txt.gz|awk '$4>=1000'|sort -k4,4nr | awk '{printf(">decoy3%.4d %s\n%s\n",NR,$1,$5)}' | fold -w120 > 19fa.3lcsat.fa | |
./RepeatMasker -species human -pa 40 -xsmall -small -dir ../rm-rst ../rm-rst/hs38d4s.fa |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var getopt = function(args, ostr) { | |
var oli; // option letter list index | |
if (typeof(getopt.place) == 'undefined') | |
getopt.ind = 0, getopt.arg = null, getopt.place = -1; | |
if (getopt.place == -1) { // update scanning pointer | |
if (getopt.ind >= args.length || args[getopt.ind].charAt(getopt.place = 0) != '-') { | |
getopt.place = -1; | |
return null; | |
} | |
if (getopt.place + 1 < args[getopt.ind].length && args[getopt.ind].charAt(++getopt.place) == '-') { // found "--" | |
++getopt.ind; | |
getopt.place = -1; | |
return null; | |
} | |
} | |
var optopt = args[getopt.ind].charAt(getopt.place++); // character checked for validity | |
if (optopt == ':' || (oli = ostr.indexOf(optopt)) < 0) { | |
if (optopt == '-') return null; // if the user didn't specify '-' as an option, assume it means null. | |
if (getopt.place < 0) ++getopt.ind; | |
return '?'; | |
} | |
if (oli+1 >= ostr.length || ostr.charAt(++oli) != ':') { // don't need argument | |
getopt.arg = null; | |
if (getopt.place < 0 || getopt.place >= args[getopt.ind].length) ++getopt.ind, getopt.place = -1; | |
} else { // need an argument | |
if (getopt.place >= 0 && getopt.place < args[getopt.ind].length) | |
getopt.arg = args[getopt.ind].substr(getopt.place); | |
else if (args.length <= ++getopt.ind) { // no arg | |
getopt.place = -1; | |
if (ostr.length > 0 && ostr.charAt(0) == ':') return ':'; | |
return '?'; | |
} else getopt.arg = args[getopt.ind]; // white space | |
getopt.place = -1; | |
++getopt.ind; | |
} | |
return optopt; | |
} | |
function hsd_fm2seg2(args) | |
{ | |
var c, min_len = 200, min_dp = 5, pre = null; | |
while ((c = getopt(args, "l:d:p:")) != null) | |
if (c == 'l') min_len = parseInt(getopt.arg); | |
else if (c == 'd') min_dp = parseInt(getopt.arg); | |
else if (c == 'p') pre = getopt.arg; | |
var file = getopt.ind == args.length? new File() : new File(args[getopt.ind]); | |
var buf = new Bytes(); | |
var name = null, len = 0; | |
while (file.readline(buf) >= 0) { | |
var m, line = buf.toString(); | |
if ((m = /^SQ\s(\S+)\s(\d+)/.exec(line)) != null) { | |
name = m[1], len = parseInt(m[2]); | |
} else if (/^NS/.test(line)) { | |
var t = line.split("\t"); | |
t[4] = parseInt(t[4]); | |
if (parseInt(t[4]) < min_len) continue; | |
t[1] = parseInt(t[1]), t[3] = parseInt(t[3]), t[5] = parseInt(t[5]); | |
var start_off = t[2] == '+'? t[3] : t[5]; | |
var s = t[9].substr(t[3], t[4]); | |
var o = t[2] == '+'? 'F' : 'R'; | |
if (t[10] != '*') { | |
var i, q = t[10].substr(t[3], t[4]); | |
for (i = 0; i < q.length; ++i) | |
if (q.charCodeAt(i) - 33 >= min_dp) break; | |
if (i == q.length) continue; | |
} | |
print((pre==null? "" : pre+"_") + [name, len, t[1]+start_off, t[1]+start_off+t[4], o].join("_"), t[3], t[5], t[4], s); | |
} | |
} | |
buf.destroy(); | |
file.close(); | |
} | |
function hsd_fm2seg(args) | |
{ | |
var c, min_len = 200, min_dp = 5, pre = null; | |
while ((c = getopt(args, "l:d:p:")) != null) | |
if (c == 'l') min_len = parseInt(getopt.arg); | |
else if (c == 'd') min_dp = parseInt(getopt.arg); | |
else if (c == 'p') pre = getopt.arg; | |
var file = getopt.ind == args.length? new File() : new File(args[getopt.ind]); | |
var buf = new Bytes(); | |
var name = null, len = 0; | |
while (file.readline(buf) >= 0) { | |
var m, line = buf.toString(); | |
if ((m = /^SQ\s(\S+)\s(\d+)/.exec(line)) != null) { | |
name = m[1], len = parseInt(m[2]); | |
} else if (/^NS/.test(line)) { | |
var t = line.split("\t"); | |
t[4] = parseInt(t[4]); | |
if (parseInt(t[4]) < min_len) continue; | |
t[1] = parseInt(t[1]), t[3] = parseInt(t[3]), t[5] = parseInt(t[5]); | |
var s = t[9].substr(t[3], t[4]); | |
var o = t[2] == '+'? 'F' : 'R'; | |
if (t[10] != '*') { | |
var i, q = t[10].substr(t[3], t[4]); | |
for (i = 0; i < q.length; ++i) | |
if (q.charCodeAt(i) - 33 >= min_dp) break; | |
if (i == q.length) continue; | |
} | |
print((pre==null? "" : pre+"_") + [name, len, t[1]+t[3], t[1]+t[3]+t[4], o].join("_"), t[3], t[5], t[4], s); | |
} | |
} | |
buf.destroy(); | |
file.close(); | |
} | |
function hsd_nr(args) | |
{ | |
var c, min_cov = .9, min_nm = 20, min_diff = 0.02, min_cov1 = .7, min_sc = null, use_formula = false; | |
while ((c = getopt(args, "c:n:d:C:s:f")) != null) { | |
if (c == 'c') min_cov = parseFloat(getopt.arg); | |
else if (c == 'n') min_nm = parseInt(getopt.arg); | |
else if (c == 'd') min_diff = parseFloat(getopt.arg); | |
else if (c == 'C') min_cov1 = parseFloat(getopt.arg); | |
else if (c == 's') min_sc = parseInt(getopt.arg); | |
else if (c == 'f') use_formula = true; | |
} | |
var re_cigar = /(\d+)([MIDHSN])/g; | |
var file = getopt.ind == args.length? new File() : new File(args[getopt.ind]); | |
var buf = new Bytes(); | |
while (file.readline(buf) >= 0) { | |
var m, line = buf.toString(); | |
if (line.charAt(0) == '@') continue; | |
var t = line.split("\t"); | |
if (t[2] == '*') continue; | |
if ((m = /\tNM:i:(\d+)/.exec(line)) == null) continue; | |
var nm = parseInt(m[1]); | |
var k = 0, M = 0, I = 0, D = 0, nI = 0, nD = 0, qclip = [0, 0]; | |
while ((m = re_cigar.exec(t[5])) != null) { | |
var l = parseInt(m[1]); | |
if (m[2] == 'M') M += l; | |
else if (m[2] == 'D') D += l, ++nD; | |
else if (m[2] == 'I') I += l, ++nI; | |
else if (m[2] == 'S' || m[2] == 'H') qclip[k==0? 0:1] = l; | |
++k; | |
} | |
var qlen = M + I + qclip[0] + qclip[1]; | |
var diff = nm / (M + I + D); | |
var s = t[2].split("_"); | |
var pos = parseInt(t[3]) - 1; | |
var tlen = parseInt(s[4]) - parseInt(s[3]); | |
var tclip = [pos, tlen - (pos + M + D)]; | |
var Smin = (qclip[0] < tclip[0]? qclip[0] : tclip[0]) + (qclip[1] < tclip[1]? qclip[1] : tclip[1]); | |
if (min_sc == null) { | |
var is_nr = false; | |
if (nm + Smin < min_nm) is_nr = true; | |
var min_len = qlen < tlen? qlen : tlen; | |
var diff_thres = use_formula? 1 - 1 / (1 + 0.055 * Math.exp(-8.5e-5 * min_len)) : min_diff; | |
if ((M + I) / (M + I + Smin) >= min_cov && (M + D) / (M + D + Smin) >= min_cov && (nm < min_nm || diff < diff_thres)) is_nr = true; | |
if (is_nr) { | |
if (qlen < tlen && (M + I) / qlen >= min_cov1) print(t[0]); | |
else if (qlen >= tlen && (M + D) / tlen >= min_cov1) print(t[2]); | |
} | |
} else { | |
if ((m = /\tAS:i:(\d+)/.exec(line)) == null) continue; | |
var score = parseInt(m[1]); | |
var min_len = qlen < tlen? qlen : tlen; | |
if (score >= min_sc && score >= min_len / 3) { | |
if (qlen < tlen) print(t[0]); | |
else print(t[2]); | |
} | |
} | |
} | |
buf.destroy(); | |
file.close(); | |
} | |
function hsd_dropsim(args) | |
{ | |
var c, min_cov = .95, min_nm = 20, min_diff = 0.01, use_formula = false; | |
while ((c = getopt(args, "c:n:d:f")) != null) { | |
if (c == 'c') min_cov = parseFloat(getopt.arg); | |
else if (c == 'n') min_nm = parseInt(getopt.arg); | |
else if (c == 'd') min_diff = parseFloat(getopt.arg); | |
else if (c == 'f') use_formula = true; | |
} | |
var re_cigar = /(\d+)([MIDHSN])/g; | |
var file = getopt.ind == args.length? new File() : new File(args[getopt.ind]); | |
var buf = new Bytes(); | |
while (file.readline(buf) >= 0) { | |
var m, line = buf.toString(); | |
if (line.charAt(0) == '@') continue; | |
var t = line.split("\t"); | |
if (t[2] == '*') continue; | |
if ((m = /\tNM:i:(\d+)/.exec(line)) == null) continue; | |
var nm = parseInt(m[1]); | |
var M = 0, I = 0, D = 0, nI = 0, nD = 0, S = 0; | |
while ((m = re_cigar.exec(t[5])) != null) { | |
var l = parseInt(m[1]); | |
if (m[2] == 'M') M += l; | |
else if (m[2] == 'D') D += l, ++nD; | |
else if (m[2] == 'I') I += l, ++nI; | |
else if (m[2] == 'S' || m[2] == 'H') S += l; | |
} | |
var qlen = M + I + S; | |
var diff = nm / (M + I + D); | |
if ((M + I) / qlen >= min_cov) { | |
if (nm < min_nm || diff < min_diff) | |
print(t[0], qlen, nm, diff, 'S'); | |
else if (use_formula) { | |
var thres = 1 / (1 + 0.055 * Math.exp(-8.5e-5 * qlen)); | |
if (thres > .99) thres = .99; | |
if (diff < 1 - thres) | |
print(t[0], qlen, nm, diff, 'F', 1-thres); | |
} | |
} else { | |
if (nm + S < min_nm) | |
print(t[0], qlen, nm, diff, 'C', S); | |
} | |
} | |
buf.destroy(); | |
file.close(); | |
} | |
function hsd_tab2fa(args) | |
{ | |
var file = args.length == 0? new File() : new File(args[0]); | |
var buf = new Bytes(); | |
while (file.readline(buf) >= 0) { | |
var t = buf.toString().split("\t"); | |
print(">"+t[0]); | |
print(t[4]); | |
} | |
buf.destroy(); | |
file.close(); | |
} | |
function hsd_fixname(args) | |
{ | |
var file = args.length == 0? new File() : new File(args[0]); | |
var buf = new Bytes(); | |
while (file.readline(buf) >= 0) { | |
var t = buf.toString().split("\t"); | |
t[2] = parseInt(t[2]); | |
if (/_R$/.test(t[0]) && t[2] != 0) { | |
var s = t[0].split("_"); | |
s[3] = parseInt(s[3]) + t[2]; | |
s[4] = parseInt(s[4]) + t[2]; | |
t[0] = s.join("_"); | |
var tmp = t[1]; | |
t[1] = t[2]; t[2] = tmp; | |
} | |
print(t.join("\t")); | |
} | |
buf.destroy(); | |
file.close(); | |
} | |
function hsd_nrexact(args) | |
{ | |
function choose_one(a) | |
{ | |
if (a.length == 1) { | |
print(a[0].join("\t")); | |
return; | |
} | |
var max = -1, max_i = -1; | |
for (var i = 0; i < a.length; ++i) { | |
var s = a[i][0].split("_"); | |
var key = parseInt(s[2])+Math.random(); | |
if (key > max) max = key, max_i = i; | |
} | |
print(a[max_i].join("\t")); | |
} | |
var file = args.length == 0? new File() : new File(args[0]); | |
var buf = new Bytes(); | |
var a = []; | |
while (file.readline(buf) >= 0) { | |
var t = buf.toString().split("\t"); | |
if (a.length && a[0][4] != t[4]) { | |
choose_one(a); | |
a = []; | |
} | |
a.push(t); | |
} | |
choose_one(a); | |
buf.destroy(); | |
file.close(); | |
} | |
function hsd_bedsum(args) | |
{ | |
var file = args.length == 0? new File() : new File(args[0]); | |
var buf = new Bytes(); | |
function get_longest(merged) | |
{ | |
var x = 0, max = 0, sum = 0; | |
for (var i = 0; i < merged.length; ++i) { | |
max = max > merged[i][0] - x? max : merged[i][0] - x; | |
x = merged[i][1]; | |
sum += merged[i][1] - merged[i][0]; | |
} | |
max = max > merged[0][2] - x? max : merged[0][2] - x; | |
return [merged[0][2], sum, max]; | |
} | |
var last = ['', 0, 0, 0]; | |
var merged = []; | |
while (file.readline(buf) >= 0) { | |
var t = buf.toString().split("\t"); | |
var name = t[0], start = parseInt(t[1]), end = parseInt(t[2]), len = parseInt(t[3]); | |
if (name != last[0]) { | |
if (last[0] != '') { | |
merged.push([last[1], last[2], last[3]]); | |
print(last[0], get_longest(merged).join("\t")); | |
} | |
last = [name, start, end, len]; | |
merged = []; | |
} else if (start > last[2]) { | |
merged.push([last[1], last[2], last[3]]); | |
last[1] = start, last[2] = end; | |
} else if (end > last[2]) { | |
last[2] = end; | |
} | |
} | |
merged.push([last[1], last[2], last[3]]); | |
print(last[0], get_longest(merged).join("\t")); | |
buf.destroy(); | |
file.close(); | |
} | |
function hsd_ntanno(args) | |
{ | |
var file = args.length == 0? new File() : new File(args[0]); | |
var buf = new Bytes(); | |
var re = /(\d+)([MIDSH])/g; | |
while (file.readline(buf) >= 0) { | |
var line = buf.toString(); | |
if (/^@/.test(line)) continue; | |
var m, NM = null, t = line.split("\t"); | |
var flag = parseInt(t[1]); | |
if ((flag&4) || t[2] == '*') { | |
print(t[0], t[9].length, -1, -1, '*', '*', 0, 0, 0, '*'); | |
continue; | |
} | |
if ((m = /\tNM:i:(\d+)/.exec(line)) != null) | |
NM = parseInt(m[1]); | |
if (NM == null) throw Error("No NM tag"); | |
var M = 0, I = 0, D = 0, clip = [0, 0]; | |
while ((m = re.exec(t[5])) != null) { | |
var l = parseInt(m[1]); | |
if (m[2] == 'M') M += l; | |
else if (m[2] == 'I') I += l; | |
else if (m[2] == 'D') D += l; | |
else if (m[2] == 'S' || m[2] == 'H') { | |
if (M == 0 && I == 0 && D == 0) clip[0] = l; | |
else clip[1] = l; | |
} | |
} | |
var strand = (flag & 16)? '-' : '+'; | |
var start, end, qlen; | |
qlen = clip[0] + clip[1] + M + I; | |
if (strand == '+') start = clip[0], end = qlen - clip[1]; | |
else start = clip[1], end = qlen - clip[0]; | |
var alen = M + I + D, hdr = null; | |
if ((m = /^gi\|(\d+)\|[^|\s]+\|([^|\s]+)/.exec(t[2])) != null) | |
t[2] = m[1]+':'+m[2]; | |
hdr = (m = /\tXR:Z:([^\t]+)/.exec(line)) != null? m[1] : '*'; | |
var diff = NM / alen; | |
print(t[0], qlen, start, end, t[2], strand, t[3], t[4], NM, diff.toFixed(4), hdr); | |
} | |
buf.destroy(); | |
file.close(); | |
} | |
function hsd_genquery(args) | |
{ | |
var file = args.length == 0? new File() : new File(args[0]); | |
var buf = new Bytes(); | |
var a = [], max = 9900; | |
while (file.readline(buf) >= 0) { | |
var t = buf.toString().split("\t"); | |
a.push(parseInt(t[0])); | |
} | |
buf.destroy(); | |
file.close(); | |
for (var i = 0; i < a.length; i += max) | |
print("db=nucleotide&id="+a.slice(i,i+max).join(",")); | |
} | |
var human_tax = [ | |
"cellular organisms", | |
"Eukaryota", | |
"Opisthokonta", | |
"Metazoa", | |
"Eumetazoa", | |
"Bilateria", | |
"Deuterostomia", | |
"Chordata", | |
"Craniata", | |
"Vertebrata", | |
"Gnathostomata", | |
"Teleostomi", | |
"Euteleostomi", | |
"Sarcopterygii", | |
"Dipnotetrapodomorpha", | |
"Tetrapoda", | |
"Amniota", | |
"Mammalia", | |
"Theria", | |
"Eutheria", | |
"Boreoeutheria", | |
"Euarchontoglires", | |
"Primates", | |
"Haplorrhini", | |
"Simiiformes", | |
"Catarrhini", | |
"Hominoidea", | |
"Hominidae", | |
"Homininae", | |
"Homo" | |
]; | |
function hsd_hslcs(args) | |
{ | |
var file = args.length == 0? new File() : new File(args[0]); | |
var buf = new Bytes(); | |
var state = 0, id = null, lcs = null; | |
while (file.readline(buf) >= 0) { | |
var m, line = buf.toString(); | |
if (state == 0 && (m = /<TaxId>(\d+)</.exec(line)) != null) { | |
id = m[1]; | |
} else if (/<LineageEx>/.test(line)) { | |
state = 1; | |
lcs = -1; | |
} else if (state == 1 && (m = /<ScientificName>([^<]+)</.exec(line)) != null) { | |
for (var i = human_tax.length - 1; i >= 0; --i) | |
if (human_tax[i] == m[1]) | |
lcs = lcs > i? lcs : i; | |
} else if (/<\/LineageEx>/.test(line)) { | |
var name = lcs == 0? "cellular_organisms" : lcs == -1? 'organisms' : human_tax[lcs]; | |
print(id, name); | |
state = 0; | |
} | |
} | |
buf.destroy(); | |
file.close(); | |
} | |
function hsd_rm2bed(args) | |
{ | |
var file = args.length == 0? new File() : new File(args[0]); | |
var buf = new Bytes(); | |
while (file.readline(buf) >= 0) { | |
var t = buf.toString().split(/\s+/); | |
if (t[0] == null || t[0] == '') t.shift(); | |
if (!/^\s*\d/.test(t[0])) continue; | |
var m, name = t[4], start = parseInt(t[5]) - 1, end = parseInt(t[6]), len = null; | |
if ((m = /^\((\d+)\)/.exec(t[7])) != null) len = end + parseInt(m[1]); | |
if (len == null) throw Error("No length!"); | |
print(name, start, end, len, t[9], t[10]); | |
} | |
buf.destroy(); | |
file.close(); | |
} | |
function main(args) | |
{ | |
var cmd = args.shift(); | |
if (cmd == 'fm2seg') hsd_fm2seg(args); | |
else if (cmd == 'fm2seg2') hsd_fm2seg2(args); | |
else if (cmd == 'fixname') hsd_fixname(args); | |
else if (cmd == 'dropsim') hsd_dropsim(args); | |
else if (cmd == 'tab2fa') hsd_tab2fa(args); | |
else if (cmd == 'nrexact') hsd_nrexact(args); | |
else if (cmd == 'bedsum') hsd_bedsum(args); | |
else if (cmd == 'nr') hsd_nr(args); | |
else if (cmd == 'ntanno') hsd_ntanno(args); | |
else if (cmd == 'genquery') hsd_genquery(args); | |
else if (cmd == 'hslcs') hsd_hslcs(args); | |
else if (cmd == 'rm2bed') hsd_rm2bed(args); | |
else warn("Unrecognized command"); | |
} | |
main(arguments); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment