Skip to content

Instantly share code, notes, and snippets.

@lh3
Last active August 29, 2015 14:06
Show Gist options
  • Save lh3/12a56904deebfcfaacb3 to your computer and use it in GitHub Desktop.
Save lh3/12a56904deebfcfaacb3 to your computer and use it in GitHub Desktop.
# generate the list of novel segments for each sample
seqtk seq -L200 001*.mag.gz | ./fermi2 match -dk101 hs38a.fmd /dev/stdin | k8 hs38d.js fm2seg -p 001 | gzip > 001.novel.gz
k8 hs38d.js tab2fa 001.novel.gz | bwa-r868 mem s38a.fa - | k8 hs38d.js dropsim | gzip -1 > 001.dropped.gz
tabtk isct -c 001.dropped.gz 001.novel.gz | gzip -1 > 001.div1.gz
#
zcat div1/*.div1.gz | cut -f1-5 | sort -k5,5 -k1,1 -S60G | k8 hs38d.js fixname | gzip -1 > 01ori.txt.gz
# tabtk isct -c gbk.novel.dropped gbk.novel.gz | sort -k5,5 -k1,1 -S60G | grep -v NNNNNNNNNN | gzip -1 > 01ori.txt.gz
tabtk isct -d_ -c contaminated.txt 01ori.txt.gz | gzip -1 > 02contam.txt.gz
# zcat 01ori.txt.gz|grep -vFf DoNotUseList_May2011.list | gzip -1 > 02contam.txt.gz
k8 hs38d.js nrexact 02contam.txt.gz | awk -f drop-sub.awk | gzip -1 > 03nr.txt.gz
zcat 03nr.txt.gz | cut -f5 | ropebwt2 -dLr > 03nr.fmd
k8 hs38d.js tab2fa 03nr.txt.gz | fermi2-r166 match -t16 03nr.fmd /dev/stdin | gzip -1 > 03nr.match.gz
zcat 03nr.match.gz | awk '/^SQ/{x=$2;getline;if($4==1)print x}' > 03nr.kept
tabtk isct 03nr.kept 03nr.txt.gz | gzip -1 > 04nr.txt.gz
k8 hs38d.js tab2fa 04nr.txt.gz|seqtk kfreq ggaat - > 04nr.kfreq
awk '{print int($4/$2*500+.499)}' 04nr.kfreq | sort -n | uniq -c > 04nr.khist
zcat 04nr.txt.gz|awk 'BEGIN{while((getline<"04nr.kfreq")>0)if($4*5/$2<=0.4)l[$1]=1}l[$1]' | gzip -1 > 05ggaat.txt.gz
k8 hs38d.js tab2fa 05ggaat.txt.gz | ./mdust -c > 05ggaat.mdust
awk '{print $1"\t"($3-1)"\t"$4"\t"$2}' 05ggaat.mdust|k8 hs38d.js bedsum|awk '{print int($3/$2*100+.499)}'|sort -n|uniq -c > 05ggaat.mhist
awk '{print $1"\t"($3-1)"\t"$4"\t"$2}' 05ggaat.mdust|k8 hs38d.js bedsum|awk '$3/$2>=.9||$2-$3<=100' > 05ggaat.dropped
tabtk isct -c 05ggaat.dropped 05ggaat.txt.gz | gzip -1 > 07dust.txt.gz
k8 hs38d.js tab2fa 07dust.txt.gz > 07dust.fa
bwa index 07dust.fa
bwa-r868 mem -aey100 -t16 07dust.fa 07dust.fa | gzip -1 > 07dust.sam.gz
k8 hs38d.js nr 07dust.sam.gz |sort|uniq > 07dust.dropped
tabtk isct -c 07dust.dropped 07dust.txt.gz | gzip -1 > 08nr.txt.gz
k8 hs38d.js tab2fa 08nr.txt.gz > 08nr.fa
bwa index 08nr.fa
bwa-r868 mem -aey1000 -t16 08nr.fa 08nr.fa | gzip -1 > 08nr.sam.gz
k8 hs38d.js nr 08nr.sam.gz |sort|uniq > 08nr.dropped
tabtk isct -c 08nr.dropped 08nr.txt.gz | gzip -1 > 09nr.txt.gz
k8 hs38d.js tab2fa 09nr.txt.gz > 09nr.fa
bwa index 09nr.fa
bwa-r868 mem -aey1000 -t16 09nr.fa 09nr.fa | gzip -1 > 09nr.sam.gz
k8 hs38d.js nr 09nr.sam.gz |sort|uniq > 09nr.dropped
tabtk isct -c 09nr.dropped 09nr.txt.gz | gzip -1 > 10nr.txt.gz
k8 hs38d.js tab2fa 10nr.txt.gz | bwa-r868 mem -y100 -t16 ../gbk/09nr.fa - | gzip -1 > 10nr.y.sam.gz
k8 hs38d.js tab2fa 10nr.txt.gz | bwa-r868 mem -t16 ../gbk/09nr.fa - | gzip -1 > 10nr.def.sam.gz
zcat 10nr.*.sam.gz|grep -v ^@|k8 hs38d.js dropsim |sort|uniq > 10nr.dropped
tabtk isct -c 10nr.dropped 10nr.txt.gz|gzip -1 > 11gbk.txt.gz
k8 hs38d.js tab2fa 11gbk.txt.gz | bwa-r868 mem -y 100 -VN100 -t16 -k17 -A1 -B1 -O1 -E1 -L0 ~/hengli/nt/nt - | gzip -1 > 11gbk.sam.gz
k8 hs38d.js ntanno 11gbk.sam.gz > 11gbk.anno
cut -f5 11gbk.anno|tr ":" "\t"|cut -f1|sort -n|uniq |grep ^[0-9] > 11gbk.all.gi
k8 hs38d.js genquery 11gbk.all.gi > 11gbk.all.querys
awk 'NR==1' 11gbk.all.querys > 11gbk.all.query1
curl -d @11gbk.all.query1 http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi > 11gbk.all.rst1
cat 11gbk.all.rst?|perl -ne '$id=$1 if/<Id>(\d+)</;if (/TaxId.*>(\d+)</){print "$id\t$1\n"}' > 11gbk.taxid
cat 11gbk.taxid|cut -f2|sort -n|uniq|awk 'BEGIN{printf("db=taxonomy&id=")}{if(f)printf(",");printf("%d",$1);f=1}' > 11gbk.taxid.query
curl -d @11gbk.taxid.query http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi > 11gbk.taxid.rst
k8 hs38d.js hslcs 11gbk.taxid.rst > 11gbk.taxid.lcs
awk -t 'BEGIN{while((getline<"11gbk.taxid.lcs")>0)l[$1]=$2}{print $1,$2,l[$2]}' 11gbk.taxid > 11gbk.gi.lcs
awk 'BEGIN{FS=OFS="\t";while((getline<"11gbk.gi.lcs")>0)l[$1]=$3}$5=="*"{print}$5!="*"{x=substr($5,1,index($5,":")-1);$5=$5":"l[x];print}' 11gbk.anno > 11gbk.lcs.anno
egrep -v ":(cellular|Eukaryota|Opisthokonta|organisms)" 11gbk.lcs.anno|awk '$5!="*"' |cut -f1 > 11gbk.kept
zcat 11gbk.txt.gz|awk 'BEGIN{while((getline<"11gbk.kept")>0)l[$1]=1}l[$1]||$2>0||$3>0'|gzip -1 > 12contam.txt.gz
k8 hs38d.js tab2fa 13mix.txt.gz > 13mix.fa
bwa index 13mix.fa
bwa-r868 mem -aey1000 -t32 13mix.fa 13mix.fa | gzip -1 > 13mix.sam.gz
k8 hs38d.js nr 13mix.sam.gz | sort |uniq > 13mix.dropped
(awk 'NR>3' 09nr.fa.out; awk 'NR>3' 11gbk.fa.out) > ../254/13mix.rm
./mdust 09nr.fa -c > 09nr.mdust # for gbk
cat ../rm-rst/09nr.mdust 05ggaat.mdust > 13mix.mdust
(awk '{print $1"\t"($3-1)"\t"$4"\t"$2"\tdust\tSimple_repeat"}' 13mix.mdust; k8 hs38d.js rm2bed 13mix.rm)|sort -k4,4nr -k1,1 -k2,2n -k3,3n|tabtk isct 13mix.txt.gz | gzip -1 > 13mix.rep.gz
zcat 13mix.rep.gz|k8 hs38d.js bedsum|awk '$3/$2>=.8||$2-$3<=100' > 13mix.rep.high
tabtk isct -c 13mix.rep.high 14nr.txt.gz | awk '$4>=500' | gzip -1 > 15split.uni.gz
tabtk isct 13mix.rep.high 14nr.txt.gz | awk '$4>=500' | gzip -1 > 15split.rep.gz
k8 hs38d.js tab2fa 14nr.txt.gz | bwa-r868 mem -t32 ~/hengli/db/hs38/hs38a.fa - | gzip -1 > 14nr.sam.gz
k8 hs38d.js dropsim -f 14nr.sam.gz > 14nr.dropped
tabtk isct -c 14nr.dropped 14nr.txt.gz | awk '$4>=500' | gzip -1 > 15div.txt.gz
k8 hs38d.js tab2fa 15div.txt.gz > 15div.fa
bwa index 15div.fa
bwa-r868 mem -aey1000 -t32 15div.fa 15div.fa | gzip -1 > 15div.sam.gz
tabtk isct -c 15div.dropped 15div.txt.gz|gzip -1 > 16nr.txt.gz
tabtk isct 13mix.lcsat.high 16nr.txt.gz| awk '$4>=2000' | gzip -1 > 17split.lcsat.txt.gz
tabtk isct -c 13mix.lcsat.high 16nr.txt.gz| tabtk isct 13mix.rep.high | awk '$4>=1000' | gzip -1 > 17split.inter.txt.gz
tabtk isct -c 13mix.rep.high 16nr.txt.gz| gzip -1 > 17split.uni.txt.gz
k8 hs38d.js tab2fa 17split.uni.txt.gz > 17split.uni.fa
bwa index 17split.uni.fa
bwa-r868 mem -aey100 -t32 17split.uni.fa 17split.uni.fa | gzip -1 > 17split.uni.sam.gz
k8 hs38d.js nr -s 400 17split.uni.sam.gz |sort |uniq > 17split.uni.nr
cat 17split.mgd-*.sub > 17split.dropped
tabtk isct 17split.uni.nr 18mgd.uni.txt.gz | gzip -1 > 18mgd.redun.txt.gz
tabtk isct -c 17split.uni.nr 18mgd.uni.txt.gz | gzip -1 > 18mgd.nr.txt.gz
zcat 18mgd.nr.txt.gz|awk '$4>=1000'|sort -k4,4nr | awk '{printf(">decoy0%.4d %s\n%s\n",NR,$1,$5)}' | fold -w120 > 19fa.0nr.fa
zcat 18mgd.redun.txt.gz|awk '$4>=1000'|sort -k4,4nr | awk '{printf(">decoy1%.4d %s\n%s\n",NR,$1,$5)}' | fold -w120 > 19fa.1redun.fa
zcat 18mgd.inter.txt.gz|awk '$4>=1000'|sort -k4,4nr | awk '{printf(">decoy2%.4d %s\n%s\n",NR,$1,$5)}' | fold -w120 > 19fa.2inter.fa
zcat 18mgd.lcsat.txt.gz|awk '$4>=1000'|sort -k4,4nr | awk '{printf(">decoy3%.4d %s\n%s\n",NR,$1,$5)}' | fold -w120 > 19fa.3lcsat.fa
./RepeatMasker -species human -pa 40 -xsmall -small -dir ../rm-rst ../rm-rst/hs38d4s.fa
var getopt = function(args, ostr) {
var oli; // option letter list index
if (typeof(getopt.place) == 'undefined')
getopt.ind = 0, getopt.arg = null, getopt.place = -1;
if (getopt.place == -1) { // update scanning pointer
if (getopt.ind >= args.length || args[getopt.ind].charAt(getopt.place = 0) != '-') {
getopt.place = -1;
return null;
}
if (getopt.place + 1 < args[getopt.ind].length && args[getopt.ind].charAt(++getopt.place) == '-') { // found "--"
++getopt.ind;
getopt.place = -1;
return null;
}
}
var optopt = args[getopt.ind].charAt(getopt.place++); // character checked for validity
if (optopt == ':' || (oli = ostr.indexOf(optopt)) < 0) {
if (optopt == '-') return null; // if the user didn't specify '-' as an option, assume it means null.
if (getopt.place < 0) ++getopt.ind;
return '?';
}
if (oli+1 >= ostr.length || ostr.charAt(++oli) != ':') { // don't need argument
getopt.arg = null;
if (getopt.place < 0 || getopt.place >= args[getopt.ind].length) ++getopt.ind, getopt.place = -1;
} else { // need an argument
if (getopt.place >= 0 && getopt.place < args[getopt.ind].length)
getopt.arg = args[getopt.ind].substr(getopt.place);
else if (args.length <= ++getopt.ind) { // no arg
getopt.place = -1;
if (ostr.length > 0 && ostr.charAt(0) == ':') return ':';
return '?';
} else getopt.arg = args[getopt.ind]; // white space
getopt.place = -1;
++getopt.ind;
}
return optopt;
}
function hsd_fm2seg2(args)
{
var c, min_len = 200, min_dp = 5, pre = null;
while ((c = getopt(args, "l:d:p:")) != null)
if (c == 'l') min_len = parseInt(getopt.arg);
else if (c == 'd') min_dp = parseInt(getopt.arg);
else if (c == 'p') pre = getopt.arg;
var file = getopt.ind == args.length? new File() : new File(args[getopt.ind]);
var buf = new Bytes();
var name = null, len = 0;
while (file.readline(buf) >= 0) {
var m, line = buf.toString();
if ((m = /^SQ\s(\S+)\s(\d+)/.exec(line)) != null) {
name = m[1], len = parseInt(m[2]);
} else if (/^NS/.test(line)) {
var t = line.split("\t");
t[4] = parseInt(t[4]);
if (parseInt(t[4]) < min_len) continue;
t[1] = parseInt(t[1]), t[3] = parseInt(t[3]), t[5] = parseInt(t[5]);
var start_off = t[2] == '+'? t[3] : t[5];
var s = t[9].substr(t[3], t[4]);
var o = t[2] == '+'? 'F' : 'R';
if (t[10] != '*') {
var i, q = t[10].substr(t[3], t[4]);
for (i = 0; i < q.length; ++i)
if (q.charCodeAt(i) - 33 >= min_dp) break;
if (i == q.length) continue;
}
print((pre==null? "" : pre+"_") + [name, len, t[1]+start_off, t[1]+start_off+t[4], o].join("_"), t[3], t[5], t[4], s);
}
}
buf.destroy();
file.close();
}
function hsd_fm2seg(args)
{
var c, min_len = 200, min_dp = 5, pre = null;
while ((c = getopt(args, "l:d:p:")) != null)
if (c == 'l') min_len = parseInt(getopt.arg);
else if (c == 'd') min_dp = parseInt(getopt.arg);
else if (c == 'p') pre = getopt.arg;
var file = getopt.ind == args.length? new File() : new File(args[getopt.ind]);
var buf = new Bytes();
var name = null, len = 0;
while (file.readline(buf) >= 0) {
var m, line = buf.toString();
if ((m = /^SQ\s(\S+)\s(\d+)/.exec(line)) != null) {
name = m[1], len = parseInt(m[2]);
} else if (/^NS/.test(line)) {
var t = line.split("\t");
t[4] = parseInt(t[4]);
if (parseInt(t[4]) < min_len) continue;
t[1] = parseInt(t[1]), t[3] = parseInt(t[3]), t[5] = parseInt(t[5]);
var s = t[9].substr(t[3], t[4]);
var o = t[2] == '+'? 'F' : 'R';
if (t[10] != '*') {
var i, q = t[10].substr(t[3], t[4]);
for (i = 0; i < q.length; ++i)
if (q.charCodeAt(i) - 33 >= min_dp) break;
if (i == q.length) continue;
}
print((pre==null? "" : pre+"_") + [name, len, t[1]+t[3], t[1]+t[3]+t[4], o].join("_"), t[3], t[5], t[4], s);
}
}
buf.destroy();
file.close();
}
function hsd_nr(args)
{
var c, min_cov = .9, min_nm = 20, min_diff = 0.02, min_cov1 = .7, min_sc = null, use_formula = false;
while ((c = getopt(args, "c:n:d:C:s:f")) != null) {
if (c == 'c') min_cov = parseFloat(getopt.arg);
else if (c == 'n') min_nm = parseInt(getopt.arg);
else if (c == 'd') min_diff = parseFloat(getopt.arg);
else if (c == 'C') min_cov1 = parseFloat(getopt.arg);
else if (c == 's') min_sc = parseInt(getopt.arg);
else if (c == 'f') use_formula = true;
}
var re_cigar = /(\d+)([MIDHSN])/g;
var file = getopt.ind == args.length? new File() : new File(args[getopt.ind]);
var buf = new Bytes();
while (file.readline(buf) >= 0) {
var m, line = buf.toString();
if (line.charAt(0) == '@') continue;
var t = line.split("\t");
if (t[2] == '*') continue;
if ((m = /\tNM:i:(\d+)/.exec(line)) == null) continue;
var nm = parseInt(m[1]);
var k = 0, M = 0, I = 0, D = 0, nI = 0, nD = 0, qclip = [0, 0];
while ((m = re_cigar.exec(t[5])) != null) {
var l = parseInt(m[1]);
if (m[2] == 'M') M += l;
else if (m[2] == 'D') D += l, ++nD;
else if (m[2] == 'I') I += l, ++nI;
else if (m[2] == 'S' || m[2] == 'H') qclip[k==0? 0:1] = l;
++k;
}
var qlen = M + I + qclip[0] + qclip[1];
var diff = nm / (M + I + D);
var s = t[2].split("_");
var pos = parseInt(t[3]) - 1;
var tlen = parseInt(s[4]) - parseInt(s[3]);
var tclip = [pos, tlen - (pos + M + D)];
var Smin = (qclip[0] < tclip[0]? qclip[0] : tclip[0]) + (qclip[1] < tclip[1]? qclip[1] : tclip[1]);
if (min_sc == null) {
var is_nr = false;
if (nm + Smin < min_nm) is_nr = true;
var min_len = qlen < tlen? qlen : tlen;
var diff_thres = use_formula? 1 - 1 / (1 + 0.055 * Math.exp(-8.5e-5 * min_len)) : min_diff;
if ((M + I) / (M + I + Smin) >= min_cov && (M + D) / (M + D + Smin) >= min_cov && (nm < min_nm || diff < diff_thres)) is_nr = true;
if (is_nr) {
if (qlen < tlen && (M + I) / qlen >= min_cov1) print(t[0]);
else if (qlen >= tlen && (M + D) / tlen >= min_cov1) print(t[2]);
}
} else {
if ((m = /\tAS:i:(\d+)/.exec(line)) == null) continue;
var score = parseInt(m[1]);
var min_len = qlen < tlen? qlen : tlen;
if (score >= min_sc && score >= min_len / 3) {
if (qlen < tlen) print(t[0]);
else print(t[2]);
}
}
}
buf.destroy();
file.close();
}
function hsd_dropsim(args)
{
var c, min_cov = .95, min_nm = 20, min_diff = 0.01, use_formula = false;
while ((c = getopt(args, "c:n:d:f")) != null) {
if (c == 'c') min_cov = parseFloat(getopt.arg);
else if (c == 'n') min_nm = parseInt(getopt.arg);
else if (c == 'd') min_diff = parseFloat(getopt.arg);
else if (c == 'f') use_formula = true;
}
var re_cigar = /(\d+)([MIDHSN])/g;
var file = getopt.ind == args.length? new File() : new File(args[getopt.ind]);
var buf = new Bytes();
while (file.readline(buf) >= 0) {
var m, line = buf.toString();
if (line.charAt(0) == '@') continue;
var t = line.split("\t");
if (t[2] == '*') continue;
if ((m = /\tNM:i:(\d+)/.exec(line)) == null) continue;
var nm = parseInt(m[1]);
var M = 0, I = 0, D = 0, nI = 0, nD = 0, S = 0;
while ((m = re_cigar.exec(t[5])) != null) {
var l = parseInt(m[1]);
if (m[2] == 'M') M += l;
else if (m[2] == 'D') D += l, ++nD;
else if (m[2] == 'I') I += l, ++nI;
else if (m[2] == 'S' || m[2] == 'H') S += l;
}
var qlen = M + I + S;
var diff = nm / (M + I + D);
if ((M + I) / qlen >= min_cov) {
if (nm < min_nm || diff < min_diff)
print(t[0], qlen, nm, diff, 'S');
else if (use_formula) {
var thres = 1 / (1 + 0.055 * Math.exp(-8.5e-5 * qlen));
if (thres > .99) thres = .99;
if (diff < 1 - thres)
print(t[0], qlen, nm, diff, 'F', 1-thres);
}
} else {
if (nm + S < min_nm)
print(t[0], qlen, nm, diff, 'C', S);
}
}
buf.destroy();
file.close();
}
function hsd_tab2fa(args)
{
var file = args.length == 0? new File() : new File(args[0]);
var buf = new Bytes();
while (file.readline(buf) >= 0) {
var t = buf.toString().split("\t");
print(">"+t[0]);
print(t[4]);
}
buf.destroy();
file.close();
}
function hsd_fixname(args)
{
var file = args.length == 0? new File() : new File(args[0]);
var buf = new Bytes();
while (file.readline(buf) >= 0) {
var t = buf.toString().split("\t");
t[2] = parseInt(t[2]);
if (/_R$/.test(t[0]) && t[2] != 0) {
var s = t[0].split("_");
s[3] = parseInt(s[3]) + t[2];
s[4] = parseInt(s[4]) + t[2];
t[0] = s.join("_");
var tmp = t[1];
t[1] = t[2]; t[2] = tmp;
}
print(t.join("\t"));
}
buf.destroy();
file.close();
}
function hsd_nrexact(args)
{
function choose_one(a)
{
if (a.length == 1) {
print(a[0].join("\t"));
return;
}
var max = -1, max_i = -1;
for (var i = 0; i < a.length; ++i) {
var s = a[i][0].split("_");
var key = parseInt(s[2])+Math.random();
if (key > max) max = key, max_i = i;
}
print(a[max_i].join("\t"));
}
var file = args.length == 0? new File() : new File(args[0]);
var buf = new Bytes();
var a = [];
while (file.readline(buf) >= 0) {
var t = buf.toString().split("\t");
if (a.length && a[0][4] != t[4]) {
choose_one(a);
a = [];
}
a.push(t);
}
choose_one(a);
buf.destroy();
file.close();
}
function hsd_bedsum(args)
{
var file = args.length == 0? new File() : new File(args[0]);
var buf = new Bytes();
function get_longest(merged)
{
var x = 0, max = 0, sum = 0;
for (var i = 0; i < merged.length; ++i) {
max = max > merged[i][0] - x? max : merged[i][0] - x;
x = merged[i][1];
sum += merged[i][1] - merged[i][0];
}
max = max > merged[0][2] - x? max : merged[0][2] - x;
return [merged[0][2], sum, max];
}
var last = ['', 0, 0, 0];
var merged = [];
while (file.readline(buf) >= 0) {
var t = buf.toString().split("\t");
var name = t[0], start = parseInt(t[1]), end = parseInt(t[2]), len = parseInt(t[3]);
if (name != last[0]) {
if (last[0] != '') {
merged.push([last[1], last[2], last[3]]);
print(last[0], get_longest(merged).join("\t"));
}
last = [name, start, end, len];
merged = [];
} else if (start > last[2]) {
merged.push([last[1], last[2], last[3]]);
last[1] = start, last[2] = end;
} else if (end > last[2]) {
last[2] = end;
}
}
merged.push([last[1], last[2], last[3]]);
print(last[0], get_longest(merged).join("\t"));
buf.destroy();
file.close();
}
function hsd_ntanno(args)
{
var file = args.length == 0? new File() : new File(args[0]);
var buf = new Bytes();
var re = /(\d+)([MIDSH])/g;
while (file.readline(buf) >= 0) {
var line = buf.toString();
if (/^@/.test(line)) continue;
var m, NM = null, t = line.split("\t");
var flag = parseInt(t[1]);
if ((flag&4) || t[2] == '*') {
print(t[0], t[9].length, -1, -1, '*', '*', 0, 0, 0, '*');
continue;
}
if ((m = /\tNM:i:(\d+)/.exec(line)) != null)
NM = parseInt(m[1]);
if (NM == null) throw Error("No NM tag");
var M = 0, I = 0, D = 0, clip = [0, 0];
while ((m = re.exec(t[5])) != null) {
var l = parseInt(m[1]);
if (m[2] == 'M') M += l;
else if (m[2] == 'I') I += l;
else if (m[2] == 'D') D += l;
else if (m[2] == 'S' || m[2] == 'H') {
if (M == 0 && I == 0 && D == 0) clip[0] = l;
else clip[1] = l;
}
}
var strand = (flag & 16)? '-' : '+';
var start, end, qlen;
qlen = clip[0] + clip[1] + M + I;
if (strand == '+') start = clip[0], end = qlen - clip[1];
else start = clip[1], end = qlen - clip[0];
var alen = M + I + D, hdr = null;
if ((m = /^gi\|(\d+)\|[^|\s]+\|([^|\s]+)/.exec(t[2])) != null)
t[2] = m[1]+':'+m[2];
hdr = (m = /\tXR:Z:([^\t]+)/.exec(line)) != null? m[1] : '*';
var diff = NM / alen;
print(t[0], qlen, start, end, t[2], strand, t[3], t[4], NM, diff.toFixed(4), hdr);
}
buf.destroy();
file.close();
}
function hsd_genquery(args)
{
var file = args.length == 0? new File() : new File(args[0]);
var buf = new Bytes();
var a = [], max = 9900;
while (file.readline(buf) >= 0) {
var t = buf.toString().split("\t");
a.push(parseInt(t[0]));
}
buf.destroy();
file.close();
for (var i = 0; i < a.length; i += max)
print("db=nucleotide&id="+a.slice(i,i+max).join(","));
}
var human_tax = [
"cellular organisms",
"Eukaryota",
"Opisthokonta",
"Metazoa",
"Eumetazoa",
"Bilateria",
"Deuterostomia",
"Chordata",
"Craniata",
"Vertebrata",
"Gnathostomata",
"Teleostomi",
"Euteleostomi",
"Sarcopterygii",
"Dipnotetrapodomorpha",
"Tetrapoda",
"Amniota",
"Mammalia",
"Theria",
"Eutheria",
"Boreoeutheria",
"Euarchontoglires",
"Primates",
"Haplorrhini",
"Simiiformes",
"Catarrhini",
"Hominoidea",
"Hominidae",
"Homininae",
"Homo"
];
function hsd_hslcs(args)
{
var file = args.length == 0? new File() : new File(args[0]);
var buf = new Bytes();
var state = 0, id = null, lcs = null;
while (file.readline(buf) >= 0) {
var m, line = buf.toString();
if (state == 0 && (m = /<TaxId>(\d+)</.exec(line)) != null) {
id = m[1];
} else if (/<LineageEx>/.test(line)) {
state = 1;
lcs = -1;
} else if (state == 1 && (m = /<ScientificName>([^<]+)</.exec(line)) != null) {
for (var i = human_tax.length - 1; i >= 0; --i)
if (human_tax[i] == m[1])
lcs = lcs > i? lcs : i;
} else if (/<\/LineageEx>/.test(line)) {
var name = lcs == 0? "cellular_organisms" : lcs == -1? 'organisms' : human_tax[lcs];
print(id, name);
state = 0;
}
}
buf.destroy();
file.close();
}
function hsd_rm2bed(args)
{
var file = args.length == 0? new File() : new File(args[0]);
var buf = new Bytes();
while (file.readline(buf) >= 0) {
var t = buf.toString().split(/\s+/);
if (t[0] == null || t[0] == '') t.shift();
if (!/^\s*\d/.test(t[0])) continue;
var m, name = t[4], start = parseInt(t[5]) - 1, end = parseInt(t[6]), len = null;
if ((m = /^\((\d+)\)/.exec(t[7])) != null) len = end + parseInt(m[1]);
if (len == null) throw Error("No length!");
print(name, start, end, len, t[9], t[10]);
}
buf.destroy();
file.close();
}
function main(args)
{
var cmd = args.shift();
if (cmd == 'fm2seg') hsd_fm2seg(args);
else if (cmd == 'fm2seg2') hsd_fm2seg2(args);
else if (cmd == 'fixname') hsd_fixname(args);
else if (cmd == 'dropsim') hsd_dropsim(args);
else if (cmd == 'tab2fa') hsd_tab2fa(args);
else if (cmd == 'nrexact') hsd_nrexact(args);
else if (cmd == 'bedsum') hsd_bedsum(args);
else if (cmd == 'nr') hsd_nr(args);
else if (cmd == 'ntanno') hsd_ntanno(args);
else if (cmd == 'genquery') hsd_genquery(args);
else if (cmd == 'hslcs') hsd_hslcs(args);
else if (cmd == 'rm2bed') hsd_rm2bed(args);
else warn("Unrecognized command");
}
main(arguments);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment