fwhigh · June 5, 2017 05:47
diff --git a/auc_benchmark.sh b/auc_benchmark.sh
 #!/usr/bin/env bash

 for a in {1..3}; do perf -ROC < kddb.t_eval.subsample.txt &gt; /dev/null; done
 time for a in {1..10}; do perf -ROC < kddb.t_eval.subsample.txt; done
diff --git a/auc_rmse.sh b/auc_rmse.sh
 #!/usr/bin/env bash

 awk -v OFS=$'\t' '
 {
    diff=$2-$1
    s1+=diff
    s2+=diff*diff
 }
 END {
    print "RMS",sqrt((NR*s2 - s1 * s1)/(NR * (NR - 1)))
 }' kddb.t_eval.subsample.txt
diff --git a/awk_auc_eval.sh b/awk_auc_eval.sh
 #!/usr/bin/env bash

 awk -v OFS=$'\t' -v decimals=3 '
 BEGIN { max=10^decimals; min=1 }
 {
 score_bin=int(max*$2)
 if ($1 > 0) { pos[score_bin]++ } else { neg[score_bin]++ }
 }
 END {
 ctp_prev=pos[max]
 cfp_prev=neg[max]
 for (i = max-1; i >= min; i--) {
  ctp=ctp_prev+pos[i]
  cfp=cfp_prev+neg[i]
  auc+=ctp*(cfp-cfp_prev)
  ctp_prev=ctp
  cfp_prev=cfp
 }
 print "ROC",auc/(ctp*cfp)
 }' kddb.t_eval.subsample.txt
diff --git a/awk_fixed_thresh_eval.sh b/awk_fixed_thresh_eval.sh
 #!/usr/bin/env bash

 awk -v OFS=$'\t' '
 {
 if ($2 >= 0.5) {
  if ($1 > 0) { tp++ } else { fp++ }
 } else {
  if ($1 > 0) { fn++ } else { tn++ }
 }
 }
 END {
 n=tp+fp
 ntot=tp+fp+tn+fn
 pos=tp+fn
 #neg=fp+tn
 recall=tp/pos
 reach=n/ntot
 precision=tp/n
 accuracy=(tp+tn)/ntot
 f1score=2*tp/(2*tp+fp+fn)
 lift=recall/reach
 print "ACC",accuracy
 print "PRE",precision
 print "REC",recall
 print "PRF",f1score
 print "LFT",lift
 }' kddb.t_eval.subsample.txt
diff --git a/get_data.sh b/get_data.sh
 #!/usr/bin/env bash

 wget http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/kddb.bz2
 wget http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/kddb.t.bz2
 bunzip2 kddb.bz2
 bunzip2 kddb.t.bz2
diff --git a/logistic_prob.sh b/logistic_prob.sh
 #!/usr/bin/env bash

 paste -d' ' <(cut -d' ' -f 1 kddb.t) <(awk '{print 1/(1+exp(-$1))}' kddb.t_scores.txt) > kddb.t_eval.txt
diff --git a/perf_auc_eval.sh b/perf_auc_eval.sh
 #!/usr/bin/env bash

 perf -ROC < kddb.t_eval.subsample.txtawk_auc_eval
diff --git a/perf_fixed_thresh_eval.sh b/perf_fixed_thresh_eval.sh
 #!/usr/bin/env bash

 perf -PRE -REC -ACC -LFT -PRF < kddb.t_eval.txt
diff --git a/perf_rmse.sh b/perf_rmse.sh
 #!/usr/bin/env bash

 perf -RMS < kddb.t_eval.subsample.txt
diff --git a/score.sh b/score.sh
 #!/usr/bin/env bash

 time awk '{$1=$1*2-1" |Features"; print $0}' kddb.t | \
 vw --loss_function logistic --initial_regressor model.vw \
 -p kddb.t_scores.txt
diff --git a/sort.sh b/sort.sh
 #!/usr/bin/env bash

 time sort -t' ' -g -r -k 2,2 kddb.t_eval.subsample.txt > /dev/null
diff --git a/subsample.sh b/subsample.sh
 #!/usr/bin/env bash

 awk '
 BEGIN { srand(42) }
 rand() < 500000/748401 {
 n++; if (n>=500000) { exit 0 } print
 }' kddb.t_eval.txt > kddb.t_eval.subsample.txt
 {% endhighlight %}

 Now evaluate with perf on the subsample.

 {% highlight bash linenos %}
 perf -PRE -REC -ACC -LFT -PRF 
 kddb.t_eval.subsample.txt
diff --git a/train.sh b/train.sh
 #!/usr/bin/env bash

 awk '{$1=$1*2-1" |Features"; print $0}' kddb | \
 vw --loss_function logistic --final_regressor model.vw
	#!/usr/bin/env bash

	for a in {1..3}; do perf -ROC < kddb.t_eval.subsample.txt > /dev/null; done
	time for a in {1..10}; do perf -ROC < kddb.t_eval.subsample.txt; done
	#!/usr/bin/env bash

	awk -v OFS=$'\t' '
	{
	diff=$2-$1
	s1+=diff
	s2+=diff*diff
	}
	END {
	print "RMS",sqrt((NRs2 - s1 s1)/(NR * (NR - 1)))
	}' kddb.t_eval.subsample.txt
	#!/usr/bin/env bash

	awk -v OFS=$'\t' -v decimals=3 '
	BEGIN { max=10^decimals; min=1 }
	{
	score_bin=int(max*$2)
	if ($1 > 0) { pos[score_bin]++ } else { neg[score_bin]++ }
	}
	END {
	ctp_prev=pos[max]
	cfp_prev=neg[max]
	for (i = max-1; i >= min; i--) {
	ctp=ctp_prev+pos[i]
	cfp=cfp_prev+neg[i]
	auc+=ctp*(cfp-cfp_prev)
	ctp_prev=ctp
	cfp_prev=cfp
	}
	print "ROC",auc/(ctp*cfp)
	}' kddb.t_eval.subsample.txt
	#!/usr/bin/env bash

	awk -v OFS=$'\t' '
	{
	if ($2 >= 0.5) {
	if ($1 > 0) { tp++ } else { fp++ }
	} else {
	if ($1 > 0) { fn++ } else { tn++ }
	}
	}
	END {
	n=tp+fp
	ntot=tp+fp+tn+fn
	pos=tp+fn
	#neg=fp+tn
	recall=tp/pos
	reach=n/ntot
	precision=tp/n
	accuracy=(tp+tn)/ntot
	f1score=2tp/(2tp+fp+fn)
	lift=recall/reach
	print "ACC",accuracy
	print "PRE",precision
	print "REC",recall
	print "PRF",f1score
	print "LFT",lift
	}' kddb.t_eval.subsample.txt
	#!/usr/bin/env bash

	wget http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/kddb.bz2
	wget http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/kddb.t.bz2
	bunzip2 kddb.bz2
	bunzip2 kddb.t.bz2
	#!/usr/bin/env bash

	paste -d' ' <(cut -d' ' -f 1 kddb.t) <(awk '{print 1/(1+exp(-$1))}' kddb.t_scores.txt) > kddb.t_eval.txt
	#!/usr/bin/env bash

	perf -ROC < kddb.t_eval.subsample.txtawk_auc_eval
	#!/usr/bin/env bash

	perf -PRE -REC -ACC -LFT -PRF < kddb.t_eval.txt
	#!/usr/bin/env bash

	time awk '{$1=$1*2-1" \|Features"; print $0}' kddb.t \| \
	vw --loss_function logistic --initial_regressor model.vw \
	-p kddb.t_scores.txt
	#!/usr/bin/env bash

	time sort -t' ' -g -r -k 2,2 kddb.t_eval.subsample.txt > /dev/null