Created
April 24, 2014 07:33
-
-
Save dbro/11245068 to your computer and use it in GitHub Desktop.
summary statistics
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/awk -f | |
# input should be a set of numbers, one on each line. can be unsorted. | |
# non-numeric input will be ignored | |
# 2-pass algorithm, stores a copy of each number in an array in memory | |
# this could be changed to assume the input is sorted, but would still | |
# need to know in advance how many numbers to expect in the full set | |
# in order to calculate percentiles and the trimmed mean. | |
BEGIN { | |
FS="\t"; | |
OFS=FS; | |
n=0; # count of numbers | |
split("", nums); # initialize array. not necessary | |
rejectctr=0; | |
sum=0; | |
sumsq=0; | |
} | |
{ | |
v = $1 + 0; | |
if (v == $1) { # test for numeric | |
n += 1; | |
nums[n] = v; | |
sum += v; | |
sumsq += v*v; | |
} else { | |
rejectctr++; | |
} | |
} | |
END { | |
mean = sum / n; | |
vari = (sumsq - ((sum * sum) / n)) / (n - 1); # naiive approach, can lose precision (awk uses double precision numbers) | |
# see http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance for alternates | |
std_dev = sqrt(vari); | |
heapsort(n, nums); # sort in place | |
mid = int(n/2)+1; | |
median = (n % 2) ? nums[mid] : (nums[mid] + nums[mid-1])/2; | |
trim_sum = 0; | |
trim_start = int(n/20) + 1; # 5th percentile and below will be dropped | |
trim_end = n - trim_start; # 95th and up | |
for (i=trim_start; i<=trim_end; i++) { | |
trim_sum += nums[i]; | |
} | |
trim_mean = trim_sum / (trim_end - trim_start + 1); | |
min = nums[1]; | |
for (i=1; nums[i] == min; i++) {minctr = i;} | |
max = nums[n]; | |
for (i=n; nums[i] == max; i--) {maxctr = n - i + 1;} | |
print "count=" n, "rejected=" rejectctr, "sum=" sum, "mean=" mean, "min=" min, "min_count=" minctr, "max=" max, "max_count=" maxctr, "std_dev=" std_dev, "median=" median, "trim_mean=" trim_mean; | |
} | |
function heapsort (n, ra) { | |
# from http://www.bagley.org/~doug/shootout/ | |
l = int(0.5+n/2) + 1 | |
ir = n; | |
for (;;) { | |
if (l > 1) { | |
rra = ra[--l]; | |
} else { | |
rra = ra[ir]; | |
ra[ir] = ra[1]; | |
if (--ir == 1) { | |
ra[1] = rra; | |
return; | |
} | |
} | |
i = l; | |
j = l * 2; | |
while (j <= ir) { | |
if (j < ir && ra[j] < ra[j+1]) { ++j; } | |
if (rra < ra[j]) { | |
ra[i] = ra[j]; | |
j += (i = j); | |
} else { | |
j = ir + 1; | |
} | |
} | |
ra[i] = rra; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment