Created
April 24, 2014 07:35
-
-
Save dbro/11245114 to your computer and use it in GitHub Desktop.
correlation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/awk -f | |
# input should be parallel sets of numbers, one set on each line, tab-separated. | |
# the input does not need to be sorted | |
# non-numeric input anywhere on the line will cause the entire line to be ignored | |
# this uses a naive algorithm that may lose precision in some situations. | |
# see http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance for an alternate algorithm | |
BEGIN { | |
FS="\t"; | |
OFS=FS; | |
n=0; # count of rows (which contain a full set of data) | |
field_count=-1; # invalid, will be initialized by first row | |
split("", sums); # initialize array. not necessary | |
rejectctr=0; | |
exitcode=0; # OK | |
} | |
{ | |
if (field_count == -1) { | |
field_count = NF; # determined by the first row | |
if (field_count < 1) { | |
print "ERROR. first row has fewer than 1 column (they should be tab delimited)" > "/dev/stderr"; | |
exitcode = 1; | |
exit exitcode; | |
} | |
} | |
# first check all the data in the row to see if this row should be rejected | |
reject_this_row = 0; # false | |
if (NF != field_count) { | |
reject_this_row = 1; # true | |
} else { | |
for (i=1; i<=field_count; i++) { | |
v = $i + 0; | |
if (v != $i) { # test for numeric | |
reject_this_row = 1; # true | |
} | |
} | |
} | |
if (reject_this_row == 1) { | |
rejectedctr++; | |
} else { | |
# then update the accumulators | |
n += 1; | |
for (i=1; i<=field_count; i++) { | |
$i; | |
sums[i] += $i; # single-column accumulator | |
# pairwise accumulators | |
for (j=i; j<=field_count; j++) { | |
sums[i,j] += ($i * $j); # i is always less than or equal to j | |
} | |
} | |
} | |
} | |
END { | |
if (exitcode == 0) { | |
printf("col1\tcol2\tcorr\n"); | |
for (i=1; i<=field_count; i++) { | |
var_i = ((sums[i,i] - ((sums[i] * sums[i]) / n)) / n); | |
std_dev_i = sqrt(var_i); | |
#print "column " i " variance = " var_i "\tstd_dev = " std_dev_i; | |
for (j=i+1; j<=field_count; j++) { | |
var_j = ((sums[j,j] - ((sums[j] * sums[j]) / n)) / n); | |
std_dev_j = sqrt(var_j); | |
cov = ((sums[i,j] - ((sums[i] * sums[j]) / n)) / n); | |
corr = cov / (std_dev_i * std_dev_j); | |
printf("%d\t%d\t%.3f\n", i, j, corr); | |
} | |
} | |
print n " rows included, and " rejectctr " rows rejected" > "/dev/stderr"; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment