Last active
October 9, 2022 21:25
-
-
Save dustalov/0ce8e45a80b110e8c18e0f1b24ea2b36 to your computer and use it in GitHub Desktop.
Pairwise statistical significance test in AWK using Z-test.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/awk -f | |
BEGIN { | |
# significance level | |
if (length(ALPHA) == 0) ALPHA = 0.05; | |
# standard error estimation method: "basic" or "pooled" | |
if (length(SE) == 0) SE = "basic"; | |
# one-tailed or two-tailed? | |
if (TAILS != 2) TAILS = 1; | |
PI = atan2(0, -1); | |
} | |
{ | |
ls[NR] = $1; # label | |
xs[NR] = $2; # proportion | |
ns[NR] = $3; # sample size | |
} | |
END { | |
for (i = 1; i < NR; i++) { | |
for (j = i + 1; j <= NR; j++) { | |
# point estimate | |
pe = xs[i] - xs[j]; | |
# standard error | |
if (SE == "basic") { | |
se = sqrt(xs[i] * (1 - xs[i]) / ns[i] + xs[j] * (1 - xs[j]) / ns[j]); | |
} else if (SE == "pooled") { | |
# pooled proportion | |
pp = (xs[i] * ns[i] + xs[j] * ns[j]) / (ns[i] + ns[j]); | |
se = sqrt(pp * (1 - pp) / ns[i] + pp * (1 - pp) / ns[j]); | |
} else { | |
print "Unknown SE mode." > "/dev/stderr"; | |
exit 1; | |
} | |
# Z-score | |
z = pe / se; | |
if (z < 0) z = -z; | |
# pnorm is the value of the CDF for the normal distribution | |
value = z; | |
sum = z; | |
for (k = 1; k <= 100; k++) { | |
value *= z * z / (2 * k + 1); | |
sum += value; | |
} | |
if (z > 10) { | |
# awk is not so great at math | |
pnorm = 0; | |
} else { | |
# note that P(Z > z) is estimated, not P(Z <= z) | |
pnorm = 0.5 - sum / sqrt(2 * PI) * exp(-z**2 / 2); | |
} | |
pvalue = TAILS * pnorm; | |
if (pvalue > 1) { | |
print "p-value is computed incorrectly." > "/dev/stderr"; | |
exit 2; | |
} | |
print ls[i], ls[j], xs[i], ns[i], xs[j], ns[j], z, sprintf("%.6f", pvalue), pvalue < ALPHA; | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi, almost nothing here is specific to Z-test.
The formula for computing
pnorm
is based on the series expansion for the normal distribution CDF, e.g., https://math.stackexchange.com/questions/2223296/cdf-of-standard-normal. I used the simpler version for the sake of code simplicity.However, the original formula evaluates
P(Z ≤ z)
and we needP(Z > z)
, which can be trivially obtained as the normal distribution is symmetric.As this is an approximate numerical method, we have to run a reasonable number of iterations, the larger the better. However, after 100 iterations the estimation accuracy does not improve significantly, so I kept the limit of 100.
Hope it helps!