-
-
Save dustalov/0ce8e45a80b110e8c18e0f1b24ea2b36 to your computer and use it in GitHub Desktop.
#!/usr/bin/awk -f | |
BEGIN { | |
# significance level | |
if (length(ALPHA) == 0) ALPHA = 0.05; | |
# standard error estimation method: "basic" or "pooled" | |
if (length(SE) == 0) SE = "basic"; | |
# one-tailed or two-tailed? | |
if (TAILS != 2) TAILS = 1; | |
PI = atan2(0, -1); | |
} | |
{ | |
ls[NR] = $1; # label | |
xs[NR] = $2; # proportion | |
ns[NR] = $3; # sample size | |
} | |
END { | |
for (i = 1; i < NR; i++) { | |
for (j = i + 1; j <= NR; j++) { | |
# point estimate | |
pe = xs[i] - xs[j]; | |
# standard error | |
if (SE == "basic") { | |
se = sqrt(xs[i] * (1 - xs[i]) / ns[i] + xs[j] * (1 - xs[j]) / ns[j]); | |
} else if (SE == "pooled") { | |
# pooled proportion | |
pp = (xs[i] * ns[i] + xs[j] * ns[j]) / (ns[i] + ns[j]); | |
se = sqrt(pp * (1 - pp) / ns[i] + pp * (1 - pp) / ns[j]); | |
} else { | |
print "Unknown SE mode." > "/dev/stderr"; | |
exit 1; | |
} | |
# Z-score | |
z = pe / se; | |
if (z < 0) z = -z; | |
# pnorm is the value of the CDF for the normal distribution | |
value = z; | |
sum = z; | |
for (k = 1; k <= 100; k++) { | |
value *= z * z / (2 * k + 1); | |
sum += value; | |
} | |
if (z > 10) { | |
# awk is not so great at math | |
pnorm = 0; | |
} else { | |
# note that P(Z > z) is estimated, not P(Z <= z) | |
pnorm = 0.5 - sum / sqrt(2 * PI) * exp(-z**2 / 2); | |
} | |
pvalue = TAILS * pnorm; | |
if (pvalue > 1) { | |
print "p-value is computed incorrectly." > "/dev/stderr"; | |
exit 2; | |
} | |
print ls[i], ls[j], xs[i], ns[i], xs[j], ns[j], z, sprintf("%.6f", pvalue), pvalue < ALPHA; | |
} | |
} | |
} |
--Hi,
is the function: pnorm = 0.5 - sum / sqrt(2 * PI) * exp(-z**2 / 2) specific to z-test ?
I'm looking for the generic formula of pnorm in awk.
i don't understand the loop:
for (k = 1; k <= 100; k++) {
value *= z * z / (2 * k + 1);
sum += value;
}
why 100 ? is there a limit ?
thank you --
Hi, almost nothing here is specific to Z-test.
The formula for computing pnorm
is based on the series expansion for the normal distribution CDF, e.g., https://math.stackexchange.com/questions/2223296/cdf-of-standard-normal. I used the simpler version for the sake of code simplicity.
However, the original formula evaluates P(Z ≤ z)
and we need P(Z > z)
, which can be trivially obtained as the normal distribution is symmetric.
As this is an approximate numerical method, we have to run a reasonable number of iterations, the larger the better. However, after 100 iterations the estimation accuracy does not improve significantly, so I kept the limit of 100.
Hope it helps!
For instance, method
A
shows the precision of0.5
measured on1000
samples, methodB
shows0.501
on10000
, and methodC
shows0.3
on5000
samples. We are interested in conducting a Z-test to find out which result is statistically significant.According to the resulting table, both
A
andB
methods significantly outperformC
. However, neitherA
norB
outperforms each other.