-
-
Save dustalov/0ce8e45a80b110e8c18e0f1b24ea2b36 to your computer and use it in GitHub Desktop.
| #!/usr/bin/awk -f | |
| BEGIN { | |
| # significance level | |
| if (length(ALPHA) == 0) ALPHA = 0.05; | |
| # standard error estimation method: "basic" or "pooled" | |
| if (length(SE) == 0) SE = "basic"; | |
| # one-tailed or two-tailed? | |
| if (TAILS != 2) TAILS = 1; | |
| PI = atan2(0, -1); | |
| } | |
| { | |
| ls[NR] = $1; # label | |
| xs[NR] = $2; # proportion | |
| ns[NR] = $3; # sample size | |
| } | |
| END { | |
| for (i = 1; i < NR; i++) { | |
| for (j = i + 1; j <= NR; j++) { | |
| # point estimate | |
| pe = xs[i] - xs[j]; | |
| # standard error | |
| if (SE == "basic") { | |
| se = sqrt(xs[i] * (1 - xs[i]) / ns[i] + xs[j] * (1 - xs[j]) / ns[j]); | |
| } else if (SE == "pooled") { | |
| # pooled proportion | |
| pp = (xs[i] * ns[i] + xs[j] * ns[j]) / (ns[i] + ns[j]); | |
| se = sqrt(pp * (1 - pp) / ns[i] + pp * (1 - pp) / ns[j]); | |
| } else { | |
| print "Unknown SE mode." > "/dev/stderr"; | |
| exit 1; | |
| } | |
| # Z-score | |
| z = pe / se; | |
| if (z < 0) z = -z; | |
| # pnorm is the value of the CDF for the normal distribution | |
| value = z; | |
| sum = z; | |
| for (k = 1; k <= 100; k++) { | |
| value *= z * z / (2 * k + 1); | |
| sum += value; | |
| } | |
| if (z > 10) { | |
| # awk is not so great at math | |
| pnorm = 0; | |
| } else { | |
| # note that P(Z > z) is estimated, not P(Z <= z) | |
| pnorm = 0.5 - sum / sqrt(2 * PI) * exp(-z**2 / 2); | |
| } | |
| pvalue = TAILS * pnorm; | |
| if (pvalue > 1) { | |
| print "p-value is computed incorrectly." > "/dev/stderr"; | |
| exit 2; | |
| } | |
| print ls[i], ls[j], xs[i], ns[i], xs[j], ns[j], z, sprintf("%.6f", pvalue), pvalue < ALPHA; | |
| } | |
| } | |
| } |
--Hi,
is the function: pnorm = 0.5 - sum / sqrt(2 * PI) * exp(-z**2 / 2) specific to z-test ?
I'm looking for the generic formula of pnorm in awk.
i don't understand the loop:
for (k = 1; k <= 100; k++) {
value *= z * z / (2 * k + 1);
sum += value;
}
why 100 ? is there a limit ?
thank you --
Hi, almost nothing here is specific to Z-test.
The formula for computing pnorm is based on the series expansion for the normal distribution CDF, e.g., https://math.stackexchange.com/questions/2223296/cdf-of-standard-normal. I used the simpler version for the sake of code simplicity.
However, the original formula evaluates P(Z ≤ z) and we need P(Z > z), which can be trivially obtained as the normal distribution is symmetric.
As this is an approximate numerical method, we have to run a reasonable number of iterations, the larger the better. However, after 100 iterations the estimation accuracy does not improve significantly, so I kept the limit of 100.
Hope it helps!
For instance, method
Ashows the precision of0.5measured on1000samples, methodBshows0.501on10000, and methodCshows0.3on5000samples. We are interested in conducting a Z-test to find out which result is statistically significant.According to the resulting table, both
AandBmethods significantly outperformC. However, neitherAnorBoutperforms each other.