/**************** Goodness of Fit ************************ This library of goodness of fit statistics was written by David Baird and may be distributed as freeware for public non-commercial use. Please provide appropriate acknowledgment if this library supports published work. There are no performance guarantees for this code. Last update: 11/9/94 Dr David Baird Biometrician Mail: AgResearch, PO Box 60, Lincoln, NEW ZEALAND Phone: +64 3 325 6900 Fax: +64 3 325 2946 Direct Dial: +64 3 325 6901 3975# Internet: BairdD@AgResearch.CRI.NZ ***********************************************************/ /* EMPCDF.SRC Procedures for calculating Goodness of fit statistics for ** Empirical distribution functions and their critical values ** and P values ** David Baird AgResearch PO Box 60 Lincoln 1/7/92 ** Procedure Description Line *========================================================================== s = GF_AD(d) ; Anderson-Darling Goodness of Fit Statistic 25 c = CV_AD(p,n) ; Critical value for Anderson-Darling Statistic 57 p = PR_AD(c,n) ; P value for Anderson-Darling Statistic 90 s = GF_C2(d) ; Cramer-von Mises Goodness of Fit Statistic 128 c = CV_C2(p,n) ; Critical value for Cramer-von Mises Statistic 160 p = PR_C2(c,n) ; P value for Cramer-von Mises Statistic 193 s = GF_KS(d) ; Kolmogorov-Smirnov Goodness of Fit Statistic 230 c = CV_KS(p,n) ; Critical value for Kolmogorov-Smirnov Statistic 264 p = PR_KS(c,n) ; P value for Kolmogorov-Smirnov Statistic 300 s = GF_K0(d) ; K0 squared Goodness of Fit Statistic 341 c = CV_K0(p,n) ; Critical value for K0 squared Statistic 377 p = PR_K0(c,n) ; P value for K0 squared Statistic 415 s = GF_K2(d) ; K squared Goodness of Fit Statistic 486 c = CV_K2(p,n) ; Critical value for K squared Statistic 523 p = PR_K2(c,n) ; P value for K squared Statistic 560 **========================================================================*/ /* GF_AD - Calculate Anderson-Darling Goodness of Fit Statistic ** ** The AD statistic is a goodness of fit statistic to test whether two ** cumulative frequency distributions are significantly different ** ** Input: D - N x C matrix - Empirical distribution function ** ** Output: S - C x 1 matrix of test statistics ** ** Usage S = GF_AD(D) ; ** ** Notes: Use CV_AD or PR_AD to evaluate the significance of AD under the ** Null hypothesis that D and T come from the same distribution */ proc gf_ad(d) ; local n,c,i,t,ad,f:proc ; n = rows(d) ; c = cols(d) ; i = 1 ; do until(i > c) ; d[.,i] = sortc(d[.,i],1) ; i = i + 1 ; endo ; ad = -sumc(seqa(1,2,n).*ln(d.*(1-rev(d))))./n - n ; retp(ad) ; endp ; /* CV_AD - Calculate critical value for Anderson-Darling Statistic ** ** The AD statistic is a goodness of fit statistic to test whether two ** cumulative frequency distributions are significantly different ** ** Input: P - R x C matrix of probabilities ( 0 < p < 1 ) ** N - S x T matrix of distribution size, ExE conformable with P ** ** Output: C - R x C matrix of critical values ie Prob(AD > C) = P ** ** Usage C = CV_AD(0.05,80) ; ** ** Notes: C is only returned for N > 5 and P in (0.0001,0.2) ** C is invariant to the value of N ** PR_AD provides the inverse tranform, and estimates P given C,N */ proc cv_ad(p,n) ; local c0,c1,c2 ; if not (0.0001 <= p and p <= 0.2) ; errorlog "ERROR: CV_AD - p value out of range (0.0001,0.2)" ; retp(error(1)) ; elseif not (n > 4) ; errorlog "ERROR: CV_AD - n value too small (n <= 4)" ; retp(error(1)) ; endif ; p = ln(p) ; c0 = 0.164111752625 ; c1 = -0.719787337528 ; c2 = 0.020207904162 ; retp((c0 + p.*(c1 + p.*c2))+zeros(rows(n),cols(n))) ; endp ; /* PR_AD - Calculate probability values for Anderson-Darling Statistic ** ** The AD statistic is a goodness of fit statistic to test whether two ** cumulative frequency distributions are significantly different. ** ** Input: C - R x C matrix of critical values ** N - S x T matrix of distribution size, ExE conformable with C ** ** Output: P - R x C matrix of probabilities ie Prob(AD > C) = P ** ** Usage P = PR_AD(2.0,80) ; ** ** Notes: P is only returned for N > 5 ** P is invariant to the value of N ** Values of P are not calculated accurately for P < 0.0001 or P > 0.2 ** CV_AD provides the inverse transform, and estimates C given P,N */ proc pr_ad(z,n) ; local c0,c1,c2,d,p ; if not (n > 4) ; errorlog "ERROR: PR_AD - n value too small (n <= 4)" ; retp(error(1)) ; endif ; n = 1./n ; c0 = 0.164111752625 ; c1 = -0.719787337528 ; c2 = 0.020207904162 ; d = c1^2 + 4.*c2.*(z - c0) ; if not (d > 0) ; d = d.* (d .> 0) ; errorlog("WARNING: PR_AD - some p values only approximate") ; endif ; p = exp(-(c1+sqrt(d))./(2.*c2)) ; p = p.*(p .> 0) + (1 - p).*(p .> 1) ; retp(p) ; endp ; /* GF_C2 - Calculate Cramer-von Mises Goodness of Fit Statistic ** ** The C2 statistic is a goodness of fit statistic to test whether two ** cumulative frequency distributions are significantly different ** ** Input: D - N x C matrix - Empirical distribution function ** ** Output: S - C x 1 matrix of test statistics ** ** Usage S = GF_C2(D) ; ** ** Notes: Use CV_C2 or PR_C2 to evaluate the significance of AD under the ** Null hypothesis that D and T come from the same distribution */ proc gf_c2(d) ; local n,c,i,t,c2 ; n = rows(d) ; c = cols(d) ; i = 1 ; do until(i > c) ; d[.,i] = sortc(d[.,i],1) ; i = i + 1 ; endo ; c2 = sumc((d - seqa(1/(2.*n),1./n,n))^2) + 1/(12.*n) ; retp(c2) ; endp ; /* CV_C2 - Calculate critical value for Cramer-von Mises Statistic ** ** The C2 statistic is a goodness of fit statistic to test whether two ** cumulative frequency distributions are significantly different. ** ** Input: P - R x C matrix of probabilities ( 0 < p < 1 ) ** N - S x T matrix of distribution size, ExE conformable with P ** ** Output: C - R x C matrix of critical values ie Prob(AD > C) = P ** ** Usage C = CV_C2(0.05,80) ; ** ** Notes: C is only returned for N > 5 and P in (0.0001,0.2) ** PR_C2 provides the inverse tranform, and estimates P given C,N */ proc cv_c2(p,n) ; local c0,c1,c2 ; if not (0.0001 <= p and p <= 0.2) ; errorlog "ERROR: CV_C2 - p value out of range (0.0001,0.2)" ; retp(error(1)) ; elseif not n > 4 ; errorlog "ERROR: CV_C2 - n value too small (n <= 4)" ; retp(error(1)) ; endif ; p = ln(p) ; n = 1./sqrt(n) ; c0 = -0.02226684 ; c1 = -0.14766489 - n.*(0.03720187 + n.*(0.12325507 - n.*0.38246277)) ; c2 = 0.00414260 - n.*0.01159939 ; retp(c0 + p.*(c1 + p.*c2)) ; endp ; /* PR_C2 - Calculate probability values for Cramer-von Mises Statistic ** ** The C2 statistic is a goodness of fit statistic to test whether two ** cumulative frequency distributions are significantly different. ** ** Input: C - R x C matrix of critical values ** N - S x T matrix of distribution size, ExE conformable with C ** ** Output: P - R x C matrix of probabilities ie Prob(AD > C) = P ** ** Usage P = PR_C2(2.0,80) ; ** ** Notes: P is only returned for N > 5 ** Values of P are not calculated accurately for P < 0.0001 or P > 0.2 ** CV_AD provides the inverse transform, and estimates C given P,N */ proc pr_c2(z,n) ; local c0,c1,c2,d,p ; if not (n > 4) ; errorlog "ERROR: PR_C2 - n value too small (n <= 4)" ; retp(error(1)) ; endif ; n = 1./sqrt(n) ; c0 = -0.02226684 ; c1 = -0.14766489 - n.*(0.03720187 + n.*(0.12325507 - n.*0.38246277)) ; c2 = 0.00414260 - n.*0.01159939 ; d = c1^2 + 4.*c2.*(z - c0) ; if not (d > 0) ; d = d.* (d .> 0) ; errorlog("WARNING: PR_KS - some p values only approximate") ; endif ; p = exp(-(c1+sqrt(d))./(2.*c2)) ; p = p + (1 - p).*(p .> 1) ; retp(p) ; endp ; /* GF_KS - Calculate Kolmogorov-Smirnov Goodness of Fit Statistic ** ** The KS statistic is a goodness of fit statistic to test whether two ** cumulative frequency distributions are significantly different. ** The KS statistic is the maximum difference between the CDF's of the ** two distrubutions ** ** Input: D - N x C matrix - Empirical distribution function ** ** Output: S - C x 1 matrix of test statistics ** ** Usage S = GF_KS(D) ; ** ** Notes: Use CV_KS or PR_KS to evaluate the significance of AD under the ** Null hypothesis that D and T come from the same distribution */ proc gf_ks(d) ; local n,c,i,ks ; n = rows(d) ; c = cols(d) ; i = 1 ; do until(i > c) ; d[.,i] = sortc(d[.,i],1) ; i = i + 1 ; endo ; ks = maxc((seqa(1./n,1./n,n) - d)|(d - seqa(0,1./n,n))) ; retp(ks) ; endp ; /* CV_KS - Calculate critical value for Kolmogorov-Smirnov Statistic ** ** The KS statistic is a goodness of fit statistic to test whether two ** cumulative frequency distributions are significantly different. ** The KS statistic is the maximum difference between the CDF's of the ** two distrubutions ** ** Input: P - R x C matrix of probabilities ( 0 < p < 1 ) ** N - S x T matrix of distribution size, ExE conformable with P ** ** Output: C - R x C matrix of critical values ie Prob(AD > C) = P ** ** Usage C = CV_KS(0.05,80) ; ** ** Notes: C is only returned for N > 5 and P in (0.0001,0.2) ** C is invariant to the value of N ** PR_KS provides the inverse tranform, and estimates P given C,N */ proc cv_ks(p,n) ; local c0,c1,c2 ; if not (0.0001 <= p and p <= 0.2) ; errorlog "ERROR: CV_KS - p value out of range (0.0001,0.2)" ; retp(error(1)) ; elseif not (n > 4) ; errorlog "ERROR: CV_KS - n value too small (n <= 4)" ; retp(error(1)) ; endif ; p = ln(p) ; n = 1./sqrt(n) ; c0 = 0.733183379676 ; c1 = -0.229138856954 - n.*0.016445061725 ; c2 = -0.007245795230 - n.*0.005111922399 ; retp((c0 + p.*(c1 + p.*c2))./(1./n + 0.11 + 0.12.*n)) ; endp ; /* PR_KS - Calculate probability values for Cramer-von Mises Statistic ** ** The KS statistic is a goodness of fit statistic to test whether two ** cumulative frequency distributions are significantly different. ** ** Input: C - R x C matrix of critical values ** N - S x T matrix of distribution size, ExE conformable with C ** ** Output: P - R x C matrix of probabilities ie Prob(AD > C) = P ** ** Usage P = PR_KS(2.0,80) ; ** ** Notes: P is only returned for N > 5 ** Values of P are not calculated accurately for P < 0.0001 or P > 0.2 ** CV_KS provides the inverse transform, and estimates C given P,N */ proc pr_ks(z,n) ; local c0,c1,c2,d,p ; if not (n > 4) ; errorlog "ERROR: PR_KS - n value too small (n <= 4)" ; retp(error(1)) ; endif ; if not(0 < z and z < 1) ; errorlog "ERROR: PR_KS - z value outside range (0,1)" ; retp(error(1)) ; endif ; n = 1./sqrt(n) ; c0 = 0.733183379676 ; c1 = -0.229138856954 - n.*0.016445061725 ; c2 = -0.007245795230 - n.*0.005111922399 ; d = c1^2 + 4.*c2.*(z.*(1./n + 0.11 + 0.12.*n) - c0) ; if not (d > 0) ; d = d.* (d .> 0) ; errorlog("WARNING: PR_KS - some p values only approximate") ; endif ; p = exp(-(c1+sqrt(d))./(2.*c2)) ; p = p + (1 - p).*(p .> 1) ; retp(p) ; endp ; /* GF_K0 - Calculate K0 squared Goodness of Fit Statistic ** ** The K0 statistic is a goodness of fit statistic to test whether two ** cumulative frequency distributions are significantly different. ** It is the correlation coefficent between the cdf's of the two functions ** without the usual mean correction (which gives the K2 statistic) ** and so must take values between 0 and 1. ** ** Input: D - N x C matrix - Empirical distribution function ** ** Output: S - C x 1 matrix of test statistics ** ** Usage S = GF_K0(D) ; ** ** Notes: Use CV_K0 or PR_K0 to evaluate the significance of AD under the ** Null hypothesis that D and T come from the same distribution */ proc gf_k0(d) ; local n,c,i,px,k0 ; n = rows(d) ; c = cols(d) ; i = 1 ; do until(i > c) ; d[.,i] = sortc(d[.,i],1) ; i = i + 1 ; endo ; px = seqa(1,1,n)./(n+1) - 0.5 ; d = d - 0.5 ; k0 = (sumc(px.*d))^2 ./ (sumc(px^2).*sumc(d^2)) ; retp(k0) ; endp ; /* CV_K0 - Calculate critical value for k squared Statistic ** ** The K0 statistic is a goodness of fit statistic to test whether two ** cumulative frequency distributions are significantly different. ** It is the correlation coefficent between the cdf's of the two functions ** without the usual mean correction (which gives the K2 statistic) ** and so must take values between 0 and 1. ** ** Input: P - R x C matrix of probabilities ( 0 < p < 1 ) ** N - S x T matrix of distribution size, ExE conformable with P ** ** Output: C - R x C matrix of critical values ie Prob(AD > C) = P ** ** Usage C = CV_K0(0.05,80) ; ** ** Notes: C is only returned for N > 5 and P in (0.001,0.2) ** PR_K2 provides the inverse tranform, and estimates P given C,N */ proc cv_k0(p,n) ; local c0,c1,c2,c3,c4 ; if not (0.001 <= p and p <= 0.2) ; errorlog "ERROR: CV_K0 - p value out of range (0.001,0.2)" ; retp(error(1)) ; elseif not (n > 4) ; errorlog "ERROR: CV_K0 - n value too small (n <= 4)" ; retp(error(1)) ; endif ; p = ln(p) ; n = 1./n ; c0 = 0.8653139 + n.*(0.1926594 - n.*(0.0192220 + n.*0.7795628)) ; c1 = 0.4632579 - n.*(0.0206542 + n.* 0.2664381) ; c2 = 0.1143172 + n.* 0.0030095 ; c3 = 0.0134711 + n.* 0.0004322 ; c4 = 0.0006115 ; retp(1 - n./(c0 + p.*(c1 + p.*(c2 + p.*(c3 + p.*c4))))) ; endp ; /* PR_K0 - Calculate probability values for k0 squared Statistic ** ** The K0 statistic is a goodness of fit statistic to test whether two ** cumulative frequency distributions are significantly different. ** It is the correlation coefficent between the cdf's of the two functions ** without the usual mean correction (which gives the K2 statistic) ** and so must take values between 0 and 1. ** ** Input: C - R x C matrix of critical values ** N - S x T matrix of distribution size, ExE conformable with C ** ** Output: P - R x C matrix of probabilities ie Prob(AD > C) = P ** ** Usage P = PR_K0(.995,80) ; ** ** Notes: P is only returned for N > 5 ** Values of P are not calculated accurately for P < 0.001 or P > 0.2 ** CV_K0 provides the inverse transform, and estimates C given P,N */ proc pr_k0(z,n) ; local c0,c1,c2,c3,c4,p,mnz,mnp,mxz,mxp,tol,converge,f,df,pn,i,zn ; tol = 1e-8 ; if not (n > 4) ; errorlog "ERROR: PR_K0 - n value too small (n <= 4)" ; retp(error(1)) ; elseif not(0 < z and z < 1) ; errorlog "ERROR: PR_K0 - z value outside range (0,1)" ; retp(error(1)) ; endif ; n = 1./n ; c0 = 0.8653139 + n.*(0.1926594 - n.*(0.0192220 + n.*0.7795628)) ; c1 = 0.4632579 - n.*(0.0206542 + n.* 0.2664381) ; c2 = 0.1143172 + n.* 0.0030095 ; c3 = 0.0134711 + n.* 0.0004322 ; c4 = 0.0006115 ; mnp = ln(0.001) ; mnz = c0 + mnp.*(c1 + mnp.*(c2 + mnp.*(c3 + mnp.*c4))) ; mxp = ln(0.2) ; mxz = c0 + mxp.*(c1 + mxp.*(c2 + mxp.*(c3 + mxp.*c4))) ; z = n./(1-z) ; zn = z ; zn = substute(zn,zn .< (mnz-tol),mnz) ; zn = substute(zn,zn .> (mxz+tol),mxz) ; clear converge,i ; p = ln(0.1).*ones(rows(z+n),cols(z+n)) ; do until(converge or i > 50) ; f = c0 + p.*(c1 + p.*(c2 + p.*(c3 + p.*c4))) ; df = c1 + p.*(2.*c2 + p.*(3.*c3 + p.*4.*c4)) ; pn = p + (z - f)./df ; converge = abs(p-pn) < tol ; p = pn ; i = i + 1 ; endo ; if not converge ; errorlog("WARNING: PR_K0 - iterations have not converged.") ; endif ; if not (z == zn) ; errorlog("WARNING: PR_K0 - some p values outside range 0.001 - 0.2") ; p = substute(exp(p),zn.==mnz,0.001) ; p = substute(exp(p),zn.==mxz,0.200) ; else ; p = exp(p) ; endif ; if not(p > 0.002 and n > 6) ; errorlog("WARNING: PR_K0 - returned values inaccurate"\ " for p < 0.002, n <= 6)") ; endif ; retp(p) ; endp ; /* GF_K2 - Calculate K squared Goodness of Fit Statistic ** ** The K2 statistic is a goodness of fit statistic to test whether two ** cumulative frequency distributions are significantly different. ** It is the correlation coefficent between the cdf's of the two functions ** and so must take values between 0 and 1. ** ** Input: D - N x C matrix - Empirical distribution function ** ** Output: S - C x 1 matrix of test statistics ** ** Usage S = GF_K2(D) ; ** ** Notes: Use CV_K2 or PR_K2 to evaluate the significance of AD under the ** Null hypothesis that D and T come from the same distribution */ proc gf_k2(d) ; local n,c,i,px,k2 ; n = rows(d) ; c = cols(d) ; i = 1 ; do until(i > c) ; d[.,i] = sortc(d[.,i],1) ; i = i + 1 ; endo ; px = seqa(1,1,n)./(n+1) ; d = d - meanc(d)' ; px = px - meanc(px)' ; k2 = (sumc(px.*d))^2 ./ (sumc(px^2).*sumc(d^2)) ; retp(k2) ; endp ; /* CV_K2 - Calculate critical value for k squared Statistic ** ** The K2 statistic is a goodness of fit statistic to test whether two ** cumulative frequency distributions are significantly different. ** It is the correlation coefficent between the cdf's of the two functions ** and so must take values between 0 and 1. ** ** Input: P - R x C matrix of probabilities ( 0 < p < 1 ) ** N - S x T matrix of distribution size, ExE conformable with P ** ** Output: C - R x C matrix of critical values ie Prob(AD > C) = P ** ** Usage C = CV_K2(0.05,80) ; ** ** Notes: C is only returned for N > 5 and P in (0.001,0.2) ** PR_K2 provides the inverse tranform, and estimates P given C,N */ proc cv_k2(p,n) ; local c0,c1,c2,c3,c4 ; if not (p >= .001 and p <= 0.2) ; errorlog "ERROR: CV_K2 - p value out of range (0.001,0.2)" ; retp(error(1)) ; elseif not (n > 4) ; errorlog "ERROR: CV_K2 - n value too small (n <= 4)" ; retp(error(1)) ; endif ; p = ln(p) ; n = 1./n ; c0 = 1.7487259 + n.*(2.2565277 - n.*(0.3038449 - n.*16.1713960)) ; c1 = 0.7536611 + n.* 0.7059508 ; c2 = 0.1675119 + n.*(0.0609113 + n.*0.0141376) ; c3 = 0.0184351 ; c4 = 0.0007881 ; retp(1 - n./(c0 + p.*(c1 + p.*(c2 + p.*(c3 + p.*c4))))) ; endp ; /* PR_K2 - Calculate probability values for k squared Statistic ** ** The K2 statistic is a goodness of fit statistic to test whether two ** cumulative frequency distributions are significantly different. ** It is the correlation coefficent between the cdf's of the two functions ** and so must take values between 0 and 1. ** ** Input: C - R x C matrix of critical values ** N - S x T matrix of distribution size, ExE conformable with C ** ** Output: P - R x C matrix of probabilities ie Prob(AD > C) = P ** ** Usage P = PR_K2(2.0,80) ; ** ** Notes: P is only returned for N > 5 ** Values of P are not calculated accurately for P < 0.001 or P > 0.2 ** CV_K2 provides the inverse transform, and estimates C given P,N */ proc pr_k2(z,n) ; local c0,c1,c2,c3,c4,p,mnz,mnp,mxz,mxp,tol,converge,f,df,pn,i,zn ; tol = 1e-8 ; if not (n > 4) ; errorlog "ERROR: PR_K2 - n value too small (n <= 4)" ; retp(error(1)) ; elseif not(0 < z and z < 1) ; errorlog "ERROR: PR_K2 - z value outside range (0,1)" ; retp(error(1)) ; endif ; n = 1./n ; c0 = 1.7487259 + n.*(2.2565277 - n.*(0.3038449 - n.*16.1713960)) ; c1 = 0.7536611 + n.* 0.7059508 ; c2 = 0.1675119 + n.*(0.0609113 + n.*0.0141376) ; c3 = 0.0184351 ; c4 = 0.0007881 ; mnp = ln(0.001) ; mnz = c0 + mnp.*(c1 + mnp.*(c2 + mnp.*(c3 + mnp.*c4))) ; mxp = ln(0.2) ; mxz = c0 + mxp.*(c1 + mxp.*(c2 + mxp.*(c3 + mxp.*c4))) ; z = n./(1 - z) ; zn = z ; zn = substute(zn,zn .< (mnz-tol),mnz) ; zn = substute(zn,zn .> (mxz+tol),mxz) ; clear converge,i ; p = ln(0.1).*ones(rows(z+n),cols(z+n)) ; do until(converge or i > 50) ; f = c0 + p.*(c1 + p.*(c2 + p.*(c3 + p.*c4))) ; df = c1 + p.*(2.*c2 + p.*(3.*c3 + p.*4.*c4)) ; pn = p + (z - f)./df ; converge = abs(p-pn) < tol ; p = pn ; i = i + 1 ; endo ; if not converge ; errorlog("WARNING: PR_K2 - iterations have not converged.") ; endif ; if not (z == zn) ; errorlog("WARNING: PR_K2 - some p values outside range 0.001 - 0.2") ; p = substute(exp(p),zn.==mnz,0.200) ; p = substute(exp(p),zn.==mxz,0.001) ; else ; p = exp(p) ; endif ; retp(p) ; endp ;