Permutation Test

Water Pollution

> water <- data.frame(pollution=scan(), river=rep(1:2,6:5))

1: 12 8 13 13 15 10

7: 6 8 7 9 7

12:

Read 11 items

> water # print it (on screen)

   pollution river
1         12     1
2          8     1
3         13     1
4         13     1
5         15     1
6         10     1
7          6     2
8          8     2
9          7     2
10         9     2
11         7     2

> # The function below will be used to compute t statistics.

> t0 <- function(x,Data,N=length(Data),m=length(x),n=N-m,dof=N-2)

+ {y <- Data[-x]; x <- Data[x]

+ sp2 <- ((m-1)*var(x) + (n-1)*var(y)) / dof

+ (mean(x) - mean(y)) / sqrt(sp2*(1/m+1/n))

+ }

>

> T0 <- t0(1:6,water$pollution)

> T0 # the test statistic

[1] 3.659011

> choose(11,6) # binomial coefficient, built-in function

[1] 462

> # The computation for all 462 permutation t statistics makes

> # use of the built-in function combn (Release 2.4 and later)

> Tstats <- combn(11, 6, FUN=t0, Data=water[[1]],

+ N=11,m=6,n=5,dof=9)

> # Note that parameters N,m,n,dof above are not mandatory. However,

> # using them sped up the computation.

> length(Tstats) # 462 permutation t statistics were generated

[1] 462

> Pperm <- (sum(Tstats>T0)+0.5*sum(Tstats==T0)) / choose(11,6)

> Pperm # P-value of the permutation test (conservative)

[1] 0.004329004

> # can exclude T0 itself as in

> (sum(Tstats>T0)+0.5*(sum(Tstats==T0)-1))/462

[1] 0.003246753

> # Histogram of t statistics and superimposed t curve

>

> range(Tstats)

[1] -5.842987  4.593666

> breaks <- seq(-6.5,5.5,1)

> breaks # break points of class intervals for histogram

 [1] -6.5 -5.5 -4.5 -3.5 -2.5 -1.5 -0.5  0.5  1.5  2.5  3.5  4.5  5.5

> tden <- list(x=(tx <- seq(-6,5,.1)),

+ y=dt(tx, 9)) # (x,y) coordinates of superimposed t curve

> windows(width=6, height=5) # 6in by 5in graphics window (under MS)

> # Next generate probability histogram with designated x label and title

> hist(Tstats, breaks=breaks, probability=TRUE,

+ xlab="t",

+ main="Permutation t Statistics\nwith superimposed t curve")

> lines(tden,lwd=2,col="blue") # superimposed t curve

> title(sub="water pollution example (problem 15, HW1)",

+ adj=0) # left-justified subtitle (footnote)

> arrows(T0,0, T0,0.1, angle=10, col="red") # marker for T0

> text(T0,0.1, pos=3, col="red",

+ label=bquote(t[0]==.(round(T0,2))) ) # show value of T0

> # The next two commands display the two hypotheses

> text(-6,0.3, adj=c(0,-0.5), col="brown",

+ label=bquote(H[0]*": "*mu[1]==mu[2]) )

> text(-6,0.3, adj=c(0,1.5), col="brown",

+ label=bquote(H[1]*": "*mu[1]>mu[2]) )

> # The next two commands display the respective P values using

> # permutation test and t test.

> text(-6,0.2, adj=c(0,-0.5), col="darkgreen",

+ label=bquote(P["perm"]==.(round(Pperm,4))) )

> text(-6,0.2, adj=c(0,1.5), col="darkgreen",

+ label=bquote(P[t]==.(round(pt(T0,9,lower=F),4))) )

> dev.off() # turn off the graphical device

null device 
          1
(perm.png here)