R Code from STA2201s04: Applied Statistics II


Here are some pieces of S code from the overheads. If you want to copy-paste them, it's better to do it from this document than from the PDF files, because sometimes there are invisible characters in the PDF files, and these can cause errors that are hard to fix, because you cannot see what caused the error. There is one SAS program, but really the SAS programs should be in a separate document.

I will add to this file throughout the course.


First, a few pieces from the introductory computer lecture.


pdf("testor.pdf")
plot(x,y,type='l')      # That's a lower case L
set.seed(12345) # Be able to reproduce the stream of pseudo-random numbers.

# regart.R    Demonstrate Regression Artifact
###################### Setup #######################
N <- 10000 ; n <- 100
truevar <- 180 ; errvar <- 45
truesd <- sqrt(truevar) ; errsd <- sqrt(errvar)
# set.seed(44444)
# Now define the function ttest, which does a matched t-test

ttest <- function(d) # Matched t-test. It operates on differences.
    {
    ttest <- numeric(4)
    names(ttest) <- c("Mean Difference"," t "," df "," p-value ")
    ave <- mean(d) ; nn <- length(d) ; sd <- sqrt(var(d)) ; df <- nn-1
    tstat <- ave*sqrt(nn)/sd
    pval <- 2*(1-pt(abs(tstat),df))
    ttest[1] <- ave ; ttest[2] <- tstat; ttest[3] <- df; ttest[4] <- pval
    ttest # Return the value of the function
    }

#####################################################

error1 <- rnorm(N,0,errsd) ; error2 <- rnorm(N,0,errsd)
truescor <- rnorm(N,100,truesd)
pre <- truescor+error1 ; rankpre <- rank(pre)

# Based on their rank on the pre-test, we take the n worst students and
# place them in a special remedial program, but it does NOTHING.

# Based on their rank on the pre-test, we take the n best students and
# place them in a special program for the gifted, but it does NOTHING.

post <- truescor+error2
diff <- post-pre # Diff represents "improvement."
                 # But of course diff = error2-error1 = noise

cat("\n") # Skip a line
cat("------------------------------------ \n")
dtest <- ttest(diff)
cat("Test on diff (all scores) \n") ; print(dtest) ; cat("\n")

remedial <- diff[rankpre<=n] ; rtest <- ttest(remedial)
cat("Test on Remedial \n") ; print(rtest) ; cat("\n")

gifted <- diff[rankpre>=(N-n+1)] ; gtest <- ttest(gifted)
cat("Test on Gifted \n") ; print(gtest) ; cat("\n")
cat("------------------------------------ \n")

#################### End of regart.R ####################

Regression stuff from introductory computer lecture.


####################################################################
# lesson2.R: execute with    R --vanilla < lesson2.R > lesson2.out #
####################################################################

datalist <-  scan("mcars.dat",list(id=0,country=0,kpl=0,weight=0,length=0))
# datalist is a linked list.
datalist
# There are other ways to read raw data. See help(read.table).
weight <- datalist$weight ; length <- datalist$length ; kpl <- datalist$kpl
country <- datalist$country
cor(cbind(weight,length,kpl))
# The table command gives a bare-bones frequency distribution
table(country)
# That was a matrix. The numbers 1 2 3 are labels.
# You can save it, and you can get at its contents
countrytable <- table(country)
countrytable[2]
# There is an "if" function that you could use to make dummy variables,
# but it's easier to use factor.
countryfac <- factor(country,levels=c(1,2,3),
                     label=c("US","Japanese","European"))
# This makes a FACTOR corresponding to country, like declaring it
# to be categorical. How are dummy variables being set up?
contrasts(countryfac)
# The first level specified is the reference category. You can get a
# different reference category by specifying the levels in a different order.
cntryfac <- factor(country,levels=c(2,1,3),
                     label=c("Japanese","US","European"))
contrasts(cntryfac)
# Test interaction. For comparison, with SAS we got F = 11.5127, p < .0001
# First fit (and save!) the reduced model. lm stands for linear model.
redmod <- lm(kpl ~ weight+cntryfac)
# The object redmod is a linked list, including lots of stuff like all the
# residuals. You don't want to look at the whole thing, at least not now.
summary(redmod)

# Full model is same stuff plus interaction. You COULD specify the whole thing.
fullmod <- update(redmod,. ~ . + weight*cntryfac)
anova(redmod,fullmod)
# The ANOVA summary table is a matrix. You can get at its (i,j)th element.
aovtab <- anova(redmod,fullmod)
aovtab[2,5] # The F statistic
aovtab[2,6] < .05       #    p < .05 -- True or false?
1>6 # Another example of an expression taking the logical value true or false.
##################### End of lesson2.R #######################################

##################### lesson3.R ############################
# Lesson 3: Factorial ANOVA with the potato2 data
# Run with  R --vanilla < lesson3.R > lesson3.out
#
system("cat potato2.dat")
spuds <- read.table("potato2.dat")
spuds[1:10,] # Rows 1:10, all columns
# Table of means
mtable <- tapply(spuds$Rot, list(spuds$Temp,spuds$Bact),mean)
mtable
#
# Note that just specifying the interaction forces the main effects in
summary(aov(Rot~factor(Bact)*factor(Temp),data=spuds))
#
# For custom tests, make a combination variable
#
rot <- spuds$Rot ; temp <- spuds$Temp ; bact <- spuds$Bact
combo <- 10*temp + bact # Two levels of temp x 3 levels of bact
table(temp,combo)
table(bact,combo)
#
# Suppose we want to test diff among bact just for low temp=1
# We can do it with a dummy var setup like the default contrasts.
#
n <- length(combo)
d1 <- numeric(n) ; d2 <- d1; d3 <- d1; d4 <- d1; d5 <- d1
d1[combo==12] <- 1 ; d2[combo==13] <- 1 ; d3[combo==21] <- 1
d4[combo==22] <- 1 ; d5[combo==23] <- 1
table(d1,combo)
table(d2,combo)
#
redmod <- lm(rot~d3+d4+d5) ; fullmod <- lm(rot~d1+d2+d3+d4+d5)
anova(redmod,fullmod)
#
# Suppose the null hypothesis is no temp effect for any type of bacteria
# H0:  mu11=mu21 , mu12=mu22  mu13=mu23
#
# Fit a reduced model by transforming the independent variables
# Need one more dummy variable
d0 <- numeric(n) ; d0[combo==11] <- 1
v1 <- d0+d3 ; v2 <- d1+d4 ; v3 <- d2+d5
#
# I can't make lm fit a model with no intercept! Must use lsfit. Ugh!
#
redfit <- lsfit(cbind(v1,v2,v3),rot,intercept=F) # Not like an lm object
SSEr <- sum(residuals(redfit)^2)
#
# Now we need some stuff from the full model
#
fullmat <- anova(fullmod) # ANOVA summary table is a matrix
fullmat
SSEf <- fullmat[6,2] ; SSEf
MSEf <- fullmat[6,3] ; MSEf
F2 <- (SSEr-SSEf)/(3*MSEf) ; pval <- 1-pf(F2,3,48)
F2 ; pval


It's so much easier with SAS

/* spud1.sas */
options linesize=79 noovp formdlim='_';
title 'Rotten potato check';

data spud;
     infile 'potato2.dat' firstobs=2; /* Skip the first line that R uses */
     input id bact temp rot;
     if temp=1 and bact=1 then mu11=1; else mu11=0;
     if temp=1 and bact=2 then mu12=1; else mu12=0;
     if temp=1 and bact=3 then mu13=1; else mu13=0;
     if temp=2 and bact=1 then mu21=1; else mu21=0;
     if temp=2 and bact=2 then mu22=1; else mu22=0;
     if temp=2 and bact=3 then mu23=1; else mu23=0;
     combo = 10*temp+bact;

proc glm;
     title2 'Standard 2-way ANOVA with proc glm';
     class bact temp;
     model rot=temp|bact;

proc reg;
     title2 'Custom tests with proc reg';
     model rot = mu11--mu23 / noint;
     ex1: test mu11=mu12=mu13;
     ex2: test mu11=mu21, mu12=mu22, mu13=mu23;

proc glm;
     class combo;
     title2 'Specify contrast matrix directly with proc glm';
     model rot = combo;
     contrast 'Bact diff for low temp'
        combo 1 -1  0  0  0  0,
        combo 0  1 -1  0  0  0;
     contrast 'Temp diff in each bact'
        combo 1  0  0 -1  0  0,
        combo 0  1  0  0 -1  0,
        combo 0  0  1  0  0 -1;

Power

##############################################################
#       powerfun.R
# Collection of function definitions used for power analysis
# in STA2201. Activate with  source("powerfun.R")
#
#                       Table of Contents
#
#     signpow
#     size1
#     merror
#     fpow2
#     matpow1
#     matpow2
#     t1pow & size2
#     poprsq
#     samprsq1
#     T1pow1
#     T1pow
#     Waldlogpow1
#     Waldlogpow2
#     chipow1
#     chipow
##############################################################

signpow <- function(theta,n) # Power of the sign test
    {
    L <- sqrt(n)*(.5-theta)/sqrt(theta*(1-theta))
    R <- 1.96/(2*sqrt(theta*(1-theta)))
    signpow <- 1 - pnorm(L+R) + pnorm(L-R)
    signpow
    } # End of function signpow

size1 <- function(truet,needpow=0.80,nstart=1,nend=1000000)
    {    # Search for sample size needed to get desired power of sign test
    nn <- nstart ; pow <- 0
    while(pownend) stop("Too many iterations!")
        }
    cat("\n")
    cat("For true Theta of ",truet,", sign test requires a sample \n")
    cat("        size of ",nn-1," to have power of ",pow,"\n")
    cat("\n")
    } # End function size1


merror <- function(phat,m,alpha) # (1-alpha)*100% merror for a proportion
    {
    z <- qnorm(1-alpha/2)
    merror <- z * sqrt(phat*(1-phat)/m)  # m is (Monte Carlo) sample size
    merror
    } # End of function merror

fpow2 <- function(r,q,effsize,wantpow=0.80,alpha=0.05)
#############################################################################
# Power for the general multiple regression model, testing H0: C Beta = h   #
#       r       is the number of beta parameters                            #
#       q       Number rows in the C matrix = numerator df                  #
#       effsize is ncp/n, a squared distance between C Beta and h           #
#       wantpow is the desired power, default = 0.80                        #
#       alpha   is the significance level, default = 0.05                   #
#############################################################################
    {
    pow <- 0 ; nn <- r+1 ; oneminus <- 1 - alpha
    while(pow < wantpow)
        {
        nn <- nn+1
        phi <- nn * effsize
        ddf <- nn-r
        pow <- 1 - pf(qf(oneminus,q,ddf),q,ddf,phi)
        }#End while
    fpow2 <- nn
    fpow2  # Returns needed n
    }      # End of function fpow2

matpow1 <- function(C,eff,f,wantpow=0.80,alpha=0.05)
# H0: C Beta = 0
# Beta is r x 1
# C     is q x r contrast matrix
# eff   is vector of effects (non-zero h) in sd units, length r
# f     is vector of RELATIVE sample sizes, all non-negative
#
    {
    f <- f/sum(f)
    if(min(f)<=0) stop("Cell sample sizes must all be positive.")
    kore <- solve(C%*%diag(1/f)%*%t(C))
    effsize <- t(eff)%*%kore%*%eff
    q <- dim(C)[1] ; r <- dim(C)[2]
#    cat("r,q,effsize,wantpow,alpha = ",r,q,effsize,wantpow,alpha,"\n")
    matpow1 <- fpow2(r,q,effsize,wantpow,alpha)
    matpow1
    } # End of function matpow1

matpow2 <- function(C,beta,f,wantpow=0.80,alpha=0.05)
# H0: C Beta = 0
# Beta is r x 1
# C     is q x r contrast matrix
# eff   is vector of effects (non-zero h) in sd units, length r
# f     is vector of RELATIVE sample sizes, all non-negative
#
    {
    f <- f/sum(f)
    if(min(f)<=0) stop("Cell sample sizes must all be positive.")
    eff <- C%*%beta
    kore <- solve(C%*%diag(1/f)%*%t(C))
    effsize <- t(eff)%*%kore%*%eff
    q <- dim(C)[1] ; r <- dim(C)[2]
#    cat("r,q,effsize,wantpow,alpha = ",r,q,effsize,wantpow,alpha,"\n")
    matpow1 <- fpow2(r,q,effsize,wantpow,alpha)
    matpow1
    } # End of function matpow2

#########################################################################
# t1pow & size2: Power analysis to find MC sample size for Z test       #
# of departure from specified Type I error rate (usually 0.05). Do      #
#               source("type1power.R")                                  #
# and then use the function size2 interactively.                        #
#########################################################################

t1pow <- function(a0,a1,size,m)  # Power of Z test for proportion
            # a0        Hypothesized Type I error rate
            # a1        True (alternative) Type I error rate
            # size      Size of Z test of H0: alpha=a0 vs not equal 
            # m         Monte Carlo sample size
    {
    L <- sqrt(m)*(a0-a1)/sqrt(a1*(1-a1))
    R <- qnorm(1-size/2) * sqrt( (a0*(1-a0))/(a1*(1-a1)) )
    t1pow <- 1 - pnorm(L+R) + pnorm(L-R)
    t1pow
    } # End of function t1pow

size2 <- function(a0=0.05,a1,alpha=0.01,needpow=0.80,nstart=1,nend=1000000)
    {
    nn <- nstart ; pow <- 0
    while(pownend) stop("Too many iterations!")
        }
    cat("\n")
    cat("In a Z test of H0: alpha = ",a0," vs not equal at the ",alpha,"\n")
    cat("level when the true Type I error rate is ",a1,", a Monte Carlo \n")
    cat("sample size of ",nn-1," is required to have a power of ",pow,"\n")
    cat("\n")
    } # End function size2


poprsq <- function(r,q,a,wantpow=0.80,alpha=0.05)
# Cohen's Popularion R-squared Method 
#   r   Number of IVs in full model
#   q   Numerator df = number of linear constraints being tested
#   a   Population proportion of remaining variation explained.
#       This is Cohen's "effect size."
#   wantpow     Desired power (default = 0.80)
#   alpha       Significance level (default = 0.05)
    {
    pow <- 0 ; nn <- r+1 ; oneminus <- 1 - alpha
    while(pow < wantpow)
        {
        nn <- nn+1
        phi <- (nn-r) * a/(1-a)
        ddf <- nn-r
        pow <- 1 - pf(qf(oneminus,q,ddf),q,ddf,phi)
        }#End while
     poprsq <- nn
     poprsq
     } # End of function poprsq

samprsq1 <- function(r,q,a,alpha=0.05)
# Find n so remaining proportion of SS explained will be significant 
#   r   Number of IVs in full model
#   q   Numerator df = number of linear constraints being tested
#   a   Sample proportion of remaining variation explained.
#   alpha       Significance level (default = 0.05)
    {
    pval <- 1 ; n <- r+1 
    while(pval > alpha)
        {
        n <- n+1
        F <- (n-r)/q * a/(1-a)
        df2 <- n-r
        pval = 1-pf(F,q,df2)
        }#End while
     samprsq1 <- n
     samprsq1
     } # End of function samprsq1


T1pow1 <- function(n,mu,C,h,f,alpha=0.05,prop=TRUE,sigsq=NULL)
###################################################################
# Power for large sample test of H0: C mu = h
#   n       Sample size
#   mu      Vector of r population means. E[ybar] = mu
#   C       q by r hypothesis matrix
#   f       Vector of r relative sample sizes
#   alpha   Significance level: default = 0.05
#   prop    Is this a test on sample proportions? If TRUE (the default),
#           there is no need to supply variances.
#   sigsq    Vector of population varinces, not needed if prop=TRUE
###################################################################
    {
    q <- dim(C)[1] ; r <- dim(C)[2]
    # Error Checks
       if(length(mu)!=r) stop("Length of mu must be num of cols in C.")
       if(length(h) != q) stop("Length of h must be num of rows in C.")
       if(length(f) != r) stop("Length of f must be num of cols in C.")
       if(prop==FALSE & length(sigsq) != length(mu))
          stop("Lengths of mu and sigsq are unequal.")
    # End error checks
    oneminus <- 1 - alpha ; f <- f/sum(f)
    if(prop) sigsq <- mu*(1-mu)
    kore <- solve(C%*%diag(sigsq/f)%*%t(C))
    ncp <- n * t(C%*%mu-h)%*%kore%*%(C%*%mu-h)
    pow <- 1 - pchisq(qchisq(oneminus,q),q,ncp)    
    T1pow1 <- pow
    T1pow1
    } # End of function T1pow1

T1pow <- function(mu,C,h,f,wantpow=0.80,alpha=0.05,prop=TRUE,
                  sigsq=NULL,maxn=100000)
###################################################################
# Sample size required for desired power: large sample test of H0: C mu = h
#   mu      Vector of r population means. E[ybar] = mu
#   C       q by r hypothesis matrix
#   f       Vector of r relative sample sizes
#   wantpow Desired power
#   alpha   Significance level: default = 0.05
#   prop    Is this a test on sample proportions? If TRUE (the default),
#           there is no need to supply variances.
#   sigsq    Vector of population varinces, not needed if prop=TRUE
#   maxn     Maximum sample size, to avoid endless loop
###################################################################
    {
    q <- dim(C)[1] ; r <- dim(C)[2]
    # Error Checks
       if(length(mu)!=r) stop("Length of mu must be num of cols in C.")
       if(length(h) != q) stop("Length of h must be num of rows in C.")
       if(length(f) != r) stop("Length of f must be num of cols in C.")
       if(prop==FALSE & length(sigsq) != length(mu))
          stop("Lengths of mu and sigsq are unequal.")
    # End error checks
    oneminus <- 1 - alpha ; f <- f/sum(f)
    if(prop) sigsq <- mu*(1-mu)
    pow <- 0 ; nn <- r+1 ; oneminus <- 1 - alpha
    while(pow < wantpow)
        {
        nn <- nn+1
        if(nn>maxn) stop("Maxn exceeded!")
        kore <- solve(C%*%diag(sigsq/f)%*%t(C))
        ncp <- nn * t(C%*%mu-h)%*%kore%*%(C%*%mu-h)
        pow <- 1 - pchisq(qchisq(oneminus,q),q,ncp)
        }#End while
    T1pow <- nn
    T1pow
    } # End of function T1pow

waldlogpow1 <- function(n,beta,X,C,h,alpha=0.05)
################################################################
# Power of Wald test for logistic regression: Single sample size
#     n           Sample size
#     beta        Vector of regression coefficients
#     X           The X matrix, with intercept if appropriate
#     C           H0 is C Beta = h
#     h           C Beta under H0
#     alpha       Significance level (size of test): Default = 0.05
################################################################
      {
      q <- dim(C)[1] ; r <- dim(C)[2]
      xb <- X %*% beta
      Vbeta <- diag(as.vector(exp(xb)/(1+exp(xb))^2))
      Ibeta <- t(X) %*% Vbeta %*% X
      kore <- solve(C%*% solve(Ibeta) %*%t(C))
      ncp <- t(C%*%beta-h)%*%kore%*%(C%*%beta-h)
      pow <- 1 - pchisq(qchisq(1-alpha,q),q,ncp)
      waldlogpow1 <- pow
      waldlogpow1
      } # End of function waldlogpow1

waldlogpow2 <- function(beta,C,h,alpha=0.05,wantpow=0.80,
                        nstart=25,nmax=100000,display=FALSE)
################################################################
# Power analysis to find sample size for logistic regression.
# Intended to be a rough guess at sample size needed for a LR test.
# X matrix must be generated internally, but just once -- random or not.
# 
#     beta        Vector of regression coefficients
#     C           H0 is C Beta = h
#     h           C Beta under H0
#     alpha       Significance level (size of test): Default = 0.05
#     wantpow     Desired power: Default = 0.80
#     nstart      Starting sample size
#     nmax        To stop endless loop
#     display     If TRUE, print n power and power during search 
################################################################
    {
    # Generate the first nstart rows of the X matrix. This will vary from
    # problem to problem.
    # Calculate square root matrix A. A%*%Z will have var-cov Sigma
    # This is for MVN IVs
    sigma <- rbind( c(50,sqrt(2000)/2),
                    c(sqrt(2000)/2,40) )
    spec <- eigen(sigma)
    A <- spec$vectors %*% diag(sqrt(spec$values))
    Z <- rbind(rnorm(nstart),rnorm(nstart))
    X <- cbind(matrix(1,nstart),t(A%*%Z))
    X[,2] <- X[,2]+70 ; X[,3] <- X[,3]+80
    ########### Finished generating first nstart rows ################
    pow <- waldlogpow1(nstart,beta,X,C,h)
    n <- nstart
    if(display) cat("Starting n = ",n," Power = ",pow,"\n")
    while(pownmax) stop("Too many iterations!")
        } # End search over n
    walodlogpow2 <- n
    walodlogpow2
    } # End of function waldlogpow2

chipow1 <- function(n,pi,alpha=0.05)
# Power for Pearson's chi-square test of independence
#  n        Sample size
#  pi       Matrix of true probabilities
#  alpha    Significance level
    {
    oneminus <- 1 - alpha
    pi <- pi/sum(pi)
    r <- dim(pi)[1] ; c <- dim(pi)[2]
    df <- (r-1)*(c-1)
    rmarg <- apply(pi,1,sum) # Apply sum to dim 1 (rows)
    cmarg <- apply(pi,2,sum)
    piM <- as.matrix(rmarg)%*%t(as.matrix(cmarg))
    ncp <- n * sum((pi-piM)^2/piM)
    chipow1 <- 1 - pchisq(qchisq(oneminus,df),df,ncp)
    chipow1
    } # End of function chipow1

chipow <- function(pi,wantpow=0.80,alpha=0.05,maxn=100000,display=FALSE)
###################################################################
# Sample size required for desired power: 
# Chisquare test of Independence
#
#   pi          Matrix of true probabilities
#   wantpow     Desired power: default = 0.80
#   alpha       Significance level: default = 0.05
#   maxn        Maximum sample size, to avoid endless loop
#   display     See details?
###################################################################
    {
    ############ One-time only calculations #############
    pi <- pi/sum(pi)
    r <- dim(pi)[1] ; c <- dim(pi)[2]
    df <- (r-1)*(c-1)
    rmarg <- apply(pi,1,sum) # Apply sum to dim 1 (rows)
    cmarg <- apply(pi,2,sum)
    piM <- as.matrix(rmarg)%*%t(as.matrix(cmarg))
    nn <- round(5/min(piM)) ; pow <- 0 ; oneminus <- 1 - alpha
    ######################################################
    while(pow < wantpow)
        {
        nn <- nn+1
        if(nn>maxn) stop("Maxn exceeded!")
        ncp <- nn * sum((pi-piM)^2/piM)
        pow <- 1 - pchisq(qchisq(oneminus,df),df,ncp)
        if(display) cat("  n = ",nn,"  Power = ",pow,"\n")
        }#End while
    chipow <- nn
    if(display)
        {
        cat("\n")
        cat("True cell probabilities:\n")
        cat("\n")
        print(pi)
        cat("\n")
        cat("Probabilities under H0 of independence:\n")
        cat("\n")
        print(piM)
        cat("\n")
        }
    chipow
    } # End of function chipow



That was the end of the file powerfun.R. It will continue to grow some as the course progresses. Now here is some more power code from the overheads.


######################################################################
# powplot.R  -  Plot Power of sign test as a function of true Theta  #
#                                                                    #
# R --vanilla < powplot.R                                            #
#####################################################################

source("powerfun.R") # To define function signpow

Theta <- seq(from=.01,to=.99,by=.01)
Power <- signpow(Theta,50)
p100 <- signpow(Theta,100)
system("rm powplot.pdf")
pdf("powplot.pdf")

plot(Theta,Power,type="l")
lines(Theta,p100,lty=2)
title("Power of the Sign Test")
# I understand there is  a "legend" command that may do this better.
x1 <- c(.70,.80) ; y1 <- c(.2,.2) ; lines(x1,y1,lty=1)
text(.9,.2,"n =  50")
x2 <- c(.70,.80) ; y2 <- c(.1,.1) ; lines(x2,y2,lty=2)
text(.9,.1,"n = 100")


############## End of powplot.R ####################

# cvm.R
# Power for test of zero correlation for an entire matrix
# (Large Sample LR Test)
# Execute with   source("cvm.R")
#
M <- 10000
sim <- numeric(M)
set.seed(32448)
n <- 50 ; v1 <- .42 ;       v2 <- .18
          s1 <- sqrt(v1) ; s2 <- sqrt(v2)
G <- function(datamat)
   {
   nn <- dim(datamat)[1] ; kk <- dim(datamat)[2] ; df <- kk*(kk-1)/2
   G <- numeric(3)
   names(G) <- c("Chisq","df","P-value")
   S <- var(datamat)
   G[1] <- nn * ( sum(log(diag(S))) - sum(log(eigen(S)$values)) ) #$
   G[2] <- df
   G[3] <- 1 - pchisq(G[1],df)
   G
   } # End function G
merror <- function(phat,m,alpha=0.01) # (1-alpha)*100% merror for a proportion
    {
    z <- qnorm(1-alpha/2)
    merror <- z * sqrt(phat*(1-phat)/m)  # m is (Monte Carlo) sample size
    merror
    }

for(j in 1:M)
   {

e1 <- rnorm(n,0,s1) ; e2 <- rnorm(n,0,s2)
x1 <- rnorm(n)+e1 ; x2 <- rnorm(n)+e1 ;  x3 <- rnorm(n)+e1 ;
x4 <- rnorm(n)+e2 ;  x5 <- rnorm(n)+e2
dat <- cbind(x1,x2,x3,x4,x5)
# print(G(dat))
print(j)
sim[j] <- G(dat)[3] < .05

    }
poww <- length(sim[sim==1])/M
cat("Power = ", poww ,"\n")
cat("Plus or Minus 99% Margin of error: ",merror(poww,M),"\n")

############## End of cvm.R ####################

# Power for comparing 2 means
n <- seq(from=120,to=140,by=2) ; phi <- n/16 ; ddf <- n-2
cbind(n,pf(qf(.95,1,ddf),1,ddf,phi,FALSE))

# Comparing r means

3 * var(c(0,.25,.5,.75)) / 4
source("powerfun.R") # To define fpow2
fpow2(r=4,q=3,effsize=0.078125)

# Signal to noise ratio
source("powerfun.R") # To define fpow2
fpow2(r=6,q=5,effsize=0.10)

# Interaction example with fpow2
source("powerfun.R") # To define fpow2
fpow2(r=6,q=2,effsize=0.01388889,wantpow=0.80,alpha=0.05)

# Interaction example with matpow1
source("powerfun.R")
effmat <- rbind(c(1, -1, -1,  1,  0,  0),
                c(0,  0,  1, -1, -1,  1)  )
h <- c(0,-.5)
ssizes <- numeric(6) + 1
matpow1(effmat,h,ssizes) # Using default wantpower of 0.80 and alpha=0.05

# Interaction example with matpow2
source("powerfun.R") 
effmat <- rbind(c(1, -1, -1,  1,  0,  0),
                c(0,  0,  1, -1, -1,  1)  )
cellmeans <- c(0,.25,0,.25,0,-.25)
ssizes <- numeric(6) + 1
matpow2(effmat,cellmeans,ssizes) # Using defaults for wantpower and alpha

# Playing with relative sample sizes in a oneway design.
source("powerfun.R") # Defining matpow2
beta <- c(0,.25,.5,.75)
Cmat <- rbind( c(1,-1, 0, 0),
               c(0, 1,-1, 0),
               c(0, 0, 1,-1) )
f <- c(2,1,1,2)               
matpow2(Cmat,beta,f)


#######################################################################
# randpow0.R
#                 A brutal simulation to estimate Power for a 
#                 normal linear model with 3 random IVs.
#              
# Run with        source("randpow0.R")
#
#######################################################################

source("powerfun.R") # To define merror
#set.seed(12345)
n <- 100 ; m <- 1000 ; signif <- numeric(m)
b0 <- 0 ; b1 <- 1 ; b2 <- .1 ; b3 <- .2
sigma <- 5 ; confid <- .99

for(i in 1:m)
      {
      e1 <- rpois(n,5) ; e2 <- rpois(n,5) ; e3 <- rpois(n,5) 
      x1 <- e1+e3 ; x2 <- e2+e3 ; x3 <- rnorm(n,10,sqrt(10))
      epsilon <- rnorm(n,0,sigma)
      y <- b0 + b1*x1 + b2*x2 + b3*x3 + epsilon
      mod1 <- lm(y~x1) ; mod2 <- lm(y~x1+x2+x3)
      signif[i] <- anova(mod1,mod2)[2,6] < 0.05
      }
Phat <- mean(signif)
cat("\n")
cat("       A brutal simulation to estimate Power for a \n")
cat("       normal linear model with 3 random IVs.      \n")
cat("\n")
cat(" n,m = ",n,m,"\n")
cat(" b0, b1, b2, b3, sigma = ",b0, b1, b2, b3, sigma, "\n")
cat("\n")
cat(" Estimated power = ",Phat,", with ",confid*100,"% margin of error \n")
cat(merror(Phat,m,1-confid),".\n")
cat("\n")
############## End of randpow0.R ####################

howlong0 <- system.time(source("randpow0.R"))
howlong0
sum(howlong0[1:2])


#######################################################################
# randpow1.R
#                 Power for the normal linear model, with random 
#                 independent variables. Run with  source("randpow1.R")
#
#######################################################################

#set.seed(12345)

n <- 100                # Sample size
m <- 50                # Monte Carlo sample size
beta <- c(0,1,.1,.2)    # Parameters
sigma <- 5
C <- rbind( c(0,0,1,0), # H0: C Beta = 0
            c(0,0,0,1)  )
alpha <- 0.05           # Significance level of test
confid <- .99           # Want 100*confid percent margin of 
                        # error for MC estimate
#########################################################################
# One-time-only calculations
Y <- numeric(m)
eff <- C%*%beta
q <- dim(C)[1] ; r <- dim(C)[2]

################# Monte Carlo Loop ############################
for(i in 1:m)
      {
      # Generate Random IVs and bind into X matrix. This  will
      # vary from problem to problem.
            e1 <- rpois(n,5) ; e2 <- rpois(n,5) ; e3 <- rpois(n,5) 
            x1 <- e1+e3 ; x2 <- e2+e3 ; x3 <- rnorm(n,10,sqrt(10))
            X <- cbind(matrix(1,n),x1,x2,x3)
      # Calculate non-centrality parameter
            xpxinv <- solve(t(X)%*%X)
            kore <- solve(C%*%xpxinv%*%t(C))
            ncp <- t(eff)%*%kore%*%eff/sigma^2
       Y[i] <- 1 - pf(qf(1-alpha,q,n-r),q,n-r,ncp)
       } 
############## End of Monte Carlo Loop ############################

Phat <- mean(Y) ; SDhat <- sqrt(var(Y))
margin <- qnorm(1-(1-confid)/2)*SDhat/sqrt(m)

cat("\n")
cat("       Monte Carlo integration to estimate Power for a \n")
cat("                  normal linear model.      \n")
cat("\n")
cat(" n,m = ",n,m,"\n")
cat(" Beta = ",beta, "\n")
cat(" Sigma = ",sigma, "\n")
cat("\n")
cat(" Estimated power = ",Phat,", with ",confid*100,"% margin of error \n")
cat(margin,".\n")
cat("\n")
############## End of randpow1.R ####################


#######################################################################
# randpow2.R
#                 Approximate Power for the normal linear model,  
#                 with random independent variables. X-prime-X/n
#                 goes a.s. to a constant. Put the constant in.
#
#                 Run with  source("randpow2.R")
#
#######################################################################

n <- 100                # Sample size
beta <- c(0,1,.1,.2)    # Parameters
sigma <- 5
C <- rbind( c(0,0,1,0), # H0: C Beta = 0
            c(0,0,0,1)  )
alpha <- 0.05           # Significance level of test
                        # Call this the "MOMent MATrix"
                        # X-prime-X divided by n goes to this a.s.
mommat <- rbind(  c( 1, 10, 10, 10),  
                  c(10,110,105,100),
                  c(10,105,110,100),
                  c(10,100,100,110)   
               )
#
# Calculate non-centrality parameter
xpxinv <- solve(mommat)
kore <- solve(C%*%xpxinv%*%t(C))
eff <- C%*%beta
ncp <- n * t(eff)%*%kore%*%eff/sigma^2
q <- dim(C)[1] ; r <- dim(C)[2]
# Estimated Power is still called P-Hat
Phat <- 1 - pf(qf(1-alpha,q,n-r),q,n-r,ncp)
cat("\n")
cat("       Rough Power guess for a normal linear model.\n")
cat("\n")
cat(" n = ",n,"\n")
cat(" Beta = ",beta, "\n")
cat(" Sigma = ",sigma, "\n")
cat("\n")
cat(" Estimated power = ",Phat," \n")
cat("\n")
############## End of randpow2.R ####################


Logistic Regression

Give me some probabilities and I will give you logistic regression coefficients.


x1 <- c(70,90,90) ; x2 <- c(70,70,90) ; pi <- c(.2,.7,.8)
lodds <- log(pi/(1-pi))
tellme <- lsfit(cbind(x1,x2),lodds)
tellme$coef

In the example below, I gave a matrix of independent variables to glm, but you don't need to bind the independent variables into a matrix. Just list them. You can also use factors.


#######################################################################
# logpow0.R
#                 A brutal simulation to estimate Power for a 
#                 logistic regression model with 2 random IVs.
#              
# Run with        source("logpow0.R")
#
#######################################################################

source("powerfun.R") # To define merror
#set.seed(12345)
n <- 100 ; m <- 200 ; signif <- numeric(m)
beta <- c(-11.09035489,0.11167961,0.02694983)
confid <- .99
######## One-time-only calculations ############
critval <- qchisq(.95,1)
# Calculate square root matrix A. A%*%Z will have var-cov Sigma
sigma <- rbind( c(50,sqrt(2000)/2),
                c(sqrt(2000)/2,40) )
spec <- eigen(sigma)
A <- spec$vectors %*% diag(sqrt(spec$values))
################ Monte Carlo Loop ##############
for(i in 1:m)
      {
      # Simulate X
      Z <- rbind(rnorm(n),rnorm(n))
      X <- cbind(matrix(1,n),t(A%*%Z))
      X[,2] <- X[,2]+70 ; X[,3] <- X[,3]+80
      # Okay that's X. Now simulate Y
      xb <- X %*% beta
      pi <- exp(xb)/(1+exp(xb))
      Y <- rbinom(n,1,pi)
      fullmod <- glm(Y ~ X[,2:3], family=binomial ) # Full model
      redmod <- glm(Y ~ X[,2], family=binomial ) # Reduced model
      signif[i] <- anova(redmod,fullmod)[2,4]>critval
      } 
############## End Monte Carlo Loop ##############
Phat <- mean(signif)
cat("\n")
cat("       Simulation to estimate Power for a \n")
cat("     logistic regression model with 2 random IVs.  \n")
cat("\n")
cat(" n,m = ",n,m,"\n")
cat(" Beta = ",beta, "\n")
cat(" Sigma = \n") 
print(sigma)
cat("\n")
cat(" Estimated power = ",Phat,", with ",confid*100,"% margin of error \n")
cat(merror(Phat,m,1-confid),".\n")
cat("\n")
################### End of logpow0.R #######################################

howlong <- system.time(source("logpow0.R")) ; sum(howlong[1:2])

The Population R2 Method. Note that the function poprsq is also in powerfun.R, listed earlier.

poprsq <- function(r,q,a,wantpow=0.80,alpha=0.05)
# Cohen's Population R-squared Method 
#   r   Number of IVs in full model
#   q   Numerator df = number of linear constraints being tested
#   a   Population proportion of remaining variation explained.
#       This is Cohen's "effect size."
#   wantpow     Desired power (default = 0.80)
#   alpha       Significance level (default = 0.05)
    {
    pow <- 0 ; nn <- r+1 ; oneminus <- 1 - alpha
    while(pow < wantpow)
        {
        nn <- nn+1
        phi <- (nn-r) * a/(1-a)
        ddf <- nn-r
        pow <- 1 - pf(qf(oneminus,q,ddf),q,ddf,phi)
        }#End while
     poprsq <- nn
     poprsq
     } # End of function poprsq

The Sample R2 Method. Note that the function samprsq1 is also in powerfun.R, listed earlier.

samprsq1 <- function(r,q,a,alpha=0.05)
# Find n so remaining proportion of SS explained will be significant 
#   r   Number of IVs in full model
#   q   Numerator df = number of linear constraints being tested
#   a   Sample proportion of remaining variation explained.
#   alpha       Significance level (default = 0.05)
    {
    pval <- 1 ; n <- r+1 
    while(pval > alpha)
        {
        n <- n+1
        F <- (n-r)/q * a/(1-a)
        df2 <- n-r
        pval = 1-pf(F,q,df2)
        }#End while
     samprsq1 <- n
     samprsq1
     } # End of function samprsq1

Surrogate Population Method

# Want to detect diff betw means with oneway ANOVA when 
# Groups 1 and 2 are from urn1 and group3 is from urn2 
urn1 <- c(1,2,2,3,3,3,3,4,4,5)
urn2 <- c(1,1,1,1,2,2,4,4,5,5)
mu1 <- mean(urn1) ; mu2 <- mean(urn1) ; mu3 <- mean(urn2)
v1 <- 9*var(urn1)/10 ; v2 <- 9*var(urn2)/10
sigma <- sqrt((2*v1+v2)/3)

> mu1 ; mu2 ; mu3
[1] 3
[1] 3
[1] 2.6
> v1 ; v2
[1] 1.2
[1] 2.64
> sigma
[1] 1.296148

# First a full parametric analysis, to find a good starting value
source("powerfun.R")

>
> matpow1
function(C,eff,f,wantpow=0.80,alpha=0.05)
# H0: C Beta = 0
# Beta is r x 1
# C     is q x r contrast matrix
# eff   is vector of effects (non-zero h) in sd units, length r
# f     is vector of RELATIVE sample sizes, all non-negative
>

C <- rbind(c(1,-1,0),
           c(0,1,-1))
effect <- c(0,(mu2-mu3)/sigma)

matpow1(C,effect,f=c(1,1,1))
> matpow1(C,effect,f=c(1,1,1))
[1] 459
> 459/3
[1] 153

We could be done, but now use all the information we have about the distribution of Y.

# surrog1.R
# Specialized power estimate for the three-sample example
# A brutal simulation
# Want to detect diff betw means with oneway ANOVA when 
# Groups 1 and 2 are from urn1 and group3 is from urn2 
#  source("surrog1.R") and then use  sur1(n,m,f=c(1,1,1))
#
source("powerfun.R")
urn1 <- c(1,2,2,3,3,3,3,4,4,5)
urn2 <- c(1,1,1,1,2,2,4,4,5,5)
mu1 <- mean(urn1) ; mu2 <- mean(urn1) ; mu3 <- mean(urn2)
v1 <- 9*var(urn1)/10 ; v2 <- 9*var(urn2)/10
sigma <- sqrt((2*v1+v2)/3)
C <- rbind(c(1,-1,0),
           c(0,1,-1))
effect <- c(0,(mu2-mu3)/sigma)
nstart <- matpow1(C,effect,f=c(1,1,1))
cat("Start with n of ",nstart,"\n")

sur1 <- function(n,m,f=c(1,1,1))
        # Specialized power estimate for the three-sample example
        # urn1 and urn2 must already exist
        # A brutal simulation
        # uses function merror
    {
    sur1 <- numeric(2)
    names(sur1) <- c("Est Power","99% Margin of error")
    f <- f/sum(f)
    nn <- round(n*f)
    x <- c(numeric(nn[1])+1,numeric(nn[2])+2,numeric(nn[3])+3)
    xfac <- factor(x)
    nsig <- 0
    for(i in 1:m)  # Monte Carlo loop
        {
        samp1 <- sample(urn1,size=nn[1],replace=TRUE)
        samp2 <- sample(urn1,size=nn[2],replace=TRUE)
        samp3 <- sample(urn2,size=nn[3],replace=TRUE)
        y <- c(samp1,samp2,samp3)
        if(anova(lm(y~xfac))[1,5] < 0.05) nsig <- nsig+1
        } # End MC loop
        sur1[1] <- nsig/m
        sur1[2] <- merror(sur1[1],m,0.01)
        sur1
    } # End function sur1

Estimating Power: A simulation

# estpow1.R   
#
# For large M, run with   
#            nohup nice R --vanilla < estpow1.R > estpow1.out &
#
# Two-sample simulation: Version 2
# ncp = n * q(1-q) * delta^2
# For detecting delta = |mu1-mu2|/sigma = 1/2 with prob .8, need 
# n1 = n2 = 64 for n of 128

findn <- function(wantpow=.8,esize,nstart=3,q=.5) 
       # Specific to the 2-sample problem
       # ncp = n * q(1-q) * delta^2 
    {
    pow <- 0 ; nn <- nstart
    while(pow < wantpow)
        {
        nn <- nn+1
        ddf <- nn-2
        ncp <- nn * q*(1-q) * esize^2
        pow <- pf(qf(.95,1,ddf),1,ddf,ncp,FALSE)
        # cat("n=",nn," pow=",pow,"\n")
        }# End while
    findn <- nn
    if(q==.5 && 2*trunc(findn/2)!=findn) findn <- findn+1 
    # Make findn even if q = 1/2
    findn
    } # End function findn


# equal sample sizes n, half a standard dev. difference bet means
n <- 128 ; ddf <- n-2 ; vect <- numeric(n/2)+1 
x <- c(vect,numeric(n/2)) ; dif <- .5 ; eff <- x*dif

M <- 50 ; simp <- numeric(M) ; effsize <- numeric(M) 
estpow <- numeric(M) ; needn <- numeric(M)
set.seed(4444)

for(i in 1:M) 
    {
    y <- eff+rnorm(n)
    amat <- anova(lm(y~x))
    simp[i] <- amat[1,5]
    F <- amat[1,4]
    effsize[i] <- sqrt(4*F/n)
    estpow[i] <- pf(qf(.95,1,ddf),1,ddf,F,FALSE)
    needn[i] <- findn(esize=effsize[i])
    }# Next simulation

results <- rbind(simp,effsize,estpow,needn)
results <- rbind(1:M,results[,order(simp)])
cat("\n")
cat("Order  p-value  effsize  estpow  needn  \n")
cat("\n")

write(results,"",ncolumns=5) # Writes the transpose

#################### End of estpow1.R ###################

# pilot1.R
# Pilot Study: Want to detect mu1 - mu2 with prob 0.80, but
# don't know sigma=2. Estimate it from a pilot study, and then
# estimate n needed for desired power.
# Run with      source("pilot1.R")

sigma <- 2 ; diff <- 1; wishpow <- 0.80
n <- 128 ; M <- 25
x <- c(numeric(n/2)+1,numeric(n/2))
rr <- 2 ; qq <- 1
source("powerfun.R")
CC <- rbind(c(1,-1)); ff <- c(1,1) ; ff <- ff/sum(ff)
rightn <- matpow1(CC,diff/sigma,ff,wishpow)
needn <- realpow <- estsig <- numeric(M)
cat("\n")
cat(" diff = ",diff,"sigma = ",sigma,"wishpow = ",wishpow,"\n")
cat(" rightn = ",rightn,"\n")
cat("\n")
for(i in 1:M)
    {
    y <- rnorm(n,0,sigma) + diff*x
    sighat <- sqrt(sum((lsfit(x,y)$residuals)^2)/(n-2))
    estsig[i] <- sighat
    eff <- diff/sighat # Desired effect in estimated SD units
    needn[i] <- matpow1(CC,eff,ff,wishpow)
    ncp <- needn[i]*prod(ff)*(diff/sigma)^2
    ddf <- needn[i]-2
    realpow[i] <- pf(qf(.95,1,ddf),1,ddf,ncp,FALSE)
    # Actual power at that sample size
    }
pilot <- cbind(estsig,needn,realpow)
sim <- 1:M
pilot <- cbind(sim,pilot[order(estsig),]) # Sort the results
print(pilot)
###################### End of pilot1.R ######################


Large-sample test on means: T1


T1 <- function(ybar,C,h,nn,prop=TRUE,s2=NULL)
###################################################################
# Large sample test of H0: C mu = h
#   ybar    Vector of r sample means. E[ybar] = mu
#   C       q by r hypothesis matrix
#   nn      Vector of r sample sizes
#   prop    Are these sample proportions? If TRUE (the default),
#           there is no need to supply sample variances.
#   s2      Vector of sample varinces, not needed if prop=TRUE
###################################################################
    {
    q <- dim(C)[1] ; r <- dim(C)[2]
    # Error Checks
       if(length(ybar)!=r) stop("Length of ybar must be num of cols in C.")
       if(length(h) != q) stop("Length of h must be num of rows in C.")
       if(length(nn) != r) stop("Length of nn must be num of cols in C.")
       if(prop==FALSE & length(s2) != length(ybar))
          stop("Lengths of ybar and s2 are unequal.")
    # End error checks
    T1 <- numeric(3)
    names(T1) <- c("Chisq "," df "," p-value")
    if(prop) s2 <- nn/(nn-1) * ybar*(1-ybar)
    kore <- solve(C%*%diag(s2/nn)%*%t(C))
    T1[1] <- t(C%*%ybar-h)%*%kore%*%(C%*%ybar-h)
    T1[2] <- q
    T1[3] <- 1-pchisq(T1[1],q)
    T1
    } # End of function T1



# Do the surgery example with T1
dat <- read.table("surgery.dat")
method <- dat$method ; success <- dat$success
thetahat <- tapply(success,list(method),mean)
thetahat
table(method)

# Chisquare test of independence: chisquare = 10.421
# Logistic regression: chisquare = 10.347

CC <- rbind( c(1,-1,0),
             c(0,1,-1)  )
hh <- c(0,0)
sampsizes <- table(method)
T1(thetahat,CC,hh,sampsizes)

# Get another large-sample test for equality of proportions. Do a
# regular one-way ANOVA, and F * q is roughly Chisq(q).
# This works because under H0, all the VARIANCES are equal, and
# MSE is a consistent estimator of the common value.

anova(lm(success~factor(method)))

source("powerfun.R")
urn1 <- c(1,2,2,3,3,3,3,4,4,5)
urn2 <- c(1,1,1,1,2,2,4,4,5,5)
mu1 <- mean(urn1) ; mu2 <- mean(urn1) ; mu3 <- mean(urn2)
v1 <- 9*var(urn1)/10 ; v2 <- 9*var(urn2)/10
mu <- c(mu1,mu2,mu3) ; mu
v <- c(v1,v1,v2) ; v
sigma <-sqrt(mean(v))
C <- rbind(c(1,-1,0),
           c(0,1,-1))
effect <- c(0,(mu2-mu3)/sigma)
matpow1(C,effect,f=c(1,1,1))

T1pow(mu,C,h=c(0,0),f=c(1,1,1),prop=FALSE,sigsq=v)

# surrog2.R
# Another specialized power estimate for the three-sample example,
# using T1 instead of a regular ANOVA.
# A brutal simulation
# Groups 1 and 2 are from urn1 and group3 is from urn2
#  source("surrog2.R") and then use  sur2(n,m,f=c(1,1,1))
#
source("powerfun.R") ; source("T1.R")
urn1 <- c(1,2,2,3,3,3,3,4,4,5)
urn2 <- c(1,1,1,1,2,2,4,4,5,5)
mu1 <- mean(urn1) ; mu2 <- mean(urn1) ; mu3 <- mean(urn2)
v1 <- 9*var(urn1)/10 ; v2 <- 9*var(urn2)/10
mu <- c(mu1,mu2,mu3) ; cat("True mu = ",mu,"\n")
v <- c(v1,v1,v2) ;  cat("True variances = ",v,"\n")
C <- rbind(c(1,-1,0),
           c(0,1,-1))
nstart <- T1pow(mu,C,h=c(0,0),f=c(1,1,1),prop=FALSE,sigsq=v)
cat("T1pow suggests starting with n of ",nstart,"\n")
cat("\n")
sur2 <- function(n,m,f=c(1,1,1))
        # Specialized power estimate for the three-sample example
        # urn1 and urn2 must already exist
        # A brutal simulation
        # uses function merror
    {
    sur2 <- numeric(2)
    names(sur2) <- c("Est Power","99% Margin of error")
    f <- f/sum(f)
    nn <- round(n*f)
    nsig <- 0
    for(i in 1:m)  # Monte Carlo loop
        {
        samp1 <- sample(urn1,size=nn[1],replace=TRUE)
        samp2 <- sample(urn1,size=nn[2],replace=TRUE)
        samp3 <- sample(urn2,size=nn[3],replace=TRUE)
        meanvec <- c(mean(samp1),mean(samp2),mean(samp3))
        varvec <- c(var(samp1),var(samp2),var(samp3))
        if(T1(meanvec,C,c(0,0),nn,prop=FALSE,varvec)[3] < 0.05) nsig <- nsig+1
        } # End MC loop
        sur2[1] <- nsig/m
        sur2[2] <- merror(sur2[1],m,0.01)
        sur2
    } # End function sur2

Now try power of chisquare test of independence on the 3-urn surrogate population.

chipow <- function(pi,wantpow=0.80,alpha=0.05,maxn=100000,display=FALSE)
###################################################################
# Sample size required for desired power: 
# Chisquare test of Independence
#
#   pi          Matrix of true probabilities
#   wantpow     Desired power: default = 0.80
#   alpha       Significance level: default = 0.05
#   maxn        Maximum sample size, to avoid endless loop
#   display     See details?
###################################################################
    {
    ############ One-time only calculations #############
    pi <- pi/sum(pi)
    r <- dim(pi)[1] ; c <- dim(pi)[2]
    df <- (r-1)*(c-1)
    rmarg <- apply(pi,1,sum) # Apply sum to dim 1 (rows)
    cmarg <- apply(pi,2,sum)
    piM <- as.matrix(rmarg)%*%t(as.matrix(cmarg))
    nn <- round(5/min(piM)) 
          # Smallest expected frequency will be 5 to start
    pow <- 0 ; oneminus <- 1 - alpha
    ######################################################
    while(pow < wantpow)
        {
        nn <- nn+1
        if(nn>maxn) stop("Maxn exceeded!")
        ncp <- nn * sum((pi-piM)^2/piM)
        pow <- 1 - pchisq(qchisq(oneminus,df),df,ncp)
        if(display) cat("  n = ",nn,"  Power = ",pow,"\n")
        }#End while
    chipow <- nn
    if(display)
        {
        cat("\n")
        cat("True cell probabilities:\n")
        cat("\n")
        print(pi)
        cat("\n")
        cat("Probabilities under H0 of independence:\n")
        cat("\n")
        print(piM)
        cat("\n")
        }
    chipow
    } # End of function chipow
    
    
urn1 <- c(1,2,2,3,3,3,3,4,4,5)
urn2 <- c(1,1,1,1,2,2,4,4,5,5)
row1 <- table(urn1)/10
row3 <- c(.4,.2,0,.2,.2)
urns <- rbind(row1,row1,row3)/3
urns
chipow(urns,display=T)

# Market researchers would really be more likely to do "top box" 
# or "top two box." First top box.

topbox <- rbind( c(0.03333333,0.3),
                 c(0.03333333,0.3),
                 c(0.13333333,0.2) )
chipow(topbox,display=T)

# Pretty good compared to n = 513.  Now top 2 box

top2box <- rbind( c(.1,0.2333333),
                 c(.1,0.2333333),
                 c(.2,0.1333333) )
chipow(top2box,display=T)

Power of Wald Tests for Logistic Regression

waldlogpow1 <- function(n,beta,X,C,h,alpha=0.05)
################################################################
# Power of Wald test for logistic regression: Single sample size
#     n           Sample size
#     beta        Vector of regression coefficients
#     X           The (fixed) X matrix, with intercept if appropriate
#     C           H0 is C Beta = h
#     h           C Beta under H0
#     alpha       Significance level (size of test): Default = 0.05
################################################################
      {
      q <- dim(C)[1] ; r <- dim(C)[2]
      xb <- X %*% beta
      Vbeta <- diag(as.vector(exp(xb)/(1+exp(xb))^2))
      Ibeta <- t(X) %*% Vbeta %*% X
      kore <- solve(C%*% solve(Ibeta) %*%t(C))
      ncp <- t(C%*%beta-h)%*%kore%*%(C%*%beta-h)
      pow <- 1 - pchisq(qchisq(1-alpha,q),q,ncp)
      waldlogpow1 <- pow
      waldlogpow1
      } # End of function waldlogpow1


# Estimate power for Wald test. Same old tough logistic regression problem.
# Base it on a single X matrix
n <- 2135
beta <- c(-11.09035489,0.11167961,0.02694983)
CC <- rbind(c(0,0,1))
hh <- 0
# Like logpow0, but just one X matrix
set.seed(12345)
# Calculate square root matrix A. A%*%Z will have var-cov Sigma
# This is for MVN IVs
sigma <- rbind( c(50,sqrt(2000)/2),
                c(sqrt(2000)/2,40) )
spec <- eigen(sigma)
A <- spec$vectors %*% diag(sqrt(spec$values))
Z <- rbind(rnorm(n),rnorm(n))
XX <- cbind(matrix(1,n),t(A%*%Z))
XX[,2] <- XX[,2]+70 ; XX[,3] <- XX[,3]+80

waldlogpow1(n,beta,XX,CC,hh)


waldlogpow2 <- function(beta,C,h,alpha=0.05,wantpow=0.80,
                        nstart=25,nmax=100000,display=FALSE)
################################################################
# Power analysis to find sample size for logistic regression.
# Intended to be a rough guess at sample size needed for a LR test.
# X matrix must be generated internally, but just once -- random or not.
# 
#     beta        Vector of regression coefficients
#     C           H0 is C Beta = h
#     h           C Beta under H0
#     alpha       Significance level (size of test): Default = 0.05
#     wantpow     Desired power: Default = 0.80
#     nstart      Starting sample size
#     nmax        To stop endless loop
#     display     If TRUE, print n power and power during search 
################################################################
    {
    # Generate the first nstart rows of the X matrix. This will vary from
    # problem to problem.
    # Calculate square root matrix A. A%*%Z will have var-cov Sigma
    # This is for MVN IVs
    sigma <- rbind( c(50,sqrt(2000)/2),
                    c(sqrt(2000)/2,40) )
    spec <- eigen(sigma)
    A <- spec$vectors %*% diag(sqrt(spec$values))
    Z <- rbind(rnorm(nstart),rnorm(nstart))
    X <- cbind(matrix(1,nstart),t(A%*%Z))
    X[,2] <- X[,2]+70 ; X[,3] <- X[,3]+80
    ########### Finished generating first nstart rows ################
    pow <- waldlogpow1(nstart,beta,X,C,h)
    n <- nstart
    if(display) cat("Starting n = ",n," Power = ",pow,"\n")
    while(pownmax) stop("Too many iterations!")
        } # End search over n
    walodlogpow2 <- n
    walodlogpow2
    } # End of function waldlogpow2


source("powerfun.R")
ruebeta <- c(-11.09035489,0.11167961,0.02694983)
CC <- rbind(c(0,0,1))
hh <- 0
waldlogpow2(truebeta,CC,hh,nstart=2000,display=TRUE)



#######################################################################
# rwaldlogpow1.R
#   Power for a Wald test on a logistic regression model, with random
#   independent variables. Modelled on randpow1.R.
#   Run with        source("rwaldlogpow1.R")
#
#######################################################################

#set.seed(12345)
n <- 2135                # Sample size
m <- 50                 # Monte Carlo sample size
beta <- c(-11.09035489,0.11167961,0.02694983)    # Parameters
C <- rbind(c(0,0,1)) ;  # H0: C Beta = 0
h <- 0
alpha <- 0.05           # Significance level of test
confid <- .99           # Want 100*confid percent margin of
                        # error for MC estimate
#########################################################################
# One-time-only calculations
Y <- numeric(m)
q <- dim(C)[1] ; r <- dim(C)[2]
# Calculate square root matrix A. A%*%Z will have var-cov Sigma
# This is for MVN IVs, and will vary from problem to problem.
sigma <- rbind( c(50,sqrt(2000)/2),
                c(sqrt(2000)/2,40) )
spec <- eigen(sigma)
A <- spec$vectors %*% diag(sqrt(spec$values))


################# Monte Carlo Loop ############################
for(i in 1:m)
      {
      # Generate Random IVs and bind into X matrix. This  will
      # vary from problem to problem.
      Z <- rbind(rnorm(n),rnorm(n))
      X <- cbind(matrix(1,n),t(A%*%Z))
      X[,2] <- X[,2]+70 ; X[,3] <- X[,3]+80
      ###### Done generating X matrix ##########
      # Calculate non-centrality parameter, and power
      xb <- X %*% beta
      dd <- as.vector(exp(xb)/(1+exp(xb))^2)
      Vbeta <- diag(dd) # Working around a problem with diag
      Ibeta <- t(X) %*% Vbeta %*% X
      kore <- solve(C%*% solve(Ibeta) %*%t(C))
      ncp <- t(C%*%beta-h)%*%kore%*%(C%*%beta-h)
      Y[i] <- 1 - pchisq(qchisq(1-alpha,q),q,ncp)
       }
############## End of Monte Carlo Loop ############################

Phat <- mean(Y) ; SDhat <- sqrt(var(Y))
margin <- qnorm(1-(1-confid)/2)*SDhat/sqrt(m)
cat("\n")
cat("       Monte Carlo integration to estimate Power for a \n")
cat("            Wald test on a logistic regression model.  \n")
cat("\n")
cat(" n,m = ",n,m,"\n")
cat(" Beta = ",beta, "\n")
cat("\n")
cat(" Estimated power = ",Phat,", with ",confid*100,"% margin of error \n")
cat(margin,".\n")
cat("\n")
####################### End of rwaldlogpow1.R #####################

howlong <- system.time(source("rwaldlogpow1.R")) ; sum(howlong[1:2])

The F-test for variances again

# var2a.R
# Test difference between variances with F - test
# Data from t distribution
# Run with      source("var2a.R")
M <- 1000 ; alpha <- 0.05
df <- c(5,10,50,200)
n <- c(100,500,1000)
sigprob <- matrix(0,nrow=length(n),ncol=length(df))
dimnames(sigprob) <- list(n,df)
f <- c(1,1) ; f <- f/sum(f)

# Varying sample size and df
for(i in 1:length(n))
    {
    n1 <- f[1]*n[i] ; n2 <- f[2]*n[i]
    lowcrit <- qf(alpha/2,n1-1,n2-1)
    upcrit  <- qf(1-alpha/2,n1-1,n2-1)
    for(j in 1:length(df))
        {
        for(k in 1:M)
            {
            x <- rt(n1,df[j]) ; y <- rt(n2,df[j])
            F <- var(x)/var(y)
            if (Fupcrit) sigprob[i,j] <- sigprob[i,j] + 1
            } # End Monte Carlo loop
        } # End loop through df
    } # End loop through n
sigprob <- sigprob/M
cat("\n")
cat("n = ",n,"\n")
cat("df = ",df,"\n")
cat("M = ",M,"\n")
cat("\n")
print(sigprob)

# var2b.R
# Test difference between variances with F - test
# Data from chisqusare distribution
# Run with      source("var2b.R")
M <- 1000 ; alpha <- 0.05
df <- c(.1,.5,1,10,50)
n <- c(100,500,1000)
sigprob <- matrix(0,nrow=length(n),ncol=length(df))
dimnames(sigprob) <- list(n,df)
f <- c(1,1) ; f <- f/sum(f)

# Varying sample size and df
for(i in 1:length(n))
    {
    n1 <- f[1]*n[i] ; n2 <- f[2]*n[i]
    lowcrit <- qf(alpha/2,n1-1,n2-1)
    upcrit  <- qf(1-alpha/2,n1-1,n2-1)
    for(j in 1:length(df))
        {
        for(k in 1:M)
            {
            x <- rchisq(n1,df[j]) ; y <- rchisq(n2,df[j])
            F <- var(x)/var(y)
            if (Fupcrit) sigprob[i,j] <- sigprob[i,j] + 1
            } # End Monte Carlo loop
        } # End loop through df
    } # End loop through n
sigprob <- sigprob/M
cat("\n")
cat("n = ",n,"\n")
cat("df = ",df,"\n")
cat("M = ",M,"\n")
cat("\n")
print(sigprob)

##################################################################
# manytests.R
# Now let's do some tests on the results for chi-squared variables
# Execute with   R --vanilla < manytests.R
#
#   n =  100 500 1000
#   df =  0.1 0.5 1 10 50
#   M =  1000
#
#          0.1   0.5     1    10    50
#   100  0.766 0.559 0.391 0.095 0.064
#   500  0.798 0.581 0.447 0.134 0.077
#   1000 0.799 0.595 0.440 0.106 0.049
#
#################################################################

source("T1.R")
Ybar <- c(0.766, 0.559, 0.391, 0.095, 0.064,
          0.798, 0.581, 0.447, 0.134, 0.077,
          0.799, 0.595, 0.440, 0.106, 0.049)
M <- numeric(15)+1000

# Overall test: H0 says all the Type I error rates are 0.05
C1 <- diag(15) # 15x15 identity matrix
h1 <- numeric(15) + 0.05
T1(Ybar,C1,h1,M)
# Check it
Z <- sqrt(1000)*(Ybar-0.05)/sqrt(Ybar*(1-Ybar))
cat("Chi-square should equal ",sum(Z^2),"\n")

# Now test each one individually
zero <- rbind(numeric(15))
for(i in 1:15)
    {
    C2 <- zero ; C2[i] <- 1
    cat(" Test ",Ybar[i]," vs 0.05: ",T1(Ybar,C2,h=0.05,nn=M),"\n")
    }
# Check this too
print(prettyNum(Z^2),quote=F)
# Okay, it's working

# Main effect for df
C3 <- rbind(c(1,-1,0,0,0, 1,-1,0,0,0, 1,-1,0,0,0),
            c(0,1,-1,0,0, 0,1,-1,0,0, 0,1,-1,0,0),
            c(0,0,1,-1,0, 0,0,1,-1,0, 0,0,1,-1,0),
            c(0,0,0,1,-1, 0,0,0,1,-1, 0,0,0,1,-1)  )
T1(Ybar,C3,h=numeric(4),nn=M)

# Main effect for n
C4 <- rbind(c(1,1,1,1,1,-1,-1,-1,-1,-1,0,0,0,0,0),
            c(0,0,0,0,0,1,1,1,1,1,-1,-1,-1,-1,-1)  )
T1(Ybar,C4,h=numeric(2),nn=M)
# Print marginal for n
ybarmat <- matrix(Ybar,nrow=3,ncol=5,byrow=T) ; ybarmat
apply(ybarmat,1,mean)
# Pairwise marginal differences
C4a <- rbind(c(1,1,1,1,1,-1,-1,-1,-1,-1,0,0,0,0,0))
T1(Ybar,C4a,h=0,nn=M)
print(prettyNum(T1(Ybar,C4a,h=0,nn=M)),quote=F)

C4b <- rbind(c(0,0,0,0,0,1,1,1,1,1,-1,-1,-1,-1,-1))
print(prettyNum(T1(Ybar,C4b,h=0,nn=M)),quote=F)

C4c <- rbind(c(1,1,1,1,1,0,0,0,0,0,-1,-1,-1,-1,-1))
print(prettyNum(T1(Ybar,C4c,h=0,nn=M)),quote=F)

# Adjacent along rows
for(i in 1:4)
   {
   C5 <- zero ; C5[i] <- 1 ; C5[i+1] <- -1
   cat(" Test ",Ybar[i]," vs ",Ybar[i+1],": ",T1(Ybar,C5,h=0,nn=M),"\n")
   }
for(i in 6:9)
   {
   C5 <- zero ; C5[i] <- 1 ; C5[i+1] <- -1
   cat(" Test ",Ybar[i]," vs ",Ybar[i+1],": ",T1(Ybar,C5,h=0,nn=M),"\n")
   }
for(i in 11:14)
   {
   C5 <- zero ; C5[i] <- 1 ; C5[i+1] <- -1
   cat(" Test ",Ybar[i]," vs ",Ybar[i+1],": ",T1(Ybar,C5,h=0,nn=M),"\n")
   }

# Adjacent down columns
# Col 1
for(i in c(1,6))
   {
   C6 <- zero ; C6[i] <- 1 ; C6[i+5] <- -1
   cat(" Test ",Ybar[i]," vs ",Ybar[i+5],": ",T1(Ybar,C6,h=0,nn=M),"\n")
   }

# Col 2
for(i in c(2,7))
   {
   C6 <- zero ; C6[i] <- 1 ; C6[i+5] <- -1
   cat(" Test ",Ybar[i]," vs ",Ybar[i+5],": ",T1(Ybar,C6,h=0,nn=M),"\n")
   }

# Col 3
for(i in c(3,8))
   {
   C6 <- zero ; C6[i] <- 1 ; C6[i+5] <- -1
   cat(" Test ",Ybar[i]," vs ",Ybar[i+5],": ",T1(Ybar,C6,h=0,nn=M),"\n")
   }

# Col 4
for(i in c(4,9))
   {
   C6 <- zero ; C6[i] <- 1 ; C6[i+5] <- -1
   cat(" Test ",Ybar[i]," vs ",Ybar[i+5],": ",T1(Ybar,C6,h=0,nn=M),"\n")
   }

# Col 5
for(i in c(5,10))
   {
   C6 <- zero ; C6[i] <- 1 ; C6[i+5] <- -1
   cat(" Test ",Ybar[i]," vs ",Ybar[i+5],": ",T1(Ybar,C6,h=0,nn=M),"\n")
   }

#################### End of manytests.R ################

R --vanilla < manytests.R

Note that power functions (all of them, I hope) are in the earlier part of this file, where powerfun.R is listed.