R Code from STA2201s04: Applied Statistics II
Here are some pieces of S code from the overheads. If you want to copy-paste them, it's better to do it from this document than from the PDF files, because sometimes there are invisible characters in the PDF files, and these can cause errors that are hard to fix, because you cannot see what caused the error. There is one SAS program, but really the SAS programs should be in a separate document.
I will add to this file throughout the course.
First, a few pieces from the introductory computer lecture.
pdf("testor.pdf") plot(x,y,type='l') # That's a lower case L set.seed(12345) # Be able to reproduce the stream of pseudo-random numbers. # regart.R Demonstrate Regression Artifact ###################### Setup ####################### N <- 10000 ; n <- 100 truevar <- 180 ; errvar <- 45 truesd <- sqrt(truevar) ; errsd <- sqrt(errvar) # set.seed(44444) # Now define the function ttest, which does a matched t-test ttest <- function(d) # Matched t-test. It operates on differences. { ttest <- numeric(4) names(ttest) <- c("Mean Difference"," t "," df "," p-value ") ave <- mean(d) ; nn <- length(d) ; sd <- sqrt(var(d)) ; df <- nn-1 tstat <- ave*sqrt(nn)/sd pval <- 2*(1-pt(abs(tstat),df)) ttest[1] <- ave ; ttest[2] <- tstat; ttest[3] <- df; ttest[4] <- pval ttest # Return the value of the function } ##################################################### error1 <- rnorm(N,0,errsd) ; error2 <- rnorm(N,0,errsd) truescor <- rnorm(N,100,truesd) pre <- truescor+error1 ; rankpre <- rank(pre) # Based on their rank on the pre-test, we take the n worst students and # place them in a special remedial program, but it does NOTHING. # Based on their rank on the pre-test, we take the n best students and # place them in a special program for the gifted, but it does NOTHING. post <- truescor+error2 diff <- post-pre # Diff represents "improvement." # But of course diff = error2-error1 = noise cat("\n") # Skip a line cat("------------------------------------ \n") dtest <- ttest(diff) cat("Test on diff (all scores) \n") ; print(dtest) ; cat("\n") remedial <- diff[rankpre<=n] ; rtest <- ttest(remedial) cat("Test on Remedial \n") ; print(rtest) ; cat("\n") gifted <- diff[rankpre>=(N-n+1)] ; gtest <- ttest(gifted) cat("Test on Gifted \n") ; print(gtest) ; cat("\n") cat("------------------------------------ \n") #################### End of regart.R ####################
Regression stuff from introductory computer lecture.
#################################################################### # lesson2.R: execute with R --vanilla < lesson2.R > lesson2.out # #################################################################### datalist <- scan("mcars.dat",list(id=0,country=0,kpl=0,weight=0,length=0)) # datalist is a linked list. datalist # There are other ways to read raw data. See help(read.table). weight <- datalist$weight ; length <- datalist$length ; kpl <- datalist$kpl country <- datalist$country cor(cbind(weight,length,kpl)) # The table command gives a bare-bones frequency distribution table(country) # That was a matrix. The numbers 1 2 3 are labels. # You can save it, and you can get at its contents countrytable <- table(country) countrytable[2] # There is an "if" function that you could use to make dummy variables, # but it's easier to use factor. countryfac <- factor(country,levels=c(1,2,3), label=c("US","Japanese","European")) # This makes a FACTOR corresponding to country, like declaring it # to be categorical. How are dummy variables being set up? contrasts(countryfac) # The first level specified is the reference category. You can get a # different reference category by specifying the levels in a different order. cntryfac <- factor(country,levels=c(2,1,3), label=c("Japanese","US","European")) contrasts(cntryfac) # Test interaction. For comparison, with SAS we got F = 11.5127, p < .0001 # First fit (and save!) the reduced model. lm stands for linear model. redmod <- lm(kpl ~ weight+cntryfac) # The object redmod is a linked list, including lots of stuff like all the # residuals. You don't want to look at the whole thing, at least not now. summary(redmod) # Full model is same stuff plus interaction. You COULD specify the whole thing. fullmod <- update(redmod,. ~ . + weight*cntryfac) anova(redmod,fullmod) # The ANOVA summary table is a matrix. You can get at its (i,j)th element. aovtab <- anova(redmod,fullmod) aovtab[2,5] # The F statistic aovtab[2,6] < .05 # p < .05 -- True or false? 1>6 # Another example of an expression taking the logical value true or false. ##################### End of lesson2.R ####################################### ##################### lesson3.R ############################ # Lesson 3: Factorial ANOVA with the potato2 data # Run with R --vanilla < lesson3.R > lesson3.out # system("cat potato2.dat") spuds <- read.table("potato2.dat") spuds[1:10,] # Rows 1:10, all columns # Table of means mtable <- tapply(spuds$Rot, list(spuds$Temp,spuds$Bact),mean) mtable # # Note that just specifying the interaction forces the main effects in summary(aov(Rot~factor(Bact)*factor(Temp),data=spuds)) # # For custom tests, make a combination variable # rot <- spuds$Rot ; temp <- spuds$Temp ; bact <- spuds$Bact combo <- 10*temp + bact # Two levels of temp x 3 levels of bact table(temp,combo) table(bact,combo) # # Suppose we want to test diff among bact just for low temp=1 # We can do it with a dummy var setup like the default contrasts. # n <- length(combo) d1 <- numeric(n) ; d2 <- d1; d3 <- d1; d4 <- d1; d5 <- d1 d1[combo==12] <- 1 ; d2[combo==13] <- 1 ; d3[combo==21] <- 1 d4[combo==22] <- 1 ; d5[combo==23] <- 1 table(d1,combo) table(d2,combo) # redmod <- lm(rot~d3+d4+d5) ; fullmod <- lm(rot~d1+d2+d3+d4+d5) anova(redmod,fullmod) # # Suppose the null hypothesis is no temp effect for any type of bacteria # H0: mu11=mu21 , mu12=mu22 mu13=mu23 # # Fit a reduced model by transforming the independent variables # Need one more dummy variable d0 <- numeric(n) ; d0[combo==11] <- 1 v1 <- d0+d3 ; v2 <- d1+d4 ; v3 <- d2+d5 # # I can't make lm fit a model with no intercept! Must use lsfit. Ugh! # redfit <- lsfit(cbind(v1,v2,v3),rot,intercept=F) # Not like an lm object SSEr <- sum(residuals(redfit)^2) # # Now we need some stuff from the full model # fullmat <- anova(fullmod) # ANOVA summary table is a matrix fullmat SSEf <- fullmat[6,2] ; SSEf MSEf <- fullmat[6,3] ; MSEf F2 <- (SSEr-SSEf)/(3*MSEf) ; pval <- 1-pf(F2,3,48) F2 ; pval
It's so much easier with SAS
/* spud1.sas */ options linesize=79 noovp formdlim='_'; title 'Rotten potato check'; data spud; infile 'potato2.dat' firstobs=2; /* Skip the first line that R uses */ input id bact temp rot; if temp=1 and bact=1 then mu11=1; else mu11=0; if temp=1 and bact=2 then mu12=1; else mu12=0; if temp=1 and bact=3 then mu13=1; else mu13=0; if temp=2 and bact=1 then mu21=1; else mu21=0; if temp=2 and bact=2 then mu22=1; else mu22=0; if temp=2 and bact=3 then mu23=1; else mu23=0; combo = 10*temp+bact; proc glm; title2 'Standard 2-way ANOVA with proc glm'; class bact temp; model rot=temp|bact; proc reg; title2 'Custom tests with proc reg'; model rot = mu11--mu23 / noint; ex1: test mu11=mu12=mu13; ex2: test mu11=mu21, mu12=mu22, mu13=mu23; proc glm; class combo; title2 'Specify contrast matrix directly with proc glm'; model rot = combo; contrast 'Bact diff for low temp' combo 1 -1 0 0 0 0, combo 0 1 -1 0 0 0; contrast 'Temp diff in each bact' combo 1 0 0 -1 0 0, combo 0 1 0 0 -1 0, combo 0 0 1 0 0 -1;
Power
############################################################## # powerfun.R # Collection of function definitions used for power analysis # in STA2201. Activate with source("powerfun.R") # # Table of Contents # # signpow # size1 # merror # fpow2 # matpow1 # matpow2 # t1pow & size2 # poprsq # samprsq1 # T1pow1 # T1pow # Waldlogpow1 # Waldlogpow2 # chipow1 # chipow ############################################################## signpow <- function(theta,n) # Power of the sign test { L <- sqrt(n)*(.5-theta)/sqrt(theta*(1-theta)) R <- 1.96/(2*sqrt(theta*(1-theta))) signpow <- 1 - pnorm(L+R) + pnorm(L-R) signpow } # End of function signpow size1 <- function(truet,needpow=0.80,nstart=1,nend=1000000) { # Search for sample size needed to get desired power of sign test nn <- nstart ; pow <- 0 while(pownend) stop("Too many iterations!") } cat("\n") cat("For true Theta of ",truet,", sign test requires a sample \n") cat(" size of ",nn-1," to have power of ",pow,"\n") cat("\n") } # End function size1 merror <- function(phat,m,alpha) # (1-alpha)*100% merror for a proportion { z <- qnorm(1-alpha/2) merror <- z * sqrt(phat*(1-phat)/m) # m is (Monte Carlo) sample size merror } # End of function merror fpow2 <- function(r,q,effsize,wantpow=0.80,alpha=0.05) ############################################################################# # Power for the general multiple regression model, testing H0: C Beta = h # # r is the number of beta parameters # # q Number rows in the C matrix = numerator df # # effsize is ncp/n, a squared distance between C Beta and h # # wantpow is the desired power, default = 0.80 # # alpha is the significance level, default = 0.05 # ############################################################################# { pow <- 0 ; nn <- r+1 ; oneminus <- 1 - alpha while(pow < wantpow) { nn <- nn+1 phi <- nn * effsize ddf <- nn-r pow <- 1 - pf(qf(oneminus,q,ddf),q,ddf,phi) }#End while fpow2 <- nn fpow2 # Returns needed n } # End of function fpow2 matpow1 <- function(C,eff,f,wantpow=0.80,alpha=0.05) # H0: C Beta = 0 # Beta is r x 1 # C is q x r contrast matrix # eff is vector of effects (non-zero h) in sd units, length r # f is vector of RELATIVE sample sizes, all non-negative # { f <- f/sum(f) if(min(f)<=0) stop("Cell sample sizes must all be positive.") kore <- solve(C%*%diag(1/f)%*%t(C)) effsize <- t(eff)%*%kore%*%eff q <- dim(C)[1] ; r <- dim(C)[2] # cat("r,q,effsize,wantpow,alpha = ",r,q,effsize,wantpow,alpha,"\n") matpow1 <- fpow2(r,q,effsize,wantpow,alpha) matpow1 } # End of function matpow1 matpow2 <- function(C,beta,f,wantpow=0.80,alpha=0.05) # H0: C Beta = 0 # Beta is r x 1 # C is q x r contrast matrix # eff is vector of effects (non-zero h) in sd units, length r # f is vector of RELATIVE sample sizes, all non-negative # { f <- f/sum(f) if(min(f)<=0) stop("Cell sample sizes must all be positive.") eff <- C%*%beta kore <- solve(C%*%diag(1/f)%*%t(C)) effsize <- t(eff)%*%kore%*%eff q <- dim(C)[1] ; r <- dim(C)[2] # cat("r,q,effsize,wantpow,alpha = ",r,q,effsize,wantpow,alpha,"\n") matpow1 <- fpow2(r,q,effsize,wantpow,alpha) matpow1 } # End of function matpow2 ######################################################################### # t1pow & size2: Power analysis to find MC sample size for Z test # # of departure from specified Type I error rate (usually 0.05). Do # # source("type1power.R") # # and then use the function size2 interactively. # ######################################################################### t1pow <- function(a0,a1,size,m) # Power of Z test for proportion # a0 Hypothesized Type I error rate # a1 True (alternative) Type I error rate # size Size of Z test of H0: alpha=a0 vs not equal # m Monte Carlo sample size { L <- sqrt(m)*(a0-a1)/sqrt(a1*(1-a1)) R <- qnorm(1-size/2) * sqrt( (a0*(1-a0))/(a1*(1-a1)) ) t1pow <- 1 - pnorm(L+R) + pnorm(L-R) t1pow } # End of function t1pow size2 <- function(a0=0.05,a1,alpha=0.01,needpow=0.80,nstart=1,nend=1000000) { nn <- nstart ; pow <- 0 while(pow nend) stop("Too many iterations!") } cat("\n") cat("In a Z test of H0: alpha = ",a0," vs not equal at the ",alpha,"\n") cat("level when the true Type I error rate is ",a1,", a Monte Carlo \n") cat("sample size of ",nn-1," is required to have a power of ",pow,"\n") cat("\n") } # End function size2 poprsq <- function(r,q,a,wantpow=0.80,alpha=0.05) # Cohen's Popularion R-squared Method # r Number of IVs in full model # q Numerator df = number of linear constraints being tested # a Population proportion of remaining variation explained. # This is Cohen's "effect size." # wantpow Desired power (default = 0.80) # alpha Significance level (default = 0.05) { pow <- 0 ; nn <- r+1 ; oneminus <- 1 - alpha while(pow < wantpow) { nn <- nn+1 phi <- (nn-r) * a/(1-a) ddf <- nn-r pow <- 1 - pf(qf(oneminus,q,ddf),q,ddf,phi) }#End while poprsq <- nn poprsq } # End of function poprsq samprsq1 <- function(r,q,a,alpha=0.05) # Find n so remaining proportion of SS explained will be significant # r Number of IVs in full model # q Numerator df = number of linear constraints being tested # a Sample proportion of remaining variation explained. # alpha Significance level (default = 0.05) { pval <- 1 ; n <- r+1 while(pval > alpha) { n <- n+1 F <- (n-r)/q * a/(1-a) df2 <- n-r pval = 1-pf(F,q,df2) }#End while samprsq1 <- n samprsq1 } # End of function samprsq1 T1pow1 <- function(n,mu,C,h,f,alpha=0.05,prop=TRUE,sigsq=NULL) ################################################################### # Power for large sample test of H0: C mu = h # n Sample size # mu Vector of r population means. E[ybar] = mu # C q by r hypothesis matrix # f Vector of r relative sample sizes # alpha Significance level: default = 0.05 # prop Is this a test on sample proportions? If TRUE (the default), # there is no need to supply variances. # sigsq Vector of population varinces, not needed if prop=TRUE ################################################################### { q <- dim(C)[1] ; r <- dim(C)[2] # Error Checks if(length(mu)!=r) stop("Length of mu must be num of cols in C.") if(length(h) != q) stop("Length of h must be num of rows in C.") if(length(f) != r) stop("Length of f must be num of cols in C.") if(prop==FALSE & length(sigsq) != length(mu)) stop("Lengths of mu and sigsq are unequal.") # End error checks oneminus <- 1 - alpha ; f <- f/sum(f) if(prop) sigsq <- mu*(1-mu) kore <- solve(C%*%diag(sigsq/f)%*%t(C)) ncp <- n * t(C%*%mu-h)%*%kore%*%(C%*%mu-h) pow <- 1 - pchisq(qchisq(oneminus,q),q,ncp) T1pow1 <- pow T1pow1 } # End of function T1pow1 T1pow <- function(mu,C,h,f,wantpow=0.80,alpha=0.05,prop=TRUE, sigsq=NULL,maxn=100000) ################################################################### # Sample size required for desired power: large sample test of H0: C mu = h # mu Vector of r population means. E[ybar] = mu # C q by r hypothesis matrix # f Vector of r relative sample sizes # wantpow Desired power # alpha Significance level: default = 0.05 # prop Is this a test on sample proportions? If TRUE (the default), # there is no need to supply variances. # sigsq Vector of population varinces, not needed if prop=TRUE # maxn Maximum sample size, to avoid endless loop ################################################################### { q <- dim(C)[1] ; r <- dim(C)[2] # Error Checks if(length(mu)!=r) stop("Length of mu must be num of cols in C.") if(length(h) != q) stop("Length of h must be num of rows in C.") if(length(f) != r) stop("Length of f must be num of cols in C.") if(prop==FALSE & length(sigsq) != length(mu)) stop("Lengths of mu and sigsq are unequal.") # End error checks oneminus <- 1 - alpha ; f <- f/sum(f) if(prop) sigsq <- mu*(1-mu) pow <- 0 ; nn <- r+1 ; oneminus <- 1 - alpha while(pow < wantpow) { nn <- nn+1 if(nn>maxn) stop("Maxn exceeded!") kore <- solve(C%*%diag(sigsq/f)%*%t(C)) ncp <- nn * t(C%*%mu-h)%*%kore%*%(C%*%mu-h) pow <- 1 - pchisq(qchisq(oneminus,q),q,ncp) }#End while T1pow <- nn T1pow } # End of function T1pow waldlogpow1 <- function(n,beta,X,C,h,alpha=0.05) ################################################################ # Power of Wald test for logistic regression: Single sample size # n Sample size # beta Vector of regression coefficients # X The X matrix, with intercept if appropriate # C H0 is C Beta = h # h C Beta under H0 # alpha Significance level (size of test): Default = 0.05 ################################################################ { q <- dim(C)[1] ; r <- dim(C)[2] xb <- X %*% beta Vbeta <- diag(as.vector(exp(xb)/(1+exp(xb))^2)) Ibeta <- t(X) %*% Vbeta %*% X kore <- solve(C%*% solve(Ibeta) %*%t(C)) ncp <- t(C%*%beta-h)%*%kore%*%(C%*%beta-h) pow <- 1 - pchisq(qchisq(1-alpha,q),q,ncp) waldlogpow1 <- pow waldlogpow1 } # End of function waldlogpow1 waldlogpow2 <- function(beta,C,h,alpha=0.05,wantpow=0.80, nstart=25,nmax=100000,display=FALSE) ################################################################ # Power analysis to find sample size for logistic regression. # Intended to be a rough guess at sample size needed for a LR test. # X matrix must be generated internally, but just once -- random or not. # # beta Vector of regression coefficients # C H0 is C Beta = h # h C Beta under H0 # alpha Significance level (size of test): Default = 0.05 # wantpow Desired power: Default = 0.80 # nstart Starting sample size # nmax To stop endless loop # display If TRUE, print n power and power during search ################################################################ { # Generate the first nstart rows of the X matrix. This will vary from # problem to problem. # Calculate square root matrix A. A%*%Z will have var-cov Sigma # This is for MVN IVs sigma <- rbind( c(50,sqrt(2000)/2), c(sqrt(2000)/2,40) ) spec <- eigen(sigma) A <- spec$vectors %*% diag(sqrt(spec$values)) Z <- rbind(rnorm(nstart),rnorm(nstart)) X <- cbind(matrix(1,nstart),t(A%*%Z)) X[,2] <- X[,2]+70 ; X[,3] <- X[,3]+80 ########### Finished generating first nstart rows ################ pow <- waldlogpow1(nstart,beta,X,C,h) n <- nstart if(display) cat("Starting n = ",n," Power = ",pow,"\n") while(pow nmax) stop("Too many iterations!") } # End search over n walodlogpow2 <- n walodlogpow2 } # End of function waldlogpow2 chipow1 <- function(n,pi,alpha=0.05) # Power for Pearson's chi-square test of independence # n Sample size # pi Matrix of true probabilities # alpha Significance level { oneminus <- 1 - alpha pi <- pi/sum(pi) r <- dim(pi)[1] ; c <- dim(pi)[2] df <- (r-1)*(c-1) rmarg <- apply(pi,1,sum) # Apply sum to dim 1 (rows) cmarg <- apply(pi,2,sum) piM <- as.matrix(rmarg)%*%t(as.matrix(cmarg)) ncp <- n * sum((pi-piM)^2/piM) chipow1 <- 1 - pchisq(qchisq(oneminus,df),df,ncp) chipow1 } # End of function chipow1 chipow <- function(pi,wantpow=0.80,alpha=0.05,maxn=100000,display=FALSE) ################################################################### # Sample size required for desired power: # Chisquare test of Independence # # pi Matrix of true probabilities # wantpow Desired power: default = 0.80 # alpha Significance level: default = 0.05 # maxn Maximum sample size, to avoid endless loop # display See details? ################################################################### { ############ One-time only calculations ############# pi <- pi/sum(pi) r <- dim(pi)[1] ; c <- dim(pi)[2] df <- (r-1)*(c-1) rmarg <- apply(pi,1,sum) # Apply sum to dim 1 (rows) cmarg <- apply(pi,2,sum) piM <- as.matrix(rmarg)%*%t(as.matrix(cmarg)) nn <- round(5/min(piM)) ; pow <- 0 ; oneminus <- 1 - alpha ###################################################### while(pow < wantpow) { nn <- nn+1 if(nn>maxn) stop("Maxn exceeded!") ncp <- nn * sum((pi-piM)^2/piM) pow <- 1 - pchisq(qchisq(oneminus,df),df,ncp) if(display) cat(" n = ",nn," Power = ",pow,"\n") }#End while chipow <- nn if(display) { cat("\n") cat("True cell probabilities:\n") cat("\n") print(pi) cat("\n") cat("Probabilities under H0 of independence:\n") cat("\n") print(piM) cat("\n") } chipow } # End of function chipow
That was the end of the file powerfun.R. It will continue to grow some as the course progresses. Now here is some more power code from the overheads.
###################################################################### # powplot.R - Plot Power of sign test as a function of true Theta # # # # R --vanilla < powplot.R # ##################################################################### source("powerfun.R") # To define function signpow Theta <- seq(from=.01,to=.99,by=.01) Power <- signpow(Theta,50) p100 <- signpow(Theta,100) system("rm powplot.pdf") pdf("powplot.pdf") plot(Theta,Power,type="l") lines(Theta,p100,lty=2) title("Power of the Sign Test") # I understand there is a "legend" command that may do this better. x1 <- c(.70,.80) ; y1 <- c(.2,.2) ; lines(x1,y1,lty=1) text(.9,.2,"n = 50") x2 <- c(.70,.80) ; y2 <- c(.1,.1) ; lines(x2,y2,lty=2) text(.9,.1,"n = 100") ############## End of powplot.R #################### # cvm.R # Power for test of zero correlation for an entire matrix # (Large Sample LR Test) # Execute with source("cvm.R") # M <- 10000 sim <- numeric(M) set.seed(32448) n <- 50 ; v1 <- .42 ; v2 <- .18 s1 <- sqrt(v1) ; s2 <- sqrt(v2) G <- function(datamat) { nn <- dim(datamat)[1] ; kk <- dim(datamat)[2] ; df <- kk*(kk-1)/2 G <- numeric(3) names(G) <- c("Chisq","df","P-value") S <- var(datamat) G[1] <- nn * ( sum(log(diag(S))) - sum(log(eigen(S)$values)) ) #$ G[2] <- df G[3] <- 1 - pchisq(G[1],df) G } # End function G merror <- function(phat,m,alpha=0.01) # (1-alpha)*100% merror for a proportion { z <- qnorm(1-alpha/2) merror <- z * sqrt(phat*(1-phat)/m) # m is (Monte Carlo) sample size merror } for(j in 1:M) { e1 <- rnorm(n,0,s1) ; e2 <- rnorm(n,0,s2) x1 <- rnorm(n)+e1 ; x2 <- rnorm(n)+e1 ; x3 <- rnorm(n)+e1 ; x4 <- rnorm(n)+e2 ; x5 <- rnorm(n)+e2 dat <- cbind(x1,x2,x3,x4,x5) # print(G(dat)) print(j) sim[j] <- G(dat)[3] < .05 } poww <- length(sim[sim==1])/M cat("Power = ", poww ,"\n") cat("Plus or Minus 99% Margin of error: ",merror(poww,M),"\n") ############## End of cvm.R #################### # Power for comparing 2 means n <- seq(from=120,to=140,by=2) ; phi <- n/16 ; ddf <- n-2 cbind(n,pf(qf(.95,1,ddf),1,ddf,phi,FALSE)) # Comparing r means 3 * var(c(0,.25,.5,.75)) / 4 source("powerfun.R") # To define fpow2 fpow2(r=4,q=3,effsize=0.078125) # Signal to noise ratio source("powerfun.R") # To define fpow2 fpow2(r=6,q=5,effsize=0.10) # Interaction example with fpow2 source("powerfun.R") # To define fpow2 fpow2(r=6,q=2,effsize=0.01388889,wantpow=0.80,alpha=0.05) # Interaction example with matpow1 source("powerfun.R") effmat <- rbind(c(1, -1, -1, 1, 0, 0), c(0, 0, 1, -1, -1, 1) ) h <- c(0,-.5) ssizes <- numeric(6) + 1 matpow1(effmat,h,ssizes) # Using default wantpower of 0.80 and alpha=0.05 # Interaction example with matpow2 source("powerfun.R") effmat <- rbind(c(1, -1, -1, 1, 0, 0), c(0, 0, 1, -1, -1, 1) ) cellmeans <- c(0,.25,0,.25,0,-.25) ssizes <- numeric(6) + 1 matpow2(effmat,cellmeans,ssizes) # Using defaults for wantpower and alpha # Playing with relative sample sizes in a oneway design. source("powerfun.R") # Defining matpow2 beta <- c(0,.25,.5,.75) Cmat <- rbind( c(1,-1, 0, 0), c(0, 1,-1, 0), c(0, 0, 1,-1) ) f <- c(2,1,1,2) matpow2(Cmat,beta,f) ####################################################################### # randpow0.R # A brutal simulation to estimate Power for a # normal linear model with 3 random IVs. # # Run with source("randpow0.R") # ####################################################################### source("powerfun.R") # To define merror #set.seed(12345) n <- 100 ; m <- 1000 ; signif <- numeric(m) b0 <- 0 ; b1 <- 1 ; b2 <- .1 ; b3 <- .2 sigma <- 5 ; confid <- .99 for(i in 1:m) { e1 <- rpois(n,5) ; e2 <- rpois(n,5) ; e3 <- rpois(n,5) x1 <- e1+e3 ; x2 <- e2+e3 ; x3 <- rnorm(n,10,sqrt(10)) epsilon <- rnorm(n,0,sigma) y <- b0 + b1*x1 + b2*x2 + b3*x3 + epsilon mod1 <- lm(y~x1) ; mod2 <- lm(y~x1+x2+x3) signif[i] <- anova(mod1,mod2)[2,6] < 0.05 } Phat <- mean(signif) cat("\n") cat(" A brutal simulation to estimate Power for a \n") cat(" normal linear model with 3 random IVs. \n") cat("\n") cat(" n,m = ",n,m,"\n") cat(" b0, b1, b2, b3, sigma = ",b0, b1, b2, b3, sigma, "\n") cat("\n") cat(" Estimated power = ",Phat,", with ",confid*100,"% margin of error \n") cat(merror(Phat,m,1-confid),".\n") cat("\n") ############## End of randpow0.R #################### howlong0 <- system.time(source("randpow0.R")) howlong0 sum(howlong0[1:2]) ####################################################################### # randpow1.R # Power for the normal linear model, with random # independent variables. Run with source("randpow1.R") # ####################################################################### #set.seed(12345) n <- 100 # Sample size m <- 50 # Monte Carlo sample size beta <- c(0,1,.1,.2) # Parameters sigma <- 5 C <- rbind( c(0,0,1,0), # H0: C Beta = 0 c(0,0,0,1) ) alpha <- 0.05 # Significance level of test confid <- .99 # Want 100*confid percent margin of # error for MC estimate ######################################################################### # One-time-only calculations Y <- numeric(m) eff <- C%*%beta q <- dim(C)[1] ; r <- dim(C)[2] ################# Monte Carlo Loop ############################ for(i in 1:m) { # Generate Random IVs and bind into X matrix. This will # vary from problem to problem. e1 <- rpois(n,5) ; e2 <- rpois(n,5) ; e3 <- rpois(n,5) x1 <- e1+e3 ; x2 <- e2+e3 ; x3 <- rnorm(n,10,sqrt(10)) X <- cbind(matrix(1,n),x1,x2,x3) # Calculate non-centrality parameter xpxinv <- solve(t(X)%*%X) kore <- solve(C%*%xpxinv%*%t(C)) ncp <- t(eff)%*%kore%*%eff/sigma^2 Y[i] <- 1 - pf(qf(1-alpha,q,n-r),q,n-r,ncp) } ############## End of Monte Carlo Loop ############################ Phat <- mean(Y) ; SDhat <- sqrt(var(Y)) margin <- qnorm(1-(1-confid)/2)*SDhat/sqrt(m) cat("\n") cat(" Monte Carlo integration to estimate Power for a \n") cat(" normal linear model. \n") cat("\n") cat(" n,m = ",n,m,"\n") cat(" Beta = ",beta, "\n") cat(" Sigma = ",sigma, "\n") cat("\n") cat(" Estimated power = ",Phat,", with ",confid*100,"% margin of error \n") cat(margin,".\n") cat("\n") ############## End of randpow1.R #################### ####################################################################### # randpow2.R # Approximate Power for the normal linear model, # with random independent variables. X-prime-X/n # goes a.s. to a constant. Put the constant in. # # Run with source("randpow2.R") # ####################################################################### n <- 100 # Sample size beta <- c(0,1,.1,.2) # Parameters sigma <- 5 C <- rbind( c(0,0,1,0), # H0: C Beta = 0 c(0,0,0,1) ) alpha <- 0.05 # Significance level of test # Call this the "MOMent MATrix" # X-prime-X divided by n goes to this a.s. mommat <- rbind( c( 1, 10, 10, 10), c(10,110,105,100), c(10,105,110,100), c(10,100,100,110) ) # # Calculate non-centrality parameter xpxinv <- solve(mommat) kore <- solve(C%*%xpxinv%*%t(C)) eff <- C%*%beta ncp <- n * t(eff)%*%kore%*%eff/sigma^2 q <- dim(C)[1] ; r <- dim(C)[2] # Estimated Power is still called P-Hat Phat <- 1 - pf(qf(1-alpha,q,n-r),q,n-r,ncp) cat("\n") cat(" Rough Power guess for a normal linear model.\n") cat("\n") cat(" n = ",n,"\n") cat(" Beta = ",beta, "\n") cat(" Sigma = ",sigma, "\n") cat("\n") cat(" Estimated power = ",Phat," \n") cat("\n") ############## End of randpow2.R ####################
Logistic Regression
Give me some probabilities and I will give you logistic regression coefficients.
x1 <- c(70,90,90) ; x2 <- c(70,70,90) ; pi <- c(.2,.7,.8) lodds <- log(pi/(1-pi)) tellme <- lsfit(cbind(x1,x2),lodds) tellme$coef
In the example below, I gave a matrix of independent variables to glm, but you don't need to bind the independent variables into a matrix. Just list them. You can also use factors.
####################################################################### # logpow0.R # A brutal simulation to estimate Power for a # logistic regression model with 2 random IVs. # # Run with source("logpow0.R") # ####################################################################### source("powerfun.R") # To define merror #set.seed(12345) n <- 100 ; m <- 200 ; signif <- numeric(m) beta <- c(-11.09035489,0.11167961,0.02694983) confid <- .99 ######## One-time-only calculations ############ critval <- qchisq(.95,1) # Calculate square root matrix A. A%*%Z will have var-cov Sigma sigma <- rbind( c(50,sqrt(2000)/2), c(sqrt(2000)/2,40) ) spec <- eigen(sigma) A <- spec$vectors %*% diag(sqrt(spec$values)) ################ Monte Carlo Loop ############## for(i in 1:m) { # Simulate X Z <- rbind(rnorm(n),rnorm(n)) X <- cbind(matrix(1,n),t(A%*%Z)) X[,2] <- X[,2]+70 ; X[,3] <- X[,3]+80 # Okay that's X. Now simulate Y xb <- X %*% beta pi <- exp(xb)/(1+exp(xb)) Y <- rbinom(n,1,pi) fullmod <- glm(Y ~ X[,2:3], family=binomial ) # Full model redmod <- glm(Y ~ X[,2], family=binomial ) # Reduced model signif[i] <- anova(redmod,fullmod)[2,4]>critval } ############## End Monte Carlo Loop ############## Phat <- mean(signif) cat("\n") cat(" Simulation to estimate Power for a \n") cat(" logistic regression model with 2 random IVs. \n") cat("\n") cat(" n,m = ",n,m,"\n") cat(" Beta = ",beta, "\n") cat(" Sigma = \n") print(sigma) cat("\n") cat(" Estimated power = ",Phat,", with ",confid*100,"% margin of error \n") cat(merror(Phat,m,1-confid),".\n") cat("\n") ################### End of logpow0.R ####################################### howlong <- system.time(source("logpow0.R")) ; sum(howlong[1:2])
The Population R2 Method. Note that the function poprsq is also in powerfun.R, listed earlier.
poprsq <- function(r,q,a,wantpow=0.80,alpha=0.05) # Cohen's Population R-squared Method # r Number of IVs in full model # q Numerator df = number of linear constraints being tested # a Population proportion of remaining variation explained. # This is Cohen's "effect size." # wantpow Desired power (default = 0.80) # alpha Significance level (default = 0.05) { pow <- 0 ; nn <- r+1 ; oneminus <- 1 - alpha while(pow < wantpow) { nn <- nn+1 phi <- (nn-r) * a/(1-a) ddf <- nn-r pow <- 1 - pf(qf(oneminus,q,ddf),q,ddf,phi) }#End while poprsq <- nn poprsq } # End of function poprsq
The Sample R2 Method. Note that the function samprsq1 is also in powerfun.R, listed earlier.
samprsq1 <- function(r,q,a,alpha=0.05) # Find n so remaining proportion of SS explained will be significant # r Number of IVs in full model # q Numerator df = number of linear constraints being tested # a Sample proportion of remaining variation explained. # alpha Significance level (default = 0.05) { pval <- 1 ; n <- r+1 while(pval > alpha) { n <- n+1 F <- (n-r)/q * a/(1-a) df2 <- n-r pval = 1-pf(F,q,df2) }#End while samprsq1 <- n samprsq1 } # End of function samprsq1
Surrogate Population Method
# Want to detect diff betw means with oneway ANOVA when # Groups 1 and 2 are from urn1 and group3 is from urn2 urn1 <- c(1,2,2,3,3,3,3,4,4,5) urn2 <- c(1,1,1,1,2,2,4,4,5,5) mu1 <- mean(urn1) ; mu2 <- mean(urn1) ; mu3 <- mean(urn2) v1 <- 9*var(urn1)/10 ; v2 <- 9*var(urn2)/10 sigma <- sqrt((2*v1+v2)/3) > mu1 ; mu2 ; mu3 [1] 3 [1] 3 [1] 2.6 > v1 ; v2 [1] 1.2 [1] 2.64 > sigma [1] 1.296148 # First a full parametric analysis, to find a good starting value source("powerfun.R") > > matpow1 function(C,eff,f,wantpow=0.80,alpha=0.05) # H0: C Beta = 0 # Beta is r x 1 # C is q x r contrast matrix # eff is vector of effects (non-zero h) in sd units, length r # f is vector of RELATIVE sample sizes, all non-negative > C <- rbind(c(1,-1,0), c(0,1,-1)) effect <- c(0,(mu2-mu3)/sigma) matpow1(C,effect,f=c(1,1,1)) > matpow1(C,effect,f=c(1,1,1)) [1] 459 > 459/3 [1] 153
We could be done, but now use all the information we have about the distribution of Y.
# surrog1.R # Specialized power estimate for the three-sample example # A brutal simulation # Want to detect diff betw means with oneway ANOVA when # Groups 1 and 2 are from urn1 and group3 is from urn2 # source("surrog1.R") and then use sur1(n,m,f=c(1,1,1)) # source("powerfun.R") urn1 <- c(1,2,2,3,3,3,3,4,4,5) urn2 <- c(1,1,1,1,2,2,4,4,5,5) mu1 <- mean(urn1) ; mu2 <- mean(urn1) ; mu3 <- mean(urn2) v1 <- 9*var(urn1)/10 ; v2 <- 9*var(urn2)/10 sigma <- sqrt((2*v1+v2)/3) C <- rbind(c(1,-1,0), c(0,1,-1)) effect <- c(0,(mu2-mu3)/sigma) nstart <- matpow1(C,effect,f=c(1,1,1)) cat("Start with n of ",nstart,"\n") sur1 <- function(n,m,f=c(1,1,1)) # Specialized power estimate for the three-sample example # urn1 and urn2 must already exist # A brutal simulation # uses function merror { sur1 <- numeric(2) names(sur1) <- c("Est Power","99% Margin of error") f <- f/sum(f) nn <- round(n*f) x <- c(numeric(nn[1])+1,numeric(nn[2])+2,numeric(nn[3])+3) xfac <- factor(x) nsig <- 0 for(i in 1:m) # Monte Carlo loop { samp1 <- sample(urn1,size=nn[1],replace=TRUE) samp2 <- sample(urn1,size=nn[2],replace=TRUE) samp3 <- sample(urn2,size=nn[3],replace=TRUE) y <- c(samp1,samp2,samp3) if(anova(lm(y~xfac))[1,5] < 0.05) nsig <- nsig+1 } # End MC loop sur1[1] <- nsig/m sur1[2] <- merror(sur1[1],m,0.01) sur1 } # End function sur1
Estimating Power: A simulation
# estpow1.R # # For large M, run with # nohup nice R --vanilla < estpow1.R > estpow1.out & # # Two-sample simulation: Version 2 # ncp = n * q(1-q) * delta^2 # For detecting delta = |mu1-mu2|/sigma = 1/2 with prob .8, need # n1 = n2 = 64 for n of 128 findn <- function(wantpow=.8,esize,nstart=3,q=.5) # Specific to the 2-sample problem # ncp = n * q(1-q) * delta^2 { pow <- 0 ; nn <- nstart while(pow < wantpow) { nn <- nn+1 ddf <- nn-2 ncp <- nn * q*(1-q) * esize^2 pow <- pf(qf(.95,1,ddf),1,ddf,ncp,FALSE) # cat("n=",nn," pow=",pow,"\n") }# End while findn <- nn if(q==.5 && 2*trunc(findn/2)!=findn) findn <- findn+1 # Make findn even if q = 1/2 findn } # End function findn # equal sample sizes n, half a standard dev. difference bet means n <- 128 ; ddf <- n-2 ; vect <- numeric(n/2)+1 x <- c(vect,numeric(n/2)) ; dif <- .5 ; eff <- x*dif M <- 50 ; simp <- numeric(M) ; effsize <- numeric(M) estpow <- numeric(M) ; needn <- numeric(M) set.seed(4444) for(i in 1:M) { y <- eff+rnorm(n) amat <- anova(lm(y~x)) simp[i] <- amat[1,5] F <- amat[1,4] effsize[i] <- sqrt(4*F/n) estpow[i] <- pf(qf(.95,1,ddf),1,ddf,F,FALSE) needn[i] <- findn(esize=effsize[i]) }# Next simulation results <- rbind(simp,effsize,estpow,needn) results <- rbind(1:M,results[,order(simp)]) cat("\n") cat("Order p-value effsize estpow needn \n") cat("\n") write(results,"",ncolumns=5) # Writes the transpose #################### End of estpow1.R ################### # pilot1.R # Pilot Study: Want to detect mu1 - mu2 with prob 0.80, but # don't know sigma=2. Estimate it from a pilot study, and then # estimate n needed for desired power. # Run with source("pilot1.R") sigma <- 2 ; diff <- 1; wishpow <- 0.80 n <- 128 ; M <- 25 x <- c(numeric(n/2)+1,numeric(n/2)) rr <- 2 ; qq <- 1 source("powerfun.R") CC <- rbind(c(1,-1)); ff <- c(1,1) ; ff <- ff/sum(ff) rightn <- matpow1(CC,diff/sigma,ff,wishpow) needn <- realpow <- estsig <- numeric(M) cat("\n") cat(" diff = ",diff,"sigma = ",sigma,"wishpow = ",wishpow,"\n") cat(" rightn = ",rightn,"\n") cat("\n") for(i in 1:M) { y <- rnorm(n,0,sigma) + diff*x sighat <- sqrt(sum((lsfit(x,y)$residuals)^2)/(n-2)) estsig[i] <- sighat eff <- diff/sighat # Desired effect in estimated SD units needn[i] <- matpow1(CC,eff,ff,wishpow) ncp <- needn[i]*prod(ff)*(diff/sigma)^2 ddf <- needn[i]-2 realpow[i] <- pf(qf(.95,1,ddf),1,ddf,ncp,FALSE) # Actual power at that sample size } pilot <- cbind(estsig,needn,realpow) sim <- 1:M pilot <- cbind(sim,pilot[order(estsig),]) # Sort the results print(pilot) ###################### End of pilot1.R ######################
Large-sample test on means: T1
T1 <- function(ybar,C,h,nn,prop=TRUE,s2=NULL) ################################################################### # Large sample test of H0: C mu = h # ybar Vector of r sample means. E[ybar] = mu # C q by r hypothesis matrix # nn Vector of r sample sizes # prop Are these sample proportions? If TRUE (the default), # there is no need to supply sample variances. # s2 Vector of sample varinces, not needed if prop=TRUE ################################################################### { q <- dim(C)[1] ; r <- dim(C)[2] # Error Checks if(length(ybar)!=r) stop("Length of ybar must be num of cols in C.") if(length(h) != q) stop("Length of h must be num of rows in C.") if(length(nn) != r) stop("Length of nn must be num of cols in C.") if(prop==FALSE & length(s2) != length(ybar)) stop("Lengths of ybar and s2 are unequal.") # End error checks T1 <- numeric(3) names(T1) <- c("Chisq "," df "," p-value") if(prop) s2 <- nn/(nn-1) * ybar*(1-ybar) kore <- solve(C%*%diag(s2/nn)%*%t(C)) T1[1] <- t(C%*%ybar-h)%*%kore%*%(C%*%ybar-h) T1[2] <- q T1[3] <- 1-pchisq(T1[1],q) T1 } # End of function T1 # Do the surgery example with T1 dat <- read.table("surgery.dat") method <- dat$method ; success <- dat$success thetahat <- tapply(success,list(method),mean) thetahat table(method) # Chisquare test of independence: chisquare = 10.421 # Logistic regression: chisquare = 10.347 CC <- rbind( c(1,-1,0), c(0,1,-1) ) hh <- c(0,0) sampsizes <- table(method) T1(thetahat,CC,hh,sampsizes) # Get another large-sample test for equality of proportions. Do a # regular one-way ANOVA, and F * q is roughly Chisq(q). # This works because under H0, all the VARIANCES are equal, and # MSE is a consistent estimator of the common value. anova(lm(success~factor(method))) source("powerfun.R") urn1 <- c(1,2,2,3,3,3,3,4,4,5) urn2 <- c(1,1,1,1,2,2,4,4,5,5) mu1 <- mean(urn1) ; mu2 <- mean(urn1) ; mu3 <- mean(urn2) v1 <- 9*var(urn1)/10 ; v2 <- 9*var(urn2)/10 mu <- c(mu1,mu2,mu3) ; mu v <- c(v1,v1,v2) ; v sigma <-sqrt(mean(v)) C <- rbind(c(1,-1,0), c(0,1,-1)) effect <- c(0,(mu2-mu3)/sigma) matpow1(C,effect,f=c(1,1,1)) T1pow(mu,C,h=c(0,0),f=c(1,1,1),prop=FALSE,sigsq=v) # surrog2.R # Another specialized power estimate for the three-sample example, # using T1 instead of a regular ANOVA. # A brutal simulation # Groups 1 and 2 are from urn1 and group3 is from urn2 # source("surrog2.R") and then use sur2(n,m,f=c(1,1,1)) # source("powerfun.R") ; source("T1.R") urn1 <- c(1,2,2,3,3,3,3,4,4,5) urn2 <- c(1,1,1,1,2,2,4,4,5,5) mu1 <- mean(urn1) ; mu2 <- mean(urn1) ; mu3 <- mean(urn2) v1 <- 9*var(urn1)/10 ; v2 <- 9*var(urn2)/10 mu <- c(mu1,mu2,mu3) ; cat("True mu = ",mu,"\n") v <- c(v1,v1,v2) ; cat("True variances = ",v,"\n") C <- rbind(c(1,-1,0), c(0,1,-1)) nstart <- T1pow(mu,C,h=c(0,0),f=c(1,1,1),prop=FALSE,sigsq=v) cat("T1pow suggests starting with n of ",nstart,"\n") cat("\n") sur2 <- function(n,m,f=c(1,1,1)) # Specialized power estimate for the three-sample example # urn1 and urn2 must already exist # A brutal simulation # uses function merror { sur2 <- numeric(2) names(sur2) <- c("Est Power","99% Margin of error") f <- f/sum(f) nn <- round(n*f) nsig <- 0 for(i in 1:m) # Monte Carlo loop { samp1 <- sample(urn1,size=nn[1],replace=TRUE) samp2 <- sample(urn1,size=nn[2],replace=TRUE) samp3 <- sample(urn2,size=nn[3],replace=TRUE) meanvec <- c(mean(samp1),mean(samp2),mean(samp3)) varvec <- c(var(samp1),var(samp2),var(samp3)) if(T1(meanvec,C,c(0,0),nn,prop=FALSE,varvec)[3] < 0.05) nsig <- nsig+1 } # End MC loop sur2[1] <- nsig/m sur2[2] <- merror(sur2[1],m,0.01) sur2 } # End function sur2
Now try power of chisquare test of independence on the 3-urn surrogate population.
chipow <- function(pi,wantpow=0.80,alpha=0.05,maxn=100000,display=FALSE) ################################################################### # Sample size required for desired power: # Chisquare test of Independence # # pi Matrix of true probabilities # wantpow Desired power: default = 0.80 # alpha Significance level: default = 0.05 # maxn Maximum sample size, to avoid endless loop # display See details? ################################################################### { ############ One-time only calculations ############# pi <- pi/sum(pi) r <- dim(pi)[1] ; c <- dim(pi)[2] df <- (r-1)*(c-1) rmarg <- apply(pi,1,sum) # Apply sum to dim 1 (rows) cmarg <- apply(pi,2,sum) piM <- as.matrix(rmarg)%*%t(as.matrix(cmarg)) nn <- round(5/min(piM)) # Smallest expected frequency will be 5 to start pow <- 0 ; oneminus <- 1 - alpha ###################################################### while(pow < wantpow) { nn <- nn+1 if(nn>maxn) stop("Maxn exceeded!") ncp <- nn * sum((pi-piM)^2/piM) pow <- 1 - pchisq(qchisq(oneminus,df),df,ncp) if(display) cat(" n = ",nn," Power = ",pow,"\n") }#End while chipow <- nn if(display) { cat("\n") cat("True cell probabilities:\n") cat("\n") print(pi) cat("\n") cat("Probabilities under H0 of independence:\n") cat("\n") print(piM) cat("\n") } chipow } # End of function chipow urn1 <- c(1,2,2,3,3,3,3,4,4,5) urn2 <- c(1,1,1,1,2,2,4,4,5,5) row1 <- table(urn1)/10 row3 <- c(.4,.2,0,.2,.2) urns <- rbind(row1,row1,row3)/3 urns chipow(urns,display=T) # Market researchers would really be more likely to do "top box" # or "top two box." First top box. topbox <- rbind( c(0.03333333,0.3), c(0.03333333,0.3), c(0.13333333,0.2) ) chipow(topbox,display=T) # Pretty good compared to n = 513. Now top 2 box top2box <- rbind( c(.1,0.2333333), c(.1,0.2333333), c(.2,0.1333333) ) chipow(top2box,display=T)
Power of Wald Tests for Logistic Regression
waldlogpow1 <- function(n,beta,X,C,h,alpha=0.05) ################################################################ # Power of Wald test for logistic regression: Single sample size # n Sample size # beta Vector of regression coefficients # X The (fixed) X matrix, with intercept if appropriate # C H0 is C Beta = h # h C Beta under H0 # alpha Significance level (size of test): Default = 0.05 ################################################################ { q <- dim(C)[1] ; r <- dim(C)[2] xb <- X %*% beta Vbeta <- diag(as.vector(exp(xb)/(1+exp(xb))^2)) Ibeta <- t(X) %*% Vbeta %*% X kore <- solve(C%*% solve(Ibeta) %*%t(C)) ncp <- t(C%*%beta-h)%*%kore%*%(C%*%beta-h) pow <- 1 - pchisq(qchisq(1-alpha,q),q,ncp) waldlogpow1 <- pow waldlogpow1 } # End of function waldlogpow1 # Estimate power for Wald test. Same old tough logistic regression problem. # Base it on a single X matrix n <- 2135 beta <- c(-11.09035489,0.11167961,0.02694983) CC <- rbind(c(0,0,1)) hh <- 0 # Like logpow0, but just one X matrix set.seed(12345) # Calculate square root matrix A. A%*%Z will have var-cov Sigma # This is for MVN IVs sigma <- rbind( c(50,sqrt(2000)/2), c(sqrt(2000)/2,40) ) spec <- eigen(sigma) A <- spec$vectors %*% diag(sqrt(spec$values)) Z <- rbind(rnorm(n),rnorm(n)) XX <- cbind(matrix(1,n),t(A%*%Z)) XX[,2] <- XX[,2]+70 ; XX[,3] <- XX[,3]+80 waldlogpow1(n,beta,XX,CC,hh) waldlogpow2 <- function(beta,C,h,alpha=0.05,wantpow=0.80, nstart=25,nmax=100000,display=FALSE) ################################################################ # Power analysis to find sample size for logistic regression. # Intended to be a rough guess at sample size needed for a LR test. # X matrix must be generated internally, but just once -- random or not. # # beta Vector of regression coefficients # C H0 is C Beta = h # h C Beta under H0 # alpha Significance level (size of test): Default = 0.05 # wantpow Desired power: Default = 0.80 # nstart Starting sample size # nmax To stop endless loop # display If TRUE, print n power and power during search ################################################################ { # Generate the first nstart rows of the X matrix. This will vary from # problem to problem. # Calculate square root matrix A. A%*%Z will have var-cov Sigma # This is for MVN IVs sigma <- rbind( c(50,sqrt(2000)/2), c(sqrt(2000)/2,40) ) spec <- eigen(sigma) A <- spec$vectors %*% diag(sqrt(spec$values)) Z <- rbind(rnorm(nstart),rnorm(nstart)) X <- cbind(matrix(1,nstart),t(A%*%Z)) X[,2] <- X[,2]+70 ; X[,3] <- X[,3]+80 ########### Finished generating first nstart rows ################ pow <- waldlogpow1(nstart,beta,X,C,h) n <- nstart if(display) cat("Starting n = ",n," Power = ",pow,"\n") while(pownmax) stop("Too many iterations!") } # End search over n walodlogpow2 <- n walodlogpow2 } # End of function waldlogpow2 source("powerfun.R") ruebeta <- c(-11.09035489,0.11167961,0.02694983) CC <- rbind(c(0,0,1)) hh <- 0 waldlogpow2(truebeta,CC,hh,nstart=2000,display=TRUE) ####################################################################### # rwaldlogpow1.R # Power for a Wald test on a logistic regression model, with random # independent variables. Modelled on randpow1.R. # Run with source("rwaldlogpow1.R") # ####################################################################### #set.seed(12345) n <- 2135 # Sample size m <- 50 # Monte Carlo sample size beta <- c(-11.09035489,0.11167961,0.02694983) # Parameters C <- rbind(c(0,0,1)) ; # H0: C Beta = 0 h <- 0 alpha <- 0.05 # Significance level of test confid <- .99 # Want 100*confid percent margin of # error for MC estimate ######################################################################### # One-time-only calculations Y <- numeric(m) q <- dim(C)[1] ; r <- dim(C)[2] # Calculate square root matrix A. A%*%Z will have var-cov Sigma # This is for MVN IVs, and will vary from problem to problem. sigma <- rbind( c(50,sqrt(2000)/2), c(sqrt(2000)/2,40) ) spec <- eigen(sigma) A <- spec$vectors %*% diag(sqrt(spec$values)) ################# Monte Carlo Loop ############################ for(i in 1:m) { # Generate Random IVs and bind into X matrix. This will # vary from problem to problem. Z <- rbind(rnorm(n),rnorm(n)) X <- cbind(matrix(1,n),t(A%*%Z)) X[,2] <- X[,2]+70 ; X[,3] <- X[,3]+80 ###### Done generating X matrix ########## # Calculate non-centrality parameter, and power xb <- X %*% beta dd <- as.vector(exp(xb)/(1+exp(xb))^2) Vbeta <- diag(dd) # Working around a problem with diag Ibeta <- t(X) %*% Vbeta %*% X kore <- solve(C%*% solve(Ibeta) %*%t(C)) ncp <- t(C%*%beta-h)%*%kore%*%(C%*%beta-h) Y[i] <- 1 - pchisq(qchisq(1-alpha,q),q,ncp) } ############## End of Monte Carlo Loop ############################ Phat <- mean(Y) ; SDhat <- sqrt(var(Y)) margin <- qnorm(1-(1-confid)/2)*SDhat/sqrt(m) cat("\n") cat(" Monte Carlo integration to estimate Power for a \n") cat(" Wald test on a logistic regression model. \n") cat("\n") cat(" n,m = ",n,m,"\n") cat(" Beta = ",beta, "\n") cat("\n") cat(" Estimated power = ",Phat,", with ",confid*100,"% margin of error \n") cat(margin,".\n") cat("\n") ####################### End of rwaldlogpow1.R ##################### howlong <- system.time(source("rwaldlogpow1.R")) ; sum(howlong[1:2])
The F-test for variances again
# var2a.R # Test difference between variances with F - test # Data from t distribution # Run with source("var2a.R") M <- 1000 ; alpha <- 0.05 df <- c(5,10,50,200) n <- c(100,500,1000) sigprob <- matrix(0,nrow=length(n),ncol=length(df)) dimnames(sigprob) <- list(n,df) f <- c(1,1) ; f <- f/sum(f) # Varying sample size and df for(i in 1:length(n)) { n1 <- f[1]*n[i] ; n2 <- f[2]*n[i] lowcrit <- qf(alpha/2,n1-1,n2-1) upcrit <- qf(1-alpha/2,n1-1,n2-1) for(j in 1:length(df)) { for(k in 1:M) { x <- rt(n1,df[j]) ; y <- rt(n2,df[j]) F <- var(x)/var(y) if (Fupcrit) sigprob[i,j] <- sigprob[i,j] + 1 } # End Monte Carlo loop } # End loop through df } # End loop through n sigprob <- sigprob/M cat("\n") cat("n = ",n,"\n") cat("df = ",df,"\n") cat("M = ",M,"\n") cat("\n") print(sigprob) # var2b.R # Test difference between variances with F - test # Data from chisqusare distribution # Run with source("var2b.R") M <- 1000 ; alpha <- 0.05 df <- c(.1,.5,1,10,50) n <- c(100,500,1000) sigprob <- matrix(0,nrow=length(n),ncol=length(df)) dimnames(sigprob) <- list(n,df) f <- c(1,1) ; f <- f/sum(f) # Varying sample size and df for(i in 1:length(n)) { n1 <- f[1]*n[i] ; n2 <- f[2]*n[i] lowcrit <- qf(alpha/2,n1-1,n2-1) upcrit <- qf(1-alpha/2,n1-1,n2-1) for(j in 1:length(df)) { for(k in 1:M) { x <- rchisq(n1,df[j]) ; y <- rchisq(n2,df[j]) F <- var(x)/var(y) if (F upcrit) sigprob[i,j] <- sigprob[i,j] + 1 } # End Monte Carlo loop } # End loop through df } # End loop through n sigprob <- sigprob/M cat("\n") cat("n = ",n,"\n") cat("df = ",df,"\n") cat("M = ",M,"\n") cat("\n") print(sigprob) ################################################################## # manytests.R # Now let's do some tests on the results for chi-squared variables # Execute with R --vanilla < manytests.R # # n = 100 500 1000 # df = 0.1 0.5 1 10 50 # M = 1000 # # 0.1 0.5 1 10 50 # 100 0.766 0.559 0.391 0.095 0.064 # 500 0.798 0.581 0.447 0.134 0.077 # 1000 0.799 0.595 0.440 0.106 0.049 # ################################################################# source("T1.R") Ybar <- c(0.766, 0.559, 0.391, 0.095, 0.064, 0.798, 0.581, 0.447, 0.134, 0.077, 0.799, 0.595, 0.440, 0.106, 0.049) M <- numeric(15)+1000 # Overall test: H0 says all the Type I error rates are 0.05 C1 <- diag(15) # 15x15 identity matrix h1 <- numeric(15) + 0.05 T1(Ybar,C1,h1,M) # Check it Z <- sqrt(1000)*(Ybar-0.05)/sqrt(Ybar*(1-Ybar)) cat("Chi-square should equal ",sum(Z^2),"\n") # Now test each one individually zero <- rbind(numeric(15)) for(i in 1:15) { C2 <- zero ; C2[i] <- 1 cat(" Test ",Ybar[i]," vs 0.05: ",T1(Ybar,C2,h=0.05,nn=M),"\n") } # Check this too print(prettyNum(Z^2),quote=F) # Okay, it's working # Main effect for df C3 <- rbind(c(1,-1,0,0,0, 1,-1,0,0,0, 1,-1,0,0,0), c(0,1,-1,0,0, 0,1,-1,0,0, 0,1,-1,0,0), c(0,0,1,-1,0, 0,0,1,-1,0, 0,0,1,-1,0), c(0,0,0,1,-1, 0,0,0,1,-1, 0,0,0,1,-1) ) T1(Ybar,C3,h=numeric(4),nn=M) # Main effect for n C4 <- rbind(c(1,1,1,1,1,-1,-1,-1,-1,-1,0,0,0,0,0), c(0,0,0,0,0,1,1,1,1,1,-1,-1,-1,-1,-1) ) T1(Ybar,C4,h=numeric(2),nn=M) # Print marginal for n ybarmat <- matrix(Ybar,nrow=3,ncol=5,byrow=T) ; ybarmat apply(ybarmat,1,mean) # Pairwise marginal differences C4a <- rbind(c(1,1,1,1,1,-1,-1,-1,-1,-1,0,0,0,0,0)) T1(Ybar,C4a,h=0,nn=M) print(prettyNum(T1(Ybar,C4a,h=0,nn=M)),quote=F) C4b <- rbind(c(0,0,0,0,0,1,1,1,1,1,-1,-1,-1,-1,-1)) print(prettyNum(T1(Ybar,C4b,h=0,nn=M)),quote=F) C4c <- rbind(c(1,1,1,1,1,0,0,0,0,0,-1,-1,-1,-1,-1)) print(prettyNum(T1(Ybar,C4c,h=0,nn=M)),quote=F) # Adjacent along rows for(i in 1:4) { C5 <- zero ; C5[i] <- 1 ; C5[i+1] <- -1 cat(" Test ",Ybar[i]," vs ",Ybar[i+1],": ",T1(Ybar,C5,h=0,nn=M),"\n") } for(i in 6:9) { C5 <- zero ; C5[i] <- 1 ; C5[i+1] <- -1 cat(" Test ",Ybar[i]," vs ",Ybar[i+1],": ",T1(Ybar,C5,h=0,nn=M),"\n") } for(i in 11:14) { C5 <- zero ; C5[i] <- 1 ; C5[i+1] <- -1 cat(" Test ",Ybar[i]," vs ",Ybar[i+1],": ",T1(Ybar,C5,h=0,nn=M),"\n") } # Adjacent down columns # Col 1 for(i in c(1,6)) { C6 <- zero ; C6[i] <- 1 ; C6[i+5] <- -1 cat(" Test ",Ybar[i]," vs ",Ybar[i+5],": ",T1(Ybar,C6,h=0,nn=M),"\n") } # Col 2 for(i in c(2,7)) { C6 <- zero ; C6[i] <- 1 ; C6[i+5] <- -1 cat(" Test ",Ybar[i]," vs ",Ybar[i+5],": ",T1(Ybar,C6,h=0,nn=M),"\n") } # Col 3 for(i in c(3,8)) { C6 <- zero ; C6[i] <- 1 ; C6[i+5] <- -1 cat(" Test ",Ybar[i]," vs ",Ybar[i+5],": ",T1(Ybar,C6,h=0,nn=M),"\n") } # Col 4 for(i in c(4,9)) { C6 <- zero ; C6[i] <- 1 ; C6[i+5] <- -1 cat(" Test ",Ybar[i]," vs ",Ybar[i+5],": ",T1(Ybar,C6,h=0,nn=M),"\n") } # Col 5 for(i in c(5,10)) { C6 <- zero ; C6[i] <- 1 ; C6[i+5] <- -1 cat(" Test ",Ybar[i]," vs ",Ybar[i+5],": ",T1(Ybar,C6,h=0,nn=M),"\n") } #################### End of manytests.R ################ R --vanilla < manytests.R
Note that power functions (all of them, I hope) are in the earlier part of this file, where powerfun.R is listed.