S code from chapters 6 and 7. Pieces of code are separated by dashed lines 1+1 2^3 # Two to the power 3 1:30 x <- 1 y <- 2 x+y z <- x+y z x <- c(1,2,3,4,5,6) # Collect these numbers; x is now a vector y <- 1 + 2*x cbind(x,y) z <- y[x>4] # z gets y such that x > 4 y[c(6,5,4,3,2,1)] # y in opposite order y[c(2,2,2,3,4)] # Repeats are okay y[7] # There is no seventh element. NA is the missing value code z <- x/y # Most operations are performed element by element cbind(x,y,z) x <- seq(from=0,to=3,by=.1) # A sequence of numbers y <- sqrt(x) pdf("testor.pdf") plot(x,y,type='l') # That's a lower case L q() ls() max(x) --------------------------------------------------------------- #################################################################### # lesson2.R: execute with R --vanilla < lesson2.R > lesson2.out # #################################################################### datalist <- scan("mcars.dat",list(id=0,country=0,kpl=0,weight=0,length=0)) # datalist is a linked list. datalist # There are other ways to read raw data. See help(read.table). weight <- datalist$weight ; length <- datalist$length ; kpl <- datalist$kpl country <- datalist$country cor(cbind(weight,length,kpl)) # The table command gives a bare-bones frequency distribution table(country) # That was a matrix. The numbers 1 2 3 are labels. # You can save it, and you can get at its contents countrytable <- table(country) countrytable[2] # There is an "if" function that you could use to make dummy variables, # but it's easier to use factor. countryfac <- factor(country,levels=c(1,2,3), label=c("US","Japanese","European")) # This makes a FACTOR corresponding to country, like declaring it # to be categorical. How are dummy variables being set up? contrasts(countryfac) # The first level specified is the reference category. You can get a # different reference category by specifying the levels in a different order. cntryfac <- factor(country,levels=c(2,1,3), label=c("Japanese","US","European")) contrasts(cntryfac) # Test interaction. For comparison, with SAS we got F = 11.5127, p < .0001 # First fit (and save!) the reduced model. lm stands for linear model. redmod <- lm(kpl ~ weight+cntryfac) # The object redmod is a linked list, including lots of stuff like all the # residuals. You don't want to look at the whole thing, at least not now. summary(redmod) # Full model is same stuff plus interaction. You COULD specify the whole thing. fullmod <- update(redmod,. ~ . + weight*cntryfac) anova(redmod,fullmod) # The ANOVA summary table is a matrix. You can get at its (i,j)th element. aovtab <- anova(redmod,fullmod) aovtab[2,5] # The F statistic aovtab[2,6] < .05 # p < .05 -- True or false? 1>6 # Another example of an expression taking the logical value true or false. --------------------------------------------------------------- rnorm(20) # 20 standard normals set.seed(12345) # Be able to reproduce the stream of pseudo-random numbers. help(rnorm) help(sample) --------------------------------------------------------------- # regart.R Demonstrate Regression Artifact ###################### Setup ####################### N <- 10000 ; n <- 100 truevar <- 180 ; errvar <- 45 truesd <- sqrt(truevar) ; errsd <- sqrt(errvar) # set.seed(44444) # Now define the function ttest, which does a matched t-test ttest <- function(d) # Matched t-test. It operates on differences. { ttest <- numeric(4) names(ttest) <- c("Mean Difference"," t "," df "," p-value ") ave <- mean(d) ; nn <- length(d) ; sd <- sqrt(var(d)) ; df <- nn-1 tstat <- ave*sqrt(nn)/sd pval <- 2*(1-pt(abs(tstat),df)) ttest[1] <- ave ; ttest[2] <- tstat; ttest[3] <- df; ttest[4] <- pval ttest # Return the value of the function } ##################################################### error1 <- rnorm(N,0,errsd) ; error2 <- rnorm(N,0,errsd) truescor <- rnorm(N,100,truesd) pre <- truescor+error1 ; rankpre <- rank(pre) # Based on their rank on the pre-test, we take the n worst students and # place them in a special remedial program, but it does NOTHING. # Based on their rank on the pre-test, we take the n best students and # place them in a special program for the gifted, but it does NOTHING. post <- truescor+error2 diff <- post-pre # Diff represents "improvement." # But of course diff = error2-error1 = noise cat("\n") # Skip a line cat("------------------------------------ \n") dtest <- ttest(diff) cat("Test on diff (all scores) \n") ; print(dtest) ; cat("\n") remedial <- diff[rankpre<=n] ; rtest <- ttest(remedial) cat("Test on Remedial \n") ; print(rtest) ; cat("\n") gifted <- diff[rankpre>=(N-n+1)] ; gtest <- ttest(gifted) cat("Test on Gifted \n") ; print(gtest) ; cat("\n") cat("------------------------------------ \n") --------------------------------------------------------------- source("regart.R") --------------------------------------------------------------- merror <- function(phat,m,alpha) # (1-alpha)*100% merror for a proportion { z <- qnorm(1-alpha/2) merror <- z * sqrt(phat*(1-phat)/m) # m is (Monte Carlo) sample size merror } mmargin <- function(p,cc,alpha) # Choose m to get (1-alpha)*100% margin of error equal to cc { mmargin <- p*(1-p)*qnorm(1-alpha/2)^2/cc^2 mmargin <- trunc(mmargin+1) # Round up to next integer mmargin } # End definition of function mmargin --------------------------------------------------------------- wpow <- c(.7,.75,.8,.85,.9,.99) mmargin(wpow,.1,.01) mmargin(wpow,.05,.01) mmargin(wpow,.01,.01) mmargin(wpow,.005,.01) mmargin(wpow,.001,.01) --------------------------------------------------------------- n <- 125:135 pow <- 1-pf(qf(.95,1,(n-2)),1,(n-2),(n/16)) cbind(n,pow) --------------------------------------------------------------- n1 <- 64 ; mu1 <- 1 ; sd1 <- 2 # Control Group n2 <- 64 ; mu2 <- 3 ; sd2 <- 6 # Experimental Group con <- rnorm(n1,mu1,sd1) ; exp <- rnorm(n2,mu2,sd2) help(t.test) t.test(con,exp) t.test(con,exp)[1] t.test(con,exp)[3] --------------------------------------------------------------- m <- 500 # Monte Carlo sample size (Number of simulations) numsig <- 0 # Initializing for(i in 1:m) { con <- rnorm(n1,mu1,sd1) ; exp <- rnorm(n2,mu2,sd2) numsig <- numsig+(t.test(con,exp)[3]<.05) } pow <- numsig/m cat ("Monte Carlo Power = ",pow,"\n") ; cat ("\n") --------------------------------------------------------------- n1 <- 80 ; mu1 <- 1 ; sd1 <- 2 # Control Group n2 <- 80 ; mu2 <- 3 ; sd2 <- 6 # Experimental Group m <- 500 # Monte Carlo sample size (Number of simulations) numsig <- 0 # Initializing for(i in 1:m) { con <- rnorm(n1,mu1,sd1) ; exp <- rnorm(n2,mu2,sd2) numsig <- numsig+(t.test(con,exp)[3]<.05) } pow <- numsig/m cat ("Monte Carlo Power = ",pow,"\n") ; cat ("\n") --------------------------------------------------------------- m <- 10000 # Monte Carlo sample size (Number of simulations) numsig <- 0 # Initializing for(i in 1:m) { con <- rnorm(n1,mu1,sd1) ; exp <- rnorm(n2,mu2,sd2) numsig <- numsig+(t.test(con,exp)[3]<.05) } pow <- numsig/m cat ("Monte Carlo Power = ",pow,"\n") ; cat ("\n") margin <- merror(.8001,10000,.01) ; margin cat("99% CI from ",(pow-margin)," to ",(pow+margin),"\n") --------------------------------------------------------------- n1 <- 40 ; mu1 <- 1 ; sd1 <- 2 # Control Group n2 <- 120 ; mu2 <- 3 ; sd2 <- 6 # Experimental Group m <- 500 # Monte Carlo sample size (Number of simulations) numsig <- 0 # Initializing for(i in 1:m) { con <- rnorm(n1,mu1,sd1) ; exp <- rnorm(n2,mu2,sd2) numsig <- numsig+(t.test(con,exp)[3]<.05) } pow <- numsig/m cat ("Monte Carlo Power = ",pow,"\n") ; cat ("\n") margin <- merror(pow,m,.01) cat("99% CI from ",(pow-margin)," to ",(pow+margin),"\n") --------------------------------------------------------------- n1 <- 40 ; mu1 <- 1 ; sd1 <- 2 # Control Group n2 <- 120 ; mu2 <- 3 ; sd2 <- 6 # Experimental Group m <- 10000 # Monte Carlo sample size (Number of simulations) numsig <- 0 # Initializing for(i in 1:m) { con <- rnorm(n1,mu1,sd1) ; exp <- rnorm(n2,mu2,sd2) numsig <- numsig+(t.test(con,exp)[3]<.05) } pow <- numsig/m cat ("Monte Carlo Power = ",pow,"\n") ; cat ("\n") margin <- merror(pow,m,.01) cat("99% CI from ",(pow-margin)," to ",(pow+margin),"\n") --------------------------------------------------------------- library(help=ctest) --------------------------------------------------------------- --------------------------------------------------------------- ########################################################## # Choose Monte Carlo sample size for a randomization # # test. Estimate p (p-value of permutation test) with # # p-hat. For a given true p (default = 0.04) and # # a given alpha (default = 0.05), returns the MC sample # # size needed to get p-hat < alpha with probability cc # # (default = .99). # ########################################################## randm <- function(p=.04,alpha=0.05,cc=.99) { randm <- pnorm(cc)^2 * p*(1-p) / (alpha-p)^2 randm <- trunc(randm+1) # Round up to next integer randm } # End of function randm --------------------------------------------------------------- probs <- c(.01,.02,.03,.04,.045,.049) cbind(probs,randm(p=probs)) # Use default values of alpha and cc --------------------------------------------------------------- # randex1.R : First randomization test example, with Student's Sleep Data # Monte Carlo sample size m may be set interactively set.seed(4444) # Set seed for random number generation # Define margin of error functions merror <- function(phat,M,alpha) # (1-alpha)*100% merror for a proportion { z <- qnorm(1-alpha/2) merror <- z * sqrt(phat*(1-phat)/M) # M is (Monte Carlo) sample size merror } mmargin <- function(p,cc,alpha) # Choose m to get (1-alpha)*100% margin of error equal to cc { mmargin <- p*(1-p)*qnorm(1-alpha/2)^2/cc^2 mmargin <- trunc(mmargin+1) # Round up to next integer mmargin } # End definition of function mmargin ############## sleepy <- read.table("sleep.dat") t.test(extra ~ group, var.equal=TRUE, data = sleepy) t.test(extra ~ group, var.equal=TRUE, data = sleepy)[1] # It's a list element, not a number ObsT <- t.test(extra ~ group, var.equal=TRUE, data = sleepy)[[1]] ObsT # If M is not assigned, it's 1210 if(length(objects(pattern="M"))==0) M <- 1210 cat("Monte Carlo Sample size M = ",M,"\n") dv <- sleepy$extra ; iv <- sleepy$group trand <- numeric(M) for(i in 1:M) { trand[i] <- t.test(sample(dv) ~ iv, var.equal=TRUE)[[1]] } randp <- length(trand[abs(trand)>=abs(ObsT)])/M margin <- merror(randp,M,.01) cat ("\n") cat ("Randomization p-value = ",randp,"\n") cat("99% CI from ",(randp-margin)," to ",(randp+margin),"\n") cat ("\n") # Now try difference between medians cat("\n") cat("Median extra sleep for Group = 1: ",median(dv[iv==1]),"\n") cat("Median extra sleep for Group = 2: ",median(dv[iv==2]),"\n") ObsMedDif <- abs(median(dv[iv==1])-median(dv[iv==2])) cat("Absolute difference is ",ObsMedDif,"\n") cat("\n") trand2 <- numeric(M) for(i in 1:M) { rdv <- sample(dv) trand2[i] <- abs(median(rdv[iv==1])-median(rdv[iv==2])) } randp2 <- length(trand2[abs(trand2)>=abs(ObsMedDif)])/M margin <- merror(randp2,M,.01) cat ("\n") cat ("Randomization p-value for diff bet medians = ",randp2,"\n") cat("99% CI from ",(randp2-margin)," to ",(randp2+margin),"\n") cat ("\n") --------------------------------------------------------------- # Power for detecting p-hat significantly different from alpha at # significance level L, given true p and MC sample size M. randmpow <- function(M,alpha=0.05,p=0.04,L=0.01) { z <- qnorm(1-L/2) left <- sqrt(M)*(alpha-p)/sqrt(p*(1-p)) right <- sqrt( alpha/p * (1-alpha)/(1-p) ) randmpow <- 1 - pnorm(left+z*right) + pnorm(left-z*right) randmpow } # End function randmpow --------------------------------------------------------------- findm <- function(wantpow=.8,mstart=1,aa=0.05,pp=0.04,LL=0.01) { pow <- 0 mm <- mstart while(pow < wantpow) { mm <- mm+1 pow <- randmpow(mm,aa,pp,LL) } # End while findm <- mm findm } # End function findm --------------------------------------------------------------- green <- read.table("green.dat") plant <- factor(green$PLANT) ; mcg <- factor(green$MCG) meanlng <- green$MEANLNG #$ obs <- anova(lm(meanlng ~ plant*mcg)) obsF <- obs[1:3,4] set.seed(4444) M <- 500 ; simf <- NULL for(i in 1:M) { simf <- rbind(simf,anova(lm(sample(meanlng)~plant*mcg))[1:3,4]) } # Next i (next simulation) --------------------------------------------------------------- # twins.R # Just do the analysis - no examples or explanation - with source("twins.R") twinframe <- read.table("smalltwin.dat") sex <- twinframe$sex ; ident <- twinframe$ident mental <- twinframe[,3:8] # All rows, cols 3 to 8 phys <- twinframe[,9:14] # All rows, cols 9 to 14 n <- length(sex) mf <- (1:n)[sex==0&ident==0] # mf are indices of male fraternal pairs mi <- (1:n)[sex==0&ident==1] # mi are indices of male identical pairs ff <- (1:n)[sex==1&ident==0] # ff are indices of female fraternal pairs fi <- (1:n)[sex==1&ident==1] # fi are indices of female identical pairs # Sub-sample sizes nmf <- length(mf) ; nmi <- length(mi) nff <- length(ff) ; nfi <- length(fi) # mentalmf are mental scores of male fraternal pairs, etc. mentalmf <- mental[mf,] ; physmf <- phys[mf,] mentalmi <- mental[mi,] ; physmi <- phys[mi,] mentalff <- mental[ff,] ; physff <- phys[ff,] mentalfi <- mental[fi,] ; physfi <- phys[fi,] cat("Male Fraternal \n") cat(" Twin 1 \n") print(cor(mentalmf[,1:3],physmf[,1:3])) cat(" Twin 2 \n") print(cor(mentalmf[,4:6],physmf[,4:6])) cat(" \n") cat("Male Identical \n") cat(" Twin 1 \n") print(cor(mentalmi[,1:3],physmi[,1:3])) cat(" Twin 2 \n") print(cor(mentalmi[,4:6],physmi[,4:6])) cat(" \n") cat("Female Fraternal \n") cat(" Twin 1 \n") print(cor(mentalff[,1:3],physff[,1:3])) cat(" Twin 2 \n") print(cor(mentalff[,4:6],physff[,4:6])) cat(" \n") cat("Female Identical \n") cat(" Twin 1 \n") print(cor(mentalfi[,1:3],physfi[,1:3])) cat(" Twin 2 \n") print(cor(mentalfi[,4:6],physfi[,4:6])) cat(" \n") # test sta will be absobs = 0.6682264 # Keep track of minimum (neg corr: obsmin = -0.5371517) and max too. obsmax <- max( c( cor(mentalmf[,1:3],physmf[,1:3]), cor(mentalmf[,4:6],physmf[,4:6]), cor(mentalmi[,1:3],physmi[,1:3]), cor(mentalmi[,4:6],physmi[,4:6]), cor(mentalff[,1:3],physff[,1:3]), cor(mentalff[,4:6],physff[,4:6]), cor(mentalfi[,1:3],physfi[,1:3]), cor(mentalfi[,4:6],physfi[,4:6]) ) ) obsmin <- min( c( cor(mentalmf[,1:3],physmf[,1:3]), cor(mentalmf[,4:6],physmf[,4:6]), cor(mentalmi[,1:3],physmi[,1:3]), cor(mentalmi[,4:6],physmi[,4:6]), cor(mentalff[,1:3],physff[,1:3]), cor(mentalff[,4:6],physff[,4:6]), cor(mentalfi[,1:3],physfi[,1:3]), cor(mentalfi[,4:6],physfi[,4:6]) ) ) absobs <- max(abs(obsmax),abs(obsmin)) # Test Statistic rmin <- NULL ; rmax <- NULL ; rabs <- NULL # Now simulate. Want p-hat sig diff from 0.07. Use findm(pp=.07), get 1506, so u se m=1600 M <- 1600 ; set.seed(4444) for(i in 1:M) { rmentalmf <- mental[sample(mf),] rmentalmi <- mental[sample(mi),] rmentalff <- mental[sample(ff),] rmentalfi <- mental[sample(fi),] rcorrs <- c( cor(rmentalmf[,1:3],physmf[,1:3]), cor(rmentalmf[,4:6],physmf[,4:6]), cor(rmentalmi[,1:3],physmi[,1:3]), cor(rmentalmi[,4:6],physmi[,4:6]), cor(rmentalff[,1:3],physff[,1:3]), cor(rmentalff[,4:6],physff[,4:6]), cor(rmentalfi[,1:3],physff[,1:3]), cor(rmentalfi[,4:6],physff[,4:6]) ) rmin <- c(rmin,min(rcorrs)) rmax <- c(rmax,max(rcorrs)) rabs <- c(rabs,max(abs(min(rcorrs)),abs(max(rcorrs)))) } twot <- length(rabs[rabs>=absobs])/M # Two sided lowt <- length(rmin[rmin<=obsmin])/M # Lower tailed upt <- length(rmax[rmax>=obsmax])/M # Upper tailed merror <- function(phat,M,alpha) # (1-alpha)*100% merror for a proportion { z <- qnorm(1-alpha/2) merror <- z * sqrt(phat*(1-phat)/M) # M is (Monte Carlo) sample size merror } # End function merror cat("Correlations Between Mental and Physical \n") cat(" \n") ; cat(" \n") cat(" Minimum Observed Correlation: ",obsmin,"\n") cat(" Randomization p-value (one-sided): p-hat = ",lowt," \n") cat(" Plus or minus 99% Margin of error = ",merror(lowt,M,0.01),"\n") cat(" \n") cat(" Maximum Observed Correlation: ",obsmax,"\n") cat(" Randomization p-value (one-sided): p-hat = ",upt," \n") cat(" Plus or minus 99% Margin of error = ",merror(upt,M,0.01),"\n") cat(" \n") cat(" Maximum Observed Absolute Correlation: ",absobs,"\n") cat(" Randomization p-value (two-sided): p-hat = ",twot," \n") cat(" Plus or minus 99% Margin of error = ",merror(twot,M,0.01),"\n") cat(" \n") --------------------------------------------------------------- # boot1.R Working on the bootstrap # Run with R --vanilla < boot1.R > boot1.out & # grades.dat has 4 columns: ID, Verbal SAT, Math SAT and 1st year GPA marks <- read.table("grades.dat") n <- length(marks$verbal) n marks[1:10,] obscorr <- cor(marks) obscorr # Question: Is the correlation between Verbal SAT and GPA the same as # the correlation between math SAT and GPA? # What is the sampling distribution of the difference between correlation # coefficients? # obsdiff <- obscorr[3,1]-obscorr[3,2] # Verbal minus math obsdiff # The strategy will be to obtain a 95% bootstrap confidence interval for # the difference between verbal correlation and math correlation. This # confidence interval will be approximately centered around the observed # difference obsdiff = .128. If the confidence interval does not include # zero, we will conclude that the observed difference is significantly # different from zero. BOOT <- 100 ; bsdiff <- NULL ; set.seed(9999) # Accumulate bootstrap values in bsdiff # For clarity, do operations in several separate steps inside the loop for(i in 1:BOOT) { bootmarks <- marks[sample(1:n,replace=TRUE),] # sample rows with # replacement bcorr <- cor(bootmarks) # Correlation matrix of bootstrap sample bdiffer <- bcorr[3,1]-bcorr[3,2] # Differencce between correlation # coefficients bsdiff <- c(bsdiff,bdiffer) # Add bdiffer to the end of bsdiff } # Next bootstrap sample bsdiff <- sort(bsdiff) # Now get lower and upper limits of 95% CI low <- bsdiff[.025*BOOT] ; up <- bsdiff[.975*BOOT + 1] low ; up (low+up)/2 obsdiff write(bsdiff,"bsdiff.dat") # Maybe for later analysis pdf("bsdiff.pdf") # Send graphics output to pdf file hist(bsdiff)