/* potato.sas */
title 'Rotten Potatoes: Factorial ANOVA several different ways';
title2 'First, illustrate a 2-factor ANOVA';

proc format;
     value tfmt 1 = '10 Degrees'  2 = '16 Degrees';
     value ofmt 1 = '2 percent' 2 = '6 percent' 3 = '10%';

data spud;
     infile '/folders/myfolders/potato.data.txt' firstobs=2; 
                                        /* Skip the header */
     input id bact temp oxygen rot;
     label bact   = 'Bacteria Type'
           temp   = 'Storage Temperature'
           oxygen = 'Oxygen level';
     /* Cell means coding for all 6 treatment combinations
        of temperature and bacteria type                  */
     if temp=1 and bact=1 then mu11=1; else mu11=0;
     if temp=1 and bact=2 then mu12=1; else mu12=0;
     if temp=1 and bact=3 then mu13=1; else mu13=0;
     if temp=2 and bact=1 then mu21=1; else mu21=0;
     if temp=2 and bact=2 then mu22=1; else mu22=0;
     if temp=2 and bact=3 then mu23=1; else mu23=0;
     combo = 10*temp+bact; /* 11 12 13 21 22 23 */
     format temp tfmt.; format oxygen ofmt.;
     run;
     
proc freq;
     tables temp*bact*oxygen / norow nocol nopercent;

proc means;
     class bact temp;
     var rot;

/* Better looking output from proc tabulate */

proc tabulate;
     class bact temp;
     var rot;
     table (temp all),(bact all) * (mean*rot);

ods trace on; /* Writes names of output tables on the log file. */
proc glm;
     title3 'Standard 2-way ANOVA with proc glm';
     class bact temp;
     model rot=temp|bact; /* Could have said   bact temp bact*temp */
     means temp|bact;
run; /* Need run with ODS trace */
ods trace off;

/* Proportion of remaining variation for the default tests
   a = sF/(sF + n-p)  */

proc iml;
     title3 'Proportion of remaining variation';
     NminusP = 53; /* From the overall F test */
     s1 = 1; F1 = 38.61; /* Main effect for Temperature */
     s2 = 2; F2 = 14.84; /* Main effect for Bacteria */
     s3 = 2; F3 = 3.84;  /* Interaction */
     Temperature  = s1*F1/(s1*F1 + NminusP);
     Bacteria     = s2*F2/(s2*F2 + NminusP);
     Temp_by_Bact = s3*F3/(s3*F3 + NminusP);
     print Temperature Bacteria Temp_by_Bact;

/* Automate the procedure with ODS. */

proc glm data = spud;
     title3 'Standard 2-way ANOVA with proc glm AGAIN';
     class bact temp;
     model rot=temp|bact; /* Could have said   bact temp bact*temp */
     means temp|bact;
     /* ODS output table name = My data table name (Backwards)*/
     ods output OverallANOVA = Overall
                ModelANOVA   = TwoWay;
run;

/* Warning: The most recently created data dable is now TwoWay, not spud. */

proc print data=TwoWay;
     title3 'Look at an ODS output table';

proc iml;
     title3 'Automated proportion of remaining variation';
     use overall;  read all into anova1; /* "all" means all cases */
     use TwoWay;   read all into anova2; 
     print "Take a look at the matrices";
     print anova1; print anova2;
     print "Proportion of remaining variation";
     NminusP = anova1[3,1]; 
     s1 = anova2[4,2]; F1 = anova2[4,5]; /* Main effect for Temperature */
     s2 = anova2[5,2]; F2 = anova2[5,5]; /* Main effect for Bacteria */
     s3 = anova2[6,2]; F3 = anova2[6,5]; /* Interaction */
     Temperature  = s1*F1/(s1*F1 + NminusP);
     Bacteria     = s2*F2/(s2*F2 + NminusP);
     Temp_by_Bact = s3*F3/(s3*F3 + NminusP);
     print Temperature Bacteria Temp_by_Bact;

/* Now generate the tests for main effects and interaction using cell means
   coding. 

               BACTERIA TYPE
TEMP      1         2        3
Cool    mu11      mu12      mu13
Warm    mu21      mu22      mu23                      */


/* The test statement of proc reg uses variable names to stand for the
corresponding regression coefficients. By naming the effect cell mean 
coding dummy variables the same as the population cell means, I can 
just state the null hypothesis. Isn't this a cute SAS trick? */

proc reg  data = spud plots = none;
     title3 'Using the proc reg test statement and cell means coding';
     model rot = mu11--mu23 / noint;
     Overall:       test mu11=mu12=mu13=mu21=mu22=mu23;
     Temperature:   test mu11+mu12+mu13 = mu21+mu22+mu23;
     Bacteria:      test mu11+mu21 = mu12+mu22 = mu13+mu23;
     Bact_by_Temp1: test mu11-mu21 = mu12-mu22 = mu13-mu23;
     Bact_by_Temp2: test mu12-mu11 = mu22-mu21,
                         mu13-mu12 = mu23-mu22;

/* Bact_by_Temp1 checks equality of temperature effects. 
   Bact_by_Temp2 checks parallel line segments. They are equivalent. */


proc glm data = spud;
     title3 'Proc glm: Using contrasts on the combination variable';
     class combo;      /* 11 12 13  21 22 23 */
     model rot=combo;
     contrast 'Main Effect for Temperature'
        combo  1  1  1  -1 -1 -1;
     contrast 'Main Effect for Bacteria'
        combo  1 -1  0   1 -1  0,
        combo  0  1 -1   0  1 -1;
     contrast 'Temperature by Bacteria Interaction'
        combo  1 -1  0  -1  1  0,
        combo  0  1 -1   0 -1  1;

/* Illustrate effect coding */

data mashed;
     set spud;
     /* Effect coding, with interactions */
        if bact = 1 then b1 =  1;
           else if bact = 2 then b1 =  0;
           else if bact = 3 then b1 = -1;
        if bact = 1 then b2 =  0;
           else if bact = 2 then b2 =  1;
           else if bact = 3 then b2 = -1;
        if temp = 1 then t =  1;
           else if temp = 2 then t = -1;
     /* Interaction terms */
        tb1 = t*b1; tb2 = t*b2;

proc reg plots = none;
     title3 'Effect coding';
     model rot = b1 b2  t  tb1 tb2;
     Temperature:   test t=0;
     Bacteria:      test b1=b2=0;
     Bact_by_Temp:  test tb1=tb2=0;
     
/* Do some exploration to follow up the interaction. The regression
   with cell means coding is easiest. The final product of several runs
   is shown below. For reference, here is the table of population means again.

               BACTERIA TYPE
TEMP      1         2        3
Cool    mu11      mu12      mu13
Warm    mu21      mu22      mu23                      */

proc reg plots = none;
     title3 'Further exploration using cell means coding';
     model rot = mu11--mu23 / noint;
     /* Pairwise comparisons of marginal means for Bacteria Type */
     Bact1vs2: test mu11+mu21=mu12+mu22;
     Bact1vs3: test mu11+mu21=mu13+mu23;
     Bact2vs3: test mu12+mu22=mu13+mu23;
     /* Effect of temperature for each bacteria type */
     Temp_for_Bac1: test mu11=mu21;
     Temp_for_Bac2: test mu12=mu22;
     Temp_for_Bac3: test mu13=mu23;
     /* Effect of bacteria type for each temperature */
     Bact_for_CoolTemp: test mu11=mu12=mu13;
     Bact_for_WarmTemp: test mu21=mu22=mu23;
     /* Pairwise comparisons of bacteria types at warm temperature */
     Bact1vs2_for_WarmTemp: test mu21=mu22;
     Bact1vs3_for_WarmTemp: test mu21=mu23;
     Bact2vs3_for_WarmTemp: test mu22=mu23;

/* We have done a lot of tests. Concerned about buildup of Type I
error? We can make ALL the tests into Scheffe follow-ups of the
initial test for equality of the 6 cell means. The Scheffe family
includes all COLLECTIONS of contrasts, not just all contrasts. */

proc iml;
     title3 'Table of critical values for all possible Scheffe tests';
     numdf = 5;   /* Numerator degrees of freedom for initial test */
     dendf =48;   /* Denominator degrees of freedom for initial test */
     alpha = 0.05;
     critval = finv(1-alpha,numdf,dendf); 
     zero = {0 0}; S_table = repeat(zero,numdf,1);  /* Make empty matrix */
     /* Label the columns */
     namz = {"Number of Contrasts in followup test" 
             "    Scheffe Critical Value"}; mattrib S_table colname=namz;
     do i = 1 to numdf;
        s_table(|i,1|) = i; 
        s_table(|i,2|) = numdf/i *  critval;
     end;
     reset noname; /* Makes output look nicer in this case */
     print "Initial test has"  numdf " and " dendf "degrees of freedom."
           "Using significance level alpha = " alpha;
     print s_table;

/* The potato data are balanced; they have equal sample sizes. This yields
   the same F-tessts for Type I and Type III sums of squares in proc glm.
   Look at an unbalanced example. */

%include '/folders/myfolders/441s16/Lecture/readmath2b.sas'; 
/* Creates the data set mathex */

proc freq;
     title2 'An unbalanced example: The math data';
     tables sex*course2 / nocol nopercent chisq;

proc glm;
     title2 'An unbalanced example: The math data';
     class course2 sex ;
     model hscalc = sex|course2;
   
/* In Type III sums of squares, each effect is tested controlling for 
   all the other effects. This is what we get from the standard 
   regresssion tests. 
   
   In Type I sums of squares, the numerator SS is improvement in
   explained sum of squares after allowing for all the preceding 
   effects. But in every case the denominator is MSE from the 
   full model.  */

/* Now back to the potato data. t's actually a 3-factor design. 
   First look at the cell means.
*/

proc tabulate data=spud;
     title2 'Oxygen by temperature by bactieria type';
     class bact temp oxygen;
     var rot;
     table (oxygen all),(temp all),(bact all) * (mean*rot);

proc glm data=spud;
     title2 'Three-way ANOVA with proc glm';
     class oxygen bact temp;
     model rot=temp|bact|oxygen; 
     means temp|bact; /* Based on the tests */