/*  permutation.sas  */

%include '/home/u1407221/441s24/SAS02/senicread.sas';
title "Permutation and randomization tests";

proc print data=senic (obs=10);

proc freq data=senic;
     title2 'The SENIC data';
     title3 "Fisher's exact test is part of the default output";
     tables mdschl*agecat / nocol nopercent fisher;
     
proc freq data=senic; 
     title3 'An exact permutation test';
     tables mdschl*region / norow nopercent;
     exact pchi / maxtime=120; /* Unnecessary 2 minute limit */

/******************************************************************************/
/* Illustrate anova-like tests with the scab disease data */
/******************************************************************************/

proc import datafile="/home/u1407221/441s24/SAS03/ScabDisease.xlsx" 
            out=scab dbms=xlsx replace;
            getnames=yes;
/* Input data file is ScabDisease.xlsx
   Ouput data table is called scab.
   dbms=xlsx The input file is an Excel spreadsheet.
             Necessary to read an Excel spreadsheet directly under unix/linux
             Works in PC environment too except for Excel 4.0 spreadsheets
             If there are multiple sheets, use sheet="sheet1" or something.
   replace	 If the data table already exists, replace it.  Use this!
   getnames=yes   Use column names as variable names. Beware of embedded,
                  leading and trailing blanks.    
   By default, blanks in a spreadsheet are missing.  */

proc print data=scab; 
     title2 'The scab disease data';
run;

data potato;
     set scab;
     label infection = 'Ave percent surface covered';

proc glm data=potato order=data;
     title3 'Overall F-test with proc glm for Comparison';
     class condition;
     model infection=condition;
     ods select OverallANOVA;  /* Just the ANOVA summary table */

/* How to find out the names of the tables: 
   ods stands for Output Delivery System.
   Information is written on the log file as the tables are produced.
   ods trace is esier than looking in the documentation.

ods trace on; 
proc glm data=potato order=data;
     class condition;
     model infection=condition;
run; 
ods trace off;
*/

/* Wilcoxon scores are the ranks of the observations. */
proc npar1way wilcoxon data=potato;
     title3 'Classical Kruskal-Wallis Test: Compare F = 4.26, p = 0.0043';
     class condition;
     var infection;

/* Permutation test using the raw data. The usual F-test statistic is 
   computed for each permutation. The anova option displays the ANOVA
   summary table and suppresses some other statistics. The anova 
   option does not affect the permutation or randomization p-value. */

proc npar1way anova scores=data data=potato;
     title3 'Randomization Test';
     class condition;
     var infection;
     exact / mc seed=88888 maxtime=300;

/* Follow up with proc multtest, which randomizes the t statistic for testing
   a single linear combination equal to zero. 
   Requesting all 21 pairwise comparisons plus
      * Control vs. Average of Spring
      * Control vs. Average of Fall
      * Average of Spring vs. Average of Fall
 */

proc multtest data=potato order=data permutation nsample=20000 
              seed=99999 out=pvals;
     title2 'Randomization multiple comparisons with proc multtest';
     class condition;
     /* Pairwise                                                 Bonferroni */
     contrast 'Control vs. Spring300'    1 -1  0  0  0  0  0;    /* 1.0000  */
     contrast 'Control vs. Spring600'    1  0 -1  0  0  0  0;    /* 1.0000  */
     contrast 'Control vs. Spring1200'   1  0  0 -1  0  0  0;    /* 1.0000  */
     contrast 'Control vs. Fall300'      1  0  0  0 -1  0  0;    /* 0.0888  */
     contrast 'Control vs. Fall600'      1  0  0  0  0 -1  0;    /* 1.0000  */
     contrast 'Control vs. Fall1200'     1  0  0  0  0  0 -1;    /* 0.0096  */
     contrast 'Spring300 vs. Spring600'  0  1 -1  0  0  0  0;
     contrast 'Spring300 vs. Spring1200' 0  1  0 -1  0  0  0;
     contrast 'Spring300 vs. Fall300'    0  1  0  0 -1  0  0;
     contrast 'Spring300 vs. Fall600'    0  1  0  0  0 -1  0;
     contrast 'Spring300 vs. Fall1200'   0  1  0  0  0  0 -1;
     contrast 'Spring600 vs. Spring1200' 0  0  1 -1  0  0  0;
     contrast 'Spring600 vs. Fall300'    0  0  1  0 -1  0  0;
     contrast 'Spring600 vs. Fall600'    0  0  1  0  0 -1  0;
     contrast 'Spring600 vs. Fall1200'   0  0  1  0  0  0 -1;
     contrast 'Spring1200 vs. Fall300'   0  0  0  1 -1  0  0;
     contrast 'Spring1200 vs. Fall600'   0  0  0  1  0 -1  0;
     contrast 'Spring1200 vs. Fall1200'  0  0  0  1  0  0 -1;
     contrast 'Fall300 vs. Fall600'      0  0  0  0  1 -1  0;
     contrast 'Fall300 vs. Fall1200'     0  0  0  0  1  0 -1;
     contrast 'Fall600 vs. Fall1200'     0  0  0  0  0  1 -1;
     /* Averages */
     contrast 'Control vs. Spring'       3 -1 -1 -1  0  0  0;
     contrast 'Control vs. Fall'         3  0  0  0 -1 -1 -1;
     contrast 'Spring vs. Fall'          0  1  1  1 -1 -1 -1;
     test mean(infection); /* Requests t-tests */
run;

/* Calculate 99% confidence intervals for the permutation p-values,
   based on the output data set pvals. Reject the null hypothesis
   that the true permutation p-value = 0.05 at the 0.01 level if 
   and only if the confidence interval does not include 0.05. */ 
  
proc print data=pvals;
     title3 'Output data set with permutation p-values';

/* Need the alpha = 0.01 critical value for the normal distribution. Use
   R, or ... */
  
proc iml;
     title3 'What is that critical value?';
     CritVal = probit(0.995);
     print CritVal;

data CI; 
     set pvals;
     Lower99 = perm_p - 2.5758*sim_se;
     Upper99 = perm_p + 2.5758*sim_se;
     keep _contrast_ perm_p Lower99 Upper99;
proc print data=CI;
     title3 '99% confidence intervals for the permutation p-values';

/******************************************************************************/
/* Rank correlation with the SENIC data */
/******************************************************************************/

proc corr data=senic nosimple;
     title2 'The SENIC Data';
     title3 'Pearson Product-moment Correlations';
     var nurses lngstay age infpercent;
proc corr data=senic nosimple spearman;
     title3 'Spearman Rank Correlations';
     var nurses lngstay age infpercent;

/* Length of stay and age is interesting because it's barely significant 
   with Pearson but n.s. wth Spearman. */
  
/* The usual p-values for Spearman correlations from proc corr assume 
bivariate normality before ranking, and this assumption matters. It looks like 
proc freq with the scorr test will do permutation/randomization. 
Can use pcorr for the Pearson correlation. */

/* Permutation test for length of stay and age 
   Compare rho = 0.12404, p = 0.2212 */
proc freq data=senic; 
     title3 'Randomization test on ranks: Compare p = 0.2212';
     tables lngstay*age / noprint; /* No frequency table */
     exact scorr / mc seed = 7777;

proc freq data=senic; 
     title3 'Randomization test on raw numbers: Compare p = 0.0491';
     tables lngstay*age / noprint measures; /* No frequency table */
     exact pcorr / mc seed = 7777;


/* This ran out of memory 
proc freq data=senic; 
     title3 'Full exact test';
     tables lngstay*age / noprint; 
     exact pcorr / maxtime = 120;
*/

/* Try a bigger MC sample size: 2 million */
proc freq data=senic; 
     title3 'Randomization test on raw numbers: Big Monte Carlo sample size';
     tables lngstay*age / noprint; /* No frequency table */
     exact pcorr / mc n=2000000 maxtime=120 seed = 7777;
     ods select PearsonCorrMC;

run;