/* permutation.sas */ %include '/home/u1407221/441s24/SAS02/senicread.sas'; title "Permutation and randomization tests"; proc print data=senic (obs=10); proc freq data=senic; title2 'The SENIC data'; title3 "Fisher's exact test is part of the default output"; tables mdschl*agecat / nocol nopercent fisher; proc freq data=senic; title3 'An exact permutation test'; tables mdschl*region / norow nopercent; exact pchi / maxtime=120; /* Unnecessary 2 minute limit */ /******************************************************************************/ /* Illustrate anova-like tests with the scab disease data */ /******************************************************************************/ proc import datafile="/home/u1407221/441s24/SAS03/ScabDisease.xlsx" out=scab dbms=xlsx replace; getnames=yes; /* Input data file is ScabDisease.xlsx Ouput data table is called scab. dbms=xlsx The input file is an Excel spreadsheet. Necessary to read an Excel spreadsheet directly under unix/linux Works in PC environment too except for Excel 4.0 spreadsheets If there are multiple sheets, use sheet="sheet1" or something. replace If the data table already exists, replace it. Use this! getnames=yes Use column names as variable names. Beware of embedded, leading and trailing blanks. By default, blanks in a spreadsheet are missing. */ proc print data=scab; title2 'The scab disease data'; run; data potato; set scab; label infection = 'Ave percent surface covered'; proc glm data=potato order=data; title3 'Overall F-test with proc glm for Comparison'; class condition; model infection=condition; ods select OverallANOVA; /* Just the ANOVA summary table */ /* How to find out the names of the tables: ods stands for Output Delivery System. Information is written on the log file as the tables are produced. ods trace is esier than looking in the documentation. ods trace on; proc glm data=potato order=data; class condition; model infection=condition; run; ods trace off; */ /* Wilcoxon scores are the ranks of the observations. */ proc npar1way wilcoxon data=potato; title3 'Classical Kruskal-Wallis Test: Compare F = 4.26, p = 0.0043'; class condition; var infection; /* Permutation test using the raw data. The usual F-test statistic is computed for each permutation. The anova option displays the ANOVA summary table and suppresses some other statistics. The anova option does not affect the permutation or randomization p-value. */ proc npar1way anova scores=data data=potato; title3 'Randomization Test'; class condition; var infection; exact / mc seed=88888 maxtime=300; /* Follow up with proc multtest, which randomizes the t statistic for testing a single linear combination equal to zero. Requesting all 21 pairwise comparisons plus * Control vs. Average of Spring * Control vs. Average of Fall * Average of Spring vs. Average of Fall */ proc multtest data=potato order=data permutation nsample=20000 seed=99999 out=pvals; title2 'Randomization multiple comparisons with proc multtest'; class condition; /* Pairwise Bonferroni */ contrast 'Control vs. Spring300' 1 -1 0 0 0 0 0; /* 1.0000 */ contrast 'Control vs. Spring600' 1 0 -1 0 0 0 0; /* 1.0000 */ contrast 'Control vs. Spring1200' 1 0 0 -1 0 0 0; /* 1.0000 */ contrast 'Control vs. Fall300' 1 0 0 0 -1 0 0; /* 0.0888 */ contrast 'Control vs. Fall600' 1 0 0 0 0 -1 0; /* 1.0000 */ contrast 'Control vs. Fall1200' 1 0 0 0 0 0 -1; /* 0.0096 */ contrast 'Spring300 vs. Spring600' 0 1 -1 0 0 0 0; contrast 'Spring300 vs. Spring1200' 0 1 0 -1 0 0 0; contrast 'Spring300 vs. Fall300' 0 1 0 0 -1 0 0; contrast 'Spring300 vs. Fall600' 0 1 0 0 0 -1 0; contrast 'Spring300 vs. Fall1200' 0 1 0 0 0 0 -1; contrast 'Spring600 vs. Spring1200' 0 0 1 -1 0 0 0; contrast 'Spring600 vs. Fall300' 0 0 1 0 -1 0 0; contrast 'Spring600 vs. Fall600' 0 0 1 0 0 -1 0; contrast 'Spring600 vs. Fall1200' 0 0 1 0 0 0 -1; contrast 'Spring1200 vs. Fall300' 0 0 0 1 -1 0 0; contrast 'Spring1200 vs. Fall600' 0 0 0 1 0 -1 0; contrast 'Spring1200 vs. Fall1200' 0 0 0 1 0 0 -1; contrast 'Fall300 vs. Fall600' 0 0 0 0 1 -1 0; contrast 'Fall300 vs. Fall1200' 0 0 0 0 1 0 -1; contrast 'Fall600 vs. Fall1200' 0 0 0 0 0 1 -1; /* Averages */ contrast 'Control vs. Spring' 3 -1 -1 -1 0 0 0; contrast 'Control vs. Fall' 3 0 0 0 -1 -1 -1; contrast 'Spring vs. Fall' 0 1 1 1 -1 -1 -1; test mean(infection); /* Requests t-tests */ run; /* Calculate 99% confidence intervals for the permutation p-values, based on the output data set pvals. Reject the null hypothesis that the true permutation p-value = 0.05 at the 0.01 level if and only if the confidence interval does not include 0.05. */ proc print data=pvals; title3 'Output data set with permutation p-values'; /* Need the alpha = 0.01 critical value for the normal distribution. Use R, or ... */ proc iml; title3 'What is that critical value?'; CritVal = probit(0.995); print CritVal; data CI; set pvals; Lower99 = perm_p - 2.5758*sim_se; Upper99 = perm_p + 2.5758*sim_se; keep _contrast_ perm_p Lower99 Upper99; proc print data=CI; title3 '99% confidence intervals for the permutation p-values'; /******************************************************************************/ /* Rank correlation with the SENIC data */ /******************************************************************************/ proc corr data=senic nosimple; title2 'The SENIC Data'; title3 'Pearson Product-moment Correlations'; var nurses lngstay age infpercent; proc corr data=senic nosimple spearman; title3 'Spearman Rank Correlations'; var nurses lngstay age infpercent; /* Length of stay and age is interesting because it's barely significant with Pearson but n.s. wth Spearman. */ /* The usual p-values for Spearman correlations from proc corr assume bivariate normality before ranking, and this assumption matters. It looks like proc freq with the scorr test will do permutation/randomization. Can use pcorr for the Pearson correlation. */ /* Permutation test for length of stay and age Compare rho = 0.12404, p = 0.2212 */ proc freq data=senic; title3 'Randomization test on ranks: Compare p = 0.2212'; tables lngstay*age / noprint; /* No frequency table */ exact scorr / mc seed = 7777; proc freq data=senic; title3 'Randomization test on raw numbers: Compare p = 0.0491'; tables lngstay*age / noprint measures; /* No frequency table */ exact pcorr / mc seed = 7777; /* This ran out of memory proc freq data=senic; title3 'Full exact test'; tables lngstay*age / noprint; exact pcorr / maxtime = 120; */ /* Try a bigger MC sample size: 2 million */ proc freq data=senic; title3 'Randomization test on raw numbers: Big Monte Carlo sample size'; tables lngstay*age / noprint; /* No frequency table */ exact pcorr / mc n=2000000 maxtime=120 seed = 7777; ods select PearsonCorrMC; run;