SAS Examples from STA441s16
Here are the SAS programs from lecture, in chronological order.
This handout, including the program code, is copyright Jerry Brunner, 2016. It is licensed under a Creative Commons Attribution-ShareAlike 3.0 Unported License. Use and share it freely.
/********************* cars1.sas ***************************/ title 'Metric Cars Data'; title2 'Basic Descriptive Statistics'; data auto; infile '/folders/myfolders/441s16/Lecture/mcars4b.data.txt'; input ID country $ lper100k weight length; mpg = 100/lper100k * 0.6214/0.2642; label Country = 'Location of Head Office' lper100k = 'Litres per 100 kilometers' mpg = 'Miles per Gallon' weight = 'Weight in kg' length = 'Length in meters'; proc freq; tables country; proc means; var lper100k mpg weight length;
/********************* scab1.sas ***************************/ title 'Scab Disease Data'; data potato; infile '/folders/myfolders/ScabDisease.data.txt'; length condition $ 10.; /* Default length of character values is 8. */ input condition $ infection; label infection = 'Ave percent surface covered'; proc freq; tables condition; proc means; class condition; var infection; proc glm; class condition; model infection = condition / clparm; /* clparm gives CIs for contrasts down in the estimate statements. */ /* Pairwise multiple comparisons */ lsmeans condition / pdiff tdiff adjust = tukey; lsmeans condition / pdiff tdiff adjust = bon; lsmeans condition / pdiff tdiff adjust = scheffe; /* Test some custom contrasts. Beware of alphabetical order: Control Fall1200 Fall300 Fall600 Spring1200 Spring300 Spring600*/ contrast 'Overall test for comparison' condition 1 -1 0 0 0 0 0, condition 1 0 -1 0 0 0 0, condition 1 0 0 -1 0 0 0, condition 1 0 0 0 -1 0 0, condition 1 0 0 0 0 -1 0, condition 1 0 0 0 0 0 -1; contrast 'Av. of Fall vs. Control' condition 3 -1 -1 -1 0 0 0; contrast 'Av. of Spring vs. Control' condition 3 0 0 0 -1 -1 -1; contrast 'Fall vs. Spring' condition 0 1 1 1 -1 -1 -1; contrast 'Spring Amount' condition 0 0 0 0 1 -1 0, condition 0 0 0 0 0 1 -1; contrast 'Fall Amount' condition 0 1 -1 0 0 0 0, condition 0 0 1 -1 0 0 0; contrast 'Spr vs. Fall for 300 lbs.' condition 0 0 1 0 0 -1 0; contrast 'Spr vs. Fall for 600 lbs.' condition 0 0 0 1 0 0 -1; contrast 'Spr vs. Fall, 1200 lbs' condition 0 1 0 0 -1 0 0; /* Estimate and CI for Control vs. Fall 1200 (F = t^2) */ estimate 'Control vs. Fall 1200' condition 1 -1 0 0 0 0 0; estimate 'Control vs. Mean of Fall' condition 3 -1 -1 -1 0 0 0 / divisor = 3; /* Scheffe critical value for a test of s contrasts is critval * (p-1)/s. For p=7 means and a single contrast, it's critval * (7-1)/1 */ proc iml; title2 'Critical value for all possible 1-df Scheffe F-tests'; numdf = 6; /* p-1 = Numerator degrees of freedom for initial test */ dendf = 25; /* n-p = Denominator degrees of freedom for initial test */ alpha = 0.05; critval = finv(1-alpha,numdf,dendf); print critval; ScheffeCritval = critval*numdf; print ScheffeCritval; proc iml; title2 'Table of Scheffe critical values for COLLECTIONS of contrasts'; numdf = 6; /* Numerator degrees of freedom for initial test */ dendf = 25; /* Denominator degrees of freedom for initial test */ alpha = 0.05; critval = finv(1-alpha,numdf,dendf); zero = {0 0}; S_table = repeat(zero,numdf,1); /* Make empty matrix */ /* Label the columns */ namz = {"Number of Contrasts in followup F-test" " Scheffe Critical Value"}; mattrib S_table colname=namz; do i = 1 to numdf; s_table(|i,1|) = i; s_table(|i,2|) = numdf/i * critval; end; reset noname; /* Makes output look nicer in this case */ print "Initial test has" numdf " and " dendf "degrees of freedom." "Using significance level alpha = " alpha; print s_table; /* Example of subsetting vs. contrasts */ proc glm; title2 'Test differences among treatments excluding control'; title3 'Using contrasts of all means (traditional)'; class condition; model infection = condition; contrast 'Differences excluding control' condition 0 1 -1 0 0 0 0, condition 0 0 1 -1 0 0 0, condition 0 0 0 1 -1 0 0, condition 0 0 0 0 1 -1 0, condition 0 0 0 0 0 1 -1; proc glm; title2 'Test differences among treatments excluding control'; title3 'Subset the data by excluding control'; where condition ne 'Control'; /* Case sensitive */ class condition; model infection = condition;
/*************************** berkeley.sas *********************************/ title 'Berkeley Graduate Admissions Data: '; proc format; value sexfmt 1 = 'Female' 0 = 'Male'; value ynfmt 1 = 'Yes' 0 = 'No'; data berkley; input line sex dept $ admit count; format sex sexfmt.; format admit ynfmt.; datalines; 1 0 A 1 512 2 0 B 1 353 3 0 C 1 120 4 0 D 1 138 5 0 E 1 53 6 0 F 1 22 7 1 A 1 89 8 1 B 1 17 9 1 C 1 202 10 1 D 1 131 11 1 E 1 94 12 1 F 1 24 13 0 A 0 313 14 0 B 0 207 15 0 C 0 205 16 0 D 0 279 17 0 E 0 138 18 0 F 0 351 19 1 A 0 19 20 1 B 0 8 21 1 C 0 391 22 1 D 0 244 23 1 E 0 299 24 1 F 0 317 ; proc freq; tables sex*admit / nopercent nocol chisq; tables dept*sex*admit / nopercent nocol chisq; tables dept*sex / nopercent nocol chisq; tables dept*admit / nopercent nocol chisq; weight count; /* Get p-value */ proc iml; x = 17.2480 + 0.2537 + 0.7535 + 0.2980 + 1.0011 + 0.3841; pval = 1-probchi(x,6); print "Pooled chisquare = " x "df=6, p = " pval; Bonferroni = 0.05/6; print "Reject with Bonferroni correction if p <" Bonferroni;
/* describemath.sas */ title 'Gender, Ethnicity and Math performance'; title2 'Basic descriptive statistics on the exploratory sample'; proc format; value ynfmt 0 = 'No' 1 = 'Yes'; value crsfmt 4 = 'No Resp'; value nfmt 1 = 'Asian' 2 = 'Eastern European' 3 = 'European not Eastern' 4 = 'Middle-Eastern and Pakistani' 5 = 'East Indian' 6 = 'Other and DK' ; data math; infile '/folders/myfolders/exploremath.data.txt'; input id course precalc calc gpa calculus english mark lang $ sex $ nation1 nation2 sample; /* Computed Variables: totscore, passed, grade, hsgpa, hscalc, hsengl, tongue, ethnic */ totscore = precalc+calc; if (50<=mark<=100) then passed=1; else passed=0; /* Some missing final marks were zero, and 998=SDF and 999=WDR */ if mark=0 then grade=.; else if mark > 100 then grade=.; else grade=mark; /* Missing HS marks were zeros */ if 65 le gpa le 100 then hsgpa = gpa; /* Else missing is automatic */ if 0 < calculus < 101 then hscalc = calculus; if 0 < english < 101 then hsengl = english; /* There were just a few French speakers */ if lang='French' then tongue='Other '; else tongue=lang; label tongue = 'Mother Tongue (Eng or Other)'; /* Rater 1 knows Middle Eastern names -- otherwise believe Rater 2 */ if nation1=4 then ethnic=nation1; else ethnic=nation2; /********************************************************************/ label precalc = 'Number precalculus correct' calc = 'Number calculus correct' totscore = 'Total # right on diagnostic test' passed = 'Passed the course' grade = 'Final mark' hsgpa = 'High School GPA' hscalc = 'HS Calculus' hsengl = 'HS English' lang = 'Mother Tongue' nation1 = 'Nationality of name acc to rater1' nation2 = 'Nationality of name acc to rater2' tongue = 'Mother Tongue (Eng or Other)' ethnic = 'Judged Nationality of name' ; format course crsfmt.; format passed ynfmt.; format nation1 nation2 ethnic nfmt.; /*********************************************************************/ proc means; title3 'Quantitative Variables'; var precalc calc totscore hsgpa hscalc hsengl grade; proc freq; title3 'Categorical variables'; tables course sex ethnic tongue passed; /* Bar chart of ethnic would be nice. */ proc univariate normal plot; /* The normal option gives tests of H0: Data are normal. Plot produces graphics, some of which are informative. */ title3 'Detailed look at grade'; var grade; proc freq; title3 'Frequency Distribution of Final Grade'; tables grade; /* The following suggests that profs in Courses 1 and 3 bumped the marks discontinuously, while the prof in Course 2 did something more sophisticated. */ proc freq; title3 'Grade separately by course'; where course < 4; /* Exclude no response */ tables grade*course / norow nopercent nopercent;
/* readmath2b.sas Just read the data and do basic transformations */ title 'Prediction of Performance in First-year Calculus'; proc format; value ynfmt 0 = 'No' 1 = 'Yes'; value crsfmt 1 = 'Catch-up' 2 = 'Mainstrm' 3 = 'Elite' 4 = 'No Resp'; value nfmt 1 = 'Asian' 2 = 'Eastern European' 3 = 'European not Eastern' 4 = 'Middle-Eastern and Pakistani' 5 = 'East Indian' 6 = 'Other and DK' ; data mathex; infile '/folders/myfolders/exploremath.data.txt'; input id course precalc calc gpa calculus english mark lang $ sex $ nation1 nation2 sample; /* Computed Variables: totscore, passed, grade, hsgpa, hscalc, hsengl, tongue, ethnic */ totscore = precalc+calc; if (50<=mark<=100) then passed=1; else passed=0; /* Some missing final marks were zero, and 998=SDF and 999=WDR */ if mark=0 then grade=.; else if mark > 100 then grade=.; else grade=mark; /* Missing HS marks were zeros */ if 65 le gpa le 100 then hsgpa = gpa; /* Else missing is automatic */ if 0 < calculus < 101 then hscalc = calculus; if 0 < english < 101 then hsengl = english; /* There were just a few French speakers */ if lang='French' then tongue='Other '; else tongue=lang; label tongue = 'Mother Tongue (Eng or Other)'; /* Rater 1 knows Middle Eastern names -- otherwise believe Rater 2 */ if nation1=4 then ethnic=nation1; else ethnic=nation2; /********************************************************************/ label precalc = 'Number precalculus correct' calc = 'Number calculus correct' totscore = 'Total # right on diagnostic test' passed = 'Passed the course' grade = 'Final mark (if any)' hsgpa = 'High School GPA' hscalc = 'HS Calculus' hsengl = 'HS English' lang = 'Mother Tongue' nation1 = 'Nationality of name acc to rater1' nation2 = 'Nationality of name acc to rater2' tongue = 'Mother Tongue (Eng or Other)' ethnic = 'Judged Nationality of name'; format course crsfmt.; format passed ynfmt.; format nation1 nation2 ethnic nfmt.; diff = (100 * precalc/9) - (100 * calc/11); label diff = 'Percentage correct: Precalc minus calc'; /* And a couple more useful variables */ if course=4 then course2=.; else course2=course; /* Eliminate 'No Resp' */ if 0 le grade le 60 then gsplit='60orLess'; else if 60 lt grade le 100 then gsplit='Over60'; /* Got median=60 from proc univariate */ label gsplit = 'Median split on final grade'; /* Dummy variables for ethnic background */ if ethnic=. then e1=.; else if ethnic=1 then e1=1; else e1=0; if ethnic=. then e2=.; else if ethnic=2 then e2=1; else e2=0; if ethnic=. then e3=.; else if ethnic=3 then e3=1; else e3=0; if ethnic=. then e4=.; else if ethnic=4 then e4=1; else e4=0; if ethnic=. then e6=.; else if ethnic=6 then e6=1; else e6=0; label e1 = 'Asian vs East Ind.' e2 = 'East Eur. vs East Ind.' e3 = 'Other Eur. vs East Ind.' e4 = 'Mid. East & Pak. vs East Ind.' e6 = 'Other/DK vs East Ind.'; if sex = 'Female' then gender=1; else if sex = 'Male' then gender=0; if tongue = 'English' then mtongue=1; else if tongue='Other' then mtongue=0; label mtongue = 'English vs. Other'; /* Only use 2 of these if the model has an intercept! */ if course=. then c1=.; else if course=1 then c1=1; else c1=0; if course=. then c2=.; else if course=2 then c2=1; else c2=0; if course=. then c3=.; else if course=3 then c3=1; else c3=0; label c1 = 'Catch-up' c2 = 'Mainstream' c3 = 'Elite';
/* MathReg1.sas */ %include '/folders/myfolders/441s16/Lecture/readmath2b.sas'; /* readmath2b has dummy variable definitions e1-e4,e6 for ethnic (Reference category is East Indian) gender=1 for Female mtongue=1 for English c1-c3: c1 = 'Catch-up' c2 = 'Mainstream' c3 = 'Elite' */ title2 'Variable Selection for Predicting Grade'; proc freq; title3 'Check dummy variables'; tables sex*gender / norow nocol nopercent missing; tables tongue*mtongue / norow nocol nopercent missing; tables (e1-e4 e6) * ethnic / norow nocol nopercent missing; tables (c1-c3) * course / norow nocol nopercent missing; proc reg plots = none; /* Suppress diagnostic plots for now*/ title3 'Model A: Predict University Calculus Grade from HS Information'; model grade = hsgpa hscalc hsengl; /* It is very interesting to know what proportion of the remaining variation is explained by each variable, controlling for the other two. F = t-squared, and a = sF/(n-p + sF) */ proc iml; title3 'Proportion of remaining variation for HS information'; n = 323; p = 4; s = 1; print "hsgpa controlling for hscalc and hsengl"; t = 8.00; F = t**2; a = s*F/(n-p + s*F); print a; print "hscalc controlling for hsgpa and hsengl"; t = 3.14; F = t**2; a = s*F/(n-p + s*F); print a; print "hsengl controlling for hsgpa and hscalc"; t = -3.26; F = t**2; a = s*F/(n-p + s*F); print a; proc reg plots = none; title3 'Model B: Predict University Calculus Grade from Diagnostic Test'; model grade = precalc calc; proc reg plots = none; title3 'Model C: Do the diagnostic test and HS info both contribute?'; model grade = hsgpa hscalc hsengl precalc calc; Diagnostic_Test: test precalc=calc=0; HS_Information: test hsgpa=hscalc=hsengl=0; proc iml; title3 'Proportion of remaining variation explained by diagnostic test'; print "Precalc and calc controlling for hsgpa hscalc hsengl"; n = 289; p = 6; s = 2; F = 8.28; a = s*F/(n-p + s*F); print a; proc reg plots = none; title3 'Model D: See if Course makes a contribution'; model grade = hsgpa hscalc hsengl precalc calc c1 c3; Course: test c1=c3=0; Diagnostic_Test: test precalc=calc=0; proc glm; title3 'Model D again with proc glm'; class course; model grade = hsgpa hscalc hsengl precalc calc course; contrast 'Replicate Test of Course' course 1 -1 0, course 0 1 -1; contrast 'Diagnostic Test F = 9.06' precalc 1, calc 1; proc reg plots = none; title3 'Model E: Include Language, Sex and Ethnic Background'; model grade = hsgpa hscalc hsengl precalc calc mtongue gender e1-e4 e6; TroubleVars: test mtongue=gender=e1=e2=e3=e4=e6=0; Nationality: test e1=e2=e3=e4=e6=0; proc reg plots = none; title3 'Model F: Discarding Gender and Nationality'; model grade = hsgpa hscalc hsengl precalc calc mtongue; EnglishTongue: test hsengl=mtongue=0; proc iml; title3 'Proportion of remaining variation explained by mother tongue'; print "Mtongue controlling for hsgpa hscalc hsengl precalc calc"; n = 287; p = 7; s = 1; t = -2.23 ; F = t**2; a = s*F/(n-p + s*F); print a; proc reg plots = none; title3 'Model G: Drop mtongue and calc'; title4 'Compare R-Square = 0.4556, Adj R-Sq = 0.4460 From Model 3'; model grade = hsgpa hscalc hsengl precalc; proc iml; title3 'Proportion of remaining variation explained by Pre-calculus'; print "precalc controlling for hsgpa hscalc hsengl"; n = 289; p = 5; s = 1; t = 3.63 ; F = t**2; a = s*F/(n-p + s*F); print a; proc reg plots(only) = ResidualPlot ; title3 'Model H: Combine precalc and calc instead of dropping calc'; title4 'Compare R-Square = 0.4492 from Model 7'; model grade = hsgpa hscalc hsengl totscore; proc iml; title3 'Proportion of remaining variation explained by Pre-calculus'; print "totscore controlling for hsgpa hscalc hsengl"; n = 289; p = 5; s = 1; t = 3.92 ; F = t**2; a = s*F/(n-p + s*F); print a; print "For prediction, I am happy with Model 8: hsgpa hscalc hsengl totscore"; proc reg plots = none; title3 'Model I: Same as Model H but including Mother Tongue'; model grade = hsgpa hscalc hsengl totscore mtongue; proc reg plots = none; title3 'Try automatic (stepwise) selection'; model grade = hsgpa hscalc hsengl precalc calc mtongue gender e1-e4 e6 / selection = stepwise slentry = 0.05 slstay = 0.05 ; /* Default slentry = slstay = 0.15 */
/* MathReg2.sas */ %include '/folders/myfolders/441s16/Lecture/readmath2b.sas'; title2 'Residual analysis'; proc reg plots(only) = ResidualPlot; title3 'Model H: hsgpa hscalc hsengl totscore'; model grade = hsgpa hscalc hsengl totscore; output out = Explor predicted = yhat residual = resid rstudent = delstud; /* Deleted Studentized Residual */ /* Could have included LCL and UCL for upper and lower limits of a 95% prediction interval for each case in the file */ /* What is a big (Studentized deleted) residual? If the model is correct, each one has a t distribution with n-p-1 = 283 df (practically standard normal), so the Studentized deleted residual can be treated directly as a t-test statistic. Values that are too big in absolute value will cause H0: mu=0 to be rejected. Tests are NOT independent, but use a Bonferroni correction for n = 289 tests. Get the critical value from proc iml. */ proc iml; title3 'Critical value for Joint t-test on Studentized Residuals'; Alpha = 0.05/289; print Alpha; Critval = tinv(1-Alpha/2,283); print Critval; proc univariate normal plot; var delstud; /* Tests for normality indicate residuals are not normal. No st resids greater than critical value. Next, a few more plots. */ proc sgplot; title3 'Plot of Y-hat by Y'; scatter y=grade x=yhat; proc sgplot; title3 'Calculus sub-test by deleted studentized residual'; scatter x=calc y=delstud; proc sgplot; title3 'Pre-calculus sub-test by deleted studentized residual'; scatter x=precalc y=delstud; proc sgplot; title3 'Mother tongue by deleted studentized residual'; scatter x=mtongue y=delstud;
/* readexplor.sas Just read the exploratory math data and do basic transformations. This version is based on readmath2c.sas. */ title 'Prediction of Performance in First-year Calculus'; proc format; value ynfmt 0 = 'No' 1 = 'Yes'; value crsfmt 1 = 'Catch-up' 2 = 'Mainstrm' 3 = 'Elite' 4 = 'No Resp'; value nfmt 1 = 'Asian' 2 = 'Eastern European' 3 = 'European not Eastern' 4 = 'Middle-Eastern and Pakistani' 5 = 'East Indian' 6 = 'Other and DK' ; data explore; infile '/folders/myfolders/exploremath.data.txt'; input id course precalc calc gpa calculus english mark lang $ sex $ nation1 nation2 sample; /* Computed Variables: totscore, passed, grade, hsgpa, hscalc, hsengl, tongue, ethnic */ totscore = precalc+calc; if (50<=mark<=100) then passed=1; else passed=0; if 0 100 then grade=.; else grade=mark; /* Missing HS marks were zeros */ if 65 le gpa le 100 then hsgpa = gpa; /* Else missing is automatic */ if 0 < calculus < 101 then hscalc = calculus; if 0 < english < 101 then hsengl = english; /* There were just a few French speakers */ if lang='French' then tongue='Other '; else tongue=lang; label tongue = 'Mother Tongue (Eng or Other)'; /* Rater 1 knows Middle Eastern names -- otherwise believe Rater 2 */ if nation1=4 then ethnic=nation1; else ethnic=nation2; /********************************************************************/ label precalc = 'Number precalculus correct' calc = 'Number calculus correct' totscore = 'Total # right on diagnostic test' passed = 'Passed the course' grade = 'Final mark (if any)' hsgpa = 'High School GPA' hscalc = 'HS Calculus' hsengl = 'HS English' lang = 'Mother Tongue' nation1 = 'Nationality of name acc to rater1' nation2 = 'Nationality of name acc to rater2' tongue = 'Mother Tongue (Eng or Other)' ethnic = 'Judged Nationality of name'; diff = (100 * precalc/9) - (100 * calc/11); label diff = 'Percentage correct: Precalc minus calc'; /* And a couple more useful variables */ if course=4 then course2=.; else course2=course; /* Eliminate 'No Resp' */ if 0 le grade le 60 then gsplit='60orLess'; else if 60 lt grade le 100 then gsplit='Over60'; /* Got median=60 from proc univariate */ label gsplit = 'Median split on final grade'; format course course2 crsfmt.; format passed ynfmt.; format nation1 nation2 ethnic nfmt.; /* Dummy variables for ethnic background */ if ethnic=. then e1=.; else if ethnic=1 then e1=1; else e1=0; if ethnic=. then e2=.; else if ethnic=2 then e2=1; else e2=0; if ethnic=. then e3=.; else if ethnic=3 then e3=1; else e3=0; if ethnic=. then e4=.; else if ethnic=4 then e4=1; else e4=0; if ethnic=. then e6=.; else if ethnic=6 then e6=1; else e6=0; label e1 = 'Asian vs East Ind.' e2 = 'East Eur. vs East Ind.' e3 = 'Other Eur. vs East Ind.' e4 = 'Mid. East & Pak. vs East Ind.' e6 = 'Other/DK vs East Ind.'; if sex = 'Female' then gender=1; else if sex = 'Male' then gender=0; if tongue = 'English' then mtongue=1; else if tongue='Other' then mtongue=0; label mtongue = 'English vs. Other'; /* Only use 2 of these if the model has an intercept! */ if course2=. then c1=.; else if course2=1 then c1=1; else c1=0; if course2=. then c2=.; else if course2=2 then c2=1; else c2=0; if course2=. then c3=.; else if course2=3 then c3=1; else c3=0; label c1 = 'Catch-up' c2 = 'Mainstream' c3 = 'Elite'; /* proc freq; title2 'Check outcome'; tables outcome*passed / norow nocol nopercent missing; tables (course course2) * outcome / nocol nopercent chisq; */
/* readreplic.sas Just read the exploratory math data and do basic transformations. This version is based on readmath2c.sas. IT READS FROM replicmath2.data, WHICH HAS ID VALUES DIFFERENT FROM THOSE IN THE EXPLORATORY SAMPLE! */ title 'Prediction of Performance in First-year Calculus'; proc format; value ynfmt 0 = 'No' 1 = 'Yes'; value crsfmt 1 = 'Catch-up' 2 = 'Mainstrm' 3 = 'Elite' 4 = 'No Resp'; value nfmt 1 = 'Asian' 2 = 'Eastern European' 3 = 'European not Eastern' 4 = 'Middle-Eastern and Pakistani' 5 = 'East Indian' 6 = 'Other and DK' ; data replic; infile '/folders/myfolders/replicmath2.data.txt'; input id course precalc calc gpa calculus english mark lang $ sex $ nation1 nation2 sample; /* Computed Variables: totscore, passed, grade, hsgpa, hscalc, hsengl, tongue, ethnic */ totscore = precalc+calc; if (50<=mark<=100) then passed=1; else passed=0; if 0 100 then grade=.; else grade=mark; /* Missing HS marks were zeros */ if 65 le gpa le 100 then hsgpa = gpa; /* Else missing is automatic */ if 0 < calculus < 101 then hscalc = calculus; if 0 < english < 101 then hsengl = english; /* There were just a few French speakers */ if lang='French' then tongue='Other '; else tongue=lang; label tongue = 'Mother Tongue (Eng or Other)'; /* Rater 1 knows Middle Eastern names -- otherwise believe Rater 2 */ if nation1=4 then ethnic=nation1; else ethnic=nation2; /********************************************************************/ label precalc = 'Number precalculus correct' calc = 'Number calculus correct' totscore = 'Total # right on diagnostic test' passed = 'Passed the course' grade = 'Final mark (if any)' hsgpa = 'High School GPA' hscalc = 'HS Calculus' hsengl = 'HS English' lang = 'Mother Tongue' nation1 = 'Nationality of name acc to rater1' nation2 = 'Nationality of name acc to rater2' tongue = 'Mother Tongue (Eng or Other)' ethnic = 'Judged Nationality of name'; diff = (100 * precalc/9) - (100 * calc/11); label diff = 'Percentage correct: Precalc minus calc'; /* And a couple more useful variables */ if course=4 then course2=.; else course2=course; /* Eliminate 'No Resp' */ if 0 le grade le 60 then gsplit='60orLess'; else if 60 lt grade le 100 then gsplit='Over60'; /* Got median=60 from proc univariate */ label gsplit = 'Median split on final grade'; format course course2 crsfmt.; format passed ynfmt.; format nation1 nation2 ethnic nfmt.; /* Dummy variables for ethnic background */ if ethnic=. then e1=.; else if ethnic=1 then e1=1; else e1=0; if ethnic=. then e2=.; else if ethnic=2 then e2=1; else e2=0; if ethnic=. then e3=.; else if ethnic=3 then e3=1; else e3=0; if ethnic=. then e4=.; else if ethnic=4 then e4=1; else e4=0; if ethnic=. then e6=.; else if ethnic=6 then e6=1; else e6=0; label e1 = 'Asian vs East Ind.' e2 = 'East Eur. vs East Ind.' e3 = 'Other Eur. vs East Ind.' e4 = 'Mid. East & Pak. vs East Ind.' e6 = 'Other/DK vs East Ind.'; if sex = 'Female' then gender=1; else if sex = 'Male' then gender=0; if tongue = 'English' then mtongue=1; else if tongue='Other' then mtongue=0; label mtongue = 'English vs. Other'; /* Only use 2 of these if the model has an intercept! */ if course2=. then c1=.; else if course2=1 then c1=1; else c1=0; if course2=. then c2=.; else if course2=2 then c2=1; else c2=0; if course2=. then c3=.; else if course2=3 then c3=1; else c3=0; label c1 = 'Catch-up' c2 = 'Mainstream' c3 = 'Elite';
/* MathReg3.sas */ %include '/folders/myfolders/441s16/Lecture/readexplor.sas'; /* Creates data table explore */ %include '/folders/myfolders/441s16/Lecture/readreplic.sas'; /* Creates data table replic */ title2 'Predict Grade for Replication Sample'; /* Plan: 1. Non-obvious findings from the exploration (based on Model I, which predicts grade from hsgpa hscalc hsengl totscore mtongue) were a. HS Engl neg b. mtongue neg c. totscore positive (diagnostic test matters) Test these on the replication with a Bonferroni correction for 3 tests. The other two results (HS GPA and HS Calculus) were obvious. 2. See if prediction intervals work as advertised for Model H, which predicts grade from hsgpa hscalc hsengl totscore. 3. Compare prediction of letter grade for the models with and without the diagnostic test. First, just illustrate use of different data tables in the same run. */ proc freq data = explore; title3 'Exploratory Sample'; tables outcome; proc freq data = replic; title3 'Replication Sample'; tables outcome; /* Now test the three findings: Point 1 above */ proc reg data = replic plots = none; title3 'Try to replicate HS Engl neg, mtongue neg, totscore pos'; title4 'with a Bonferroni correction (check p < 0.05/3 = 0.01666667)'; model grade = hsgpa hscalc hsengl totscore mtongue; /* Make combined data table, look at prediction intervals: Point 2 */ data predict; set explore replic; keeper = grade+hsgpa+hscalc+hsengl+totscore; /* keeper will be missing if any of the vars are missing */ if keeper ne .; /* Discards all other cases */ grade2 = grade; /* Save value of grade for future use */ if sample=2 then grade=. ; /* Response variable is now missing for replication sample. But it is preserved in grade2 */ proc reg plots = none data = predict; /* Data table predict is the default anyway */ title3 'Model H: hsgpa hscalc hsengl totscore: R-sq = 0.4532'; model grade = hsgpa hscalc hsengl totscore; output out = predataH predicted = Yhat L95 = lowpred U95 = hipred; /* Data table predataH has everything in predict plus Yhat and the lower and upper 95% predictoipn limits. */ proc print; title3 'Look at predictions for the replication sample'; var id sample grade2 Yhat lowpred hipred; where sample = 2; /* Should predicted marks be used to advise students? */ /* Does 95 Percent Prediction Interval really contain 95 percent of grades? Recall that the data fail all tests for normality, and the prediction intervals are based on normal theory. */ data predictB; set predataH; if (lowpred < grade2 < hipred) then ininterval='Yes'; else ininterval='No'; proc freq; title3 'Does 95 Percent Prediction Interval Work?'; tables sample * ininterval / nocol nopercent; /* Keep trying. Try to predict letter grade. */ data predictC; set predictB; if 80 <= grade2 <= 100 then lgrade = 'A'; else if 70 <= grade2 <= 79 then lgrade = 'B'; else if 60 <= grade2 <= 69 then lgrade = 'C'; else if 50 <= grade2 <= 59 then lgrade = 'D'; else if 0 <= grade2 <= 49 then lgrade = 'F'; label lgrade = 'Letter Grade'; pregrade = round(Yhat); if 80 <= pregrade <= 100 then prelgrade = 'A'; else if 70 <= pregrade <= 79 then prelgrade = 'B'; else if 60 <= pregrade <= 69 then prelgrade = 'C'; else if 50 <= pregrade <= 59 then prelgrade = 'D'; else if 0 <= pregrade <= 49 then prelgrade = 'F'; label prelgrade = 'Predicted Letter Grade'; proc freq; title3 'Accuracy of predicting Letter Grades From Model H'; tables sample*prelgrade*lgrade / nocol nopercent; /* Will yield separate table for each sample. */ /* Predict grade for a new student with hsgpa=80 hscalc=90 hsengl=70 totscore=15. For just a prediction (no interval), proc glm is easier. */ proc glm data = explor; model grade = hsgpa hscalc hsengl totscore; estimate 'New Student' intercept 1 hsgpa 80 hscalc 90 hsengl 70 totscore 15; /* Prediction for Y_{n+1} is the same as estimate of E[Y|X]. CI from proc glm is for E[Y|X]. PREDICTION interval for Y_{n+1} is wider. */ data student; hsgpa=80; hscalc=90; hsengl=70; totscore=15; id = -27; data together; set explore student; /* All variables not assigned will be missing for observation -27 */ proc reg plots = none; title3 'Model H: hsgpa hscalc hsengl totscore: R-sq = 0.4532'; model grade = hsgpa hscalc hsengl totscore; output out = guess predicted = PredictedY L95 = LowerLimit U95 = UpperLimit; data newguess; set guess; if id < 0; /* Discard all other cases */ proc print; title3 'hsgpa=80 hscalc=90 hsengl=70 totscore=15'; var predictedY LowerLimit UpperLimit;
/* training1.sas */ title 'Centering and Interactions (Customer Support Training)'; /* FIRST, DEMONSTRATE CENTERING THE SMART WAY, AND CREATION OF INTERACTION TERMS. */ data support; infile '/folders/myfolders/training.data' firstobs=2; input id Group Skill Performance; cskill = skill; label cskill = 'Skill Minus Mean'; /* Center it below in proc standard */ proc standard out=support2 mean=0; /* Could also say std=1 to standardize */ var cskill; proc means; var skill cskill; data support3; set support2; if group=. then g1=.; else if group=1 then g1=1; else g1=0; if group=. then g2=.; else if group=2 then g2=1; else g2=0; g1skill = g1*skill; g2skill = g2*skill; g1cskill = g1*cskill; g2cskill = g2*cskill; /* NOW VERIFY THAT CENTERING AFFECTS ONLY THE INTERCEPTS, AND THAT FOR AN EQUAL SLOPES MODEL, PROC GLM GIVES THE SAME RESULTS WHETHER THE COVARIATE IS CENTERED OR NOT. */ proc reg; title2 'Equal slopes uncentered with proc reg'; model performance = skill g1 g2; Group: test g1=g2=0; proc reg; title2 'Equal slopes Centered with proc reg'; model performance = cskill g1 g2; Group: test g1=g2=0; proc glm; title2 'Equal slopes uncentered with proc glm'; class group; model performance = skill group; lsmeans group / pdiff adjust=bon; proc glm; title2 'Equal slopes Centered with proc glm'; class group; model performance = cskill group; lsmeans group / pdiff adjust=bon; /* NOW THE MODEL INCLUDES AN INTERACTION, ALLOWING FOR UNEQUAL SLOPES. COMPARE RESULTS WITH THE COVARIATE CENTERED AND UNCENTERED. THE MORAL OF THE STORY IS THAT YOU CAN GET WHAT YOU WANT EITHER WAY IF YOU KNOW WHAT YOU'RE DOING, BUT IT CAN BE EASIER WITH CENTERED IVs. */ proc reg; title2 'Unequal Slopes Uncentered with proc reg'; model performance = skill g1 g2 g1skill g2skill; GroupAtZero: test g1=g2=0; Interaction: test g1skill=g2skill=0; GroupAtMean: test g1 + 75*g1skill = g2 + 75*g2skill = 0; Group1vs2AtMean: test g1 + 75*g1skill = g2 + 75*g2skill; proc reg; title2 'Unequal Slopes Centered with proc reg'; model performance = cskill g1 g2 g1cskill g2cskill; GroupAtMean: test g1=g2=0; Group1vs2AtMean: test g1=g2; Interaction: test g1cskill=g2cskill=0; /* HERE, WE SEE THAT WHEN PROC GLM TESTS A CATEGORICAL INDEPENDENT VARIABLE IN THE PRESENCE OF COVARIATES, IT IS TESTING FOR DIFFERENCES BETWEEN INTERCEPTS. THIS IS TRUE EVEN IF THE MODEL HAS AN INTERACTION BETWEEN THE CATEGORICAL VARIABLE AND THE COVARIATE(S). WHEN THE COVARIATES ARE UNCENTERED, THIS IS SELDOM WHAT YOU WANT, AND YOU NEED TO WATCH OUT. */ proc glm; title2 'Unequal slopes uncentered with proc glm'; class group; model performance = skill group skill*group; lsmeans group / pdiff adjust=bon; proc glm; title2 'Unequal slopes Centered with proc glm'; class group; model performance = cskill group cskill*group; lsmeans group / pdiff adjust=bon;
/* training2.sas */ options linesize=79 noovp formdlim=' '; title 'Customer Support Training'; /* THIS PROGRAM IS FOCUSED ON UNDERSTANDING THE DATA. training1.sas WAS MORE ABOUT STATISTICAL IDEAS AND SOME SAS DETAILS */ data support; infile 'training.data' firstobs=2; input id Group Skill Performance; cskill = skill; label cskill = 'Skill Minus Mean'; /* Center it below in proc standard */ proc standard out=support2 mean=0; /* Could also say std=1 to standardize */ var cskill; data support3; set support2; if group=. then g1=.; else if group=1 then g1=1; else g1=0; if group=. then g2=.; else if group=2 then g2=1; else g2=0; g1skill = g1*skill; g2skill = g2*skill; g1cskill = g1*cskill; g2cskill = g2*cskill; /* EQUAL REGRESSIONS SAYS THAT THE THREE REGRESSION LINES ARE RIGHT ON TOP OF EACH OTHER. THAT IS, THERE ARE NO DIFFERENCES AMONG TRAINING PROGRAMMES FOR ANY SKILL LEVEL. THIS IS THE NULL HYPOTHESIS FOR THE EQUAL SLOPES MODEL, TOO. */ proc reg data = support3; title2 'Test Equal Regressions (and equal slopes)'; model performance = skill g1 g2 g1skill g2skill; EqualRegressions: test g1=g2=g1skill=g2skill = 0; Interaction: test g1skill=g2skill = 0; proc iml; title2 'Proportion of remaining variation explained by'; title3 'Unequal regressions and Unequal slopes'; F = 41.74; s = 4; NminusP = 144; a1 = s*F/(NminusP + s*F); F = 4.33; s = 2; NminusP = 144; a2 = s*F/(NminusP + s*F); print "Unequal Regressions: " a1 ", Unequal Slopes: " a2; /* LOOK TO SEE WHAT'S GOING ON. A CONVENIENT WAY TO GET THE THREE REGRESSIONS IS TO JUST FIT THEM DIRECTLY. ONLY THE ESTIMATED REGRESSION COEFFICIENTS ARE WHAT WE WOULD GET FROM A SINGLE REGRESSION MODEL WITH PRODUCT TERMS. EVERYTHING ELSE IS "WRONG." */ proc sort; by group; proc reg; title2 'Separate regressions'; model performance = skill; by group; /* Data must be sorted by this varIale. */ /* LOOK AT A ROUGH PLOT OF THE REGRESSION LINES OVER THE RANGE OF THE DATA. OUTPUT FROM THE SEPARATE REGRESSIONS GIVES Group 1: Yhat = -23.09708 + 0.98349*X Group 2: Yhat = -12.16946 + 0.76137*X Group 3: Yhat = -1.79639 + 0.52395*X WHAT IS THE RANGE OF THE DATA? A REAL DATA ANALYSIS JOB WOULD START WITH DESCRIPTIVE STATISTICS AND YOU'D KNOW THIS ALREADY. */ proc univariate; var skill; /* CREATE A SAS DATA SET OF POINTS TO PLOT. THIS WOULD BE BETTER IN R. */ data pts; do skill = 60 to 95; group = 1; Yhat = -23.09708 + 0.98349*skill; output; group = 2; Yhat = -12.16946 + 0.76137*skill; output; group = 3; Yhat = -1.79639 + 0.52395*skill; output; end; label Yhat = 'Predicted Performance'; options pagesize=500; proc print; title2 'Look at the data set pts'; options pagesize=35; proc plot; title2 'Rough Plot of the Three Regression Lines'; plot Yhat * skill = group; /* Plotting symbol is the value of group. */ /* ARE THE DIFFERENCES BETWEEN TRAINING PROGRAMMES SIGNIFICANT EVEN AT LOW SKILL LEVELS? FROM PROC UNIVARIATE, MINIMUM IS 58 AND 25TH PERCENTILE IS 71. */ proc reg data = support3; title2 'Test group differences at lower skill levels'; model performance = skill g1 g2 g1skill g2skill; DiffAt58: test g1 + 58*g1skill = g2 + 58*g2skill = 0; DiffAt71: test g1 + 71*g1skill = g2 + 71*g2skill = 0; /* 67 is the 10th percentile. */ DiffAt67: test g1 + 67*g1skill = g2 + 67*g2skill = 0; /* 64 is the 5th percentile. */ DiffAt64: test g1 + 64*g1skill = g2 + 64*g2skill = 0; Group1vs2At64: test g1 + 64*g1skill = g2 + 64*g2skill; Group1vs3At64: test g1 + 64*g1skill = 0; Group2vs3At64: test g2 + 64*g2skill = 0; /* HOW MIGHT THESE RESULTS BE DESCRIBED IN PLAIN LANGUAGE? YOU DON'T HAVE TO (AND SHOULD NOT) SAY EVERYTHING. HERE'S A POSSIBILITY. "Average job performance depends on level of skill and technical knowledge prior to training. Naturally, those with higher prior levels of skill tend to perform better. Overall, average job performance was best for employees receiving Training Programme 1, followed by 2 and 3 in that order. The advantage of Programme 1 was greatest for those with higher levels of prior skill, but was still apparent for those with relatively low skill levels." A HIGH-RESOLUTION PLOT OF THE THREE REGRESSION LINES WOULD BE GOOD, EVEN FOR A NON-TECHNICAL AUDIENCE. THERE ARE SOME MORE INTERESTNG ISSUES THAT COULD BE EXPLORED WITH THESE DATA. ONE EXAMPLE IS TESTS OF PAIRWISE DIFFERENCES BETWEEN SLOPES. ANOTHER ONE IS TO LOCATE THE EXACT SKILL LEVEL AT WHICH GROUP DIFFERNCES BECOME CLEAR, TREATING ALL TESTS AS SCHEFFE FOLLOW-UPS TO THE INITIAL TEST OF EQUAL REGRESSIONS. IT'S LIKELY NOT GOING TO BE AT THE 5TH PERCENTILE, BECAUSE OF THE PENALTY FOR PROTECTING INFINITELY MANY TESTS AT A SINGLE JOINT SIGNIFICANCE LEVEL. */
/* MathLogReg1.sas */ %include '/folders/myfolders/441s16/Lecture/readmath2b.sas'; title2 'Logistic Regression with dummy variables on the Math data'; /* Recall definition of passed if (50<=mark<=100) then passed=1; else passed=0; And if course=4 then course2=.; else course2=course; if course2=. then c1=.; else if course2=1 then c1=1; else c1=0; if course2=. then c2=.; else if course2=2 then c2=1; else c2=0; if course2=. then c3=.; else if course2=3 then c3=1; else c3=0; label c1 = 'Catch-up' c2 = 'Mainstream' c3 = 'Elite'; */ proc freq; title3 'Check course2 and dummy vars -- and why so many no course?'; tables (course c1-c3) * course2 / norow nocol nopercent missing; proc freq; title3 'A few simple Chi-squared tests to predict passed'; tables (course2 sex ethnic tongue) * passed / nocol nopercent chisq; proc logistic; title3 'Course2 by passed with dummy vars: Compare LR Chisq = 34.4171'; model passed (event='Yes') = c1 c3; /* Mainstream is reference category */ Course1_vs_2: test c1=0; Course1_vs_3: test c1=c3; Course2_vs_3: test c3=0; /* A few details: The higher the minus 2 Log Likelihood, the lower the (estimated) maximum probability of observing these responses. It is a meaure of lack of model fit. The Akaike information criterion and Schwarz's Bayesian criterion both impose a further penalty for number of explanatory variables. Small is good. Association of Predicted Probabilities and Observed Responses: * Every case has Y=0 or Y=1. * Every case has a p-hat. * Pick a case with Y=0, and another case with Y=1. That's a pair. * If the case with Y=0 has a lower p-hat than the case with Y=1, the pair is concordant. */ proc iml; title3 'Estimate prob. of passing for for course=3: Compare 31/39 = 0.7949'; b0 = 0.4077; b1 = -1.4838; b2 = 0.9468; c1 = 0; c3=1; lcombo = b0 + b1*c1 + b2*c3; probpass = exp(lcombo) / (1+exp(lcombo)); print "Estimated probability of passing course 3 is " probpass; proc logistic; title3 'Use the Class statement'; class course2 / param=ref; /* This param option makes the ALPHABETICALLY last category (Mainstream) the reference category */ model passed (event='Yes') = course2; contrast 'Catch-up vs Mainstream' course2 1 0; contrast 'Elite vs Mainstream' course2 0 1; contrast 'Catch-up vs Elite' course2 1 -1; /* Contrast is a little tricky in proc logistic. It lets you specify a set of linear combinations (not necessarily contrasts) to test on the regression coefficients. It is essential to know exactly what the dummy variable coding scheme is. This can still be more convenient than defining your own dummy variables in the data step. */
/* MathLogReg2.sas */ %include '/folders/myfolders/441s16/Lecture/readmath2b.sas'; title2 'Predict Passing the course (Y-N) with Logistic Regression'; /* c1 = 'Catch-up' c2 = 'Mainstream' c3 = 'Elite' */ proc logistic; title3 'HS variables'; model passed (event='Yes') = hsgpa hscalc hsengl; /* Decision: Drop hsengl */ proc logistic; title3 'HS gpa and calc, course2 and diagnostic test'; class course2 / param=ref; model passed (event='Yes') = hsgpa hscalc course2 precalc calc; contrast 'Course' course2 1 0, course2 0 1; proc logistic; title3 'HS gpa and calc, precalc and total score'; model passed (event='Yes') = hsgpa hscalc precalc totscore; precalc_n_totscore: test precalc = totscore = 0; /* Decision: Keep precalc rather than totscore */ proc logistic; title3 'Try gender, ethnic and mother tongue controlling for good stuff'; class ethnic (param=ref ref='East Indian'); /* Specifying a reference category that's not the last value */ model passed (event='Yes') = hsgpa hscalc precalc ethnic gender mtongue; contrast 'Demographics' ethnic 1 0 0 0 0, ethnic 0 1 0 0 0, ethnic 0 0 1 0 0, ethnic 0 0 0 1 0, ethnic 0 0 0 0 1, gender 1, mtongue 1 / e; /* Display the effect matrix */ /* Decision: Forget about gender and ethnicity. */
/* MathLogReg4.sas */ %include '/folders/myfolders/441s16/Lecture/readmath2b.sas'; /* Creates data table mathex */ title2 'Logistic regression with more than 2 resp. categories'; /*************** Data step continues ************************/ hsutil = hsgpa+hscalc+hsengl; if hsutil = . then hsmiss=1; else hsmiss=0; label hsmiss = 'Missing Any High School Data'; format hsmiss ynfmt.; if (0<=mark<=49) then outcome = 'Fail'; else if (50<=mark<=100) then outcome = 'Pass'; else outcome = 'Gone'; /*************************************************************/ proc freq; tables outcome*passed / norow nocol nopercent missing; proc freq; title3 'One at a time cat IVs with proc freq'; tables (course2 sex ethnic tongue hsmiss) * outcome / nocol nopercent chisq; /* Multinomial Logit model is ln(pi1/pi3) = beta01 + beta11 x Fail vs. Pass ln(pi2/pi3) = beta02 + beta12 x Gone vs. Pass */ proc logistic data=mathex outest=ParmNames; title3 'Multinomial logit model with proc logistic'; model outcome (ref='Pass') = hsmiss / link = glogit; contrast 'HS Missing method 1' hsmiss 1; /* Find out the parameter names. */ proc transpose data=ParmNames; run; proc print noobs; run; proc logistic data = mathex; title3 'Multinomial logit model with proc logistic'; model outcome (ref='Pass') = hsmiss / link = glogit; contrast 'HS Missing method 1' hsmiss 1; HS_MissingMethod2: test hsmiss_Fail = hsmiss_Gone = 0; proc iml; title3 'Estimate Probabilities using output from proc catmod'; b01 = -1.3594; b11 = 0.6663; b02 = -0.8306; b12 = 1.2360; hsmiss = 0; L1 = b01 + b11*hsmiss; L2 = b02 + b12*hsmiss; denom = 1 + exp(L1) + exp(L2); Fail = exp(L1)/denom; Gone = exp(L2)/denom; Pass = 1/denom; print "No Missing HS Data:" Fail Gone Pass; hsmiss = 1; L1 = b01 + b11*hsmiss; L2 = b02 + b12*hsmiss; denom = 1 + exp(L1) + exp(L2); Fail = exp(L1)/denom; Gone = exp(L2)/denom; Pass = 1/denom; print "Yes Missing HS Data:" Fail Gone Pass; proc freq data = mathex; title3 'Hsmiss by outcome again for comparison'; tables hsmiss * outcome / nocol nopercent; /* Now seek a good predictive model */ proc logistic data = mathex; title3 'HS variables'; model outcome (ref='Pass') = hsgpa hscalc hsengl / link = glogit; /* Drop HS English */ proc logistic data = mathex; title3 'HS gpa and calc + course2 '; class course2 / param=ref; /* Last category is reference by default. */ model outcome (ref='Pass') = hsgpa hscalc course2 / link = glogit; /* Forget course2 */ proc logistic data = mathex; title3 'HS gpa and calc + diagnostic test'; model outcome (ref='Pass') = hsgpa hscalc precalc calc / link = glogit; /* Drop calc subtest, keep precalc */ proc logistic data = mathex; title3 'Try gender, ethnic and mother tongue controlling for good stuff'; class ethnic (param=ref ref='East Indian'); /* Specifying a reference category that's not the last value */ model outcome (ref='Pass') = hsgpa hscalc precalc ethnic gender mtongue / link = glogit; contrast 'Demographics' ethnic 1 0 0 0 0, ethnic 0 1 0 0 0, ethnic 0 0 1 0 0, ethnic 0 0 0 1 0, ethnic 0 0 0 0 1, gender 1, mtongue 1; contrast 'Ethnic and Gender' ethnic 1 0 0 0 0, ethnic 0 1 0 0 0, ethnic 0 0 1 0 0, ethnic 0 0 0 1 0, ethnic 0 0 0 0 1, gender 1; /* Mother tongue is significant. Still true when we drop ethnic and gender? */ proc logistic data = mathex; title3 'hsgpa hscalc precalc mtongue'; model outcome (ref='Pass') = hsgpa hscalc precalc mtongue / link = glogit; proc logistic data = mathex; title3 'hsgpa hscalc precalc mtongue'; model outcome (ref='Pass') = hsgpa hscalc precalc mtongue / link = glogit; /* Allowing for academic background, students whose first language is English are more likely to fail the course as opposed to passing, and less likely to disappear as opposed to passing. If this is replicated, it will be very interesting. Now explore in more detail. Recall the response categories are 1=Fail 2=Gone 3=Pass. We want to know whether failing is different from disappearing in terms of their relationship to the explanatory variables. We are getting advanced here. What is H0? Model (using b instead of beta) is ln(pi1/pi3) = b01 + b11 hsgpa + b21 hscalc + b31 precalc + b41 mtongue ln(pi2/pi3) = b02 + b12 hsgpa + b22 hscalc + b32 precalc + b42 mtongue The null hypothesis is b11=b12, b21=b22, b31=b32, b41=b42 Parameter names are easy to guess. */ proc logistic data = mathex; title3 'Different coefficients for Gone and Fail?'; model outcome (ref='Pass') = hsgpa hscalc precalc mtongue / link = glogit; DiffOverall: test hsgpa_Fail = hsgpa_Gone, hscalc_Fail = hscalc_Gone, precalc_Fail = precalc_Gone, mtongue_Fail = mtongue_Gone; Diff_hsgpa: test hsgpa_Fail = hsgpa_Gone; Diff_hscalc: test hscalc_Fail = hscalc_Gone; Diff_precalc: test precalc_Fail = precalc_Gone; Diff_mtongue: test mtongue_Fail = mtongue_Gone; run; /************************** Replication *********************** For interpretation, want to replicate 8 findings: Gone vs. Pass and Fail vs. Pass for each explanatory variable. ***************************************************************/ %include '/folders/myfolders/441s16/Lecture/readreplic.sas'; if (0<=mark<=49) then outcome = 'Fail'; else if (50<=mark<=100) then outcome = 'Pass'; else outcome = 'Gone'; proc logistic data = replic; /* That's the default anyway. */ title2 'Replicate hsgpa hscalc precalc calc mtongue 0.05/8 = .00625'; model outcome (ref='Pass') = hsgpa hscalc precalc mtongue / link = glogit; Diff_mtongue: test mtongue_Fail = mtongue_Gone; /* Final conclusions: Students with higher High School GPA were less likely to fail as opposed to passing and less likely to disappear as opposed to passing. Students with higher High School Calculus marks were less likely to disappear as opposed to passing. Students with higher scores on the pre-calculus portion of the diagnostic test were less likely to disappear as opposed to passing. There was no convincing evidence of a connection between Mother Tongue (English vs. Other) and outcome. */
/* potato.sas */ title 'Rotten Potatoes: Factorial ANOVA several different ways'; title2 'First, illustrate a 2-factor ANOVA'; proc format; value tfmt 1 = '10 Degrees' 2 = '16 Degrees'; value ofmt 1 = '2 percent' 2 = '6 percent' 3 = '10%'; data spud; infile '/folders/myfolders/potato.data.txt' firstobs=2; /* Skip the header */ input id bact temp oxygen rot; label bact = 'Bacteria Type' temp = 'Storage Temperature' oxygen = 'Oxygen level'; /* Cell means coding for all 6 treatment combinations of temperature and bacteria type */ if temp=1 and bact=1 then mu11=1; else mu11=0; if temp=1 and bact=2 then mu12=1; else mu12=0; if temp=1 and bact=3 then mu13=1; else mu13=0; if temp=2 and bact=1 then mu21=1; else mu21=0; if temp=2 and bact=2 then mu22=1; else mu22=0; if temp=2 and bact=3 then mu23=1; else mu23=0; combo = 10*temp+bact; /* 11 12 13 21 22 23 */ format temp tfmt.; format oxygen ofmt.; run; proc freq; tables temp*bact*oxygen / norow nocol nopercent; proc means; class bact temp; var rot; /* Better looking output from proc tabulate */ proc tabulate; class bact temp; var rot; table (temp all),(bact all) * (mean*rot); ods trace on; /* Writes names of output tables on the log file. */ proc glm; title3 'Standard 2-way ANOVA with proc glm'; class bact temp; model rot=temp|bact; /* Could have said bact temp bact*temp */ means temp|bact; run; /* Need run with ODS trace */ ods trace off; /* Proportion of remaining variation for the default tests a = sF/(sF + n-p) */ proc iml; title3 'Proportion of remaining variation'; NminusP = 53; /* From the overall F test */ s1 = 1; F1 = 38.61; /* Main effect for Temperature */ s2 = 2; F2 = 14.84; /* Main effect for Bacteria */ s3 = 2; F3 = 3.84; /* Interaction */ Temperature = s1*F1/(s1*F1 + NminusP); Bacteria = s2*F2/(s2*F2 + NminusP); Temp_by_Bact = s3*F3/(s3*F3 + NminusP); print Temperature Bacteria Temp_by_Bact; /* Automate the procedure with ODS. */ proc glm data = spud; title3 'Standard 2-way ANOVA with proc glm AGAIN'; class bact temp; model rot=temp|bact; /* Could have said bact temp bact*temp */ means temp|bact; /* ODS output table name = My data table name (Backwards)*/ ods output OverallANOVA = Overall ModelANOVA = TwoWay; run; /* Warning: The most recently created data dable is now TwoWay, not spud. */ proc print data=TwoWay; title3 'Look at an ODS output table'; proc iml; title3 'Automated proportion of remaining variation'; use overall; read all into anova1; /* "all" means all cases */ use TwoWay; read all into anova2; print "Take a look at the matrices"; print anova1; print anova2; print "Proportion of remaining variation"; NminusP = anova1[3,1]; s1 = anova2[4,2]; F1 = anova2[4,5]; /* Main effect for Temperature */ s2 = anova2[5,2]; F2 = anova2[5,5]; /* Main effect for Bacteria */ s3 = anova2[6,2]; F3 = anova2[6,5]; /* Interaction */ Temperature = s1*F1/(s1*F1 + NminusP); Bacteria = s2*F2/(s2*F2 + NminusP); Temp_by_Bact = s3*F3/(s3*F3 + NminusP); print Temperature Bacteria Temp_by_Bact; /* Now generate the tests for main effects and interaction using cell means coding. BACTERIA TYPE TEMP 1 2 3 Cool mu11 mu12 mu13 Warm mu21 mu22 mu23 */ /* The test statement of proc reg uses variable names to stand for the corresponding regression coefficients. By naming the effect cell mean coding dummy variables the same as the population cell means, I can just state the null hypothesis. Isn't this a cute SAS trick? */ proc reg data = spud plots = none; title3 'Using the proc reg test statement and cell means coding'; model rot = mu11--mu23 / noint; Overall: test mu11=mu12=mu13=mu21=mu22=mu23; Temperature: test mu11+mu12+mu13 = mu21+mu22+mu23; Bacteria: test mu11+mu21 = mu12+mu22 = mu13+mu23; Bact_by_Temp1: test mu11-mu21 = mu12-mu22 = mu13-mu23; Bact_by_Temp2: test mu12-mu11 = mu22-mu21, mu13-mu12 = mu23-mu22; /* Bact_by_Temp1 checks equality of temperature effects. Bact_by_Temp2 checks parallel line segments. They are equivalent. */ proc glm data = spud; title3 'Proc glm: Using contrasts on the combination variable'; class combo; /* 11 12 13 21 22 23 */ model rot=combo; contrast 'Main Effect for Temperature' combo 1 1 1 -1 -1 -1; contrast 'Main Effect for Bacteria' combo 1 -1 0 1 -1 0, combo 0 1 -1 0 1 -1; contrast 'Temperature by Bacteria Interaction' combo 1 -1 0 -1 1 0, combo 0 1 -1 0 -1 1; /* Illustrate effect coding */ data mashed; set spud; /* Effect coding, with interactions */ if bact = 1 then b1 = 1; else if bact = 2 then b1 = 0; else if bact = 3 then b1 = -1; if bact = 1 then b2 = 0; else if bact = 2 then b2 = 1; else if bact = 3 then b2 = -1; if temp = 1 then t = 1; else if temp = 2 then t = -1; /* Interaction terms */ tb1 = t*b1; tb2 = t*b2; proc reg plots = none; title3 'Effect coding'; model rot = b1 b2 t tb1 tb2; Temperature: test t=0; Bacteria: test b1=b2=0; Bact_by_Temp: test tb1=tb2=0; /* Do some exploration to follow up the interaction. The regression with cell means coding is easiest. The final product of several runs is shown below. For reference, here is the table of population means again. BACTERIA TYPE TEMP 1 2 3 Cool mu11 mu12 mu13 Warm mu21 mu22 mu23 */ proc reg plots = none; title3 'Further exploration using cell means coding'; model rot = mu11--mu23 / noint; /* Pairwise comparisons of marginal means for Bacteria Type */ Bact1vs2: test mu11+mu21=mu12+mu22; Bact1vs3: test mu11+mu21=mu13+mu23; Bact2vs3: test mu12+mu22=mu13+mu23; /* Effect of temperature for each bacteria type */ Temp_for_Bac1: test mu11=mu21; Temp_for_Bac2: test mu12=mu22; Temp_for_Bac3: test mu13=mu23; /* Effect of bacteria type for each temperature */ Bact_for_CoolTemp: test mu11=mu12=mu13; Bact_for_WarmTemp: test mu21=mu22=mu23; /* Pairwise comparisons of bacteria types at warm temperature */ Bact1vs2_for_WarmTemp: test mu21=mu22; Bact1vs3_for_WarmTemp: test mu21=mu23; Bact2vs3_for_WarmTemp: test mu22=mu23; /* We have done a lot of tests. Concerned about buildup of Type I error? We can make ALL the tests into Scheffe follow-ups of the initial test for equality of the 6 cell means. The Scheffe family includes all COLLECTIONS of contrasts, not just all contrasts. */ proc iml; title3 'Table of critical values for all possible Scheffe tests'; numdf = 5; /* Numerator degrees of freedom for initial test */ dendf =48; /* Denominator degrees of freedom for initial test */ alpha = 0.05; critval = finv(1-alpha,numdf,dendf); zero = {0 0}; S_table = repeat(zero,numdf,1); /* Make empty matrix */ /* Label the columns */ namz = {"Number of Contrasts in followup test" " Scheffe Critical Value"}; mattrib S_table colname=namz; do i = 1 to numdf; s_table(|i,1|) = i; s_table(|i,2|) = numdf/i * critval; end; reset noname; /* Makes output look nicer in this case */ print "Initial test has" numdf " and " dendf "degrees of freedom." "Using significance level alpha = " alpha; print s_table; /* The potato data are balanced; they have equal sample sizes. This yields the same F-tessts for Type I and Type III sums of squares in proc glm. Look at an unbalanced example. */ %include '/folders/myfolders/441s16/Lecture/readmath2b.sas'; /* Creates the data set mathex */ proc freq; title2 'An unbalanced example: The math data'; tables sex*course2 / nocol nopercent chisq; proc glm; title2 'An unbalanced example: The math data'; class course2 sex ; model hscalc = sex|course2; /* In Type III sums of squares, each effect is tested controlling for all the other effects. This is what we get from the standard regresssion tests. In Type I sums of squares, the numerator SS is improvement in explained sum of squares after allowing for all the preceding effects. But in every case the denominator is MSE from the full model. */ /* Now back to the potato data. t's actually a 3-factor design. First look at the cell means. */ proc tabulate data=spud; title2 'Oxygen by temperature by bactieria type'; class bact temp oxygen; var rot; table (oxygen all),(temp all),(bact all) * (mean*rot); proc glm data=spud; title2 'Three-way ANOVA with proc glm'; class oxygen bact temp; model rot=temp|bact|oxygen; means temp|bact; /* Based on the tests */
/************* classicalmixed.sas ********************* Three levels of factor A, four levels of B Both fixed Both random A fixed, B random B nested within A ***************************************************/ data mixedup; infile '/folders/myfolders/mixedup.data.txt'; input ID Y A B; proc glm plots=none; title 'Default proc glm: Both Fixed'; class A B ; model y = a | b; means a*b; proc glm plots=none; title 'Both Random with proc glm'; class A B ; model y = a | b; random a b a*b / test ; /* Have to specify interaction random too, and explicitly ask for tests */ proc glm plots=none; title 'A Fixed, B Random'; class A B ; model y = a | b; random b a*b / test; proc glm plots=none; title 'A fixed, B random and nested within A'; class A B ; model y = a b(a); random b(a) / test; proc mixed cl; /*Confidence Limits for variances (There are no tests) */ title 'A fixed, B random and nested within A'; title2 'Using proc mixed'; title3 'Compare F for A = 7.32, F for B(A) = 3.11, p = 0.0126'; class A B ; model y = a ; random b(a); proc glm plots=none; title 'Both random, B nested within A'; class A B ; model y = a b(a); random a b(a) / test; proc sort; by A B; /* Data must be sorted in order of nesting*/ proc nested; title 'Nested random effects with proc nested'; title2 'Compare F for A = 7.32, F for B(A) = 3.11'; class A B; var Y; proc mixed cl; title 'Pure nested with proc mixed'; class A B ; model y = ; random a b(a); /****** What happens when the data are unbalanced? ******/ proc glm plots=none; title 'UNBALANCED: Both random, B nested within A'; title2 'Using proc glm'; title3 'Compare F for A = 7.32, F for B(A) = 3.11'; where id ne 1; /* Throw out one case */ class A B ; model y = a b(a); random a b(a) / test; proc sort; by A B; /* Data must be sorted in order of nesting*/ proc nested; title 'UNBALANCED: Nested random effects with proc nested'; title2 'Proc glm gave F for A = 7.38, F for B(A) = 2.77'; where id ne 1; /* Throw out one case */ class A B; var Y; proc mixed cl; title 'UNBALANCED: Pure nested with proc mixed'; title2 'Proc nested estimated Var[A]=14.39, Var[B(A)]=6.25'; where id ne 1; /* Throw out one case */ class A B ; model y = ; random a b(a); /* Conclusion: Proc glm tries to fix the problem, proc nested gives up and proc mixed never cared in the first place. */
/* classicalgrapefruit.sas */ title 'Grapefruit Data'; data fruit; /* Skip the header */ infile '/folders/myfolders/unigrapefruit.data.txt' firstobs=2; input store price sales; proc glm; title2 'Classical mixed model repeated measures'; class price store; model sales = price store; means price; random store / test; /* Follow up with matched t-tests, but they require a multivariate format. Read the data in again, with multiple lines of data per case. */ data mvfruit; infile '/folders/myfolders/unigrapefruit.data.txt' firstobs=2; input store1 price1 sales1 store2 price2 sales2 store3 price3 sales3; One_vs_Two = sales1-sales2; One_vs_Three = sales1-sales3; Two_vs_Three = sales2-sales3; proc means n mean stddev t probt; title2 'Matched t-tests: 0.05/3 = 0.0167'; var One_vs_Two One_vs_Three Two_vs_Three;
/* SleepTheHardWay.sas */ title "Student's Sleep data with a classical mixed model"; data bedtime; infile '/folders/myfolders/studentsleep.data.txt' firstobs=2; /* Skip the header */ input patient xsleep1 xsleep2; sleepdif = xsleep2-xsleep1; proc print; title2 'Data in multivariate format'; proc means n mean stddev t probt; title2 'Matched t-test'; var xsleep1 xsleep2 sleepdif; /* Make a data table in univariate format */ data unisleep; set bedtime; /* output writes a case */ Subject=patient; Drug=1; ExtraSleep=xsleep1; output; Subject=patient; Drug=2; ExtraSleep=xsleep2; output; keep Subject Drug ExtraSleep; proc print; title2 'Data in univariate format'; proc glm plots=none; title2 'Repeated measures ANOVA: Compare t = 4.06, p = 0.0028'; class Subject Drug; model ExtraSleep = Subject Drug; random Subject / test; /* So F = t-squared except for rounding error. */
/******************** ClassicalNoise.sas ************************** * Participants listened to political discussions under different * * levels of background noise. "Discrimination score" is a measure * * of how well they could tell what was being said. * ******************************************************************/ title 'Noise data'; proc format; value sexfmt 0 = 'Male' 1 = 'Female' ; data loud; infile '/folders/myfolders/noise.data.txt'; /* Univariate data read */ input ident interest sex age noise time discrim ; format sex sexfmt.; label interest = 'Interest in topic (politics)' time = 'Order of presenting noise level'; proc freq; tables sex*age*noise / norow nocol nopercent; proc glm; title2 'Classical mixed model repeated measures'; class ident age sex noise; model discrim = age|sex|noise ident(age*sex); random ident(age*sex) / test; means noise age; /* Expected mean squares show that not all the hypotheses are testable with the classical mixed model approach. SAS is buying testability of the hypotheses by assuming that you're only interested in an effect if all the higher-order interactions involving that effect are absent. Also, we can't use the covariate and it's not clear how to do multiple comparisons. */
/******************** RepeatedNoise.sas *********************** * Participants listened to political discussions under different * * levels of background noise. "Discrimination score" is a measure * * of how well they could tell what was being said. * ******************************************************************/ title 'Noise data'; proc format; value sexfmt 0 = 'Male' 1 = 'Female' ; data loud; infile '/folders/myfolders/noise.data.txt'; /* Univariate data read */ input ident interest sex age noise time discrim ; format sex sexfmt.; label interest = 'Interest in topic (politics)' time = 'Order of presenting noise level'; proc freq; tables sex*age*noise / norow nocol nopercent; proc glm; title2 'Classical mixed model repeated measures'; class ident age sex noise; model discrim = age|sex|noise ident(age*sex); random ident(age*sex) / test; means noise age; /* Expected mean squares show that not all the hypotheses are testable with the classical mixed model approach. SAS is buying testability of the hypotheses by assuming that you're only interested in an effect if all the higher-order interactions involving that effect are absent. Also, we can't use the covariate and it's not clear how to do multiple comparisons. */ proc mixed; title2 'Covariance structure (cs) with proc mixed'; title3 'Replicate the classical tests'; class age sex noise; model discrim = age|sex|noise; repeated / type=cs subject=ident; proc mixed; title2 'Include covariate, multiple comparisons, contrasts'; class age sex noise; model discrim = interest age|sex|noise / solution; /* solution shows the beta-hat values (interest) */ repeated / type=cs subject=ident r; lsmeans age / pdiff tdiff adjust = bon; lsmeans noise / pdiff tdiff adjust = bon; contrast 'Noise Linear' noise 1 -2 1 0 0, noise 0 1 -2 1 0, noise 0 0 1 -2 1; /* In the test for departure from linearity, mu1-mu2 = mu2-mu3 => 1 -2 1 0 0 mu2-mu3 = mu3-mu4 => 0 1 -2 1 0 mu3-mu4 = mu4-mu5 => 0 0 1 -2 1 */ /* Compare results with unknown covariance structure */ proc mixed cl; title2 'Unknown covariance structure'; class age sex noise; model discrim = interest age|sex|noise; repeated / type=un subject=ident r; lsmeans age / pdiff tdiff adjust = bon; lsmeans noise / pdiff tdiff adjust = bon; contrast 'Noise Linear' noise 1 -2 1 0 0, noise 0 1 -2 1 0, noise 0 0 1 -2 1; /* Test for difference between covariance structures using a full versus reduced model approach. Difference between Likelihood Ratio chisquares is chi-squared, with df = difference of degrees of freedom. H0 is compound symmetry, alternative is unstructured. Any model for the covariance matrix is reduced compared to the unknown structure. */ proc iml; title2 'Test for difference between covariance structures'; title3 'Compound symmetry versus unknown'; ChiSquared = 51.44-42.90; df = 14-1; pvalue = 1-probchi(ChiSquared,df); print ChiSquared df pvalue;
/* RepeatedMonkey.sas */ title 'Primate hippocampal function: Zola-Morgan and Squire, 1990'; title2 'Covariance Structure approach to repeated measures (within-cases)'; data memory; infile '/folders/myfolders/monkey.data.txt' firstobs=2; input monkey $ treatment $ week2 week4 week8 week12 week16; /* Make 5 "cases" in the data set for each line in the raw data file. The output command generates a case. */ id = _n_; time = 2; score = week2; output; time = 4; score = week4; output; time = 8; score = week8; output; time = 12; score = week12; output; time = 16; score = week16; output; keep monkey treatment id time score; proc print; proc freq; tables time*treatment /norow nocol nopercent; tables monkey; proc mixed cl; title3 'Unstructured Covariance Matrix'; class treatment time; model score = treatment|time; repeated / type=un subject=id r; /* Could have used subject=monkey, but then monkey must be declared in class because it's character-valued. */ /* We did not reject the null hypothesis that observations from the same subject are independent. Try an ordinary 2-way ANOVA, mostly for the interaction plot. */ proc glm; title3 'Ordinary 2-way'; class time treatment; /* This order determines the plot. */ model score = treatment|time; /* Could justify compound symmetry: Each monkey brings her own special talent for discrimination learning. */ proc mixed cl; title3 'Compound Symmetry'; class treatment time; model score = treatment|time; repeated / type=cs subject=id r; /* Contrast statements to follow up the interaction are possible but challenging. */ data unimemory; infile '/folders/myfolders/monkey.data.txt' firstobs=2; input monkey $ treatment $ week2 week4 week8 week12 week16; twovs16 = week2-week16; /* Could do all 15 pairwise differences*/ proc glm; class treatment; model twovs16=treatment; proc glm data=memory; title3 'Classical (Same results as CS)'; class treatment time monkey; model score = treatment|time monkey(treatment); random monkey(treatment) / test;
/* RepeatedMantids.sas */ title 'Mantids Data: Thanks to Stephanie Hill'; proc format; value sexfmt 0 = 'Male' 1 = 'Female'; value dfmt 1 = '1=8cm' 2 = '2=13cm' 3 = '3=18cm'; value birdfmt 1='Vinny' 2='Chickpea' 3='Emily' 4='Bramble'; data bugs; /* Univariate Data Read. Skip the first line. */ infile '/folders/myfolders/mantids.data.txt' firstobs=2; input id Sex Order Predator Distance Calls; format sex sexfmt. distance dfmt. Predator birdfmt. ; proc freq; tables sex*predator*distance / norow nocol nopercent missing; proc mixed; title2 'Predator is a fixed effect'; class Sex Predator Distance; model calls = Sex|Predator|Distance; repeated / type=un subject=id; lsmeans sex distance Predator*Distance; proc mixed cl; /* Confidence Limits for random effects */ title2 'Predator is a random effect'; class Sex Predator Distance; model calls = Sex|Distance; random Predator Predator*Sex Predator*Distance Predator*Sex*Distance; repeated / type=cs subject=id; proc mixed cl; title2 'No interactions of fixed effects factors with Predator'; class Sex Predator Distance; model calls = Sex|Distance; random Predator; repeated / type=cs subject=id; lsmeans sex; lsmeans distance / pdiff tdiff adjust = bon;
/* ReadNasal.sas: Read and re-format. This is like Nasalance1.sas but without all the proc prints. Use with %include '/folders/myfolders/441s16/Lecture/ReadNasal.sas'; */ title "Gillian DeBoer's Nasalance Data"; proc import datafile="/folders/myfolders/NasalanceTest.xls" out=wide dbms=xls replace; getnames=yes; /* Search and replace NA with 999 in the spreadsheet, because proc transpose used below will omit any columns with missing values (columns become rows in the transpose). This is critical. */ proc transpose data=wide out=long1 prefix=Subject; /* Labelling variables */ id participant; /* in the transposed data set. */ data long2; set long1; Feedback = SubjectSubheading_dail_rep; /* More infomative name */ Condition = _name_; /* Useful name of experimental condition */ Time = _n_; array Sub Subject4 -- Subject14; array id[10] ( 4 5 6 7 8 9 10 12 13 14 ); /* Optional */ j = 0; /* Give consecutive numbers to subjects */ do over Sub; j = j + 1; Participant = id[j]; /* Optional */ Nasalance = Sub[j]; if Nasalance=999 then Nasalance=.; /* Fix the missing values. */ Subject = j; output; /* Create a new case. */ end; /* Reorder the variables. This method seems a bit strange to me. */ data long2; retain Participant Condition Feedback Nasalance Time Subject; set long2; /* Keep only the variables we want. Order below does not matter.*/ keep Participant Condition Feedback Nasalance Time Subject; run; proc sort data = long2 out=long3; by Subject; /* Could sort by Time as well, but data are already in time order. */
/* Nasalance.sas */ %include '/folders/myfolders/441s16/Lecture/ReadNasal.sas'; /* Variables in long3 are Participant Condition Feedback Nasalance Time Subject */ proc print data=long3(obs=190); /* Just the first 190 lines */ proc freq data=long3; tables participant feedback; proc means data=long3; class Feedback; var Nasalance; output out=meanz mean=AveNasalance; proc print data = meanz; proc sgplot data=meanz; title2 'Mean Nasalance as a Function of Feedback'; scatter x=Feedback y=AveNasalance; proc reg data=long3 plots=none; title2 'Naive regression but with Durbin-Watson'; model Nasalance = Feedback / dw; output out = longer residual = resid rstudent = delstud; /* Deleted Studentized Residual */ proc arima data=longer; title2 'Time series diagnostics on the residals'; identify var=resid; proc mixed cl data=long3; title2 'CS directly with proc mixed'; /* class Feedback; */ model Nasalance = Feedback; repeated / type=cs subject=Participant; proc mixed cl data=long3; title2 'CS random shock per subject with proc mixed'; class Participant; model Nasalance = Feedback; random Participant; /* The numbers match up PERFECTLY, except of course there is no LR chi-square. */ proc mixed cl data=long3; title2 'Just AR(1) with proc mixed'; model Nasalance = Feedback; repeated / type=ar(1) subject=Participant; proc mixed cl data=long3; title2 'Random shock from subject and AR(1) for errors'; class Participant; model Nasalance = Feedback / solution; /* Solution gives beta-hat */ random Participant; repeated / type=ar(1) subject=Participant;
/* Lynch1.sas */ title 'Hovland and Sears Lynching data'; data lynch; infile '/folders/myfolders/Lynch.data.txt' firstobs=7; /* Skipping the header */ input Year Ayres Cotton1 Cotton2 Blynch Totlynch; label ayres = 'Ayres'' composite economic index' cotton1 = 'Per-acre value of cotton' cotton2 = 'Farm value of cotton' blynch = 'Negro lynchings' totlynch = 'Total lynchings'; olynch = totlynch-blynch; label olynch = 'Non-Black lynchings'; one=1; /* Used below -- one "subject." */ proc sgplot; title2 'Lynchings and Cotton Prices over Time'; series x=year y=cotton2; series x=year y=totlynch / y2axis; proc means; var ayres -- olynch; proc corr; var year -- olynch; proc reg plots=none; title2 'Naive regression, but request Durbin-Watson statistic'; title3 'Lower Durbin-Watson critical value = 1.5'; model totlynch = year cotton2 / dw; output out = Dixie residual = epsilonhat; /* Save residuals */ proc arima; title2 'Autocorrelations and partial autocorrelations'; identify var = epsilonhat; /* That looks like an AR(1) */ /* Apparently SAS university Edition does not include proc autoreg, so this does not work. ********************************************************* proc autoreg; title2 'Fit AR(1) with proc autoreg'; model totlynch = year cotton2 / nlag=1 method=ml; *********************************************************/ proc mixed; title2 'Fit AR(1) with proc mixed'; model totlynch = year cotton2 / solution; repeated / type=ar(1) subject=one; /* Test for difference between AR(1) and ARMA(1,1), using a likelihood ratio chi-squared test. AR(1) is the reduced model. Fit the models with maximum likelihood. */ proc mixed method=ml; title2 'Reduced model is AR(1)'; model totlynch = year cotton2 / solution; repeated / type=ar(1) subject=one; proc mixed method=ml; title2 'Full model is Autoregressive Moving Average: ARMA(1,1)'; model totlynch = year cotton2 / solution; repeated / type=arma(1,1) subject=one; proc iml; title2 'Difference between -2 Log Likelihood values is chi-squared'; Chisquare = 443.7-441.5; df=1; pvalue = 1-probchi(Chisquare,df); print Chisquare df pvalue;
/********************* tuberead.sas ***********************/ title 'Fungus Tube data'; /* Data definition file */ data mould; infile '/folders/myfolders/tubes.data.txt' firstobs=2; input line1 mcg replic1 day1 amlng1 amscl1 amlead1 pmlng1 pmscl1 pmlead1 line2 mcg2 replic2 day2 amlng2 amscl2 amlead2 pmlng2 pmscl2 pmlead2 line3 mcg3 replic3 day3 amlng3 amscl3 amlead3 pmlng3 pmscl3 pmlead3 line4 mcg4 replic4 day4 amlng4 amscl4 amlead4 pmlng4 pmscl4 pmlead4 line5 mcg5 replic5 day5 amlng5 amscl5 amlead5 pmlng5 pmscl5 pmlead5 line6 mcg6 replic6 day6 amlng6 amscl6 amlead6 pmlng6 pmscl6 pmlead6 line7 mcg7 replic7 day7 amlng7 amscl7 amlead7 pmlng7 pmscl7 pmlead7 line8 mcg8 replic8 day8 amlng8 amscl8 amlead8 pmlng8 pmscl8 pmlead8 line9 mcg9 replic9 day9 amlng9 amscl9 amlead9 pmlng9 pmscl9 pmlead9 line10 mcg10 replic10 day10 amlng10 amscl10 amlead10 pmlng10 pmscl10 pmlead10 line11 mcg11 replic11 day11 amlng11 amscl11 amlead11 pmlng11 pmscl11 pmlead11 line12 mcg12 replic12 day12 amlng12 amscl12 amlead12 pmlng12 pmscl12 pmlead12 line13 mcg13 replic13 day13 amlng13 amscl13 amlead13 pmlng13 pmscl13 pmlead13 line14 mcg14 replic14 day14 amlng14 amscl14 amlead14 pmlng14 pmscl14 pmlead14 amslope pmslope weight; rate=(amslope+pmslope)/2; label mcg = 'Mycelial Compatibility Group'; label weight = 'Sclerotial Weight'; label rate = 'Regression Growth Rate'; /* Average of am & pm slope */ /***** Average morning and evening observations ***/ array am{28} amlng1-amlng14 amscl1-amscl14; array pm{28} pmlng1-pmlng14 pmscl1-pmscl14; array aver{28} length1-length14 sclr1-sclr14; do i=1 to 28; /* Length and sclerotia at the same time */ aver{i}=(am{i}+pm{i})/2; end;
************************ tubeclean.sas *******************************/ /* Data cleaning for TUBES Data: There is no point in doing the right */ /* statistical analysis on data that are full of errors. */ /**********************************************************************/ title2 'Data cleaning for tubes data'; %include 'tuberead.sas'; /* More data step */ /* Error check variables (internal consistency) */ /* MCG must be the same on each line */ if mcg ne mcg2 then mcger1=line1; if mcg2 ne mcg3 then mcger2=line2; if mcg3 ne mcg4 then mcger3=line3; if mcg4 ne mcg5 then mcger4=line4; if mcg5 ne mcg6 then mcger5=line5; if mcg6 ne mcg7 then mcger6=line6; if mcg7 ne mcg8 then mcger7=line7; if mcg8 ne mcg9 then mcger8=line8; if mcg9 ne mcg10 then mcger9=line9; if mcg10 ne mcg11 then mcger10=line10; if mcg11 ne mcg12 then mcger11=line11; if mcg12 ne mcg13 then mcger12=line12; if mcg13 ne mcg14 then mcger13=line13; /* REPLIC must be the same on each line */ if replic1 ne replic2 then replicer1=line1; if replic2 ne replic3 then replicer2=line2; if replic3 ne replic4 then replicer3=line3; if replic4 ne replic5 then replicer4=line4; if replic5 ne replic6 then replicer5=line5; if replic6 ne replic7 then replicer6=line6; if replic7 ne replic8 then replicer7=line7; if replic8 ne replic9 then replicer8=line8; if replic9 ne replic10 then replicer9=line9; if replic10 ne replic11 then replicer10=line10; if replic11 ne replic12 then replicer11=line11; if replic12 ne replic13 then replicer12=line12; if replic13 ne replic14 then replicer13=line13; /* Increase in length and number of sclerotia from am to pm */ array diff{28} ldiff1-ldiff14 sdiff1-sdiff14; do i=1 to 28; diff{i}=pm{i}-am{i}; /* am and pm are defined in tuberead */ end; proc freq; title3 'Frequency distributions'; tables line1-line14 mcg mcg2-mcg14 replic1-replic14 day1-day14 mcger1-mcger13 replicer1-replicer13; proc means n mean min max; title3 'Means of quantitative variables'; var amlng1-amlng14 pmlng1-pmlng14 length1-length14 amscl1-amscl14 pmscl1-pmscl14 sclr1-sclr14 amslope pmslope rate weight; proc freq; title3 'Look at am to pm change each day'; tables ldiff1-ldiff14 sdiff1-sdiff14; /* At this point it looks like length10 is the primary DV. Data set is small, so look at the whole thing */ proc sort; by mcg length10; proc print; var line1 mcg length10 sclr10 weight rate; /* Notes: Commented out Freq dist for line1: entries must all be odd, but I see a line 238. Checking the raw data file, see -- ahha! two line 227s. But the rest of the data look okay. This is just cosmetic. Forget it. Looking at proc means: The last day with no missing observations for am length is day 11, but max is 31.2. These are 30cm race tubes, so we'd better stick with day 10. pmlng11 has a max length of 32.8. It's growing beyond the end of the tube. Stick to day 10. It looks like they recorded missing values instead of zero for sclerotia. I could fix this, but I'm not sure I need to. if amscl1=. then amscl1=0; if pmscl1=. then pmscl1=0; if amscl2=. then amscl2=0; if pmscl2=. then pmscl2=0; Looking at difference variables. This is a careful lab study, but still there is measurement error. Especially look at the -17 for sdiff12. We could track it down and hide it, which is something a Biologist might do. But to a statistician, everything has a piece of random error attached. It's a fact of life. So we model it or live with it. In this case, we live with it. At this point I'm looking at length10 as my primary dependent variable. There is a good justification, but I don't want to type it now. Other good DVs are sclr10, weight and rate. Linda wanted: data fixed; set mould; if line1 ne 113;
/********************* uvtubes.sas ***********************/ title 'Fungus Tube data in univariate format'; data mould1; infile '/folders/myfolders/tubes.data.txt' firstobs=2 missover; /* missover makes data omitted at the end of a line missing. */ input line mcg rep day AML AMS AMld PML PMS PMld AMslp PMslp SWeight; label mcg = 'Fungus Type' rep = 'Replication (1-4)' AML = 'AM Length' AMS = 'AM Number of Sclerotia' AMld = 'AM Leading Edge' PML = 'PM Length' PMS = 'PM Number of Sclerotia' PMld = 'PM Leading Edge' AMslp = 'AM Least-squares Slope' PMslp = 'PM Least-squares Slope' SWeight = 'Slerotial Weight'; Tube = 1000*rep + mcg; /* 4-digit tube identifier */ Length = (AML+PML)/2; Sclerotia = (AMS+PMS)/2; /* Zero sclerotia were recorded as missing. */ if Sclerotia=. then Sclerotia=0; Slope = (AMslp+PMslp)/2; if Tube ne 1213; /* Getting rid of the outlier, based on science. */ /* Fungus grew past the end of some tubes after day 10 so it was really a 10-day experiment, but day 14 has slopes and weight of sclerotia. These variables will still be available in mould1, with one non-missing value for each tube. */ data mould2; set mould1; if day < 11; proc print; var Tube mcg rep day Length Sclerotia; proc freq; title2 'Sample sizes'; tables day*mcg / norow nocol nopercent missing; proc tabulate; title2 'Mean Length'; class day mcg; var Length; table (day all),(mcg all) * (mean*Length); proc glm; title2 'Proc glm just for the mean plot'; class day mcg; model Length = day|mcg; proc mixed cl; title2 'Unstructured Covariance Matrix'; class day mcg; model Length = day|mcg; repeated / type=un subject=tube; lsmeans mcg / pdiff adjust=bon; /* Test linear growth starting with day 2. H0 is mu1 mu2 mu3 mu4 mu5 mu6 mu7 mu8 mu9 mu10 mu2-mu3 = mu3-mu4 <=> 0 1 -2 1 0 0 0 0 0 0 mu3-mu4 = mu4-mu5 <=> 0 0 1 -2 1 0 0 0 0 0 mu4-mu5 = mu5-mu6 <=> 0 0 0 1 -2 1 0 0 0 0 mu5-mu6 = mu6-mu7 <=> 0 0 0 0 1 -2 1 0 0 0 mu6-mu7 = mu7-mu8 <=> 0 0 0 0 0 1 -2 1 0 0 mu7-mu8 = mu8-mu9 <=> 0 0 0 0 0 0 1 -2 1 0 mu8-mu9 = mu9-mu10 <=> 0 0 0 0 0 0 0 1 -2 1 */ contrast 'LinearAfter2' day 0 1 -2 1 0 0 0 0 0 0, day 0 0 1 -2 1 0 0 0 0 0, day 0 0 0 1 -2 1 0 0 0 0, day 0 0 0 0 1 -2 1 0 0 0, day 0 0 0 0 0 1 -2 1 0 0, day 0 0 0 0 0 0 1 -2 1 0, day 0 0 0 0 0 0 0 1 -2 1; /* Finally, test whether an ar(1) is good enough */ proc mixed method=ml; title2 'Full model for Sigma is unstructured'; class day mcg; model Length = day|mcg; repeated / type=un subject=tube; proc mixed method=ml; title2 'Reduced model for Sigma is AR(1)'; class day mcg; model Length = day|mcg; repeated / type=ar(1) subject=tube; proc iml; title2 'Difference between chi-squared values is chi-squared'; Chisquare = 382.99-219.86; df=54-1; pvalue = 1-probchi(Chisquare,df); print Chisquare df pvalue;