/* MathReg3.sas */ %include '/home/u1407221/441s24/SAS08/ReadLabelMath2.sas'; title2 'Replication and Prediction'; /* Plan: 1. Non-obvious findings from the exploratory analysis (based on Model I, which predicts grade from hsgpa hscalc hsengl totscore mtongue) were a. HS Engl negative b. mtongue negative c. totscore positive (diagnostic test matters controlling for HS) Test these on the replication with a Bonferroni correction for 3 tests. The other two results (HS GPA and HS Calculus) were obvious. 2. See if prediction intervals work as advertised on replication data. 3. Try prediction of letter grade. 4. Try predictions of grade for some imaginary students. /* Test the three findings: Point 1 above */ proc reg data = replic plots = none; title3 'Try to replicate HS Engl neg, mtongue neg, totscore pos'; title4 'with a Bonferroni correction (check p < 0.05/3 = 0.01666667)'; model grade = hsgpa hscalc hsengl totscore mtongue; /* Point 2: Look at prediction intervals */ data predict; set math; /* Combined data set */ keeper = grade+hsgpa+hscalc+hsengl+totscore+mtongue; /* keeper will be missing if any of the vars are missing */ if keeper ne .; /* Discards all other cases */ grade2 = grade; /* Save value of grade for future use */ if sample=2 then grade=. ; /* Response variable is now missing for replication sample. But it is preserved in grade2 */ proc reg plots=none data=predict; /* Data table predict is the default anyway */ title3 'Re-running Model I to generate y-hat and prediction intervals'; model grade = hsgpa hscalc hsengl totscore mtongue; output out = predataI predicted = Yhat L95 = lowpred U95 = hipred; /* Data set predataI has everything in predict plus Yhat and the lower and upper 95% prediction limits. */ /* Does 95 Percent Prediction Interval really contain 95 percent of grades? Recall that the data fail all tests for normality, and the prediction intervals are based on normal theory. */ data predictB; set predataI; if (lowpred < grade2 < hipred) then ininterval='Yes'; else ininterval='No'; proc freq data=predictB; title3 'Does 95 Percent Prediction Interval Work?'; tables sample * ininterval / nocol nopercent; proc print data=predataI; title3 'Look at predictions for the replication sample'; var id sample grade2 Yhat lowpred hipred; where sample = 2; /* Should predicted marks be used to advise students? */ /* Keep trying. Try to predict letter grade. */ data predictC; set predictB; if 80 <= grade2 <= 100 then lgrade = 'A'; else if 70 <= grade2 <= 79 then lgrade = 'B'; else if 60 <= grade2 <= 69 then lgrade = 'C'; else if 50 <= grade2 <= 59 then lgrade = 'D'; else if 0 <= grade2 <= 49 then lgrade = 'F'; label lgrade = 'Letter Grade'; pregrade = round(Yhat); if 80 <= pregrade <= 100 then prelgrade = 'A'; else if 70 <= pregrade <= 79 then prelgrade = 'B'; else if 60 <= pregrade <= 69 then prelgrade = 'C'; else if 50 <= pregrade <= 59 then prelgrade = 'D'; else if 0 <= pregrade <= 49 then prelgrade = 'F'; label prelgrade = 'Predicted Letter Grade'; proc freq; title3 'Accuracy of predicting Letter Grades From Model I'; tables sample*prelgrade*lgrade / nocol nopercent; /* Will yield separate table for each sample. */ /* Predict grade for a new student with hsgpa=80 hscalc=90 hsengl=70 totscore=15. For just a prediction (no interval), proc glm is easier. */ proc glm data = explore; model grade = hsgpa hscalc hsengl mtongue totscore; estimate 'New Student 1' intercept 1 hsgpa 80 hscalc 90 hsengl 70 mtongue 1 totscore 15; estimate 'New Student 2' intercept 1 hsgpa 80 hscalc 90 hsengl 0 mtongue 1 totscore 15; /* Prediction for Y_{n+1} is the same as estimate of E[Y|X]. CI from proc glm is for E[Y|X]. PREDICTION interval for Y_{n+1} is wider. */ data student; hsgpa=80; hscalc=90; hsengl=70; mtongue=1; totscore=15; id = -1; output; hsgpa=80; hscalc=90; hsengl=0; mtongue=1; totscore=15; id = -2; output; proc print; data together; set explore student; /* All variables not assigned will be missing for new observations */ proc reg noprint data=together; title3 'Fit Model I to predict new student data'; model grade = hsgpa hscalc hsengl mtongue totscore; output out = guess predicted = PredictedY L95 = LowerLimit U95 = UpperLimit; data newguess; set guess; if id < 0; /* Discard all other cases */ proc print; title3 'Prediction intervals for new students'; var id hsgpa hscalc hsengl totscore predictedY LowerLimit UpperLimit; quit;