/* MathReg3.sas */ %include '/folders/myfolders/441s16/Lecture/readexplor.sas'; /* Creates data table explore */ %include '/folders/myfolders/441s16/Lecture/readreplic.sas'; /* Creates data table replic */ title2 'Predict Grade for Replication Sample'; /* Plan: 1. Non-obvious findings from the exploration (based on Model I, which predicts grade from hsgpa hscalc hsengl totscore mtongue) were a. HS Engl neg b. mtongue neg c. totscore positive (diagnostic test matters) Test these on the replication with a Bonferroni correction for 3 tests. The other two results (HS GPA and HS Calculus) were obvious. 2. See if prediction intervals work as advertised for Model H, which predicts grade from hsgpa hscalc hsengl totscore. 3. Compare prediction of letter grade for the models with and without the diagnostic test. First, just illustrate use of different data tables in the same run. */ proc freq data = explore; title3 'Exploratory Sample'; tables outcome; proc freq data = replic; title3 'Replication Sample'; tables outcome; /* Now test the three findings: Point 1 above */ proc reg data = replic plots = none; title3 'Try to replicate HS Engl neg, mtongue neg, totscore pos'; title4 'with a Bonferroni correction (check p < 0.05/3 = 0.01666667)'; model grade = hsgpa hscalc hsengl totscore mtongue; /* Make combined data table, look at prediction intervals: Point 2 */ data predict; set explore replic; keeper = grade+hsgpa+hscalc+hsengl+totscore; /* keeper will be missing if any of the vars are missing */ if keeper ne .; /* Discards all other cases */ grade2 = grade; /* Save value of grade for future use */ if sample=2 then grade=. ; /* Response variable is now missing for replication sample. But it is preserved in grade2 */ proc reg plots = none data = predict; /* Data table predict is the default anyway */ title3 'Model H: hsgpa hscalc hsengl totscore: R-sq = 0.4532'; model grade = hsgpa hscalc hsengl totscore; output out = predataH predicted = Yhat L95 = lowpred U95 = hipred; /* Data table predataH has everything in predict plus Yhat and the lower and upper 95% predictoipn limits. */ proc print; title3 'Look at predictions for the replication sample'; var id sample grade2 Yhat lowpred hipred; where sample = 2; /* Should predicted marks be used to advise students? */ /* Does 95 Percent Prediction Interval really contain 95 percent of grades? Recall that the data fail all tests for normality, and the prediction intervals are based on normal theory. */ data predictB; set predataH; if (lowpred < grade2 < hipred) then ininterval='Yes'; else ininterval='No'; proc freq; title3 'Does 95 Percent Prediction Interval Work?'; tables sample * ininterval / nocol nopercent; /* Keep trying. Try to predict letter grade. */ data predictC; set predictB; if 80 <= grade2 <= 100 then lgrade = 'A'; else if 70 <= grade2 <= 79 then lgrade = 'B'; else if 60 <= grade2 <= 69 then lgrade = 'C'; else if 50 <= grade2 <= 59 then lgrade = 'D'; else if 0 <= grade2 <= 49 then lgrade = 'F'; label lgrade = 'Letter Grade'; pregrade = round(Yhat); if 80 <= pregrade <= 100 then prelgrade = 'A'; else if 70 <= pregrade <= 79 then prelgrade = 'B'; else if 60 <= pregrade <= 69 then prelgrade = 'C'; else if 50 <= pregrade <= 59 then prelgrade = 'D'; else if 0 <= pregrade <= 49 then prelgrade = 'F'; label prelgrade = 'Predicted Letter Grade'; proc freq; title3 'Accuracy of predicting Letter Grades From Model H'; tables sample*prelgrade*lgrade / nocol nopercent; /* Will yield separate table for each sample. */ /* Predict grade for a new student with hsgpa=80 hscalc=90 hsengl=70 totscore=15. For just a prediction (no interval), proc glm is easier. */ proc glm data = explor; model grade = hsgpa hscalc hsengl totscore; estimate 'New Student' intercept 1 hsgpa 80 hscalc 90 hsengl 70 totscore 15; /* Prediction for Y_{n+1} is the same as estimate of E[Y|X]. CI from proc glm is for E[Y|X]. PREDICTION interval for Y_{n+1} is wider. */ data student; hsgpa=80; hscalc=90; hsengl=70; totscore=15; id = -27; data together; set explore student; /* All variables not assigned will be missing for observation -27 */ proc reg plots = none; title3 'Model H: hsgpa hscalc hsengl totscore: R-sq = 0.4532'; model grade = hsgpa hscalc hsengl totscore; output out = guess predicted = PredictedY L95 = LowerLimit U95 = UpperLimit; data newguess; set guess; if id < 0; /* Discard all other cases */ proc print; title3 'hsgpa=80 hscalc=90 hsengl=70 totscore=15'; var predictedY LowerLimit UpperLimit;