1 OPTIONS NONOTES NOSTIMER NOSOURCE NOSYNTAXCHECK;
55
56 /* MathReg3.sas */
57 %include '/folders/myfolders/441s16/Lecture/readexplor.sas';
NOTE: Format YNFMT is already on the library WORK.FORMATS.
NOTE: Format YNFMT has been output.
NOTE: Format CRSFMT is already on the library WORK.FORMATS.
NOTE: Format CRSFMT has been output.
NOTE: Format NFMT is already on the library WORK.FORMATS.
NOTE: Format NFMT has been output.
NOTE: PROCEDURE FORMAT used (Total process time):
real time 0.00 seconds
cpu time 0.00 seconds
174 /* Creates data table explore */
175 %include '/folders/myfolders/441s16/Lecture/readreplic.sas';
NOTE: The infile '/folders/myfolders/exploremath.data.txt' is:
Filename=/folders/myfolders/exploremath.data.txt,
Owner Name=root,Group Name=vboxsf,
Access Permission=-rwxrwx---,
Last Modified=18Jan2016:17:34:49,
File Size (bytes)=44583
NOTE: 579 records were read from the infile '/folders/myfolders/exploremath.data.txt'.
The minimum record length was 75.
The maximum record length was 75.
NOTE: Missing values were generated as a result of performing an operation on missing values.
Each place is given by: (Number of times) at (Line):(Column).
99 at 83:24 99 at 119:18
NOTE: The data set WORK.EXPLORE has 579 observations and 35 variables.
NOTE: DATA statement used (Total process time):
real time 0.02 seconds
cpu time 0.01 seconds
NOTE: Format YNFMT is already on the library WORK.FORMATS.
NOTE: Format YNFMT has been output.
NOTE: Format CRSFMT is already on the library WORK.FORMATS.
NOTE: Format CRSFMT has been output.
NOTE: Format NFMT is already on the library WORK.FORMATS.
NOTE: Format NFMT has been output.
NOTE: PROCEDURE FORMAT used (Total process time):
real time 0.00 seconds
cpu time 0.00 seconds
288 /* Creates data table replic */
289 title2 'Predict Grade for Replication Sample';
290
291 /* Plan:
292
293 1. Non-obvious findings from the exploration (based on Model I,
294 which predicts grade from hsgpa hscalc hsengl totscore mtongue) were
295 a. HS Engl neg
296 b. mtongue neg
297 c. totscore positive (diagnostic test matters)
298 Test these on the replication with a Bonferroni correction for 3 tests.
299 The other two results (HS GPA and HS Calculus) were obvious.
300
301 2. See if prediction intervals work as advertised for Model H, which
302 predicts grade from hsgpa hscalc hsengl totscore.
303
304 3. Compare prediction of letter grade for the models with and without
305 the diagnostic test.
306
307
308 First, just illustrate use of different data tables in the same run. */
309
NOTE: The infile '/folders/myfolders/replicmath2.data.txt' is:
Filename=/folders/myfolders/replicmath2.data.txt,
Owner Name=root,Group Name=vboxsf,
Access Permission=-rwxrwx---,
Last Modified=30Jan2016:15:16:16,
File Size (bytes)=38214
NOTE: 579 records were read from the infile '/folders/myfolders/replicmath2.data.txt'.
The minimum record length was 64.
The maximum record length was 64.
NOTE: Missing values were generated as a result of performing an operation on missing values.
Each place is given by: (Number of times) at (Line):(Column).
81 at 203:24 81 at 239:18
NOTE: The data set WORK.REPLIC has 579 observations and 35 variables.
NOTE: DATA statement used (Total process time):
real time 0.01 seconds
cpu time 0.01 seconds
310 proc freq data = explore;
311 title3 'Exploratory Sample';
312 tables outcome;
NOTE: There were 579 observations read from the data set WORK.EXPLORE.
NOTE: PROCEDURE FREQ used (Total process time):
real time 0.04 seconds
cpu time 0.05 seconds
313 proc freq data = replic;
314 title3 'Replication Sample';
315 tables outcome;
316
317 /* Now test the three findings: Point 1 above */
318
NOTE: There were 579 observations read from the data set WORK.REPLIC.
NOTE: PROCEDURE FREQ used (Total process time):
real time 0.02 seconds
cpu time 0.02 seconds
319 proc reg data = replic plots = none;
320 title3 'Try to replicate HS Engl neg, mtongue neg, totscore pos';
321 title4 'with a Bonferroni correction (check p < 0.05/3 = 0.01666667)';
322 model grade = hsgpa hscalc hsengl totscore mtongue;
323
324 /* Make combined data table, look at prediction intervals: Point 2 */
325
NOTE: PROCEDURE REG used (Total process time):
real time 0.07 seconds
cpu time 0.06 seconds
326 data predict;
327 set explore replic;
328 keeper = grade+hsgpa+hscalc+hsengl+totscore;
329 /* keeper will be missing if any of the vars are missing */
330 if keeper ne .; /* Discards all other cases */
331 grade2 = grade; /* Save value of grade for future use */
332 if sample=2 then grade=. ;
333 /* Response variable is now missing for replication sample.
334 But it is preserved in grade2 */
335
NOTE: Missing values were generated as a result of performing an operation on missing values.
Each place is given by: (Number of times) at (Line):(Column).
486 at 328:21 21 at 328:27 4 at 328:34 65 at 328:41
NOTE: There were 579 observations read from the data set WORK.EXPLORE.
NOTE: There were 579 observations read from the data set WORK.REPLIC.
NOTE: The data set WORK.PREDICT has 582 observations and 37 variables.
NOTE: DATA statement used (Total process time):
real time 0.01 seconds
cpu time 0.01 seconds
336 proc reg plots = none data = predict;
337 /* Data table predict is the default anyway */
338 title3 'Model H: hsgpa hscalc hsengl totscore: R-sq = 0.4532';
339 model grade = hsgpa hscalc hsengl totscore;
340 output out = predataH predicted = Yhat
341 L95 = lowpred
342 U95 = hipred;
343 /* Data table predataH has everything in predict plus
344 Yhat and the lower and upper 95% predictoipn limits. */
345
NOTE: The data set WORK.PREDATAH has 582 observations and 40 variables.
NOTE: PROCEDURE REG used (Total process time):
real time 0.06 seconds
cpu time 0.06 seconds
346 proc print;
347 title3 'Look at predictions for the replication sample';
348 var id sample grade2 Yhat lowpred hipred;
349 where sample = 2;
350 /* Should predicted marks be used to advise students? */
351
352 /* Does 95 Percent Prediction Interval really contain 95 percent of grades?
353 Recall that the data fail all tests for normality, and the prediction
354 intervals are based on normal theory. */
355
NOTE: There were 293 observations read from the data set WORK.PREDATAH.
WHERE sample=2;
NOTE: PROCEDURE PRINT used (Total process time):
real time 0.48 seconds
cpu time 0.48 seconds
356 data predictB;
357 set predataH;
358 if (lowpred < grade2 < hipred) then ininterval='Yes';
359 else ininterval='No';
360
NOTE: There were 582 observations read from the data set WORK.PREDATAH.
NOTE: The data set WORK.PREDICTB has 582 observations and 41 variables.
NOTE: DATA statement used (Total process time):
real time 0.00 seconds
cpu time 0.01 seconds
361 proc freq;
362 title3 'Does 95 Percent Prediction Interval Work?';
363 tables sample * ininterval / nocol nopercent;
364
365 /* Keep trying. Try to predict letter grade. */
366
NOTE: There were 582 observations read from the data set WORK.PREDICTB.
NOTE: PROCEDURE FREQ used (Total process time):
real time 0.03 seconds
cpu time 0.03 seconds
367 data predictC;
368 set predictB;
369 if 80 <= grade2 <= 100 then lgrade = 'A';
370 else if 70 <= grade2 <= 79 then lgrade = 'B';
371 else if 60 <= grade2 <= 69 then lgrade = 'C';
372 else if 50 <= grade2 <= 59 then lgrade = 'D';
373 else if 0 <= grade2 <= 49 then lgrade = 'F';
374 label lgrade = 'Letter Grade';
375 pregrade = round(Yhat);
376 if 80 <= pregrade <= 100 then prelgrade = 'A';
377 else if 70 <= pregrade <= 79 then prelgrade = 'B';
378 else if 60 <= pregrade <= 69 then prelgrade = 'C';
379 else if 50 <= pregrade <= 59 then prelgrade = 'D';
380 else if 0 <= pregrade <= 49 then prelgrade = 'F';
381 label prelgrade = 'Predicted Letter Grade';
382
NOTE: There were 582 observations read from the data set WORK.PREDICTB.
NOTE: The data set WORK.PREDICTC has 582 observations and 44 variables.
NOTE: DATA statement used (Total process time):
real time 0.01 seconds
cpu time 0.01 seconds
383 proc freq;
384 title3 'Accuracy of predicting Letter Grades From Model H';
385 tables sample*prelgrade*lgrade / nocol nopercent;
386 /* Will yield separate table for each sample. */
387
388
389 /* Predict grade for a new student with hsgpa=80 hscalc=90 hsengl=70
390 totscore=15. For just a prediction (no interval), proc glm is easier. */
391
NOTE: There were 582 observations read from the data set WORK.PREDICTC.
NOTE: PROCEDURE FREQ used (Total process time):
real time 0.08 seconds
cpu time 0.08 seconds
392 proc glm data = explor;
393 model grade = hsgpa hscalc hsengl totscore;
394 estimate 'New Student' intercept 1 hsgpa 80 hscalc 90 hsengl 70
395 totscore 15;
396
397 /* Prediction for Y_{n+1} is the same as estimate of E[Y|X]. CI from proc glm
398 is for E[Y|X]. PREDICTION interval for Y_{n+1} is wider. */
399
NOTE: PROCEDURE GLM used (Total process time):
real time 0.08 seconds
cpu time 0.08 seconds
400 data student;
401 hsgpa=80; hscalc=90; hsengl=70; totscore=15; id = -27;
402
NOTE: The data set WORK.STUDENT has 1 observations and 5 variables.
NOTE: DATA statement used (Total process time):
real time 0.00 seconds
cpu time 0.02 seconds
403 data together;
404 set explore student;
405 /* All variables not assigned will be missing for observation -27 */
406
NOTE: There were 579 observations read from the data set WORK.EXPLORE.
NOTE: There were 1 observations read from the data set WORK.STUDENT.
NOTE: The data set WORK.TOGETHER has 580 observations and 35 variables.
NOTE: DATA statement used (Total process time):
real time 0.00 seconds
cpu time 0.00 seconds
407 proc reg plots = none;
408 title3 'Model H: hsgpa hscalc hsengl totscore: R-sq = 0.4532';
409 model grade = hsgpa hscalc hsengl totscore;
410 output out = guess predicted = PredictedY
411 L95 = LowerLimit
412 U95 = UpperLimit;
413
NOTE: The data set WORK.GUESS has 580 observations and 38 variables.
NOTE: PROCEDURE REG used (Total process time):
real time 0.06 seconds
cpu time 0.05 seconds
414 data newguess;
415 set guess;
416 if id < 0; /* Discard all other cases */
417
NOTE: There were 580 observations read from the data set WORK.GUESS.
NOTE: The data set WORK.NEWGUESS has 1 observations and 38 variables.
NOTE: DATA statement used (Total process time):
real time 0.01 seconds
cpu time 0.01 seconds
418 proc print;
419 title3 'hsgpa=80 hscalc=90 hsengl=70 totscore=15';
420 var predictedY LowerLimit UpperLimit;
421
422
423
424
425
426
427
428
429
430
431 OPTIONS NONOTES NOSTIMER NOSOURCE NOSYNTAXCHECK;
443