/* mathlogreg2.sas */ %include '/home/u1407221/441s24/SAS08/ReadLabelMath2.sas'; title2 'Predict Passing the course (Y-N) with Logistic Regression'; /* We know course is useful: c1 = 'Catch-up' c2 = 'Mainstream' c3 = 'Elite' */ proc logistic data = explore; title3 'Course and HS variables'; model passed (event='Yes') = c1 c3 hsgpa hscalc hsengl; course: test c1=c3=0; HSvars: test hsgpa=hscalc=hsengl=0; run; /* Decision: Drop course */ ods select ParameterEstimates; /* Limit the output */ proc logistic data = explore; title3 'Just HS variables'; model passed (event='Yes') = hsgpa hscalc hsengl; run; /* Decision: Drop HS English. Does the diagnostic test add anything? */ ods select ParameterEstimates; proc logistic data = explore; title3 'HS GPA, HS Calculus and Diagnostic Test'; model passed (event='Yes') = hsgpa hscalc calc precalc; run; /* Decision: Drop the calc subscale, but which is better, precalc or total score? */ ods select ParameterEstimates TestStmts; /* I ran a trace to find out the name */ proc logistic data = explore; title3 'HS GPA, HS Calculus and Diagnostic Test'; model passed (event='Yes') = hsgpa hscalc precalc totscore; precalc_n_totscore: test precalc = totscore = 0; run; /* Decision: Keep precalc rather than totscore. Confirm */ ods select ParameterEstimates; proc logistic data = explore; title3 'HS GPA, HS Calculus and Pre-calculus test'; model passed (event='Yes') = hsgpa hscalc precalc; run; proc logistic data = explore; title3 'Try gender, ethnic and mother tongue controlling for good stuff'; class ethnic (param=ref ref='East Indian'); /* Specifying a reference category that's not the last value */ model passed (event='Yes') = hsgpa hscalc precalc ethnic gender mtongue; contrast 'Demographics' ethnic 1 0 0 0 0, ethnic 0 1 0 0 0, ethnic 0 0 1 0 0, ethnic 0 0 0 1 0, ethnic 0 0 0 0 1, gender 1, mtongue 1 / e; /* Display the effect matrix */ run; /* Decision: Forget about ethnicity. */ ods select ParameterEstimates; proc logistic data = explore; title3 'HS GPA, HS Calculus, Pre-calculus test, Gender and Mother tongue'; model passed (event='Yes') = hsgpa hscalc precalc gender mtongue; run; /* Decision: Drop Gender and Mother tongue too. My model now has just HS GPA, HS Calculus and Pre-calculus test. */ proc logistic data = explore; title3 'Try automatic (stepwise) selection'; model passed (event='Yes') = gender mtongue e1-e6 hsgpa hscalc hsengl c1-c3 precalc calc totscore / selection = stepwise slentry = 0.05 slstay = 0.05 ; /* Default slentry = slstay = 0.15 */ run; /* Note 211 observations lost to missingness for stepwise, compared to 204 for the earlier model with hsgpa, hscalc and precalc. */ /* Perhaps missingness on the variables we dropped could be useful. */ proc freq; title2 'Explore missingness on omitted variables'; tables gender mtongue ethnic; tables gender*mtongue / norow nocol nopercent missing; tables gender*course2 / norow nocol nopercent missing; run; data explore2; set explore; if gender = . then sexmiss = 1; else sexmiss=0; /* Includes mtongue */ if course = . then coursemiss = 1; else coursemiss=0; format sexmiss coursemiss ynfmt.; label sexmiss = 'Gender and mother tongue missing' coursemiss = 'Course missing'; /* Checks are commented out proc freq; tables gender*sexmiss / norow nocol nopercent missing; tables course*coursemiss / norow nocol nopercent missing; tables sexmiss*coursemiss / norow nocol nopercent missing chisq; */ proc logistic data = explore2; title3 'Try adding missingness on gender/mtongue and course'; model passed (event='Yes') = hsgpa hscalc precalc sexmiss coursemiss; run; /* All the cases with course missing were deleted because of missingness on other variables. */ proc logistic data = explore2; title3 'Try adding missingness on gender/mtongue and course'; model passed (event='Yes') = hsgpa hscalc precalc sexmiss; run; /* Here's the current model. */ proc logistic data = explore; title3 'HS GPA, HS Calculus and Pre-calculus test'; model passed (event='Yes') = hsgpa hscalc precalc; output out=explore3 prob=pihat; run; proc print data=explore3 (obs=13); /* List only the first 13 observations */ var hsgpa hscalc precalc pihat passed; run; /* Based on invariance and the Law of Total Probability (double expectation), I predict that the mean pihat will be 234/375 = 0.624, the proportion of students with non-missing data who passed. */ proc univariate normal plot data=explore3; title2 'Explore the distribution of estimated probabilities'; where pihat ne .; var pihat; /* Should have n=375 non-missing. */ run; /* Goal: Develop a prediction model that uses all the data and makes a prediction for every case. Base on estimated probabilities. */