/* MathLogReg4.sas */ %include '/folders/myfolders/441s16/Lecture/readmath2b.sas'; title2 'Logistic regression with more than 2 resp. categories using proc catmod'; /*************** Data step continues ************************/ hsutil = hsgpa+hscalc+hsengl; if hsutil = . then hsmiss=1; else hsmiss=0; label hsmiss = 'Missing Any High School Data'; format hsmiss ynfmt.; if (0<=mark<=49) then outcome = 'Fail'; else if (50<=mark<=100) then outcome = 'Pass'; else outcome = 'Gone'; /*************************************************************/ proc freq; tables outcome*passed / norow nocol nopercent missing; proc freq; title3 'One at a time cat IVs with proc freq'; tables (course2 sex ethnic tongue hsmiss) * outcome / nocol nopercent chisq; proc logistic descending order=internal; title3 'Simple logistic regression: Reproduce this'; model passed = hsgpa; proc catmod; title3 'Hsgpa Reproduce b1 = 0.2089, Wald Chisq = 76.5326'; direct hsgpa; /* Direct means no dummy vars please */ model passed = hsgpa / noprofile; /* Always suppress the profile when there are quantitative explanatory variables. */ /* The last response category (Y=1) is the reference (denominator) category so the sign of the regression coefficient is reversed, but we can live with this. */ proc catmod; title3 'Hsmiss by outcome'; direct hsmiss; /* It's already a dummy variable. */ model outcome = hsmiss; contrast 'HS Missing method 1' hsmiss 1; contrast 'HS Missing method 2' all_parms 0 0 1 0, all_parms 0 0 0 1; /* Model is ln(pi1/pi3) = beta01 + beta11 x Fail vs. Pass ln(pi1/pi3) = beta02 + beta12 x Gone vs. Pass all_parms reads down the columns beta01 beta02 beta11 beta12 all_parms 0 0 1 0, 0 0 0 1 Specifying 2 linear combinations of the betas equal to zero, so there is no relationship with x. */ proc iml; title3 'Estimate Probabilities using output from proc catmod'; b01 = -1.3594; b11 = 0.6663; b02 = -0.8306; b12 = 1.2360; hsmiss = 0; L1 = b01 + b11*hsmiss; L2 = b02 + b12*hsmiss; denom = 1 + exp(L1) + exp(L2); Fail = exp(L1)/denom; Gone = exp(L2)/denom; Pass = 1/denom; print "No Missing HS Data:" Fail Gone Pass; hsmiss = 1; L1 = b01 + b11*hsmiss; L2 = b02 + b12*hsmiss; denom = 1 + exp(L1) + exp(L2); Fail = exp(L1)/denom; Gone = exp(L2)/denom; Pass = 1/denom; print "Yes Missing HS Data:" Fail Gone Pass; proc freq; title3 'Hsmiss by outcome again for comparison'; tables hsmiss * outcome / nocol nopercent; /* Now seek a good predictive model */ proc catmod; title3 'HS variables'; direct hsgpa hscalc hsengl; model outcome = hsgpa hscalc hsengl / noprofile; /* Drop HS English */ proc catmod; title3 'HS gpa and calc + course2 '; direct hsgpa hscalc; model outcome = hsgpa hscalc course2 / noprofile; /* Dummy vars for course2 use effect coding */ /* Forget course2 */ proc catmod; title3 'HS gpa and calc + diagnostic test'; direct hsgpa hscalc precalc calc; model outcome = hsgpa hscalc precalc calc / noprofile; /* Drop calc subtest, keep precalc */ proc catmod; title3 'Try gender, ethnic and mother tongue controlling for good stuff'; direct hsgpa hscalc precalc calc gender mtongue; model outcome = hsgpa hscalc precalc ethnic gender mtongue / noprofile; contrast 'Demographics' ethnic 1 0 0 0 0, ethnic 0 1 0 0 0, ethnic 0 0 1 0 0, ethnic 0 0 0 1 0, ethnic 0 0 0 0 1, gender 1, mtongue 1; contrast 'Ethnic and Gender' ethnic 1 0 0 0 0, ethnic 0 1 0 0 0, ethnic 0 0 1 0 0, ethnic 0 0 0 1 0, ethnic 0 0 0 0 1, gender 1; /* Got this in the log file: "WARNING: The formatted values of one or more variables are truncated to 16" */ /* Mother tongue is significant. Still true when we drop ethnic and gender? */ proc catmod; title3 'hsgpa hscalc precalc calc mtongue'; direct hsgpa hscalc precalc calc mtongue; model outcome = hsgpa hscalc precalc mtongue / noprofile; /* Allowing for academic background, students whose first language is English are more likely to fail the course as opposed to passing, and less likely to disappear as opposed to passing. If this is replicated, it will be very interesting. Now explore in more detail. Recall the response categories are 1=Fail 2=Gone 3=Pass. We want to know whether failing is different from disappearing in terms of their relationship to the explanatory variables. We are getting advanced here. What is H0? Model (using b instead of beta) is ln(pi1/pi3) = b01 + b11 hsgpa + b21 hscalc + b31 precalc + b41 mtongue ln(pi1/pi3) = b02 + b12 hsgpa + b22 hscalc + b32 precalc + b42 mtongue The null hypothesis is b11=b12, b21=b22, b31=b32, b41=b42 all_parms reads down the columns, so beta = b01 b02 b11 b12 b21 b22 b31 b32 b41 b42 And H0 says that 4 linear combinations of the betas equal zero: b01 b02 b11 b12 b21 b22 b31 b32 b41 b42 0 0 1 -1 0 0 0 0 0 0 0 0 0 0 1 -1 0 0 0 0 0 0 0 0 0 0 1 -1 0 0 0 0 0 0 0 0 0 0 1 -1 */ proc catmod; title3 'Different coefficients for Gone and Fail?'; direct hsgpa hscalc precalc calc mtongue; model outcome = hsgpa hscalc precalc mtongue / noprofile; contrast 'Diff Relationships Overall' all_parms 0 0 1 -1 0 0 0 0 0 0, all_parms 0 0 0 0 1 -1 0 0 0 0, all_parms 0 0 0 0 0 0 1 -1 0 0, all_parms 0 0 0 0 0 0 0 0 1 -1; contrast 'Diff Relationships for hsgpa' all_parms 0 0 1 -1 0 0 0 0 0 0; contrast 'Diff Relationships for hscalc' all_parms 0 0 0 0 1 -1 0 0 0 0; contrast 'Diff Relationships for precalc' all_parms 0 0 0 0 0 0 1 -1 0 0; contrast 'Diff Relationships for mtongue' all_parms 0 0 0 0 0 0 0 0 1 -1; /************************** Replication *********************** For interpretation, want to replicate 8 findings: Gone vs. Pass and Fail vs. Pass for each explanatory variable. ***************************************************************/ %include '/folders/myfolders/441s16/Lecture/readreplic.sas'; if (0<=mark<=49) then outcome = 'Fail'; else if (50<=mark<=100) then outcome = 'Pass'; else outcome = 'Gone'; proc catmod data=replic; /* That's the default anyway */ title3 'Replicate hsgpa hscalc precalc calc mtongue 0.05/8 = .00625'; direct hsgpa hscalc precalc calc mtongue; model outcome = hsgpa hscalc precalc mtongue / noprofile; contrast 'Diff Relationships for mtongue' all_parms 0 0 0 0 0 0 0 0 1 -1; /* Final conclusions: Students with higher High School GPA were more likely to pass as opposed to failing and more likely to pass as opposed to diappearing. Students with higher High School Calculus marks were more likely to pass as opposed to disappearing. Students with higher scores on the pre-calculus portion of the diagnostic test were more likely to pass as opposed to disappearing. There was no convincing evidence of a connection between Mother Tongue (English vs. Other) and outcome. */