1 OPTIONS NONOTES NOSTIMER NOSOURCE NOSYNTAXCHECK;
55
56 /* MathLogReg4.sas */
57 %include '/folders/myfolders/441s16/Lecture/readmath2b.sas';
NOTE: Format YNFMT is already on the library WORK.FORMATS.
NOTE: Format YNFMT has been output.
NOTE: Format CRSFMT is already on the library WORK.FORMATS.
NOTE: Format CRSFMT has been output.
NOTE: Format NFMT is already on the library WORK.FORMATS.
NOTE: Format NFMT has been output.
NOTE: PROCEDURE FORMAT used (Total process time):
real time 0.00 seconds
cpu time 0.00 seconds
166 /* Creates data table mathex */
167 title2 'Logistic regression with more than 2 resp. categories';
168 /*************** Data step continues ************************/
169 hsutil = hsgpa+hscalc+hsengl;
170 if hsutil = . then hsmiss=1; else hsmiss=0;
171 label hsmiss = 'Missing Any High School Data'; format hsmiss ynfmt.;
172
173 if (0<=mark<=49) then outcome = 'Fail';
174 else if (50<=mark<=100) then outcome = 'Pass';
175 else outcome = 'Gone';
176 /*************************************************************/
177
NOTE: The infile '/folders/myfolders/exploremath.data.txt' is:
Filename=/folders/myfolders/exploremath.data.txt,
Owner Name=root,Group Name=vboxsf,
Access Permission=-rwxrwx---,
Last Modified=18Jan2016:18:34:49,
File Size (bytes)=44583
NOTE: 579 records were read from the infile '/folders/myfolders/exploremath.data.txt'.
The minimum record length was 75.
The maximum record length was 75.
NOTE: Missing values were generated as a result of performing an operation on missing values.
Each place is given by: (Number of times) at (Line):(Column).
99 at 80:24 99 at 117:13 142 at 169:15 2 at 169:22
NOTE: The data set WORK.MATHEX has 579 observations and 37 variables.
NOTE: DATA statement used (Total process time):
real time 0.01 seconds
cpu time 0.01 seconds
178 proc freq;
179 tables outcome*passed / norow nocol nopercent missing;
180
NOTE: There were 579 observations read from the data set WORK.MATHEX.
NOTE: PROCEDURE FREQ used (Total process time):
real time 0.05 seconds
cpu time 0.05 seconds
181 proc freq;
182 title3 'One at a time cat IVs with proc freq';
183 tables (course2 sex ethnic tongue hsmiss) * outcome
184 / nocol nopercent chisq;
185
186 /* Multinomial Logit model is
187
188 ln(pi1/pi3) = beta01 + beta11 x Fail vs. Pass
189 ln(pi2/pi3) = beta02 + beta12 x Gone vs. Pass
190 */
191
NOTE: There were 579 observations read from the data set WORK.MATHEX.
NOTE: PROCEDURE FREQ used (Total process time):
real time 0.18 seconds
cpu time 0.18 seconds
192 proc logistic data=mathex outest=ParmNames;
193 title3 'Multinomial logit model with proc logistic';
194 model outcome (ref='Pass') = hsmiss / link = glogit;
195 contrast 'HS Missing method 1' hsmiss 1;
196
197 /* Find out the parameter names. */
198
NOTE: PROC LOGISTIC is fitting the generalized logit model. The logits modeled contrast each response category against the
reference category (outcome='Pass').
NOTE: Convergence criterion (GCONV=1E-8) satisfied.
NOTE: There were 579 observations read from the data set WORK.MATHEX.
NOTE: The data set WORK.PARMNAMES has 1 observations and 10 variables.
NOTE: PROCEDURE LOGISTIC used (Total process time):
real time 0.10 seconds
cpu time 0.09 seconds
199 proc transpose data=ParmNames;
200 run;
NOTE: There were 1 observations read from the data set WORK.PARMNAMES.
NOTE: The data set WORK.DATA6 has 5 observations and 3 variables.
NOTE: PROCEDURE TRANSPOSE used (Total process time):
real time 0.01 seconds
cpu time 0.01 seconds
201 proc print noobs;
202 run;
NOTE: There were 5 observations read from the data set WORK.DATA6.
NOTE: PROCEDURE PRINT used (Total process time):
real time 0.01 seconds
cpu time 0.02 seconds
203
204 proc logistic data = mathex;
205 title3 'Multinomial logit model with proc logistic';
206 model outcome (ref='Pass') = hsmiss / link = glogit;
207 contrast 'HS Missing method 1' hsmiss 1;
208 HS_MissingMethod2: test hsmiss_Fail = hsmiss_Gone = 0;
209
NOTE: PROC LOGISTIC is fitting the generalized logit model. The logits modeled contrast each response category against the
reference category (outcome='Pass').
NOTE: Convergence criterion (GCONV=1E-8) satisfied.
NOTE: There were 579 observations read from the data set WORK.MATHEX.
NOTE: PROCEDURE LOGISTIC used (Total process time):
real time 0.09 seconds
cpu time 0.08 seconds
210 proc iml;
NOTE: IML Ready
211 title3 'Estimate Probabilities using output from proc catmod';
212 b01 = -1.3594;
212 ! b11 = 0.6663;
213 b02 = -0.8306;
213 ! b12 = 1.2360;
214 hsmiss = 0;
215 L1 = b01 + b11*hsmiss;
216 L2 = b02 + b12*hsmiss;
217 denom = 1 + exp(L1) + exp(L2);
218 Fail = exp(L1)/denom;
218 ! Gone = exp(L2)/denom;
218 ! Pass = 1/denom;
219 print "No Missing HS Data:" Fail Gone Pass;
220 hsmiss = 1;
221 L1 = b01 + b11*hsmiss;
222 L2 = b02 + b12*hsmiss;
223 denom = 1 + exp(L1) + exp(L2);
224 Fail = exp(L1)/denom;
224 ! Gone = exp(L2)/denom;
224 ! Pass = 1/denom;
225 print "Yes Missing HS Data:" Fail Gone Pass;
226
NOTE: Exiting IML.
NOTE: PROCEDURE IML used (Total process time):
real time 0.03 seconds
cpu time 0.02 seconds
227 proc freq data = mathex;
228 title3 'Hsmiss by outcome again for comparison';
229 tables hsmiss * outcome / nocol nopercent;
230
231 /* Now seek a good predictive model */
232
NOTE: There were 579 observations read from the data set WORK.MATHEX.
NOTE: PROCEDURE FREQ used (Total process time):
real time 0.03 seconds
cpu time 0.02 seconds
233 proc logistic data = mathex;
234 title3 'HS variables';
235 model outcome (ref='Pass') = hsgpa hscalc hsengl / link = glogit;
236
237 /* Drop HS English */
238
NOTE: PROC LOGISTIC is fitting the generalized logit model. The logits modeled contrast each response category against the
reference category (outcome='Pass').
NOTE: Convergence criterion (GCONV=1E-8) satisfied.
NOTE: There were 579 observations read from the data set WORK.MATHEX.
NOTE: PROCEDURE LOGISTIC used (Total process time):
real time 0.10 seconds
cpu time 0.10 seconds
239 proc logistic data = mathex;
240 title3 'HS gpa and calc + course2 ';
241 class course2 / param=ref; /* Last category is reference by default. */
242 model outcome (ref='Pass') = hsgpa hscalc course2 / link = glogit;
243
244 /* Forget course2 */
245
NOTE: PROC LOGISTIC is fitting the generalized logit model. The logits modeled contrast each response category against the
reference category (outcome='Pass').
NOTE: Convergence criterion (GCONV=1E-8) satisfied.
NOTE: There were 579 observations read from the data set WORK.MATHEX.
NOTE: PROCEDURE LOGISTIC used (Total process time):
real time 0.12 seconds
cpu time 0.11 seconds
246 proc logistic data = mathex;
247 title3 'HS gpa and calc + diagnostic test';
248 model outcome (ref='Pass') = hsgpa hscalc precalc calc / link = glogit;
249
250 /* Drop calc subtest, keep precalc */
251
NOTE: PROC LOGISTIC is fitting the generalized logit model. The logits modeled contrast each response category against the
reference category (outcome='Pass').
NOTE: Convergence criterion (GCONV=1E-8) satisfied.
NOTE: There were 579 observations read from the data set WORK.MATHEX.
NOTE: PROCEDURE LOGISTIC used (Total process time):
real time 0.10 seconds
cpu time 0.10 seconds
252 proc logistic data = mathex;
253 title3 'Try gender, ethnic and mother tongue controlling for good stuff';
254 class ethnic (param=ref ref='East Indian');
255 /* Specifying a reference category that's not the last value */
256 model outcome (ref='Pass') =
257 hsgpa hscalc precalc ethnic gender mtongue / link = glogit;
258 contrast 'Demographics' ethnic 1 0 0 0 0,
259 ethnic 0 1 0 0 0,
260 ethnic 0 0 1 0 0,
261 ethnic 0 0 0 1 0,
262 ethnic 0 0 0 0 1,
263 gender 1,
264 mtongue 1;
265 contrast 'Ethnic and Gender' ethnic 1 0 0 0 0,
266 ethnic 0 1 0 0 0,
267 ethnic 0 0 1 0 0,
268 ethnic 0 0 0 1 0,
269 ethnic 0 0 0 0 1,
270 gender 1;
271
272 /* Mother tongue is significant. Still true when we drop ethnic and gender? */
273
NOTE: PROC LOGISTIC is fitting the generalized logit model. The logits modeled contrast each response category against the
reference category (outcome='Pass').
NOTE: Convergence criterion (GCONV=1E-8) satisfied.
NOTE: There were 579 observations read from the data set WORK.MATHEX.
NOTE: PROCEDURE LOGISTIC used (Total process time):
real time 0.17 seconds
cpu time 0.16 seconds
274 proc logistic data = mathex;
275 title3 'hsgpa hscalc precalc mtongue';
276 model outcome (ref='Pass') =
277 hsgpa hscalc precalc mtongue / link = glogit;
278
NOTE: PROC LOGISTIC is fitting the generalized logit model. The logits modeled contrast each response category against the
reference category (outcome='Pass').
NOTE: Convergence criterion (GCONV=1E-8) satisfied.
NOTE: There were 579 observations read from the data set WORK.MATHEX.
NOTE: PROCEDURE LOGISTIC used (Total process time):
real time 0.10 seconds
cpu time 0.08 seconds
279 proc logistic data = mathex;
280 title3 'hsgpa hscalc precalc mtongue';
281 model outcome (ref='Pass') =
282 hsgpa hscalc precalc mtongue / link = glogit;
283
284 /* Allowing for academic background, students whose first language is English
285 are more likely to fail the course as opposed to passing, and less likely to
286 disappear as opposed to passing. If this is replicated, it will be very
287 interesting. Now explore in more detail.
288
289 Recall the response categories are 1=Fail 2=Gone 3=Pass.
290
291 We want to know whether failing is different from disappearing in terms
292 of their relationship to the explanatory variables. We are getting
293 advanced here. What is H0?
294
295 Model (using b instead of beta) is
296
297 ln(pi1/pi3) = b01 + b11 hsgpa + b21 hscalc + b31 precalc + b41 mtongue
298 ln(pi2/pi3) = b02 + b12 hsgpa + b22 hscalc + b32 precalc + b42 mtongue
299
300 The null hypothesis is b11=b12, b21=b22, b31=b32, b41=b42
301
302 Parameter names are easy to guess. */
303
NOTE: PROC LOGISTIC is fitting the generalized logit model. The logits modeled contrast each response category against the
reference category (outcome='Pass').
NOTE: Convergence criterion (GCONV=1E-8) satisfied.
NOTE: There were 579 observations read from the data set WORK.MATHEX.
NOTE: PROCEDURE LOGISTIC used (Total process time):
real time 0.11 seconds
cpu time 0.09 seconds
304 proc logistic data = mathex;
305 title3 'Different coefficients for Gone and Fail?';
306 model outcome (ref='Pass') = hsgpa hscalc precalc mtongue / link = glogit;
307 DiffOverall: test hsgpa_Fail = hsgpa_Gone, hscalc_Fail = hscalc_Gone,
308 precalc_Fail = precalc_Gone, mtongue_Fail = mtongue_Gone;
309 Diff_hsgpa: test hsgpa_Fail = hsgpa_Gone;
310 Diff_hscalc: test hscalc_Fail = hscalc_Gone;
311 Diff_precalc: test precalc_Fail = precalc_Gone;
312 Diff_mtongue: test mtongue_Fail = mtongue_Gone;
313 run;
NOTE: PROC LOGISTIC is fitting the generalized logit model. The logits modeled contrast each response category against the
reference category (outcome='Pass').
NOTE: Convergence criterion (GCONV=1E-8) satisfied.
NOTE: There were 579 observations read from the data set WORK.MATHEX.
NOTE: PROCEDURE LOGISTIC used (Total process time):
real time 0.12 seconds
cpu time 0.09 seconds
314
315 /************************** Replication ***********************
316 For interpretation, want to replicate 8 findings:
317 Gone vs. Pass and Fail vs. Pass for each explanatory variable.
318 ***************************************************************/
319
320 %include '/folders/myfolders/441s16/Lecture/readreplic.sas';
NOTE: Format YNFMT is already on the library WORK.FORMATS.
NOTE: Format YNFMT has been output.
NOTE: Format CRSFMT is already on the library WORK.FORMATS.
NOTE: Format CRSFMT has been output.
NOTE: Format NFMT is already on the library WORK.FORMATS.
NOTE: Format NFMT has been output.
NOTE: PROCEDURE FORMAT used (Total process time):
real time 0.00 seconds
cpu time 0.01 seconds
433 if (0<=mark<=49) then outcome = 'Fail';
434 else if (50<=mark<=100) then outcome = 'Pass';
435 else outcome = 'Gone';
436
NOTE: The infile '/folders/myfolders/replicmath2.data.txt' is:
Filename=/folders/myfolders/replicmath2.data.txt,
Owner Name=root,Group Name=vboxsf,
Access Permission=-rwxrwx---,
Last Modified=30Jan2016:16:16:16,
File Size (bytes)=38214
NOTE: 579 records were read from the infile '/folders/myfolders/replicmath2.data.txt'.
The minimum record length was 64.
The maximum record length was 64.
NOTE: Missing values were generated as a result of performing an operation on missing values.
Each place is given by: (Number of times) at (Line):(Column).
81 at 348:24 81 at 384:18
NOTE: The data set WORK.REPLIC has 579 observations and 35 variables.
NOTE: DATA statement used (Total process time):
real time 0.00 seconds
cpu time 0.01 seconds
437 proc logistic data = replic; /* That's the default anyway. */
438 title2 'Replicate hsgpa hscalc precalc calc mtongue 0.05/8 = .00625';
439 model outcome (ref='Pass') = hsgpa hscalc precalc mtongue / link = glogit;
440 Diff_mtongue: test mtongue_Fail = mtongue_Gone;
441
442
443
444 /* Final conclusions:
445
446 Students with higher High School GPA were less likely to fail as
447 opposed to passing and less likely to disappear as opposed to passing.
448
449 Students with higher High School Calculus marks were less likely
450 to disappear as opposed to passing.
451
452 Students with higher scores on the pre-calculus portion of the
453 diagnostic test were less likely to disappear as opposed to passing.
454
455 There was no convincing evidence of a connection between Mother
456 Tongue (English vs. Other) and outcome.
457
458 */
459
460
461
462
463
464
465
466
467 OPTIONS NONOTES NOSTIMER NOSOURCE NOSYNTAXCHECK;
479