1 OPTIONS NONOTES NOSTIMER NOSOURCE NOSYNTAXCHECK;
55
56 /* MathLogReg4.sas */
57 %include '/folders/myfolders/441s16/Lecture/readmath2b.sas';
NOTE: Format YNFMT is already on the library WORK.FORMATS.
NOTE: Format YNFMT has been output.
NOTE: Format CRSFMT is already on the library WORK.FORMATS.
NOTE: Format CRSFMT has been output.
NOTE: Format NFMT is already on the library WORK.FORMATS.
NOTE: Format NFMT has been output.
NOTE: PROCEDURE FORMAT used (Total process time):
real time 0.00 seconds
cpu time 0.00 seconds
166 title2 'Logistic regression with more than 2 resp. categories using
167 proc catmod';
168 /*************** Data step continues ************************/
169 hsutil = hsgpa+hscalc+hsengl;
170 if hsutil = . then hsmiss=1; else hsmiss=0;
171 label hsmiss = 'Missing Any High School Data'; format hsmiss ynfmt.;
172
173 if (0<=mark<=49) then outcome = 'Fail';
174 else if (50<=mark<=100) then outcome = 'Pass';
175 else outcome = 'Gone';
176 /*************************************************************/
177
NOTE: The infile '/folders/myfolders/exploremath.data.txt' is:
Filename=/folders/myfolders/exploremath.data.txt,
Owner Name=root,Group Name=vboxsf,
Access Permission=-rwxrwx---,
Last Modified=18Jan2016:17:34:49,
File Size (bytes)=44583
NOTE: 579 records were read from the infile '/folders/myfolders/exploremath.data.txt'.
The minimum record length was 75.
The maximum record length was 75.
NOTE: Missing values were generated as a result of performing an operation on missing values.
Each place is given by: (Number of times) at (Line):(Column).
99 at 80:24 99 at 117:13 142 at 169:15 2 at 169:22
NOTE: The data set WORK.MATHEX has 579 observations and 37 variables.
NOTE: DATA statement used (Total process time):
real time 0.01 seconds
cpu time 0.02 seconds
178 proc freq;
179 tables outcome*passed / norow nocol nopercent missing;
180
NOTE: There were 579 observations read from the data set WORK.MATHEX.
NOTE: PROCEDURE FREQ used (Total process time):
real time 0.05 seconds
cpu time 0.06 seconds
181 proc freq;
182 title3 'One at a time cat IVs with proc freq';
183 tables (course2 sex ethnic tongue hsmiss) * outcome
184 / nocol nopercent chisq;
185
NOTE: There were 579 observations read from the data set WORK.MATHEX.
NOTE: PROCEDURE FREQ used (Total process time):
real time 0.22 seconds
cpu time 0.22 seconds
186 proc logistic descending order=internal;
187 title3 'Simple logistic regression: Reproduce this';
188 model passed = hsgpa;
189
NOTE: PROC LOGISTIC is modeling the probability that passed='Yes'.
NOTE: Convergence criterion (GCONV=1E-8) satisfied.
NOTE: There were 579 observations read from the data set WORK.MATHEX.
NOTE: PROCEDURE LOGISTIC used (Total process time):
real time 0.12 seconds
cpu time 0.12 seconds
190 proc catmod;
191 title3 'Hsgpa Reproduce b1 = 0.2089, Wald Chisq = 76.5326';
192 direct hsgpa; /* Direct means no dummy vars please */
193 model passed = hsgpa / noprofile;
194 /* Always suppress the profile when there are quantitative
195 explanatory variables. */
196
197 /* The last response category (Y=1) is the reference
198 (denominator) category so the sign of the regression
199 coefficient is reversed, but we can live with this. */
200
NOTE: The default estimation method for this model is maximum-likelihood.
NOTE: Maximum likelihood computations converged.
NOTE: There were 579 observations read from the data set WORK.MATHEX.
NOTE: PROCEDURE CATMOD used (Total process time):
real time 0.06 seconds
cpu time 0.06 seconds
201 proc catmod;
202 title3 'Hsmiss by outcome';
203 direct hsmiss; /* It's already a dummy variable. */
204 model outcome = hsmiss;
205 contrast 'HS Missing method 1' hsmiss 1;
206 contrast 'HS Missing method 2' all_parms 0 0 1 0,
207 all_parms 0 0 0 1;
208 /* Model is
209
210 ln(pi1/pi3) = beta01 + beta11 x Fail vs. Pass
211 ln(pi1/pi3) = beta02 + beta12 x Gone vs. Pass
212
213 all_parms reads down the columns
214 beta01 beta02 beta11 beta12
215 all_parms 0 0 1 0,
216 0 0 0 1
217
218 Specifying 2 linear combinations of the betas equal
219 to zero, so there is no relationship with x.
220 */
221
NOTE: The default estimation method for this model is maximum-likelihood.
NOTE: Maximum likelihood computations converged.
NOTE: There were 579 observations read from the data set WORK.MATHEX.
NOTE: PROCEDURE CATMOD used (Total process time):
real time 0.10 seconds
cpu time 0.11 seconds
222 proc iml;
NOTE: IML Ready
223 title3 'Estimate Probabilities using output from proc catmod';
224 b01 = -1.3594;
224 ! b11 = 0.6663;
225 b02 = -0.8306;
225 ! b12 = 1.2360;
226 hsmiss = 0;
227 L1 = b01 + b11*hsmiss;
228 L2 = b02 + b12*hsmiss;
229 denom = 1 + exp(L1) + exp(L2);
230 Fail = exp(L1)/denom;
230 ! Gone = exp(L2)/denom;
230 ! Pass = 1/denom;
231 print "No Missing HS Data:" Fail Gone Pass;
232 hsmiss = 1;
233 L1 = b01 + b11*hsmiss;
234 L2 = b02 + b12*hsmiss;
235 denom = 1 + exp(L1) + exp(L2);
236 Fail = exp(L1)/denom;
236 ! Gone = exp(L2)/denom;
236 ! Pass = 1/denom;
237 print "Yes Missing HS Data:" Fail Gone Pass;
238
NOTE: Exiting IML.
NOTE: PROCEDURE IML used (Total process time):
real time 0.04 seconds
cpu time 0.05 seconds
239 proc freq;
240 title3 'Hsmiss by outcome again for comparison';
241 tables hsmiss * outcome / nocol nopercent;
242
243 /* Now seek a good predictive model */
244
NOTE: There were 579 observations read from the data set WORK.MATHEX.
NOTE: PROCEDURE FREQ used (Total process time):
real time 0.03 seconds
cpu time 0.03 seconds
245 proc catmod;
246 title3 'HS variables';
247 direct hsgpa hscalc hsengl;
248 model outcome = hsgpa hscalc hsengl / noprofile;
249
250 /* Drop HS English */
251
NOTE: The default estimation method for this model is maximum-likelihood.
NOTE: Maximum likelihood computations converged.
NOTE: There were 579 observations read from the data set WORK.MATHEX.
NOTE: PROCEDURE CATMOD used (Total process time):
real time 0.09 seconds
cpu time 0.08 seconds
252 proc catmod;
253 title3 'HS gpa and calc + course2 ';
254 direct hsgpa hscalc;
255 model outcome = hsgpa hscalc course2 / noprofile;
256 /* Dummy vars for course2 use effect coding */
257
258 /* Forget course2 */
259
NOTE: The default estimation method for this model is maximum-likelihood.
NOTE: Maximum likelihood computations converged.
NOTE: There were 579 observations read from the data set WORK.MATHEX.
NOTE: PROCEDURE CATMOD used (Total process time):
real time 0.08 seconds
cpu time 0.09 seconds
260 proc catmod;
261 title3 'HS gpa and calc + diagnostic test';
262 direct hsgpa hscalc precalc calc;
263 model outcome = hsgpa hscalc precalc calc / noprofile;
264
265 /* Drop calc subtest, keep precalc */
266
NOTE: The default estimation method for this model is maximum-likelihood.
NOTE: Maximum likelihood computations converged.
NOTE: There were 579 observations read from the data set WORK.MATHEX.
NOTE: PROCEDURE CATMOD used (Total process time):
real time 0.09 seconds
cpu time 0.09 seconds
267 proc catmod;
268 title3 'Try gender, ethnic and mother tongue controlling for good stuff';
269 direct hsgpa hscalc precalc calc gender mtongue;
270 model outcome = hsgpa hscalc precalc ethnic gender mtongue / noprofile;
271 contrast 'Demographics' ethnic 1 0 0 0 0,
272 ethnic 0 1 0 0 0,
273 ethnic 0 0 1 0 0,
274 ethnic 0 0 0 1 0,
275 ethnic 0 0 0 0 1,
276 gender 1,
277 mtongue 1;
278 contrast 'Ethnic and Gender' ethnic 1 0 0 0 0,
279 ethnic 0 1 0 0 0,
280 ethnic 0 0 1 0 0,
281 ethnic 0 0 0 1 0,
282 ethnic 0 0 0 0 1,
283 gender 1;
284
285 /* Got this in the log file: "WARNING: The formatted values of one or more
286 variables are truncated to 16" */
287
288
289 /* Mother tongue is significant. Still true when we drop ethnic and gender? */
290
WARNING: The formatted values of one or more variables are truncated to 16 characters.
NOTE: The default estimation method for this model is maximum-likelihood.
NOTE: Maximum likelihood computations converged.
NOTE: There were 579 observations read from the data set WORK.MATHEX.
NOTE: PROCEDURE CATMOD used (Total process time):
real time 0.12 seconds
cpu time 0.12 seconds
291 proc catmod;
292 title3 'hsgpa hscalc precalc calc mtongue';
293 direct hsgpa hscalc precalc calc mtongue;
294 model outcome = hsgpa hscalc precalc mtongue / noprofile;
295
296 /* Allowing for academic background, students whose first language is English
297 are more likely to fail the course as opposed to passing, and less likely to
298 disappear as opposed to passing. If this is replicated, it will be very
299 interesting. Now explore in more detail.
300
301 Recall the response categories are 1=Fail 2=Gone 3=Pass.
302
303 We want to know whether failing is different from disappearing in terms
304 of their relationship to the explanatory variables. We are getting
305 advanced here. What is H0?
306
307 Model (using b instead of beta) is
308
309 ln(pi1/pi3) = b01 + b11 hsgpa + b21 hscalc + b31 precalc + b41 mtongue
310 ln(pi1/pi3) = b02 + b12 hsgpa + b22 hscalc + b32 precalc + b42 mtongue
311
312 The null hypothesis is b11=b12, b21=b22, b31=b32, b41=b42
313
314 all_parms reads down the columns, so
315
316 beta = b01 b02 b11 b12 b21 b22 b31 b32 b41 b42
317
318 And H0 says that 4 linear combinations of the betas equal zero:
319
320 b01 b02 b11 b12 b21 b22 b31 b32 b41 b42
321 0 0 1 -1 0 0 0 0 0 0
322 0 0 0 0 1 -1 0 0 0 0
323 0 0 0 0 0 0 1 -1 0 0
324 0 0 0 0 0 0 0 0 1 -1
325
326 */
327
NOTE: The default estimation method for this model is maximum-likelihood.
NOTE: Maximum likelihood computations converged.
NOTE: There were 579 observations read from the data set WORK.MATHEX.
NOTE: PROCEDURE CATMOD used (Total process time):
real time 0.09 seconds
cpu time 0.09 seconds
328 proc catmod;
329 title3 'Different coefficients for Gone and Fail?';
330 direct hsgpa hscalc precalc calc mtongue;
331 model outcome = hsgpa hscalc precalc mtongue / noprofile;
332 contrast 'Diff Relationships Overall'
333 all_parms 0 0 1 -1 0 0 0 0 0 0,
334 all_parms 0 0 0 0 1 -1 0 0 0 0,
335 all_parms 0 0 0 0 0 0 1 -1 0 0,
336 all_parms 0 0 0 0 0 0 0 0 1 -1;
337 contrast 'Diff Relationships for hsgpa'
338 all_parms 0 0 1 -1 0 0 0 0 0 0;
339 contrast 'Diff Relationships for hscalc'
340 all_parms 0 0 0 0 1 -1 0 0 0 0;
341 contrast 'Diff Relationships for precalc'
342 all_parms 0 0 0 0 0 0 1 -1 0 0;
343 contrast 'Diff Relationships for mtongue'
344 all_parms 0 0 0 0 0 0 0 0 1 -1;
345
346 /************************** Replication ***********************
347 For interpretation, want to replicate 8 findings:
348 Gone vs. Pass and Fail vs. Pass for each explanatory variable.
349 ***************************************************************/
350
351 %include '/folders/myfolders/441s16/Lecture/readreplic.sas';
NOTE: The default estimation method for this model is maximum-likelihood.
NOTE: Maximum likelihood computations converged.
NOTE: There were 579 observations read from the data set WORK.MATHEX.
NOTE: PROCEDURE CATMOD used (Total process time):
real time 0.11 seconds
cpu time 0.10 seconds
NOTE: Format YNFMT is already on the library WORK.FORMATS.
NOTE: Format YNFMT has been output.
NOTE: Format CRSFMT is already on the library WORK.FORMATS.
NOTE: Format CRSFMT has been output.
NOTE: Format NFMT is already on the library WORK.FORMATS.
NOTE: Format NFMT has been output.
NOTE: PROCEDURE FORMAT used (Total process time):
real time 0.00 seconds
cpu time 0.00 seconds
464 if (0<=mark<=49) then outcome = 'Fail';
465 else if (50<=mark<=100) then outcome = 'Pass';
466 else outcome = 'Gone';
467
468
NOTE: The infile '/folders/myfolders/replicmath2.data.txt' is:
Filename=/folders/myfolders/replicmath2.data.txt,
Owner Name=root,Group Name=vboxsf,
Access Permission=-rwxrwx---,
Last Modified=30Jan2016:15:16:16,
File Size (bytes)=38214
NOTE: 579 records were read from the infile '/folders/myfolders/replicmath2.data.txt'.
The minimum record length was 64.
The maximum record length was 64.
NOTE: Missing values were generated as a result of performing an operation on missing values.
Each place is given by: (Number of times) at (Line):(Column).
81 at 379:24 81 at 415:18
NOTE: The data set WORK.REPLIC has 579 observations and 35 variables.
NOTE: DATA statement used (Total process time):
real time 0.03 seconds
cpu time 0.03 seconds
469 proc catmod data=replic; /* That's the default anyway */
470 title3 'Replicate hsgpa hscalc precalc calc mtongue 0.05/8 = .00625';
471 direct hsgpa hscalc precalc calc mtongue;
472 model outcome = hsgpa hscalc precalc mtongue / noprofile;
473 contrast 'Diff Relationships for mtongue'
474 all_parms 0 0 0 0 0 0 0 0 1 -1;
475
476 /* Final conclusions:
477
478 Students with higher High School GPA were more likely to pass as
479 opposed to failing and more likely to pass as opposed to diappearing.
480
481 Students with higher High School Calculus marks were more likely
482 to pass as opposed to disappearing.
483
484 Students with higher scores on the pre-calculus portion of the
485 diagnostic test were more likely to pass as opposed to disappearing.
486
487 There was no convincing evidence of a connection between Mother
488 Tongue (English vs. Other) and outcome.
489
490 */
491
492
493
494
495
496
497
498
499 OPTIONS NONOTES NOSTIMER NOSOURCE NOSYNTAXCHECK;
511