1 OPTIONS NONOTES NOSTIMER NOSOURCE NOSYNTAXCHECK;
NOTE: ODS statements in the SAS Studio environment may disable some output features.
62
63 /* mathlogreg2.sas */
64 %include '/folders/myfolders/441s18/Lecture/mathread1.sas';
NOTE: Format YNFMT is already on the library WORK.FORMATS.
NOTE: Format YNFMT has been output.
NOTE: Format CRSFMT is already on the library WORK.FORMATS.
NOTE: Format CRSFMT has been output.
NOTE: Format NFMT is already on the library WORK.FORMATS.
NOTE: Format NFMT has been output.
NOTE: PROCEDURE FORMAT used (Total process time):
real time 0.00 seconds
cpu time 0.00 seconds
178 title2 'Predict Passing the course (Y-N) with Logistic Regression';
179
180 /* We know course is useful:
181 c1 = 'Catch-up' c2 = 'Mainstream' c3 = 'Elite' */
182
NOTE: The infile '/folders/myfolders/441s18/Lecture/exploremath.data.txt' is:
Filename=/folders/myfolders/441s18/Lecture/exploremath.data.txt,
Owner Name=root,Group Name=vboxsf,
Access Permission=-rwxrwx---,
Last Modified=18Jan2016:17:34:49,
File Size (bytes)=44583
NOTE: 579 records were read from the infile '/folders/myfolders/441s18/Lecture/exploremath.data.txt'.
The minimum record length was 75.
The maximum record length was 75.
NOTE: Missing values were generated as a result of performing an operation on missing values.
Each place is given by: (Number of times) at (Line):(Column).
99 at 101:24 99 at 102:18
NOTE: The data set WORK.MATHEX has 579 observations and 25 variables.
NOTE: DATA statement used (Total process time):
real time 0.01 seconds
cpu time 0.00 seconds
183 proc logistic data = mathex;
184 title3 'Course and HS variables';
185 model passed (event='Yes') = c1 c3 hsgpa hscalc hsengl;
186 course: test c1=c3=0;
187 run;
NOTE: PROC LOGISTIC is modeling the probability that passed='Yes'.
NOTE: Convergence criterion (GCONV=1E-8) satisfied.
NOTE: There were 579 observations read from the data set WORK.MATHEX.
NOTE: PROCEDURE LOGISTIC used (Total process time):
real time 0.12 seconds
cpu time 0.12 seconds
188
189
190 /* Decision: Drop course */
191
192
193 ods select ParameterEstimates; /* Limit the output */
194 proc logistic data = mathex;
195 title3 'Just HS variables';
196 model passed (event='Yes') = hsgpa hscalc hsengl;
197 run;
NOTE: PROC LOGISTIC is modeling the probability that passed='Yes'.
NOTE: Convergence criterion (GCONV=1E-8) satisfied.
NOTE: There were 579 observations read from the data set WORK.MATHEX.
NOTE: PROCEDURE LOGISTIC used (Total process time):
real time 0.03 seconds
cpu time 0.04 seconds
198
199
200 /* Decision: Drop HS English.
201 Does the diagnostic test add anything? */
202
203
204 ods select ParameterEstimates;
205 proc logistic data = mathex;
206 title3 'HS GPA, HS Calculus and Diagnostic Test';
207 model passed (event='Yes') = hsgpa hscalc calc precalc;
208 run;
NOTE: PROC LOGISTIC is modeling the probability that passed='Yes'.
NOTE: Convergence criterion (GCONV=1E-8) satisfied.
NOTE: There were 579 observations read from the data set WORK.MATHEX.
NOTE: PROCEDURE LOGISTIC used (Total process time):
real time 0.04 seconds
cpu time 0.03 seconds
209
210
211 /* Decision: Drop the calc subscale, but which is better,
212 precalc or total score? */
213
214
215 ods select ParameterEstimates TestStmts; /* I ran a trace to find out the name */
216 proc logistic data = mathex;
217 title3 'HS GPA, HS Calculus and Diagnostic Test';
218 model passed (event='Yes') = hsgpa hscalc precalc totscore;
219 precalc_n_totscore: test precalc = totscore = 0;
220 run;
NOTE: PROC LOGISTIC is modeling the probability that passed='Yes'.
NOTE: Convergence criterion (GCONV=1E-8) satisfied.
NOTE: There were 579 observations read from the data set WORK.MATHEX.
NOTE: PROCEDURE LOGISTIC used (Total process time):
real time 0.04 seconds
cpu time 0.03 seconds
221
222
223 /* Decision: Keep precalc rather than totscore. Confirm */
224
225
226 ods select ParameterEstimates;
227 proc logistic data = mathex;
228 title3 'HS GPA, HS Calculus and Pre-calculus test';
229 model passed (event='Yes') = hsgpa hscalc precalc;
230 run;
NOTE: PROC LOGISTIC is modeling the probability that passed='Yes'.
NOTE: Convergence criterion (GCONV=1E-8) satisfied.
NOTE: There were 579 observations read from the data set WORK.MATHEX.
NOTE: PROCEDURE LOGISTIC used (Total process time):
real time 0.04 seconds
cpu time 0.03 seconds
231
232 proc logistic data = mathex;
233 title3 'Try gender, ethnic and mother tongue controlling for good stuff';
234 class ethnic (param=ref ref='East Indian');
235 /* Specifying a reference category that's not the last value */
236 model passed (event='Yes') = hsgpa hscalc precalc ethnic gender mtongue;
237 contrast 'Demographics' ethnic 1 0 0 0 0,
238 ethnic 0 1 0 0 0,
239 ethnic 0 0 1 0 0,
240 ethnic 0 0 0 1 0,
241 ethnic 0 0 0 0 1,
242 gender 1,
243 mtongue 1 / e;
244 /* Display the effect matrix */
245 run;
NOTE: PROC LOGISTIC is modeling the probability that passed='Yes'.
NOTE: Convergence criterion (GCONV=1E-8) satisfied.
NOTE: There were 579 observations read from the data set WORK.MATHEX.
NOTE: PROCEDURE LOGISTIC used (Total process time):
real time 0.17 seconds
cpu time 0.15 seconds
246
247
248 /* Decision: Forget about ethnicity. */
249
250
251 ods select ParameterEstimates;
252 proc logistic data = mathex;
253 title3 'HS GPA, HS Calculus, Pre-calculus test, Gender and Mother tongue';
254 model passed (event='Yes') = hsgpa hscalc precalc gender mtongue;
255 run;
NOTE: PROC LOGISTIC is modeling the probability that passed='Yes'.
NOTE: Convergence criterion (GCONV=1E-8) satisfied.
NOTE: There were 579 observations read from the data set WORK.MATHEX.
NOTE: PROCEDURE LOGISTIC used (Total process time):
real time 0.04 seconds
cpu time 0.02 seconds
256
257
258 /* Decision: Drop Gender and Mother tongue too.
259 My model now has just HS GPA, HS Calculus and Pre-calculus test. */
260
261
262 proc logistic data = mathex;
263 title3 'Try automatic (stepwise) selection';
264 model passed (event='Yes') =
265 gender mtongue e1-e6
266 hsgpa hscalc hsengl
267 c1-c3 precalc calc totscore
268 / selection = stepwise slentry = 0.05 slstay = 0.05 ;
269 /* Default slentry = slstay = 0.15 */
270 run;
NOTE: PROC LOGISTIC is modeling the probability that passed='Yes'.
NOTE: Convergence criterion (GCONV=1E-8) satisfied in Step 0.
NOTE: Convergence criterion (GCONV=1E-8) satisfied in Step 1.
NOTE: Convergence criterion (GCONV=1E-8) satisfied in Step 2.
NOTE: Convergence criterion (GCONV=1E-8) satisfied in Step 3.
NOTE: There were 579 observations read from the data set WORK.MATHEX.
NOTE: PROCEDURE LOGISTIC used (Total process time):
real time 0.18 seconds
cpu time 0.17 seconds
271
272
273 /* Note 211 observations lost to missingness for stepwise, compared to 204
274 for the earlier model with hsgpa, hscalc and precalc. */
275
276 /* Perhaps missingness on the variables we dropped could be useful. */
277
278 proc freq;
279 title2 'Explore missingness on omitted variables';
280 tables gender mtongue ethnic;
281 tables gender*mtongue / norow nocol nopercent missing;
282 tables gender*course / norow nocol nopercent missing;
283 run;
NOTE: There were 579 observations read from the data set WORK.MATHEX.
NOTE: PROCEDURE FREQ used (Total process time):
real time 0.08 seconds
cpu time 0.09 seconds
284
285
286 data mathex2;
287 set mathex;
288 if gender = . then sexmiss = 1; else sexmiss=0; /* Includes mtongue */
289 if course = . then coursemiss = 1; else coursemiss=0;
290 format sexmiss coursemiss ynfmt.;
291 label sexmiss = 'Gender and mother tongue missing'
292 coursemiss = 'Course missing';
293
294 /* Checks are commented out
295 proc freq;
296 tables gender*sexmiss / norow nocol nopercent missing;
297 tables course*coursemiss / norow nocol nopercent missing;
298 tables sexmiss*coursemiss / norow nocol nopercent missing chisq;
299 */
300
301
NOTE: There were 579 observations read from the data set WORK.MATHEX.
NOTE: The data set WORK.MATHEX2 has 579 observations and 27 variables.
NOTE: DATA statement used (Total process time):
real time 0.00 seconds
cpu time 0.00 seconds
302 proc logistic data = mathex2;
303 title3 'Try adding missingness on gender/mtongue and course';
304 model passed (event='Yes') = hsgpa hscalc precalc sexmiss coursemiss;
305 run;
NOTE: PROC LOGISTIC is modeling the probability that passed='Yes'.
NOTE: Convergence criterion (GCONV=1E-8) satisfied.
NOTE: There were 579 observations read from the data set WORK.MATHEX2.
NOTE: PROCEDURE LOGISTIC used (Total process time):
real time 0.09 seconds
cpu time 0.08 seconds
306
307
308 /* All the cases with course missing were deleted because of
309 missingness on other variables. */
310
311 proc logistic data = mathex2;
312 title3 'Try adding missingness on gender/mtongue and course';
313 model passed (event='Yes') = hsgpa hscalc precalc sexmiss;
314 run;
NOTE: PROC LOGISTIC is modeling the probability that passed='Yes'.
NOTE: Convergence criterion (GCONV=1E-8) satisfied.
NOTE: There were 579 observations read from the data set WORK.MATHEX2.
NOTE: PROCEDURE LOGISTIC used (Total process time):
real time 0.09 seconds
cpu time 0.08 seconds
315
316 /* Here's the current model. */
317
318 proc logistic data = mathex;
319 title3 'HS GPA, HS Calculus and Pre-calculus test';
320 model passed (event='Yes') = hsgpa hscalc precalc;
321 run;
NOTE: PROC LOGISTIC is modeling the probability that passed='Yes'.
NOTE: Convergence criterion (GCONV=1E-8) satisfied.
NOTE: There were 579 observations read from the data set WORK.MATHEX.
NOTE: PROCEDURE LOGISTIC used (Total process time):
real time 0.08 seconds
cpu time 0.08 seconds
322
323
324 /* Goal: Develop a prediction model that uses all the data and makes a
325 prediction for every case. */
326
327
328
329
330 OPTIONS NONOTES NOSTIMER NOSOURCE NOSYNTAXCHECK;
343