/* mathreg1.sas */ title2 'Regression on the math data: Part One'; %include '/home/u1407221/441s24/SAS06/ReadLabelMath.sas'; /* id sex tongue rater1 rater2 ethnic hsgpa hscalc hsengl havecalc class precalc calc totscore grade passed outcome */ data dummy; /* Define dummy variables here, for now. Later move them to ReadLabelMath.sas. */ set explore; /* Dummy variables for ethnic background */ if ethnic=. then e1=.; else if ethnic=1 then e1=1; else e1=0; if ethnic=. then e2=.; else if ethnic=2 then e2=1; else e2=0; if ethnic=. then e3=.; else if ethnic=3 then e3=1; else e3=0; if ethnic=. then e4=.; else if ethnic=4 then e4=1; else e4=0; if ethnic=. then e6=.; else if ethnic=6 then e6=1; else e6=0; label e1 = 'Asian vs East Ind.' e2 = 'East Eur. vs East Ind.' e3 = 'Other Eur. vs East Ind.' e4 = 'Mid. East & Pak. vs East Ind.' e6 = 'Other/DK vs East Ind.'; if sex = 'Female' then gender=1; else if sex = 'Male' then gender=0; if tongue = 'English' then mtongue=1; else if tongue='Other' then mtongue=0; label mtongue = 'English vs. Other'; /* Only use 2 of these if the model has an intercept! */ if class=. then c1=.; else if class=1 then c1=1; else c1=0; if class=. then c2=.; else if class=2 then c2=1; else c2=0; if class=. then c3=.; else if class=3 then c3=1; else c3=0; label c1 = 'Catch-up' c2 = 'Mainstream' c3 = 'Elite'; /* Commented out proc freq; title3 'Check dummy variables'; tables sex*gender / norow nocol nopercent missing; tables tongue*mtongue / norow nocol nopercent missing; tables (e1-e4 e6) * ethnic / norow nocol nopercent missing; tables (c1-c3) * class / norow nocol nopercent missing; */ proc reg plots=none corr data=dummy; /* Suppress diagnostic plots for now*/ title3 'Model A: Predict University Calculus Grade from HS Information'; model grade = hsgpa hscalc hsengl; /* It is very interesting to know what proportion of the remaining variation is explained by each variable, controlling for the other two. F = t-squared, and a = sF/(n-p + sF) */ proc iml; title3 'Proportion of remaining variation for HS information'; n = 323; p = 4; s = 1; print "hsgpa controlling for hscalc and hsengl"; t = 8.00; F = t**2; a = s*F/(n-p + s*F); print a; print "hscalc controlling for hsgpa and hsengl"; t = 3.14; F = t**2; a = s*F/(n-p + s*F); print a; print "hsengl controlling for hsgpa and hscalc"; t = -3.26; F = t**2; a = s*F/(n-p + s*F); print a; proc reg plots = none; title3 'Model B: Predict University Calculus Grade from Diagnostic Test'; model grade = precalc calc; proc reg plots = none; title3 'Making a mistake on purpose'; model grade = totscore precalc calc; proc reg plots = none; title3 'Model C: Do the diagnostic test and HS info both contribute?'; model grade = hsgpa hscalc hsengl precalc calc; Diagnostic_Test: test precalc=calc=0; HS_Information: test hsgpa=hscalc=hsengl=0; proc iml; title3 'Proportion of remaining variation explained by diagnostic test'; print "Precalc and calc controlling for hsgpa hscalc hsengl"; n = 289; p = 6; s = 2; F = 8.28; a = s*F/(n-p + s*F); print a; proc iml; title3 'Proportion of remaining variation explained by HS info'; print "Hsgpa hscalc hsengl controlling for precalc and calc"; n = 289; p = 6; s = 3; F = 46.97; a = s*F/(n-p + s*F); print a; proc reg plots = none; title3 'Model D: See if class makes a contribution'; model grade = hsgpa hscalc hsengl precalc calc c1 c3; class: test c1=c3=0; Diagnostic_Test: test precalc=calc=0; proc glm; title3 'Model D again with proc glm'; class class; model grade = hsgpa hscalc hsengl precalc calc class; contrast 'Replicate Test of Class' class 1 -1 0, class 0 1 -1; contrast 'Diagnostic Test F = 9.06' precalc 1, calc 1; proc reg plots = none; title3 'Model E: Include Language, Sex and Ethnic Background'; model grade = hsgpa hscalc hsengl precalc calc mtongue gender e1-e4 e6; TroubleVars: test mtongue=gender=e1=e2=e3=e4=e6=0; Nationality: test e1=e2=e3=e4=e6=0; proc reg plots = none; title3 'Model F: Discarding Gender and Nationality'; model grade = hsgpa hscalc hsengl precalc calc mtongue; EnglishTongue: test hsengl=mtongue=0; proc iml; title3 'Proportion of remaining variation explained by mother tongue'; print "Mtongue controlling for hsgpa hscalc hsengl precalc calc"; n = 287; p = 7; s = 1; t = -2.23 ; F = t**2; a = s*F/(n-p + s*F); print a; proc reg plots = none; title3 'Model G: Drop mtongue and calc'; title4 'Compare R-Square = 0.4556, Adj R-Sq = 0.4460 From Model 3'; model grade = hsgpa hscalc hsengl precalc; proc iml; title3 'Proportion of remaining variation explained by Pre-calculus'; print "precalc controlling for hsgpa hscalc hsengl"; n = 289; p = 5; s = 1; t = 3.63 ; F = t**2; a = s*F/(n-p + s*F); print a; proc reg plots = none; title3 'Model H: Combine precalc and calc instead of dropping calc'; title4 'Compare R-Square = 0.4492 from Model G'; model grade = hsgpa hscalc hsengl totscore; proc iml; title3 'Proportion of remaining variation explained by Pre-calculus'; print "totscore controlling for hsgpa hscalc hsengl"; n = 289; p = 5; s = 1; t = 3.92 ; F = t**2; a = s*F/(n-p + s*F); print a; print "For prediction, I am happy with Model H: hsgpa hscalc hsengl totscore"; proc reg plots = none; title3 'Model I: Same as Model H but including Mother Tongue'; model grade = hsgpa hscalc hsengl totscore mtongue; /* A slightly better model except for the lawsuit. */ /* For stepwise selection, note Mallow's Cp is Cp = SSE_p/MSE_F - n + 2(p+1) Small is good. */ proc reg plots = none; title3 'Try automatic (stepwise) selection'; model grade = hsgpa hscalc hsengl precalc calc totscore mtongue gender e1-e4 e6 / selection = stepwise slentry = 0.05 slstay = 0.05 ; /* Default slentry = slstay = 0.15 */ quit;