/* readexplor.sas   Just read the exploratory math data and do basic 
                    transformations. This version is based on 
                    readmath2c.sas.  */

title 'Prediction of Performance in First-year Calculus';

proc format; 
     value ynfmt  0 = 'No'  1 = 'Yes';
     value crsfmt 1 = 'Catch-up' 2 = 'Mainstrm' 3 = 'Elite' 4 = 'No Resp';
     value nfmt 
                1 = 'Asian'
                2 = 'Eastern European'
                3 = 'European not Eastern' 
                4 = 'Middle-Eastern  and Pakistani'
                5 = 'East Indian'
                6 = 'Other   and DK' ;

data explore;
     infile '/folders/myfolders/exploremath.data.txt';
     input id course precalc calc gpa calculus english mark lang $ sex $ 
           nation1 nation2 sample;

/* Computed Variables: totscore, passed, grade, hsgpa, hscalc, hsengl, 
                       tongue, ethnic */

     totscore = precalc+calc;
     if (50<=mark<=100) then passed=1; else passed=0;
     if 0<mark<50 then outcome = 'Failed     ';
        else if (50<=mark<=100) then outcome = 'Passed     ';
        else outcome = 'Disappeared';
     /* Some missing final marks were zero, and 998=SDF and 999=WDR */
     if mark=0 then grade=.;
        else if mark > 100 then grade=.;
        else grade=mark;
     /* Missing HS marks were zeros */
     if 65 le gpa le 100 then hsgpa = gpa; /* Else missing is automatic */
     if 0 < calculus < 101 then hscalc = calculus;
     if 0 < english < 101 then hsengl = english;
     /* There were just a few French speakers */
     if lang='French' then tongue='Other  '; else tongue=lang; 
     label tongue = 'Mother Tongue (Eng or Other)';
     /* Rater 1 knows Middle Eastern names -- otherwise believe Rater 2 */
     if nation1=4 then ethnic=nation1; else ethnic=nation2;

     /********************************************************************/

     label 
           precalc  = 'Number precalculus correct'
           calc  = 'Number calculus correct'
           totscore = 'Total # right on diagnostic test'
           passed = 'Passed the course'
           grade = 'Final mark (if any)'
           hsgpa = 'High School GPA'
           hscalc = 'HS Calculus'
           hsengl = 'HS English'
           lang = 'Mother Tongue'
           nation1 = 'Nationality of name acc to rater1'
           nation2 = 'Nationality of name acc to rater2'
           tongue = 'Mother Tongue (Eng or Other)'
           ethnic = 'Judged Nationality of name';

     diff = (100 * precalc/9) - (100 * calc/11);
     label diff = 'Percentage correct: Precalc minus calc';

     /* And a couple more useful variables */
     if course=4 then course2=.; else course2=course; /* Eliminate 'No Resp' */
     if 0 le grade le 60 then gsplit='60orLess'; 
        else if 60 lt grade le 100 then gsplit='Over60'; 
     /* Got median=60 from proc univariate */
     label gsplit = 'Median split on final grade';

     format course course2 crsfmt.;
     format passed ynfmt.;
     format nation1 nation2 ethnic nfmt.;

     /* Dummy variables for ethnic background */
     if ethnic=. then e1=.;
        else if ethnic=1 then e1=1;
        else e1=0;
     if ethnic=. then e2=.;
        else if ethnic=2 then e2=1;
        else e2=0;
     if ethnic=. then e3=.;
        else if ethnic=3 then e3=1;
        else e3=0;
     if ethnic=. then e4=.;
        else if ethnic=4 then e4=1;
        else e4=0;
     if ethnic=. then e6=.;
        else if ethnic=6 then e6=1;
        else e6=0;

     label e1 = 'Asian vs East Ind.'
           e2 = 'East Eur. vs East Ind.'
           e3 = 'Other Eur. vs East Ind.'
           e4 = 'Mid. East & Pak. vs East Ind.'
           e6 = 'Other/DK vs East Ind.';

     if sex = 'Female' then gender=1; else if sex = 'Male' then gender=0;
     if tongue = 'English' then mtongue=1; 
        else if tongue='Other' then mtongue=0;
     label mtongue = 'English vs. Other';

     
     /* Only use 2 of these if the model has an intercept! */
     if course2=. then c1=.; else if course2=1 then c1=1; else c1=0;
     if course2=. then c2=.; else if course2=2 then c2=1; else c2=0;
     if course2=. then c3=.; else if course2=3 then c3=1; else c3=0;
     label c1 = 'Catch-up' c2 = 'Mainstream' c3 = 'Elite';

/*
proc freq;
     title2 'Check outcome';
     tables outcome*passed / norow nocol nopercent missing;
     tables (course course2) * outcome / nocol nopercent chisq;
*/