STA429/1007 F 2004 Handout 2: The Math Data appsrv01.srv> ls *.dat diag.dat english.dat final.dat gender.dat hs.dat rater1.dat rater2.dat appsrv01.srv> ls math*.sas mathclean1.sas mathclean3.sas mathdescribe2.sas mathclean2.sas mathdescribe1.sas mathread.sas /* mathread.sas : Read and merge Math performance data */ options linesize=79 noovp formdlim='_'; title 'Gender, Ethnicity and Math performance'; libname math 'mathlib'; /* For storage of permanent SAS datasets, formats */ libname library 'mathlib'; /* SAS will seach for permanently stored formats ONLY in a place called "library." */ proc format library=math; /* Save the formats permanently */ value rwfmt 0 = 'Wrong' 1 = 'Right'; value crsfmt 4 = 'No Resp'; value langfmt 1 = 'English' 2 = 'French' 3 = 'Other'; value sexfmt 0 = 'Male' 1 = 'Female'; value ynfmt 0 = 'No' 1 = 'Yes'; value natfmt 1 = 'Chinese' 2 = 'Japanese' 3 = 'Korean' 4 = 'Vietnamese' 5 = 'Other Asian' 6 = 'Eastern European' 7 = 'Hispanic' 8 = 'English-speaking' 9 = 'French' 10 = 'Italian' 11 = 'Greek' 12 = 'Germanic' 13 = 'Other European' 14 = 'Middle-Eastern' 15 = 'Pakistani' 16 = 'East Indian' 17 = 'Sub-Saharan' 18 = 'OTHER' ; data dtest; infile 'diag.dat' missover; /* missover causes blanks to be missing, even at the end of a line */ input student $ 1-9 course1 13 q1 17 q2 18 q3 19 q4 20 q5 21 q6 22 q7 23 q8 24 q9 25 q10 26 q11 27 q12 28 q13 29 q14 30 q15 31 q16 32 q17 33 q18 34 q19 35 q20 36; precalc1 = sum(of q1-q4); precalc2 = sum(of q5-q9); calcone = sum(of q10-q14); calctwo = sum(of q15-q20); precalc = precalc1 + precalc2; calc = calcone + calctwo; totscore = precalc+calc; indtest = 1; label student = 'Student number' precalc1 = 'Precalculus 1 (bc1) subscale' precalc2 = 'Precalculus 2 (bc2) subscale' precalc = 'Number precalculus correct' calcone = 'Calculus 1 (c1) subscale' calctwo = 'Calculus 2 (c2) subscale' calc = 'Number calculus correct' totscore = 'Total # right on diagnostic test' indtest = 'In dtest data set'; format q1-q20 rwfmt.; format course1 crsfmt.; format indtest ynfmt.; /*************************************************************************/ data hs; infile 'hs.dat'; input student $9. hsgpa fin_mat alg_geo rel_fun mathema calculus; /* Default length of string variable is 8 chars, so $9. is vital */ inhs = 1; label inhs = 'In high school data set'; format inhs ynfmt.; /*************************************************************************/ data anglo; /* HS English is in a separate file */ infile 'english.dat'; input student $9. gpa2 eng $ english; ineng = 1; label ineng = 'In High School English data set'; format ineng ynfmt.; /*************************************************************************/ data final; infile 'final.dat'; input student $9. mark course2; /* For mark, 998=SDF and 998=WDR */ infinal = 1; label havmark = 'In final data set'; format course2 crsfmt.; format infinal ynfmt.; /*************************************************************************/ data gender; infile 'gender.dat'; input student $9. lang sex; label lang = 'Mother Tongue'; ingender=1; label ingender = 'In gender data set'; format lang langfmt.; format sex sexfmt.; format ingender ynfmt.; /*************************************************************************/ data rater1; infile 'rater1.dat'; input student $9. nation1; inrat1 = 1; label nation1 = 'Nationality of name acc to rater1'; format nation1 natfmt.; /*************************************************************************/ data rater2; infile 'rater2.dat'; input student $9. nation2; inrat2 = 1; label nation2 = 'Nationality of name acc to rater2'; format nation1 natfmt.; /*************************************************************************/ /* All data sets must be sorted by id to match */ proc sort data=dtest; by student; proc sort data=hs; by student; proc sort data=anglo; by student; proc sort data=final; by student; proc sort data=gender; by student; proc sort data=rater1; by student; proc sort data=rater2; by student; data together; merge dtest hs anglo final gender rater1 rater2; by student; /* Change missing to no for in_data_set variables */ if indtest = . then indtest=0; if inhs = . then inhs=0; if ineng = . then ineng = 0; if infinal = . then infinal=0; if ingender = . then ingender=0; if inrat1 = . then inrat1=0; if inrat2 = . then inrat2=0; inall = indtest*inhs*infinal*ingender*inrat1*inrat2; label inall = 'In all data sets'; format inall ynfmt.; /* The following variables were created after looking at the data and at summary stats, but not at the CONNECTION between IVs and DVs. Never modify existing variables. Make new variables and keep the original unmodified ones. Here are the new variables, in order: numberin passed grade course gpa finmat alggeo relfun hscalc havcalc tongue */ numberin = indtest+inhs+ineng+infinal+ingender+inrat1+inrat2; label numberin = 'Number of data files (7) in which student appears'; if (50<=mark<=100) then passed=1; else passed=0; label passed = 'Passed the course'; format passed ynfmt.; if mark=0 then grade=.; else if mark > 100 then grade=.; else grade=mark; label grade = 'Final mark (if any)'; if (course1=3 and course2=2) then course=2; else if (course1=. and course2=.) then course=4; else if (course1=. and course2 ne .) then course=course2; else course=course1; format course crsfmt.; if 65 le hsgpa le 100 then gpa = hsgpa; /* Else missing is automatic */ label gpa = 'High School GPA'; if 0