# readmath2.txt: Read and clean math replication data # Result is data frames: test, test2 and replic # rm(list=ls()) options(scipen=999) # Read the replication sample (the "test" data) test = read.table("http://www.utstat.toronto.edu/~brunner/data/legal/replicmath.data.txt") # test = read.table("replicmath.data.txt") # Local version colnames(test) = c('id', 'course', 'precalc', 'calc', 'gpa', 'calculus', 'english', 'mark', 'lang', 'sex', 'nation1', 'nation2', 'sample') # Variables 3 through 8 are numeric # The following will make . = NA for(j in 3:8) test[,j] = as.numeric(as.character(test[,j])) # Data Cleaning # Variables needing attention are: # course # gpa : Min is zero # calculus: Min is zero, Max is 999 # english: Min is zero # mark: Min is zero, Max is 999 # lang # sex # Fix up the variables outside the data frame to save typing. attach(test) id = 1:dim(test)[1] # Because ids start at 580 # Fix up course course = as.character(course) # I need to make it character to edit course[course=='.'] = NA course[course=='4'] = NA course[course=='1'] = 'Catch-up' course[course=='2'] = 'Mainstream' course[course=='3'] = 'Elite' course=factor(course) # Fix up gpa bad = subset(id,gpa==0) gpa[bad] = NA # Fix up calculus bad = subset(id,calculus==0 | calculus == 999) # Logical or calculus[bad] = NA # Fix up english bad = subset(id,english==0); english[bad] = NA # Fix up mark bad = subset(id,mark==0 | mark == 999); mark[bad] = NA bad = subset(id,mark==998); mark[bad] = NA # Fix up lang lang = as.character(lang) # For editing dot = subset(id,lang=='.') ; lang[dot] = NA franc = subset(id,lang=='French'); lang[franc] = 'Other' lang = factor(lang) # Fix up sex dot = subset(id,sex=='.') ; sex[dot] = NA sex = factor(sex) # Fix up nationality ratings. This includes a recode of nation1 and nation2. # It's not necessary for the exploratory data, which are already recoded. nation1 = as.character(nation1) dot = subset(id,nation1=='.') ; nation1[dot] = NA n1 = as.numeric(nation1) cat1 = numeric(length(nation1)) + 6 # Default is 6 = Other one = subset(id,n1 %in% 1:5); cat1[one] = 1 two = subset(id,n1 == 6); cat1[two] = 2 three = subset(id,n1 %in% 7:13); cat1[three] = 3 four = subset(id,n1 %in% 14:15); cat1[four] = 4 five = subset(id,n1 == 16); cat1[five] = 5 nation1 = cat1 nation2 = as.character(nation2) dot = subset(id,nation2=='.') ; nation2[dot] = NA n2 = as.numeric(nation2) cat2 = numeric(length(nation2)) + 6 # Default is 6 = Other one = subset(id,n2 %in% 1:5); cat2[one] = 1 two = subset(id,n2 == 6); cat2[two] = 2 three = subset(id,n2 %in% 7:13); cat2[three] = 3 four = subset(id,n2 %in% 14:15); cat2[four] = 4 five = subset(id,n2 == 16); cat2[five] = 5 nation2 = cat2 nation1 = factor(nation1, labels = c('Asian', 'Eastern European', 'European not Eastern', 'Middle-Eastern and Pakistani', 'East Indian', 'Other and DK')) nation2 = factor(nation2, labels = c('Asian', 'Eastern European', 'European not Eastern', 'Middle-Eastern and Pakistani', 'East Indian', 'Other and DK')) # Create nation and diagtest nation = nation2 sub1 = subset(id,nation1=='Middle-Eastern and Pakistani') nation[sub1] = nation1[sub1] # Use rater one's opinion in this case diagtest = precalc+calc # Diagnostic test score test2 = data.frame(id, course, precalc, calc, gpa, calculus, english, mark, lang, sex, nation1, nation2, sample, nation,diagtest) id = test$id # Put it back # Data frame replic has just the vars used in model3 replic = data.frame(mark, diagtest, gpa, calculus, english, lang) # Clean up, leaving only data frames test, test2 and replic rm(bad, calculus, cat1, cat2, course, diagtest, dot, english, five, four, franc, gpa, id, j, lang, mark, n1, n2, nation, nation1, nation2, one, sex, sub1, three, two)