# readmath2.txt: Read and clean math replication data
# Result is data frames: test, test2 and replic

# rm(list=ls())
options(scipen=999)
# Read the replication sample (the "test" data)
test = read.table("http://www.utstat.toronto.edu/~brunner/data/legal/replicmath.data.txt") 
# test = read.table("replicmath.data.txt") # Local version
colnames(test) = c('id', 'course', 'precalc', 'calc', 'gpa', 'calculus', 'english', 'mark', 'lang', 'sex', 'nation1', 'nation2', 'sample')
# Variables 3 through 8 are numeric
# The following will make . = NA   
for(j in 3:8) test[,j] = as.numeric(as.character(test[,j]))
# Data Cleaning
# Variables needing attention are:
    # course
    # gpa : Min is zero
    # calculus: Min is zero, Max is 999
    # english: Min is zero
    # mark:  Min is zero, Max is 999
    # lang
    # sex
# Fix up the variables outside the data frame to save typing.
attach(test)
id = 1:dim(test)[1] # Because ids start at 580
# Fix up course
course = as.character(course) # I need to make it character to edit
course[course=='.'] = NA
course[course=='4'] = NA
course[course=='1'] = 'Catch-up'
course[course=='2'] = 'Mainstream'
course[course=='3'] = 'Elite'
course=factor(course)
# Fix up gpa
bad = subset(id,gpa==0)
gpa[bad] = NA
# Fix up calculus
bad = subset(id,calculus==0 | calculus == 999) # Logical or
calculus[bad] = NA
# Fix up english
bad = subset(id,english==0); english[bad] = NA
# Fix up mark
bad = subset(id,mark==0 | mark == 999); mark[bad] = NA
bad = subset(id,mark==998); mark[bad] = NA
# Fix up lang
lang = as.character(lang) # For editing
dot = subset(id,lang=='.') ; lang[dot] = NA
franc = subset(id,lang=='French'); lang[franc] = 'Other'
lang = factor(lang)
# Fix up sex
dot = subset(id,sex=='.') ; sex[dot] = NA
sex = factor(sex)
# Fix up nationality ratings. This includes a recode of nation1 and nation2.
# It's not necessary for the exploratory data, which are already recoded.
nation1 = as.character(nation1) 
dot = subset(id,nation1=='.') ; nation1[dot] = NA
n1 = as.numeric(nation1)
cat1 = numeric(length(nation1)) + 6 # Default is 6 = Other
one = subset(id,n1 %in% 1:5); cat1[one] = 1
two = subset(id,n1 == 6); cat1[two] = 2
three = subset(id,n1 %in% 7:13); cat1[three] = 3
four = subset(id,n1 %in% 14:15); cat1[four] = 4
five = subset(id,n1 == 16); cat1[five] = 5
nation1 = cat1
nation2 = as.character(nation2) 
dot = subset(id,nation2=='.') ; nation2[dot] = NA
n2 = as.numeric(nation2)
cat2 = numeric(length(nation2)) + 6 # Default is 6 = Other
one = subset(id,n2 %in% 1:5); cat2[one] = 1
two = subset(id,n2 == 6); cat2[two] = 2
three = subset(id,n2 %in% 7:13); cat2[three] = 3
four = subset(id,n2 %in% 14:15); cat2[four] = 4
five = subset(id,n2 == 16); cat2[five] = 5
nation2 = cat2
nation1 = factor(nation1, labels = c('Asian', 'Eastern European', 'European not Eastern', 'Middle-Eastern and Pakistani', 'East Indian', 'Other and DK'))
nation2 = factor(nation2, labels = c('Asian', 'Eastern European', 'European not Eastern', 'Middle-Eastern and Pakistani', 'East Indian', 'Other and DK'))
# Create nation and diagtest
nation = nation2
sub1 = subset(id,nation1=='Middle-Eastern and Pakistani') 
nation[sub1] = nation1[sub1] # Use rater one's opinion in this case
diagtest = precalc+calc # Diagnostic test score
test2 = data.frame(id, course, precalc, calc, gpa, calculus, english, mark, lang, sex, nation1, nation2, sample, nation,diagtest)
id = test$id # Put it back
# Data frame replic has just the vars used in model3
replic = data.frame(mark, diagtest, gpa, calculus, english, lang)
# Clean up, leaving only data frames test, test2 and replic
rm(bad, calculus, cat1, cat2, course, diagtest, dot, english, five, four, franc, gpa, id, j, lang, mark, n1, n2, nation, nation1, nation2, one, sex, sub1, three, two)