/********************* cars2.sas ***************************/
title 'Regression on the Metric Cars Data';

/* Read data directly from Excel spreadsheet */
proc import datafile="/home/u1407221/441s24/data/mcars4.xlsx" 
            out=cars dbms=xlsx replace;
            getnames=yes;
/* Input data file is mcars4.xlsx
   Ouput data set is called cars
   dbms=xlsx      The input file is an Excel spreadsheet.
                  Necessary to read an Excel spreadsheet directly under unix/linux
                  Works in PC environment too except for Excel 4.0 spreadsheets
                  If there are multiple sheets, use sheet="sheet1" or something.
   replace	      If the data set cars already exists, replace it.  
   getnames=yes   Use column names as variable names.                  */

proc print  data=cars(obs=10);
     title2 'Look at first 10 lines of input data set';

proc contents;
     title2 'Contents of the default data set';

data auto;
     set cars;
     mpg = 100/lper100k * 0.6214/0.2642;
     Country = Cntry; /* I just like the spelling more */
     label Country  = 'Location of Head Office'
           lper100k = 'Litres per 100 kilometers'
           mpg      = 'Miles per Gallon'
           weight   = 'Weight in kg'
           length   = 'Length in meters';
/* Indicator dummy vars: Ref category is Japanese */
     if country = 'US' then c1=1;  else c1=0;
     if country = 'Europ' then c2=1;  else c2=0;
     if country = 'Japan' then c3=1;  else c3=0;
     label c1 = 'US = 1' 
           c2 = 'Europe = 1'
           c3 = 'Japan';
     /* Interaction Terms */
     cw1 = c1*weight; cw2 = c2*weight; cw3 = c3*weight;
	 cL1 = c1*length; cL2 = c2*length; cL3 = c3*length;	 

/* This way of creating dummy variables is safe only because 
   Country is never missing. If it could be missing, better is
     if country = ' ' then c1 = .;
        else if country = 'US' then c1=1;  
        else c1=0;
     if country =  ' ' then c2 = .;
        else if country = 'Europ' then c2=1;  
        else c2=0;
     if country =  ' ' then c3 = .;
        else if country = 'Japan' then c3=1;  
        else c3=0;
Note that a blank space is the missing value code for character variables, 
while a period is missing for numeric variables.  */

proc freq;
     title2 'Check dummy variables';
     tables (c1 c2 c3)*country / norow nocol nopercent;

proc means;
     title2 'Means of quantitative variables';
     var weight length lper100k mpg;


/* First an analysis with country only. */

/* Questions for every significance test:
     * What is E(y|x) for the model SAS is using? 
     * Give the null hypothesis in symbols.
     * Do you reject H0 at alpha = 0.05? Answer Yes or No.
     * In plain, non-statistical language, what do you conclude? */

     
proc means;
     title2 'Litres per 100 k Broken Down by Country';
     class Country;
     var lper100k;

proc reg plots=none; /* Suppress diagnostic plots for now*/
     title2 'Regression with Just Country';
     model lper100k = c1 c2;
     USvsEURO: test c1=c2; 

proc glm;
     title2 'Compare Oneway with proc glm';
     class country;
     model lper100k = country;

proc reg simple plots=none data=auto; /* simple gives descriptive stats */
     title2 'Country, Weight and Length';
     model lper100k = c1 c2 weight length;
     country: test c1 = c2 = 0;     /* Country controlling for wgt, length */
     USvsEURO: test c1=c2;          /* US vs. Europe controlling for wgt, length */
     wgt_len: test weight=length=0; /* wgt, length controlling for Country */

/* Proportions of remaining variation, using a = sF/(n-p+sF) */

proc iml;
     title2 'Proportion of remaining variation';
     print "Country controlling for Weight and Length";
     n = 100; p = 5; s = 2;
     F = 6.90;  a = s*F/(n-p + s*F);
     print a;
     
     print "Weight and Length controlling for Country";
     F = 115.16;  a = s*F/(n-p + s*F);
     print a;

proc glm data=auto plots=none;
     title2 'Country, weight and length with proc glm';
     class country;
     model lper100k = weight length country;
     lsmeans country / pdiff tdiff adjust = bon;

/* Reproduce Bonferroni p-values from proc reg output */
proc iml;
     title2 "Reproduce Bonferroni p-values from proc reg output";
     USvsJap = 0.0010; EURvsJap = 0.4448; USvsEUR = 0.0113;
     print "Uncorrected" USvsJap EURvsJap USvsEUR;
     BonUSvsJap = 3*USvsJap; BonEURvsJap = 3*EURvsJap; BonUSvsEUR = 3*USvsEUR;
     print "Bonferroni " BonUSvsJap BonEURvsJap BonUSvsEUR;

/* Reproduce LS means from proc reg output
   Hold weight and length fixed at sample mean values. */
proc iml;
     title2 "Reproduce LS means from proc reg output";
/*         yhat = b0 + b1*c1 + b2*c2 + b3*xbar1      + b4*xbar2 */
     EuropLSM = -5.28270 - 0.50652 + 0.00546*1413.21 + 2.34597*4.8492;
     JapanLSM = -5.28270           + 0.00546*1413.21 + 2.34597*4.8492;
     US_LSM   = -5.28270  -1.99424 + 0.00546*1413.21 + 2.34597*4.8492;
     print EuropLSM JapanLSM US_LSM;
run;
quit; /* Try quitting iml. Needed for ods below. */

/* Reproduce LS means a more sophisticated way. */
     
/*   
ods trace on;
proc reg simple data=auto plots=none; 
     model lper100k = c1 c2 weight length;
run;
ods trace off;
*/


options replace=yes;
ods output SimpleStatistics=simplestats;
ods output ParameterEstimates=parest;
proc reg simple data=auto plots=none; 
     model lper100k = c1 c2 weight length;
run; 

proc print data=simplestats;
     title2 "Simple Statistics";
proc print data=parest;
     title2 "Parameter Estimates";

proc iml;
     title2 "Least squares means";
     use simplestats;
     read point 4 var {Mean} into xbar1;
     read point 5 var {Mean} into xbar2;
     close simplestats; 
     use parest;
     read all var {Estimate} into b; 
     close parest;
     /* Calculate y-hat values: U.S. first */
     /* Order is   Intercept c1 c2 weight length */
     x =                  {1  0  0   0       0}; /* Row vector */
     x[2] = 1; x[4] = xbar1; x[5] = xbar2;
     US = x * b; 
     /* Set dummy variables for Europe */
     x[2] = 0; x[3] = 1; 
     Europe = x * b;
     /* Now Japan */
     x[3]=0;
     Japan = x * b;
     print US Europe Japan;

proc iml;
     title2 "Repeat LS means from earlier";
/*         yhat = b0 + b1*c1 + b2*c2 + b3*xbar1      + b4*xbar2 */
     EuropLSM = -5.28270 - 0.50652 + 0.00546*1413.21 + 2.34597*4.8492;
     JapanLSM = -5.28270           + 0.00546*1413.21 + 2.34597*4.8492;
     US_LSM   = -5.28270  -1.99424 + 0.00546*1413.21 + 2.34597*4.8492;
     print US_LSM EuropLSM JapanLSM ;
run; quit;

proc reg data=auto plots = none; 
     title2 'Country, Weight and Length with Interactions';
     model lper100k = c1 c2 weight length cw1 cw2 cL1 cL2;
     country: test c1 = c2 = 0;     /* Is it really still country?  */
     Interactions: test cw1 = cw2 = cL1 = cL2 = 0;

/* Centering an explanatory variable by subtracting off the mean affects the
intercept, but not the relationships among variables. I want to create a new 
data set with weight and length centered. It's not an issue for these data,
but it's important to sbtract off the mean of the cases used in the regression.
Also, to avoid confusion I will make sure the centered variables are nicely 
labelled. */

data auto2;
     set auto;
     utility = lper100k + c1 + c2 + weight + length;
     if utility = . then delete;
     drop utility;

proc standard mean=0 data=auto2 out=cntrd;
     var weight length;

/* In the new data set "cntrd," weight and length are adjusted to have mean
zero (the sample means have been subtracted from each observation). If I had
said mean=0 std=1, they would have been converted to z-scores.  All the other
variables (including the product terms) are as they were before, and the
labels are the same as before too. */

data centered; 
     set cntrd;  /* Now centered has everything in cntrd */
    /* Re-create Interaction Terms and re-label explanatory vars*/
     cw1 = c1*weight; cw2 = c2*weight;
     cL1 = c1*length; cL2 = c2*length;
     label weight = 'Weight in kg (Centered)'
           length = 'Length in cm (Centered)';

/* By default, SAS procedures use the most recently created data set,
   but specify it anyway. */

proc reg plots=none simple data=centered;
     title2 'Weight and length are now centered: Mean=0';
     model lper100k = c1 c2 weight length cw1 cw2 cL1 cL2;
     country: test c1 = c2 = 0;     /* Does this make better sense? */
     Interactions: test cw1 = cw2 = cL1 = cL2 = 0;

proc sgplot data=centered;
     title2 'Look at the regression lines';
     reg x=weight y=lper100k / group = country;

proc reg plots=none data=centered;
     title2 'Drop Length: Weight is centered';
     model lper100k = c1 c2 weight cw1 cw2;
     Interactions: test cw1 = cw2 = 0;
     USvsEurSlope: test cw1 = cw2;

proc reg plots=none data=auto;
     title2 'Cell means coding with interactions. Weight is uncentered.';
     title3 'Compare F = 4.36 for interaction';
     model lper100k = c1 c2 c3 cw1 cw2 cw3 / noint ;
     Interactions:  test cw1 = cw2 = cw3;
     USvsEurSlope:  test cw1 = cw2;
     USvsJapSlope:  test cw1 = cw3;
     EurvsJapSlope: test cw2 = cw3;

quit;