/********************* cars2.sas ***************************/
title 'Regression on Metric Cars Data';

/* Read data directly from Excel spreadsheet */
proc import datafile="/home/brunner0/441s20/mcars4.xlsx" 
            out=cars dbms=xlsx replace;
            getnames=yes;
/* Input data file is mcars4.xlsx
   Ouput data set is called cars
   dbms=xlsx      The input file is an Excel spreadsheet.
                  Necessary to read an Excel spreadsheet directly under unix/linux
                  Works in PC environment too except for Excel 4.0 spreadsheets
                  If there are multiple sheets, use sheet="sheet1" or something.
   replace	      If the data set cars already exists, replace it.  
   getnames=yes   Use column names as variable names.                  */

proc print;
     title2 'Look at input data set';

data auto;
     set cars;
     mpg = 100/lper100k * 0.6214/0.2642;
     Country = Cntry; /* I just like the spelling more */
     label Country  = 'Location of Head Office'
           lper100k = 'Litres per 100 kilometers'
           mpg      = 'Miles per Gallon'
           weight   = 'Weight in kg'
           length   = 'Length in meters';
/* Indicator dummy vars: Ref category is Japanese */
     if country = 'US' then c1=1;  else c1=0;
     if country = 'Europ' then c2=1;  else c2=0;
     label c1 = 'US = 1' 
           c2 = 'Europe = 1';
     /* Interaction Terms */
     cw1 = c1*weight; cw2 = c2*weight;
	 cL1 = c1*length; cL2 = c2*length;
/* This way of creating dummy variables is safe only because 
   Country is never missing. If it could be missing, better is
     if country = ' ' then c1 = .;
        else if country = 'US' then c1=1;  
        else c1=0;
     if country =  ' ' then c2 = .;
        else if country = 'Europ' then c2=1;  
        else c2=0;
Note that a blank space is the missing value code for character variables, 
while a period is missing for numeric variables.  */

proc freq;
     title2 'Check dummy variables';
     tables (c1 c2)*country / norow nocol nopercent;

/* First an analysis with country only. */

/* Questions for every significance test:
     * What is E(y|x) for the model SAS is using? 
     * Give the null hypothesis in symbols.
     * Do you reject H0 at alpha = 0.05? Answer Yes or No.
     * In plain, non-statistical language, what do you conclude? */

     
proc means;
     title2 'Litres per 100 k Broken Down by Country';
     class Country;
     var lper100k;

proc reg plots = none; /* Suppress diagnostic plots for now*/
     title2 'Regression with Just Country';
     model lper100k = c1 c2;
     USvsEURO: test c1=c2; 

proc glm;
     title2 'Compare Oneway with proc glm';
     class country;
     model lper100k = country;

proc reg plots = none data = auto; 
     title2 'Country, Weight and Length';
     model lper100k = c1 c2 weight length;
     country: test c1 = c2 = 0;     /* Country controlling for wgt, length */
     USvsEURO: test c1=c2;          /* US vs. Europe controlling for wgt, length */
     wgt_len: test weight=length=0; /* wgt, length controlling for Country */

/* Proportions of remaining variation, using a = sF/(n-p+sF) */

proc iml;
     title2 'Proportion of remaining variation';
     print "Country controlling for Weight and Length";
     n = 100; p = 5; s = 2;
     F = 6.90;  a = s*F/(n-p + s*F);
     print a;
     
     print "Weight and Length controlling for Country";
     F = 115.16;  a = s*F/(n-p + s*F);
     print a;

proc glm data=auto plots=none;
     title2 'Country, weight and length with proc glm';
     class country;
     model lper100k = weight length country;
     lsmeans country / pdiff tdiff adjust = bon;

proc reg plots = none; 
     title2 'Country, Weight and Length with Interactions';
     model lper100k = c1 c2 weight length cw1 cw2 cL1 cL2;
     country: test c1 = c2 = 0;     /* Is it really still country?  */
     Interactions: test cw1 = cw2 = cL1 = cL2 = 0;

/* Centering an explanatory variable by subtracting off the mean affects the
intercept, but not the relationships among variables. I want to create a new 
data set with weight and length centered, and to avoid confusion
I will make sure the variables are nicely labelled. */

proc standard mean=0 data=auto out=cntrd;
     var weight length;

/* In the new data set "cntrd," weight and length are adjusted to have mean
zero (the sample means have been subtracted from each observation). If I had
said mean=0 std=1, they would have been converted to z-scores.  All the other
variables (including the product terms) are as they were before, and the
labels are the same as before too. */

data centered; 
     set cntrd;  /* Now centered has everything in cntrd */
    /* Re-create Interaction Terms and re-label explanatory vars*/
     cw1 = c1*weight; cw2 = c2*weight;
     cL1 = c1*length; cL2 = c2*length;
     label weight = 'Weight in kg (Centered)'
           length = 'Length in cm (Centered)';

/* By default, SAS procedures use the most recently created data set,
   but specify it anyway. */

proc reg plots=none simple data=centered;
     title2 'Weight and length are now centered: Mean=0';
     model lper100k = c1 c2 weight length cw1 cw2 cL1 cL2;
     country: test c1 = c2 = 0;     /* Does this make better sense? */
     Interactions: test cw1 = cw2 = cL1 = cL2 = 0;

quit;