/********************* cars2.sas ***************************/ title 'Regression on Metric Cars Data'; /* Read data directly from Excel spreadsheet */ proc import datafile="/home/brunner0/441s20/mcars4.xlsx" out=cars dbms=xlsx replace; getnames=yes; /* Input data file is mcars4.xlsx Ouput data set is called cars dbms=xlsx The input file is an Excel spreadsheet. Necessary to read an Excel spreadsheet directly under unix/linux Works in PC environment too except for Excel 4.0 spreadsheets If there are multiple sheets, use sheet="sheet1" or something. replace If the data set cars already exists, replace it. getnames=yes Use column names as variable names. */ proc print; title2 'Look at input data set'; data auto; set cars; mpg = 100/lper100k * 0.6214/0.2642; Country = Cntry; /* I just like the spelling more */ label Country = 'Location of Head Office' lper100k = 'Litres per 100 kilometers' mpg = 'Miles per Gallon' weight = 'Weight in kg' length = 'Length in meters'; /* Indicator dummy vars: Ref category is Japanese */ if country = 'US' then c1=1; else c1=0; if country = 'Europ' then c2=1; else c2=0; label c1 = 'US = 1' c2 = 'Europe = 1'; /* Interaction Terms */ cw1 = c1*weight; cw2 = c2*weight; cL1 = c1*length; cL2 = c2*length; /* This way of creating dummy variables is safe only because Country is never missing. If it could be missing, better is if country = ' ' then c1 = .; else if country = 'US' then c1=1; else c1=0; if country = ' ' then c2 = .; else if country = 'Europ' then c2=1; else c2=0; Note that a blank space is the missing value code for character variables, while a period is missing for numeric variables. */ proc freq; title2 'Check dummy variables'; tables (c1 c2)*country / norow nocol nopercent; /* First an analysis with country only. */ /* Questions for every significance test: * What is E(y|x) for the model SAS is using? * Give the null hypothesis in symbols. * Do you reject H0 at alpha = 0.05? Answer Yes or No. * In plain, non-statistical language, what do you conclude? */ proc means; title2 'Litres per 100 k Broken Down by Country'; class Country; var lper100k; proc reg plots = none; /* Suppress diagnostic plots for now*/ title2 'Regression with Just Country'; model lper100k = c1 c2; USvsEURO: test c1=c2; proc glm; title2 'Compare Oneway with proc glm'; class country; model lper100k = country; proc reg plots = none data = auto; title2 'Country, Weight and Length'; model lper100k = c1 c2 weight length; country: test c1 = c2 = 0; /* Country controlling for wgt, length */ USvsEURO: test c1=c2; /* US vs. Europe controlling for wgt, length */ wgt_len: test weight=length=0; /* wgt, length controlling for Country */ /* Proportions of remaining variation, using a = sF/(n-p+sF) */ proc iml; title2 'Proportion of remaining variation'; print "Country controlling for Weight and Length"; n = 100; p = 5; s = 2; F = 6.90; a = s*F/(n-p + s*F); print a; print "Weight and Length controlling for Country"; F = 115.16; a = s*F/(n-p + s*F); print a; proc glm data=auto plots=none; title2 'Country, weight and length with proc glm'; class country; model lper100k = weight length country; lsmeans country / pdiff tdiff adjust = bon; proc reg plots = none; title2 'Country, Weight and Length with Interactions'; model lper100k = c1 c2 weight length cw1 cw2 cL1 cL2; country: test c1 = c2 = 0; /* Is it really still country? */ Interactions: test cw1 = cw2 = cL1 = cL2 = 0; /* Centering an explanatory variable by subtracting off the mean affects the intercept, but not the relationships among variables. I want to create a new data set with weight and length centered, and to avoid confusion I will make sure the variables are nicely labelled. */ proc standard mean=0 data=auto out=cntrd; var weight length; /* In the new data set "cntrd," weight and length are adjusted to have mean zero (the sample means have been subtracted from each observation). If I had said mean=0 std=1, they would have been converted to z-scores. All the other variables (including the product terms) are as they were before, and the labels are the same as before too. */ data centered; set cntrd; /* Now centered has everything in cntrd */ /* Re-create Interaction Terms and re-label explanatory vars*/ cw1 = c1*weight; cw2 = c2*weight; cL1 = c1*length; cL2 = c2*length; label weight = 'Weight in kg (Centered)' length = 'Length in cm (Centered)'; /* By default, SAS procedures use the most recently created data set, but specify it anyway. */ proc reg plots=none simple data=centered; title2 'Weight and length are now centered: Mean=0'; model lper100k = c1 c2 weight length cw1 cw2 cL1 cL2; country: test c1 = c2 = 0; /* Does this make better sense? */ Interactions: test cw1 = cw2 = cL1 = cL2 = 0; quit;