/********************* 2101f17senic1.sas ***************************/ title 'Open SENIC Data'; /* I reluctantly changed NA to . with a text editor. */ data senic; infile '/folders/myfolders/2101f17/openSENIC.data.txt' firstobs=2; /* Skip the first line that R uses */ length region $ 12; /* Otherwise names are truncated to 8 */ input id region $ mdschl $ census nbeds nurses lngstay age xratio culratio infpercent; label region = 'Region of U.S.A.' mdschl = 'Medical school affiliation' census = 'Aver # patients in hospital per day' nbeds = 'Average # beds during study period' nurses = 'Aver # nurses during study period' lngstay = 'Av length of hospital stay, in days' age = 'Average patient age' xratio = '# x-rays / # no signs of pneumonia' culratio = '# cultures / # no hosp acq infect' infpercent = 'Percent acquiring infection in hospital'; /* Make dummy variables. Proc reg needs them, proc glm does not. */ if mdschl = 'No' then mschool = 0; else if mdschl = 'Yes' then mschool = 1; /* Region */ if region = 'NorthCentral' then nc = 1; else if region = ' ' then nc = .; else nc = 0; if region = 'Northeast' then ne = 1; else if region = ' ' then ne = .; else ne = 0; if region = 'South' then s = 1; else if region = ' ' then s = .; else s = 0; if region = 'West' then w = 1; else if region = ' ' then w = .; else w = 0; /* First priority is to check the dummy varables. */ proc freq; tables mschool*mdschl / norow nocol nopercent missing; proc freq; tables (nc ne s w) * region / norow nocol nopercent missing; proc freq; title2 'Frequency distributions of categorical variables'; tables region mdschl; proc means; title2 'Table of means and standard deviations'; var census nbeds nurses lngstay age xratio culratio infpercent; proc univariate plot; title2 'Detailed descriptive statistics with boxplots and histograms'; var census -- infpercent; histogram; proc sgplot; /* Bar charts for categorical variables */ vbar region; proc sgplot; vbar mdschl; /* Boxplots for different values of a grouping variable */ /* Need to sort by the grouping variable first */ proc sort; by mdschl; proc boxplot; plot infpercent*mdschl; proc freq; title2 'Relationship between region and medical school affiliation'; tables mdschl*region / norow nopercent chisq; proc ttest; title2 'Less risk at Hospitals with Med School Affiliation?'; title3 'Compare t = -2.542'; class mdschl; var infpercent; proc glm; title2 'Regional differences in average infection risk?'; title3 'Compare F = 2.674, p = 0.0519'; class region; model infpercent = region; means region; /* Could get the means from proc means, with no extra boxplots */ proc means; class region; var infpercent; proc corr; title2 'Correlation matrix of quantitative variables'; var census nbeds nurses lngstay age xratio culratio infpercent; /* The nomiss option gives casewise deletion of missing values. */ proc reg plots=none; /* Suppress diagnostic plots for now*/ title2 'Simple regression (One explanatory variable)'; model infpercent = nurses; proc reg plots=none; title2 'Just the hospital size variables'; model infpercent = census nbeds nurses; /* Fit the model with all quantitative variables, and test the size variables (census nbeds nurses) simultaneously */ proc reg plots=none; title2 'All the quantitative explanatory variables'; title3 'Compare F-test for size: F = 2.7662'; model infpercent = census nbeds nurses lngstay age xratio culratio; size: test census = nbeds = nurses = 0; proc reg plots=none; title2 'Full model including categorical variables'; title3 'Reference category for region is South'; model infpercent = census nbeds nurses lngstay age xratio culratio mschool nc ne w; region: test nc = ne = w = 0; size: test census=nbeds=nurses = 0; /* Proc glm will make dummy variables for you. */ proc glm; title2 'Full model with proc glm'; title3 'Compare tests of region (F=4.0769) and size (F=3.5856)'; class region mdschl; model infpercent = census nbeds nurses lngstay age xratio culratio mdschl region / solution; /* The solution option requests the regression coefficients; /* Test region and size*/ contrast 'Test of Region' region 1 -1 0 0, region 0 1 -1 0, region 0 0 1 -1; contrast 'Test of size' census 1, nbeds 1, nurses 1; lsmeans region; proc glm; title2 'Try lsmeans another way'; class region; model infpercent = census nbeds nurses lngstay age xratio culratio mschool region; lsmeans region / at mschool=0; proc reg plots=none; title2 'Just hospitals with no medical school affiliation'; where mdschl = 'No'; model infpercent = census nbeds nurses lngstay age xratio culratio nc ne w;