## Transformations to achieve homoscedasticity ################ # 1. proportions ################ y1<-c(.05,.15,.35,.25,.20,.05,.10,.05,.30,.05,.25) y2<-c(0,.15,0,.05,0,0,.05,.10) ## table 3.7 > mean(y1) [1] 0.1636364 > sqrt(var(y1)) [1] 0.1120065 > mean(y2) [1] 0.04375 > sqrt(var(y2)) [1] 0.05629958 y1new<-asin(sqrt(y1)) y2new<-asin(sqrt(y2)) > sqrt(var(y1new)) [1] 0.1577415 > sqrt(var(y2new)) # the asin(sqrt(y)) transformation works well [1] 0.1656799 grp<-rep(c(1,2),c(11,8)) fit1<-lm(c(y1new,y2new)~grp) > anova(fit1) Response: c(y1new, y2new) Df Sum Sq Mean Sq F value Pr(>F) grp 1 0.28646 0.28646 11.043 0.004024 ** Residuals 17 0.44097 0.02594 > t.test(y1new,y2new,var.equal=T) Two Sample t-test data: y1new and y2new t = 3.3231, df = 17, p-value = 0.004024 alternative hypothesis: true difference in means is not equal to 0 95 percent confidence interval: 0.09080162 0.40658652 sample estimates: mean of x mean of y 0.3950037 0.1463096 > 3.3231^2 [1] 11.04299 ## t^2=F ########### # 2. counts ########### y3<-c(7925,15643,17462,10805,9300,7538,6297) y4<-c(3158,3669,5930,5697,8331,11822) ## table 3.8 > mean(y3) [1] 10710 > sqrt(var(y3)) [1] 4266.409 > mean(y4) [1] 6434.5 > sqrt(var(y4)) [1] 3218.812 ## check whether s/sqrt(mean) is a constant > sqrt(var(y3))/sqrt(mean(y3)) [1] 41.22568 > sqrt(var(y4))/sqrt(mean(y4)) [1] 40.12713 y3new<-sqrt(y3) y4new<-sqrt(y4) > sqrt(var(y3new)) [1] 19.94644 > sqrt(var(y4new)) [1] 19.52792 ## equal variances grp2<-c(rep(1,7),rep(2,6)) fit2<-lm(c(y3new,y4new)~grp2) > anova(fit2) Analysis of Variance Table Response: c(y3new, y4new) Df Sum Sq Mean Sq F value Pr(>F) grp2 1 1802.3 1802.3 4.617 0.05477 . Residuals 11 4293.9 390.4 ####################### # 3. time to event data ####################### ## s/(mean^2) is a constant. use 1/y transformation. ## Or, take a survival analysis approach ########################### # 4. log-norm distributed y ########################### y7<-c(.2,.3,.4,1.1,2,2.1,3.3,3.8,4.5,4.8,4.9,5,5.3,7.5,9.8, 10.4,10.9,11.3,12.4,16.2,17.6,18.9,20.7,24,25.4,40,42.2,50,60) y8<-c(.2,.3,.4,.7,1.2,1.5,1.5,1.9,2,2.4,2.5,2.8,3.6,4.8,4.8, 5.4,5.7,5.8,7.5,8.7,8.8,9.1,10.3,15.6,16.1,16.5,16.7,20,20.7,33) > mean(y7) [1] 14.31034 > sqrt(var(y7)) [1] 15.74058 > mean(y8) [1] 7.683333 > sqrt(var(y8)) [1] 7.849844 ## check whether s/mean is a constant > sqrt(var(y7))/mean(y7) [1] 1.099944 > sqrt(var(y8))/mean(y8) [1] 1.021672 hist(y7) hist(y8) ## skewed distribution with long right tails y7new<-log(y7) y8new<-log(y8) > var(y7new) [1] 2.19442 > var(y8new) [1] 1.732304 ## variances are still not equal, but better than before hist(y7new) hist(y8new) ## dist'ns are more normal looking > t.test(y7new,y8new,var.equal=T) Two Sample t-test data: y7new and y8new t = 1.4088, df = 57, p-value = 0.1643 alternative hypothesis: true difference in means is not equal to 0 95 percent confidence interval: -0.2163980 1.2434674 ### 95% CI for log(mu1)-log(mu2) sample estimates: mean of x mean of y 1.921094 1.407559 ## 95% CI for mu1/mu2 > exp(-0.2164) [1] 0.805413 > exp(1.2435) [1] 3.467729 ## 1 is included. The two means are not sig. diff.