rm(list=ls(all=TRUE)) # REMOVE ALL PREVIOUS OBJECTS options(warn=-1) ny<-read.table("A:/cs73.dat",header=T); dim(ny) # 916 11 ny2<-na.omit(ny); dim(ny2) # 914 11 attach(ny2) ## transformations lexpen<-log(expen) lwealth<-log(wealth) lpop<-log(pop) ldens<-log(dens) lincome<-log(income) pint2<-pint**2 pint3<-pint**3 lpop2<-lpop**2 lpop3<-lpop**3 ldens2<-ldens**2 ldens3<-ldens**3 lgrowr<-ifelse(growr>0, log(growr+1), -log(-growr+1)) ## check that the transformations work par(mfrow=c(1,2)) hist(expen) hist(lexpen) ## figure 7.2 hist(wealth) hist(lwealth) hist(pop) hist(lpop) hist(pint) hist(log(pint)) hist(dens) hist(ldens) hist(income) hist(lincome) hist(growr) hist(lgrowr) ## linear or nonlinear relationships par(mfrow=c(1,1)) plot(lwealth,lexpen) ## linear plot(lpop,lexpen, xlab="Log Population",ylab="Log Expenditures", main="Log(Expenditures) versus Log(Population) with LOWESS") lines(lowess(lpop,lexpen)) #nonlinear; two linear segments will do lines(c(8.3,8.3),c(0,6)) plot(lpop,lexpen, xlab="Log Population",ylab="Log Expenditures", main="Figure 7.3: Log(Expenditures) versus Log(Population) with a Fitted Smooth Curve") lines(smooth.spline(lpop,lexpen)) plot(pint,lexpen) lines(lowess(pint,lexpen)) plot(log(pint),lexpen) lines(lowess(log(pint),lexpen)) #nonlinear? plot(ldens,lexpen) lines(lowess(ldens,lexpen)) #nonlinear, two linear segments lines(c(4.5,4.5),c(0,6)) plot(income,lexpen) plot(lincome,lexpen) ## better lines(lowess(lincome,lexpen)) ## linear plot(lgrowr,lexpen) lines(lowess(lgrowr,lexpen)) ## very linear ## regression model fit1<-lm(lexpen~lwealth+lpop+log(pint)+ldens++lincome+lgrowr) summary(fit1) # lincome not sig ## df for error: n-p-1 ## df for overall F test: (p, n-p-1) ## split data into two parts #(lpop cut=8.3, ldens cut=4.5) set2<-(lpop<=7.8 & ldens<=4) plot(lpop[set2],lexpen[set2]) lines(lowess(lpop[set2],lexpen[set2])) plot(ldens[set2],lexpen[set2]) lines(lowess(ldens[set2],lexpen[set2])) plot(lwealth[set2],lexpen[set2]) lines(lowess(lwealth[set2],lexpen[set2])) plot(log(pint)[set2],lexpen[set2]) plot(lincome[set2],lexpen[set2]) plot(lgrowr[set2],lexpen[set2]) lines(smooth.spline(lgrowr[set2],lexpen[set2])) fit2<-lm(lexpen~lwealth+lpop+lpop^2+log(pint)+ldens++lincome+lgrowr, subset=set2) summary(fit2) fit3<-lm(lexpen~lwealth+log(pint)+ldens+lincome+lgrowr, subset=set2) summary(fit3) fit4<-lm(lexpen~lwealth+log(pint)+ldens+lgrowr,subset=set2) summary(fit4) #model diagnostics # residuals vs. y plot(lexpen[set2],fit4$resid) plot(lexpen[set2],fit2$resid) # qq plot qqnorm(fit2$resid) #qqline(fit2$resid) ### try the model suggested by the author fit11<-lm(lexpen~wealth+lpop+lpop2+lpop3+pint+pint2+pint3+ldens +ldens2+ldens3+income+lgrowr) plot(lexpen,fit11$resid) # same problem qqnorm(fit11$resid) ## perdiction in the original scale #year 2005, Warwick sum(fit11$coef[2:13] * c(85000, log(20442), log(20442)^2, log(20442)^3, 24.7, log(214), log(214)^2, log(214)^3, 19500, log(35+1))) # -2338.024 # sigma > sd(fit11$resid) [1] 0.40059 exp(-2338.024+0.40059^2/2) # not a meaningful answer due to problem with data # The error term mostly measures the peculiarities of each particular town # that are not explained by the model.