The book Data by Andrews and Herzberg [1985] contains lots of data sets that have been used for various purposes in statistics. One famous data set records the annual number of deaths by horse kicks in the Prussian Army from 1875-1894 for each of 14 corps. The data are included below. (The data come from Table 4.1 in the Andrews and Herzberg [1985] book.)
Let Yi j be the number of deaths in year i, corps j, for i = 1875,…,1894 and j = 1,…,14. The Yijs are in columns 5–18 of the table.
(a) What are the first four columns of the table?
Table information, military units and year.
(b) What is the last column of the table?
Total by year.
(c) What is a good model for the data? Explain.
Poisson
(d) Suppose you model the data as i.i.d. Poi(λ).
i. Plot the likelihood function for λ.
myData<-read.table("http://lib.stat.cmu.edu/datasets/Andrews/T04.1",header=F)
mean(as.vector(as.matrix(myData[,5:18])),na.rm=T)
## [1] 0.7
countsByCorps<-apply(myData[,5:18],2,sum)
countsByCorps
## V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18
## 16 16 12 12 8 11 17 12 7 13 15 25 24 8
mean(countsByCorps)
## [1] 14
# V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18
# 16 16 12 12 8 11 17 12 7 13 15 25 24 8
countsByYear<-apply(myData[,5:18],1,sum)
countsByYear
## [1] 3 5 7 9 10 18 6 14 11 9 5 11 15 6 11 17 12 15 8 4
# 3 5 7 9 10 18 6 14 11 9 5 11 15 6 11 17 12 15 8 4
nllPois<-function(parVec,data){
nllik<- -sum(dpois(x=data,lambda=parVec,log=TRUE))
# cat("nllik= ",nllik,sep=" ",fill=T);cat(" ",sep=" ",fill=T)
#lik<-exp(-nllik)
return(nllik)
}
lPois<-function(parVec,data){
lik<- prod(dpois(x=data,lambda=parVec,log=FALSE))
# cat("nllik= ",nllik,sep=" ",fill=T);cat(" ",sep=" ",fill=T)
#lik<-exp(-nllik)
return(lik)
}
xSeq<-seq(from=1,to=40,len=1000)
yLik<-vapply(xSeq,nllPois,FUN.VALUE=1,data=countsByCorps)
plot(xSeq,yLik,typ='l')
ii. Find λ hat (i.e., the mle estimate of λ) by corps and by year.
### By Corps
### By corps
parVec<-c(1) # Initial parameter values
outPois<-optim(par=parVec,fn=nllPois,method="L-BFGS-B",lower=c(0),upper=c(Inf),
data=countsByCorps)
outPois$par # 14
## [1] 14
### Using another numerical minimizer
library(Rvmmin)
parVec<-c(1.0)
outPoisR<-Rvmmin(par=parVec,fn=nllPois,
lower=c(0.01),upper=c(Inf),
data=countsByCorps)
outPoisR$par # 14
## [1] 14
mean(countsByCorps) # 14 both MLE's match the mean (which happens to be the MLE here)
## [1] 14
### By Year
library(Rvmmin)
parVec<-c(1.0)
outPoisR<-Rvmmin(par=parVec,fn=nllPois,
lower=c(0.01),upper=c(Inf),
data=countsByYear)
outPoisR$par # 9.8
## [1] 9.8
mean(countsByYear) # 9.8 so out mle estimate above is the same as the mean
## [1] 9.8
### all data by corps by year
dat<-as.vector(as.matrix(myData[,5:18]))
mean(dat) # 0.7
## [1] 0.7
outPois<-optim(par=parVec,fn=nllPois,method="L-BFGS-B",
lower=c(0.01),upper=c(Inf),
data=dat)
outPois$par
## [1] 0.7000005
iii. What can you say about the rate of death by horse kick in the Prussian cavalry at the end of the 19th century?
xSeq<-seq(from=1,to=40,len=1000)
for(i in 1:length(countsByCorps)){
# browser()
yLik<-vapply(xSeq,nllPois,FUN.VALUE=1,data=countsByCorps[i])
if(i==1){
plot(xSeq,yLik,typ='l',col=i)
}else{
lines(xSeq,yLik,typ='l',col=i)
}
}
# vapply(xSeq,nllPois,FUN.VALUE=1,data=countsByCorps[1])
wtData<-read.table("http://www.uvm.edu/~bbeckage/Teaching/DataAnalysis/Data/wts.txt",header=TRUE)
head(wtData)
## gender height_in weight_lbs
## 1 m 72 165
## 2 m 67 160
## 3 m 68 160
## 4 f 64 114
## 5 f 66 135
## 6 m 69 148
out<-lm(weight_lbs~height_in,data=wtData)
summary(out)
##
## Call:
## lm(formula = weight_lbs ~ height_in, data = wtData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -34.341 -11.363 -1.274 11.170 35.659
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -255.179 110.894 -2.301 0.03858 *
## height_in 6.022 1.637 3.678 0.00278 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.89 on 13 degrees of freedom
## Multiple R-squared: 0.5099, Adjusted R-squared: 0.4722
## F-statistic: 13.53 on 1 and 13 DF, p-value: 0.002785
Note: 17.89*12/13 -> 16.8
require(ggplot2)
## Loading required package: ggplot2
ggplot(wtData,aes(height_in,weight_lbs))+ #stat_summary(fun.data=mean_cl_normal) +
geom_smooth(method='lm') +
geom_point()
nllNormReg<-function(parVec,data){
wt<-data$weight_lbs
ht<-data$height
b0<-parVec[1]
b1<-parVec[2]
mySd<-parVec[3]
wtPred<-b0+b1*ht
nllik<- -sum(dnorm(x=wt,mean=wtPred,sd=mySd,log=TRUE))
#browser()
return(nllik)
}
library(Rvmmin)
parVec<-c(1.0,10.0,5.0) # Rvmmin does not converge
parVec<-c(-200.0,6.0,10.0) # Rvmmin does converge
outLM<-Rvmmin(par=parVec,fn=nllNormReg,
lower=c(-Inf,-Inf,0.0001),upper=c(Inf,Inf,Inf),
data=wtData)
outLM$par # -255.17835 6.02234 16.65187
## [1] -255.17835 6.02234 16.65187
parVec<-c(1.0,10.0,5.0) # optim does converge
outLM<-optim(par=parVec,fn=nllNormReg,method="L-BFGS-B",
lower=c(-Inf,-Inf,0.0001),upper=c(Inf,Inf,Inf),
data=wtData)
outLM$par # -255.176994 6.022321 16.651855
## [1] -255.176994 6.022321 16.651855
Difference in estimates of sd:
sqrt(sum(out$residuals^2)/13)
## [1] 17.88698
sqrt(sum(out$residuals^2)/15)
## [1] 16.65187
wtData<-read.table("http://www.uvm.edu/~bbeckage/Teaching/DataAnalysis/Data/wts.txt",header=TRUE)
head(wtData)
## gender height_in weight_lbs
## 1 m 72 165
## 2 m 67 160
## 3 m 68 160
## 4 f 64 114
## 5 f 66 135
## 6 m 69 148
out2<-lm(weight_lbs~height_in + gender,data=wtData)
summary(out2)
##
## Call:
## lm(formula = weight_lbs ~ height_in + gender, data = wtData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.146 -9.104 -1.518 7.276 23.482
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -29.406 121.433 -0.242 0.8127
## height_in 2.457 1.850 1.328 0.2088
## genderm 29.008 10.460 2.773 0.0169 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.53 on 12 degrees of freedom
## Multiple R-squared: 0.7014, Adjusted R-squared: 0.6516
## F-statistic: 14.09 on 2 and 12 DF, p-value: 0.0007095
plot(wtData[wtData$gender=='f',]$height_in,
wtData[wtData$gender=='f',]$weight_lbs,
xlim=c(50,90),ylim=c(100,250), xlab="Height",ylab="Weight")
points(wtData[wtData$gender=='m',]$height_in,
wtData[wtData$gender=='m',]$weight_lbs,col='red')
abline(a=-29.4,b=2.457)
abline(a=-29.4+29.01,b=2.457,col='red') # male
require(ggplot2)
ggplot(wtData,aes(x=height_in,y=weight_lbs,colour=gender))+
geom_smooth(method='lm') +
geom_point()
nllNormRegG<-function(parVec,data){
wt<-data$weight_lbs
ht<-data$height
male<-as.numeric(data$gender)-1
b0<-parVec[1]
b1<-parVec[2]
b2<-parVec[3]
mySd<-parVec[4]
wtPred<-b0+b1*ht + male*b2
nllik<- -sum(dnorm(x=wt,mean=wtPred,sd=mySd,log=TRUE))
#browser()
return(nllik)
}
male<-model.matrix(~gender , data=wtData)
as.numeric(wtData$gender)-1
## [1] 1 1 1 0 0 1 0 0 0 0 1 1 1 0 1
parVec<-c(-10.0,10.0,20,15.0) # optim does converge
outLM2<-optim(par=parVec,fn=nllNormRegG,method="L-BFGS-B",
lower=c(-Inf,-Inf,-Inf,0.0001),upper=c(Inf,Inf,Inf,Inf),
control=list(maxit=4000),data=wtData)
outLM2$par # -0.4593658 2.0163780 30.7075045 13.0291443
## [1] -10.792825 2.173845 30.087430 13.011999
llNormRegGA<-function(b0,b1,b2,mySd,data){
wt<-data$weight_lbs
ht<-data$height
male<-as.numeric(data$gender)-1
# b0<-parVec[1]
# b1<-parVec[2]
# b2<-parVec[3]
# mySd<-parVec[4]
wtPred<-b0+b1*ht + male*b2
llik<- sum(dnorm(x=wt,mean=wtPred,sd=mySd,log=TRUE))
#browser()
return(llik)
}
library(GA)
## Loading required package: foreach
## Loading required package: iterators
## Package 'GA' version 3.0.2
## Type 'citation("GA")' for citing this R package in publications.
parVec<-c(-10.0,10.0,20,15.0)
outGA <- ga(type='real-valued', min=c(-100,-100, -100, 0.00001),
max=c(100, 100, 100,100), popSize=500, maxiter=2000, names=c('b0', 'b1', 'b2', 'sd'),
keepBest=T, fitness = function(b) llNormRegGA(b[1],b[2],b[3],b[4], wtData))
summary(outGA)
## +-----------------------------------+
## | Genetic Algorithm |
## +-----------------------------------+
##
## GA settings:
## Type = real-valued
## Population size = 500
## Number of generations = 2000
## Elitism = 25
## Crossover probability = 0.8
## Mutation probability = 0.1
## Search domain =
## b0 b1 b2 sd
## Min -100 -100 -100 1e-05
## Max 100 100 100 1e+02
##
## GA results:
## Iterations = 2000
## Fitness function value = -59.77267
## Solution =
## b0 b1 b2 sd
## [1,] -10.46631 2.169621 30.0138 13.01419
# b0 b1 b2 sd
# [1,] -19.47306 2.309252 29.77016 13.0651
llNormRegGA(-19.47306,2.309252,29.77016,13.0651,wtData) # -59.7669 Likelihood
## [1] -59.7669
llNormRegGA(-29.406,2.457,29.008,14.53,wtData) # -59.93025 lm
## [1] -59.93025
sqrt(sum(out2$residuals^2)/12)
## [1] 14.53355
sqrt(sum(out2$residuals^2)/15)
## [1] 12.99921
AIC = 2k-2log(likelihood) where k is number of fitted parameters.
# 3 parm model
2*3 +2 *nllNormReg(c(-255.176994,6.022321,16.651855),wtData) # 132.9438
## [1] 132.9438
# 4 parm model
2*4 -2 *llNormRegGA(-19.47306,2.309252,29.77016,13.0651,wtData) # 127.5338
## [1] 127.5338
So the 4 parmeter model with gender is favored.
This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.
plot(cars)
Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Cmd+Option+I.
When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Cmd+Shift+K to preview the HTML file).