library(emdbook) library(bbmle) prep.bar <- function(x) { out <- {} for ( i in 0:max(x) ) { out[i+1] <- length(x[x==i]) } out } # dice example # empirically compute probability of getting a 7 with two dice throws <- 100000 p <- replicate( throws , sum( round(runif( 2 , min=1 , max=6 )) ) ) out <- {} for ( i in 2:max(p) ) { out[i-1] <- length(p[p==i]) } barplot(out,names.arg=2:max(p),space=0,xlab="dice total",ylab="frequency",main=paste(throws,"throws") ) # probability of a 7 length( p[p==7] ) / throws # empirical likelihood principles k <- rbinom( 10000 , size=5 , prob=0.4 ) barplot(prep.bar(k),names.arg=0:max(k),space=0) barplot(prep.bar(k),names.arg=0:max(k),space=0,col=c(0,1,0,0,0,0)) length( k[k==1] )/10000 # empirical likelihood estimation # instead of writing a formula for the likelihood of observing x, # we simulate sampling and count frequency with which x occurs # note that because there will be variation among the simulations, # optimization will be less precise # using A LOT of replicates will help # as will using more tolerant optimization methods (SANN --- see ?optim) # sample from binomial -- easy problem # this function empirically estimates density of binomial dsimbinom <- function( x , prob , size , log=TRUE , R=99 ) { e <- rbinom( R , prob=prob , size=size ) p <- log( sapply( x , function(y) length(e[e==y])/R ) ) p } # sample 100 "fake" data k <- rbinom( 100 , size=5 , prob=0.4 ) prep.bar <- function(x) { out <- {} for ( i in 0:max(x) ) { out[i+1] <- length(x[x==i]) } out } barplot(prep.bar(k),names.arg=0:max(k),space=0) # estimate, both ways m.prob <- mle2( k ~ dbinom( prob=1/(1+exp(z)) , size=5 ) , start=list(z=0) ) m.sim <- mle2( k ~ dsimbinom( prob=1/(1+exp(z)) , size=5 , R=999 ) , start=list(z=0) , trace=FALSE , method="SANN" ) # output estimates, as well as mean occurrence (should be same as MLE from m.prob) 1/(1+exp(coef(m.prob))) 1/(1+exp(coef(m.sim))) sum(k)/500 # draw likelihood surface for this example reps <- 99 x <- seq(0.2,0.6,0.01) y <- sapply( x , function(z) -sum(dsimbinom(k,prob=z,size=5,R=reps,log=TRUE)) ) plot( x , y , type="l" , xlab="prob" , ylab="-logLik" ,main=paste(reps,"replicates") ) y2 <- sapply( x , function(z) -sum(dbinom(k,prob=z,size=5,log=TRUE)) ) lines( x , y2 , col="red" ) reps <- 999 x <- seq(0.2,0.6,0.01) y <- sapply( x , function(z) -sum(dsimbinom(k,prob=z,size=5,R=reps,log=TRUE)) ) plot( x , y , type="l" , xlab="prob" , ylab="-logLik" ,main=paste(reps,"replicates") ) y2 <- sapply( x , function(z) -sum(dbinom(k,prob=z,size=5,log=TRUE)) ) lines( x , y2 , col="red" ) reps <- 9999 x <- seq(0.2,0.6,0.01) y <- sapply( x , function(z) -sum(dsimbinom(k,prob=z,size=5,R=reps,log=TRUE)) ) plot( x , y , type="l" , xlab="prob" , ylab="-logLik" ,main=paste(reps,"replicates") ) y2 <- sapply( x , function(z) -sum(dbinom(k,prob=z,size=5,log=TRUE)) ) lines( x , y2 , col="red" ) # empirical beta-binomial example # generate some fake beta-binomial data, using the analytical function k <- rbetabinom( 100 , size=5 , shape1=2 , shape2=2 ) barplot(prep.bar(k),names.arg=0:max(k),space=0,xlab="dead tadpoles",ylab="frequency") # write the density function, that uses simulation to approximate likelihoods # should produce similar results to dbetabinom() dsimbetabinom <- function( x , shape1 , shape2 , size , log=TRUE , R=99 ) { # sample R probabilities from betabinom p <- rbeta( R , shape1=shape1 , shape2=shape2 ) # sample each event from p e <- rbinom( R , size=size , prob=p ) # observe log-freq of each x in distribution of e log( sapply( x , function(y) length(e[e==y])/R ) ) } # estimate mle, using analytical function m.prob <- mle2( k ~ dbetabinom( size=5 , shape1=exp(s1) , shape2=exp(s2) ) , start=list( s1=1,s2=1 ) ) exp(coef(m.prob)) # now estimate using simulation function m.sim <- mle2( k ~ dsimbetabinom( shape1=exp(s1) , shape2=exp(s2) , size=5 , R=999 ) , start=list( s1=1 , s2=1 ) , trace=FALSE , method="SANN" ) # confidence intervals, from resampling # can't always estimate shape of likelihood surface well # hessian (quadratic) estimation sometimes doesn't work # alternative is to RESAMPLE data from data itself, and re-estimate # more common use of bootstrap is when we have no good idea how the population is distributed # so we just resample from sample itself to estimate variability of descriptive estimates, like mean, variance # binomial example of bootstrap MLE confint estimation logit <- function(x) 1/(1+exp(x)) k <- rbinom( 100 , size=5 , prob=0.3 ) # estimate m <- mle2( k ~ dbinom( prob=logit(z) , size=5 ) , start=list(z=0) ) # replicate and return list of model fits plist <- replicate( 999 , coef(mle2( sample(k,100,TRUE) ~ dbinom( prob=logit(z) , size=5 ) , start=list(z=coef(m)[1]) , method="Nelder-Mead" ) ) ) # find cut-offs that include 95% of all values ci.sim <- logit( quantile( plist , probs=c(0.025,0.975) ) ) hist( logit(plist) ) lines( c( ci.sim[1] , ci.sim[1] ) , c(0,1000) , col="red" ) lines( c( ci.sim[2] , ci.sim[2] ) , c(0,1000) , col="red" ) ci.sim # now theoretical confidence limits logit(confint(m)) # log-normal poisson distribution example hist( rpois( 1000 , lambda=rlnorm( 1000 , 2 , 1 ) ) ) # boot library example library(MASS) data(Animals) d <- Animals plot( log(d$brain) ~ log(d$body) , xlab="log body mass" , ylab="log brain mass" ) m.orig <- lm( log(d$brain) ~ log(d$body) ) abline( lm( log(d$brain) ~ log(d$body) ) , col="red" ) f.coef <- function( d , i ) { # make a new data frame that contains the resampled rows in i nd <- d[i,] # fit our model to the resampled data m <- lm( log(brain) ~ log(body) , data=nd ) # return coefficients coef(m) } library(boot) boot.animals <- boot( d , f.coef , R=9999 ) plot( boot.animals , index=2 ) # plot histogram of resampled beta coefficients (the second parameter) hist( boot.animals$t[,2] ) lines( c( coef(m.orig)[2] , coef(m.orig)[2] ) , c(0,2000) , col="red" , lwd=2 ) boot.ci( boot.animals , type="perc" , index=2 )