# the parameter problem # generate some data, from a linear true model # then fit various models to the data # fit of more complex models ALWAYS better # generate data gen.dat <- function(n=30,a=4,b=1.3,mx=7,sx=3,se=5) { x <- rnorm( n , mx , sx ) e <- rnorm( n , 0 , se ) y <- a + b*x + e data.frame(x=x,y=y) } d <- gen.dat() # fit to a line library(bbmle) m1 <- mle2( d$y ~ dnorm( mean=a + b*d$x , sd=s ) , start=list(a=4,b=2,s=5) ) # fit to a parabola m2 <- mle2( d$y ~ dnorm( mean=a + b*d$x + b2*(d$x^2) , sd=s ) , start=list(a=4,b=2,b2=0,s=5) ) # draw predictions against data plot(d) curve( coef(m1)[1] + coef(m1)[2]*x , add=TRUE ) curve( coef(m2)[1] + coef(m2)[2]*x + coef(m2)[3]*x^2 , add=TRUE , col="red" ) # compare fits anova(m1,m2) # example with 5th order polynomial for to data generated from a line d <- gen.dat(n=5,se=2) x <- d$x m5th <- mle2( d$y ~ dnorm( mean=a + b*x + b2*x^2 + b3*x^3 + b4*x^4 + b5*x^5 , sd=exp(s) ) , start=list(a=4,b=2,b2=0,b3=0,b4=0,b5=0,s=2) ) k <- coef(m5th) curve( k[1]+k[2]*x+k[3]*x^2+k[4]*x^3+k[5]*x^4+k[6]*x^5 , from=min(d$x) , to=max(d$x) , col="red" , xlab="x" , ylab="y" ) points( d$x , d$y ) # now put above in a function, so we can do it a bunch of times comparefits <- function() { dat <- gen.dat() assign( "dat" , dat , envir=.GlobalEnv ) # this is necessary for mle2 to find the data object m1 <- mle2( dat$y ~ dnorm( mean=a + b*dat$x , sd=s ) , start=list(a=4,b=2,s=5) ) m2 <- mle2( dat$y ~ dnorm( mean=a + b*dat$x + b2*(dat$x^2) , sd=s ) , start=list(a=4,b=2,b2=0,s=5) ) ll1 <- logLik(m1)[1] ll2 <- logLik(m2)[1] ll1 - ll2 # return difference between fits } # now loop 99 times and build table of differences in logLik diffs <- rep(0,99) for ( i in 1:99 ) { diffs[i] <- comparefits() } hist(diffs) # but if we CROSS-VALIDATE, linear model is better # to cross-validate, fit on one sample, evaluate goodness-of-fit by predicting y values on another sample # step 1: get two samples, A and B # step 2: fit models to sample A # step 3: predict sample B, using estimates from step 2 # step 4: compare goodness of predictions to sample B, across models # this procedure directly evaluates ability of models to predict new data # mantra: fitting is easy, prediction is hard # illustrate with one cross-validation d <- gen.dat() d2 <- gen.dat() m1 <- mle2( d$y ~ dnorm( mean=a + b*d$x , sd=s ) , start=list(a=4,b=2,s=5) ) m2 <- mle2( d$y ~ dnorm( mean=a + b*d$x + b2*(d$x^2) , sd=s ) , start=list(a=4,b=2,b2=0,s=5) ) m1cv <- mle2( d2$y ~ dnorm( mean=a + b*d$x , sd=s ) , start=list(a=4,b=2,s=5) , fixed=list( a=coef(m1)[1] , b=coef(m1)[2] , s=coef(m1)[3] ) ) m2cv <- mle2( d2$y ~ dnorm( mean=a + b*d$x + b2*(d$x^2) , sd=s ) , start=list(a=4,b=2,b2=0,s=5) , fixed=list( a=coef(m2)[1] , b=coef(m2)[2] , b2=coef(m2)[3] , s=coef(m2)[4] ) ) par(mfrow=c(1,2)) # two plots in one row plot(d) curve( coef(m1)[1] + coef(m1)[2]*x , add=TRUE ) curve( coef(m2)[1] + coef(m2)[2]*x + coef(m2)[3]*x^2 , add=TRUE , col="red" ) plot(d2) curve( coef(m1)[1] + coef(m1)[2]*x , add=TRUE ) curve( coef(m2)[1] + coef(m2)[2]*x + coef(m2)[3]*x^2 , add=TRUE , col="red" ) crossvalidate <- function() { d <- gen.dat() d2 <- gen.dat() assign( "d" , d , envir=.GlobalEnv ) assign( "d2" , d2 , envir=.GlobalEnv ) # fit m1 <- mle2( d$y ~ dnorm( mean=a + b*d$x , sd=s ) , start=list(a=4,b=2,s=5) ) m2 <- mle2( d$y ~ dnorm( mean=a + b*d$x + b2*(d$x^2) , sd=s ) , start=list(a=4,b=2,b2=0,s=5) ) # predict m1cv <- mle2( d2$y ~ dnorm( mean=a + b*d$x , sd=s ) , start=list(a=4,b=2,s=5) , fixed=list( a=coef(m1)[1] , b=coef(m1)[2] , s=coef(m1)[3] ) ) m2cv <- mle2( d2$y ~ dnorm( mean=a + b*d$x + b2*(d$x^2) , sd=s ) , start=list(a=4,b=2,b2=0,s=5) , fixed=list( a=coef(m2)[1] , b=coef(m2)[2] , b2=coef(m2)[3] , s=coef(m2)[4] ) ) ll1 <- logLik(m1)[1] ll2 <- logLik(m2)[1] ll1cv <- logLik(m1cv)[1] ll2cv <- logLik(m2cv)[1] c(ll1 - ll2 , ll1cv - ll2cv ) } diffs <- rep(0,99) diffscv <- rep(0,99) for ( i in 1:99 ) { d <- crossvalidate() diffs[i] <- d[1] diffscv[i] <- d[2] } par(mfrow=c(1,2)) hist(diffs) hist(diffscv) # K-L distance examples kldist <- function( pi1 , p1=0.5 ) { p2 <- 1 - p1 pi2 <- 1 - pi1 p1 * log( p1 / pi1 ) + p2 * log( p2 / pi2 ) } curve( kldist(x) , from=0, to=1 , xlab="pi_1" , ylab="K-L Dist, I(f,g)" , main="p1 = 0.5" ) curve( kldist(x,p1=0.25) , from=0, to=1 , xlab="pi_1" , ylab="K-L Dist, I(f,g)" , main="p1 = 0.25" ) # AIC examples