# statistical thinking week 1 # topics: estimation and confidence # show that likelihood of model justifies using the frequency of water in sample as estimate of proportion of earth covered by water # first a blank plot showing the sample mean with a dashed line curve( dbinom( 1 , size=2 , prob=x , log=FALSE ) , from=0 , to=1 , xlab="Proportion water" , ylab="Likelihood" ,col="white" ) lines( c(0.5,0.5) , c(0,0.5) , lty=2 ) # now plot some hypotheses # p = 0.25 lines( c(0.25,0.25) , c( 0 , dbinom(1,2,0.25,FALSE) ) ) points( 0.25 , dbinom(1,2,0.25,FALSE) ) text( x=0.25 , y=dbinom(1,2,0.25,FALSE) , labels="p=0.25" , pos=3 ) # p = 0.66 lines( c(2/3,2/3) , c( 0 , dbinom(1,2,2/3,FALSE) ) ) points( 2/3 , dbinom(1,2,2/3,FALSE) ) text( x=2/3 , y=dbinom(1,2,2/3,FALSE) , labels="p=2/3" , pos=3 ) # now add a random likelihood to the plot p <- runif(1) lines( c(p,p) , c( 0 , dbinom(1,2,p,FALSE) ) ) points( p , dbinom(1,2,p,FALSE) ) # plot the function itself curve( dbinom( 1 , size=2 , prob=x , log=FALSE ) , from=0 , to=1 , xlab="Proportion water" , ylab="Likelihood" , col="red" ) lines( c(0.5,0.5) , c(0,0.5) , lty=2 ) # now an example with a different sample, with L,L,W curve( dbinom( 1 , size=3 , prob=x , log=FALSE ) , from=0 , to=1 , xlab="Proportion water" , ylab="Likelihood" , col="red" ) lines( c(1/3,1/3) , c(0, dbinom( 1 , size=3 , prob=1/3 , log=FALSE ) ) , lty=2 ) # now an example with a different sample, with L,W,W,W,W curve( dbinom( 4 , size=5 , prob=x , log=FALSE ) , from=0 , to=1 , xlab="Proportion water" , ylab="Likelihood" , col="red" ) lines( c(4/5,4/5) , c(0, dbinom( 4 , size=5 , prob=4/5 , log=FALSE ) ) , lty=2 ) # generating confidence for the land/water globe sampling # meant to convince student that {L,W} and {L,L,W,W} produce same estimates but different amounts of confidence # first draw likelihood function for a small sample of two, {L,W} curve( dbinom( 1 , size=2 , prob=x , log=FALSE ) , from=0 , to=1 , xlab="Proportion water" , ylab="Likelihood" , col="blue" , lwd=2 ) text( x=0.5 , y=dbinom(1,2,0.5,FALSE) , labels=2 , pos=1 ) # new draw likelihood function for a larger sample of four, {L,L,W,W} curve( dbinom( 2 , size=4 , prob=x , log=FALSE ) , from=0 , to=1 , add=TRUE , col="red" , lwd=2 ) text( x=0.5 , y=dbinom(2,4,0.5,FALSE) , labels=4 , pos=3 ) # now a bunch of large samples with same proportion of water for ( i in 3:6 ) { curve( dbinom( 2^(i-1) , size=2^i , prob=x , log=FALSE ) , from=0 , to=1 , add=TRUE ) if ( 2^i < 128 ) text( x=0.5 , y=dbinom(2^(i-1),2^i,0.5,FALSE) , labels=2^i , pos=3 ) } # now draw the above, normalizing the height of the likelihood function to highlight changes in shape nx <- dbinom( 1 , size=2 , prob=0.5 , log=FALSE ) curve( dbinom( 1 , size=2 , prob=x , log=FALSE )/nx , from=0 , to=1 , xlab="Proportion water" , ylab="Normalized Likelihood" ) for ( i in 2:6 ) { nx <- dbinom( 2^(i-1) , size=2^i , prob=0.5 , log=FALSE ) curve( dbinom( 2^(i-1) , size=2^i , prob=x , log=FALSE )/nx , from=0 , to=1 , add=TRUE ) } ################# # generate a list of 20 correlated height and weight values # height in inches h <- rnorm( n=20 , mean=69 , sd=3 ) # weight as function of height w <- rnorm( n=20 , mean=80+h*1.5 , sd=6 ) # plot weight against height plot( w ~ h , xlab="height" , ylab="weight" ) # compute the mean height of this population mean( h ) # now sub-sample this population, to illustrate how our estimate of the mean varies with sample size sample.sizes <- rep( 5:20 , each=10 ) sample.means <- sapply( sample.sizes , function(x) mean( sample( h , size=x ) ) ) # plot results plot( sample.means ~ sample.sizes ) lines( c(5,20) , c( mean(h) , mean(h) ) , col="red" ) # now we'll focus on samples of 10 heights and simulate the "sampling distribution" of the sample mean n <- 10 sample.means <- sapply( rep(n,10000) , function(x) mean( sample( h , size=x ) ) ) hist( sample.means , breaks=50 ) lines( c( mean(h) , mean(h) ) , c(0,5000) , col="red" ) # now generate a bunch of 95% confidence intervals for n=5 samples and plot them and the true mean. true mean should be inside 95% of the intervals! sample.means <- {} sample.ci.hi <- {} sample.ci.lo <- {} n <- 100 for ( i in 1:n ) { s <- sample( h , size=5 ) sample.means[i] <- mean(s) sample.ci <- confint.default( lm( s ~ 1 ) , level=0.95 ) sample.ci.hi[i] <- sample.ci[2] sample.ci.lo[i] <- sample.ci[1] } # count instances of true mean being inside intervals is.inside <- function( true.mean , lo , hi ) { ifelse( true.mean>lo & true.mean