# week 3 # linear models, just the basics # load the height/weight data from the class h <- c( 5*12+11 , 69 , 5*12+8 , 62,5*12+11,72,5*12+8,69,75,71,61,61,62,5*12+11 , 68,70,74,5*12+2,67,65.75 , 76 , 69 , 65 , 67 , 5*12+4 , 5*12+8 , 68 , 75 ) w <- c( 135,165,165,120,220,170,150,165,245,160,110,136,112,195,150,155,200,137,160,150,225,140,128,150,130,140,145,210 ) d <- data.frame( height=h , weight=w ) hist(d$height) # we want to estimate the population mean, using this sample. # find maximum likelihood estimate! # first, just plot the likelihood of each hypothetical mean likelihood.of.sample <- function( hypothetical.mean , sample=d$height ) { prod( dnorm( sample , mean=hypothetical.mean , sd=sd(sample) ) ) } height.hypotheses <- seq( from=60 , to=75 , by=0.1 ) likelihoods <- sapply( height.hypotheses , function(z) likelihood.of.sample(z) ) plot( height.hypotheses , likelihoods , type="l" , xlab="mean height (alpha)" , ylab="likelihood, Pr(data|alpha)" ) lines( c( mean(d$height) , mean(d$height) ) , c(0,1) , col="red" , lty=2 ) # fit the sd, assuming we know mean likelihood.of.sample.sd <- function( hypothetical.sigma , sample=d$height ) { prod( dnorm( sample , mean=mean(sample) , sd=hypothetical.sigma ) ) } sd.hypotheses <- seq( from=2 , to=6 , by=0.1 ) likelihoods <- sapply( sd.hypotheses , function(z) likelihood.of.sample.sd(z) ) plot( sd.hypotheses , likelihoods , type="l" , xlab="standard deviation (sigma)" , ylab="likelihood, Pr(data|sigma)" ) lines( c( sd(d$height) , sd(d$height) ) , c(0,1) , col="red" , lty=2 ) # fit both alpha and sigma at same time likelihood.of.sample.alpha.sd <- function( alpha, sigma ) { prod( dnorm( d$height , mean=alpha , sd=sigma ) ) } liks <- matrix( 0 , nrow=length(height.hypotheses) , ncol=length(sd.hypotheses) ) for ( i in 1:length(height.hypotheses) ) { for ( j in 1:length(sd.hypotheses) ) { alpha <- height.hypotheses[i] sigma <- sd.hypotheses[j] liks[i,j] <- likelihood.of.sample.alpha.sd( alpha , sigma ) } } contour( height.hypotheses , sd.hypotheses , liks , xlab="alpha" , ylab="sigma" ) lines( c( mean(d$height) , mean(d$height) ) , c(0,10) , col="red" , lty=2 ) lines( c( 0 , 100 ) , c( sd(d$height) , sd(d$height) ) , col="red" , lty=2 ) # now find these estimates the easy way summary( lm( d$height ~ 1 ) ) # use weight as a predictor of height plot( d$height ~ d$weight ) our.model <- lm( d$height ~ d$weight ) abline( our.model ,col="red" ) summary( our.model ) # now run the model forward and generate predictions alpha.estimate <- 51.32838 beta.estimate <- 0.10514 sigma.estimate <- 2.317 # make a bunch of random weights for a fictional population range.of.weights <- rnorm( 50 , mean=180 , sd=50 ) # use those to predict heights simulated.heights <- rnorm( length(range.of.weights) , mean=alpha.estimate + beta.estimate * range.of.weights , sd=sigma.estimate ) plot( simulated.heights ~ range.of.weights ) # extreme weights example # make a bunch of random weights for a fictional population range.of.weights <- rnorm( 50 , mean=300 , sd=150 ) # use those to predict heights simulated.heights <- rnorm( length(range.of.weights) , mean=alpha.estimate + beta.estimate * range.of.weights , sd=sigma.estimate ) plot( simulated.heights ~ range.of.weights ) #################### # multivariate examples # leg example lleg <- c( 83.45907 , 86.08410 , 84.80643 , 87.13789 , 86.42901 , 85.09396 , 85.22466 , 83.12974 , 89.52345 , 85.52122 , 84.42796 , 86.92371 , 87.81347 , 85.19558 , 85.39040 , 87.50774 , 84.99560 , 85.68324 , 84.62204 , 88.34727 , 88.19220 , 85.33634 , 86.80957 , 85.38315 , 84.84974 , 83.81123 , 83.79694 , 82.03360 , 84.52455 , 85.80234 ) rleg <- c( 83.60661 , 86.24163 , 84.73308 , 87.01831 , 86.25176 , 85.33211 , 85.50472 , 83.07236 , 89.57016 , 85.42602 , 84.49321 , 87.01678 , 88.08828 , 85.07632 , 85.13303 , 87.43865 , 84.91936 , 85.65192 , 84.60824 , 87.90713 , 88.22550 , 85.37465 , 86.73603 , 85.34718 , 85.08006 , 83.77995 , 83.88948 , 82.37259 , 84.55983 , 85.91773 ) height <- c( 165.8875 , 174.4843 , 169.2630 , 172.5638 , 173.0247 , 171.1726 , 168.6353 , 169.2743 , 177.1377 , 171.8969 , 169.8022 , 174.0514 , 173.2186 , 169.0965 , 174.6001 , 174.0821 , 170.4566 , 170.3515 , 173.0082 , 174.4888 , 179.8436 , 171.6773 , 172.4103 , 171.8616 , 171.9519 , 169.2527 , 167.5621 , 163.8267 , 165.1103 , 172.0778 ) plot( height ~ rleg ) plot( height ~ lleg ) plot( lleg ~ rleg ) summary( lm( height ~ lleg + rleg ) ) summary( lm( height ~ rleg ) ) summary( lm( height ~ lleg ) ) ## plant growth example wt <- c( rep(1,10) , rep(2,10) , rep(3,10) , rep(4,10) ) fh <- c(4.81, 2.16, 4.85, 2.04, 4.29, 5.35, 6.73, 5.19, 2.87, 8.04, 6.76, 4.13, 6.84, 6.72, 4.66, 7.22, 3.67, 7.27, 4.84, 4.82, 6.86, 5.1, 5.12, 4.46, 4.3, 6.6, 8.08, 5.79, 5.15, 5.99, 6.14, 6.35, 6.49, 4.72, 6.62, 6.35, 6.03, 7.57, 3.41, 8.07 ) ih <- c(4.64, 2.15, 4.68, 2.03, 4.16, 5.16, 6.46, 5, 2.82, 7.68, 6.26, 3.85, 6.34, 6.22, 4.34, 6.69, 3.43, 6.72, 4.5, 4.49, 6.06, 4.63, 4.65, 4.11, 3.98, 5.85, 7.05, 5.19, 4.67, 5.35, 5.89, 6.08, 6.22, 4.57, 6.33, 6.08, 5.78, 7.22, 3.35, 7.68 ) boxplot( fh~wt , xlab="water treatment" , ylab="final height" ) summary( lm( fh ~ as.factor(wt) ) ) boxplot( (fh-ih) ~ wt , xlab="water treatment" , ylab="final height - initial height" ) summary( lm( fh ~ as.factor(wt) + ih ) ) plot( fh , ylab="plant height" ) points( 1:40 , ih , pch=16 ) for ( i in 1:40 ) { lines( c(i,i) , c( ih[i] , fh[i] ) ) } plot( (fh-ih) , ylab="growth" ) ## math ability example m <- c(10, 11, 10, 12, 12, 11, 12, 13, 12, 14, 14, 14, 15, 15, 15, 16, 16, 16, 18, 18, 18, 19, 19, 19, 19, 19, 21, 21, 22, 21, 22, 23 ) y <- c(5, 5.2, 5.4, 5.6, 5.8, 6, 6.2, 6.4, 6.6, 6.8, 7, 7.2, 7.4, 7.6, 7.8, 8, 8.2, 8.4, 8.6, 8.8, 9, 9.2, 9.4, 9.6, 9.8, 10, 10.2, 10.4, 10.6, 10.8, 11, 11.2 ) h <- c(129.44, 129.46, 131.81, 130.54, 130.73, 134.31, 134.07, 135.27, 135.24, 137.21, 137.66, 137.88, 141.74, 142.28, 141.71, 144.73, 145.88, 145.5, 144.3, 145.23, 145.66, 146.06, 148.19, 151.27, 149.4, 150.62, 153.26, 156.84, 155.97, 155, 157.13, 158.04 ) plot( m ~ h , xlab="height" , ylab="math score" ) plot( m ~ y , xlab="years of age" , ylab="math score" ) summary( lm( m ~ h ) ) summary( lm( m ~ h + y ) ) resid_h <- residuals( lm( m ~ h ) ) resid_y <- residuals( lm( m ~ y ) ) plot( resid_h ~ y ) abline( lm( resid_h ~ y ) ) plot( resid_y ~ h ) abline( lm( resid_y ~ h ) ) # seasons and dummy variables seasons <- data.frame( season=c( rep("winter",5) , rep("spring",4) , rep("summer",6) , rep("fall",2) ) ) seasons$temp <- rnorm( 17 , 100 , 20 ) # convert to dummy variables seasons$winter <- ifelse( seasons$season == "winter" , 1 , 0 ) seasons$spring <- ifelse( seasons$season == "spring" , 1 , 0 ) seasons$summer <- ifelse( seasons$season == "summer" , 1 , 0 ) # now here's the internal R function that does the same thing: model.matrix( seasons$temp ~ seasons$season )