2020-11-18
If the population distribution is known, we can sample from it many times to get an idea of the sampling distribution.
Example: \(X_i \sim N(\mu, \sigma^2), i = 1, \ldots, n\).
Sampling distribution of \(\bar{X}\) is \(N(\mu, \sigma^2/n)\)
Also, we can draw from normal many times, and plot the histogram of all the means.
set.seed(123) x = rnorm(100,2,3) xbar.smpls = rep(NA, 1000) for (i in 1:1000) xbar.smpls[i] = mean(rnorm(100,2,3)) hist(xbar.smpls, freq = FALSE) curve(dnorm(x,mean=2,sd=3/10),add=T)
mean(xbar.smpls)
## [1] 2.00255
sd(xbar.smpls)
## [1] 0.2851543
A major component of bootstrapping is being able to resample a given data set and in R the function which does this is the sample function.
sample(x, size, replace, prob)
sample(10)
## [1] 4 1 5 10 3 9 8 6 7 2
sample(10, replace=T)
## [1] 1 9 5 8 1 10 7 5 5 6
sample(10, replace=T)
## [1] 5 10 10 1 7 5 6 8 8 10
prob1 <- c(rep(.15, 5), rep(.05, 5)) prob1
## [1] 0.15 0.15 0.15 0.15 0.15 0.05 0.05 0.05 0.05 0.05
sample(10, replace=T, prob=prob1)
## [1] 4 8 4 3 7 5 5 3 1 7
B = 5000 theta.boot = rep(0, B) n = length(x) for (b in 1:B) theta.boot[b] = mean(sample(x,replace=TRUE))
mean(theta.boot)
## [1] 2.271241
sd(theta.boot)
## [1] 0.2682298
data <- round(rnorm(100, 5, 3)) data[1:10]
## [1] 6 4 9 4 1 4 2 11 9 8
resamples <- lapply(1:20, function(i) sample(data, replace = T)) resamples[1]
## [[1]] ## [1] 4 4 8 4 1 -2 5 5 4 7 4 6 6 11 1 5 6 5 9 4 10 -1 4 4 4 ## [26] 10 5 7 3 4 4 7 6 3 9 5 6 0 4 4 6 3 1 11 7 4 9 7 7 11 ## [51] 9 12 4 4 3 7 2 4 7 4 5 8 4 0 7 6 4 6 9 6 6 7 5 6 12 ## [76] 4 4 5 7 10 1 7 7 10 4 3 1 5 4 4 0 8 1 8 4 2 5 6 11 11
r.median <- sapply(resamples, median) r.median
## [1] 5.0 5.0 5.0 6.0 5.0 5.0 5.0 5.0 5.0 5.0 5.0 4.5 5.0 5.0 5.0 6.0 5.0 5.0 5.0 ## [20] 6.0
sqrt(var(r.median))
## [1] 0.3931988
hist(r.median, freq = FALSE, col = rgb(0,1,0,0.5))
# Bootstrap 95% CI for R-Squared library(boot) # function to obtain R-Squared from the data rsq <- function(formula, data, indices) { d <- data[indices,] # allows boot to select sample fit <- lm(formula, data=d) return(summary(fit)$r.square) } # bootstrapping with 1000 replications results <- boot(data=mtcars, statistic=rsq, R=1000, formula=mpg~wt+disp)
# view results results
## ## ORDINARY NONPARAMETRIC BOOTSTRAP ## ## ## Call: ## boot(data = mtcars, statistic = rsq, R = 1000, formula = mpg ~ ## wt + disp) ## ## ## Bootstrap Statistics : ## original bias std. error ## t1* 0.7809306 0.01138209 0.04917415
You can also get them separately:
(bs.median = results$t0)
## [1] 0.7809306
(bs.stderr = sqrt(var(results$t)))
## [,1] ## [1,] 0.04917415
plot(results)
df <- data.frame(x = results$t); x<- df$x library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.0.3
ggplot(df, aes(x)) + geom_histogram(aes(y = ..density..), fill = "blue", alpha = 0.5)+geom_density()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
data(law, package = "bootstrap") library(ggplot2) qplot(LSAT, GPA, data = law)
with(law, cor(LSAT, GPA))
## [1] 0.7763745
sample(nrow(law), replace = TRUE)
## [1] 3 1 10 1 14 2 13 14 3 5 3 3 1 8 5
sample(nrow(law), replace = TRUE)
## [1] 7 6 3 12 8 12 5 7 9 2 3 4 4 4 3
with(law[sample(nrow(law), replace = TRUE), ], cor(LSAT, GPA))
## [1] 0.6162869
with(law[sample(nrow(law), replace = TRUE), ], cor(LSAT, GPA))
## [1] 0.7647155
with(law[sample(nrow(law), replace = TRUE), ], cor(LSAT, GPA))
## [1] 0.8670833
ptm <- proc.time() B <- 5000 cor.boot <- rep(0,B) for (b in 1:B){ law.boot <- law[sample(nrow(law), replace = TRUE), ] cor.boot[b] = cor(law.boot$LSAT, law.boot$GPA) } summary(cor.boot)
## Min. 1st Qu. Median Mean 3rd Qu. Max. ## 0.00217 0.69120 0.78835 0.76975 0.87130 0.99465
proc.time()-ptm
## user system elapsed ## 0.73 0.00 0.73
ptm <- proc.time() B <- 5000 cors <- replicate(B, {with(law[sample(nrow(law), replace = TRUE), ], cor(LSAT, GPA))}) summary(cors)
## Min. 1st Qu. Median Mean 3rd Qu. Max. ## 0.1565 0.6890 0.7890 0.7707 0.8721 0.9917
proc.time()-ptm
## user system elapsed ## 0.69 0.00 0.69
df <- data.frame(x = cors); x<- df$x library(ggplot2) ggplot(df, aes(x)) + geom_histogram(aes(y = ..density..), fill = "blue", alpha = 0.5)+geom_density()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Bs <- rep(100 * (1:50), 3) Bs
## [1] 100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 ## [15] 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400 2500 2600 2700 2800 ## [29] 2900 3000 3100 3200 3300 3400 3500 3600 3700 3800 3900 4000 4100 4200 ## [43] 4300 4400 4500 4600 4700 4800 4900 5000 100 200 300 400 500 600 ## [57] 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 ## [71] 2100 2200 2300 2400 2500 2600 2700 2800 2900 3000 3100 3200 3300 3400 ## [85] 3500 3600 3700 3800 3900 4000 4100 4200 4300 4400 4500 4600 4700 4800 ## [99] 4900 5000 100 200 300 400 500 600 700 800 900 1000 1100 1200 ## [113] 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400 2500 2600 ## [127] 2700 2800 2900 3000 3100 3200 3300 3400 3500 3600 3700 3800 3900 4000 ## [141] 4100 4200 4300 4400 4500 4600 4700 4800 4900 5000
ptm <- proc.time() corsize2 = matrix(0,length(Bs),2) for (i in 1:length(Bs)){ b = Bs[i] for (b in 1:B){ law.boot <- law[sample(nrow(law), replace = TRUE), ] cor.boot[b] = cor(law.boot$LSAT, law.boot$GPA) } corsize2[i,] = c(b, mean(cor.boot)) } colnames(corsize2) = c("B","cor") proc.time()-ptm
## user system elapsed ## 109.58 0.33 121.28
head(corsize2, n = 3)
## B cor ## [1,] 5000 0.7674854 ## [2,] 5000 0.7736561 ## [3,] 5000 0.7734934
ldply
that can avoid for loops altogether.ptm <- proc.time() library(plyr) corsize <- ldply(Bs, function(B){ cors <- replicate(B,{with(law[sample(nrow(law), replace = TRUE),] ,cor(LSAT, GPA))}) c(B = B, cor = mean(cors))}) head(corsize,n=3)
## B cor ## 1 100 0.7676786 ## 2 200 0.7787451 ## 3 300 0.7638906
proc.time()-ptm
## user system elapsed ## 52.80 0.08 54.36
qplot(B, cor, data = corsize)