This is an R Markdown presentation. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document.
I will show here the R codes for ANOVA. The two examples are taken from the following websites.
We first read the InsectSprays dataset that has data on pain levels for three types of drugs. We create a data frame in R to store the data.
pain = c(4, 5, 4, 3, 2, 4, 3, 4, 4, 6, 8, 4, 5, 4, 6, 5,
8, 6, 6, 7, 6, 6, 7, 5, 6, 5, 5)
drug = c(rep("A",9), rep("B",9), rep("C",9))
migraine = data.frame(pain,drug)
str(migraine)
## 'data.frame': 27 obs. of 2 variables:
## $ pain: num 4 5 4 3 2 4 3 4 4 6 ...
## $ drug: Factor w/ 3 levels "A","B","C": 1 1 1 1 1 1 1 1 1 2 ...
Instead of doing
mean(pain[drug=="A"])
## [1] 3.666667
mean(pain[drug=="B"])
## [1] 5.777778
We can simply call
tapply(pain, drug, mean)
## A B C
## 3.666667 5.777778 5.888889
tapply(pain, drug, var)
## A B C
## 0.7500000 2.1944444 0.6111111
tapply(pain, drug, length)
## A B C
## 9 9 9
We can look at the boxplot for the three groups:
plot(pain ~ drug, data=migraine)
R package ‘mosaic’ and ‘gmodel’ can create better looking plots and summary:
require(mosaic)
require(gmodels)
options(digits=3)
favstats(pain ~ drug, data=migraine)
## drug min Q1 median Q3 max mean sd n missing
## 1 A 2 3 4 4 5 3.67 0.866 9 0
## 2 B 4 5 6 6 8 5.78 1.481 9 0
## 3 C 5 5 6 6 7 5.89 0.782 9 0
trellis.par.set(theme=col.mosaic()) # better color scheme for lattice
histogram(~ pain| drug, fit="normal", layout=c(1, 3), data=migraine)
trellis.par.set(theme=col.mosaic()) # better color scheme for lattice
densityplot(~ pain, groups=drug, auto.key=TRUE, data=migraine)
trellis.par.set(theme=col.mosaic()) # better color scheme for lattice
bwplot(pain ~ drug, data=migraine)
n_j = tapply(pain,drug,length)
SST_j = n_j * (tapply(pain,drug,mean)-mean(pain))^2
SST = sum(SST_j)
cat("SST = ", SST, '\n')
## SST = 28.2
Variation within the treatments = Sum of squares for Error (SSE): \[ SSE = (n_1 -1)s_1^2 + (n_2 - 1)s_2^2 + \cdots + (n_K -1)s_K^2 \]
SSE_j = (n_j-1)*tapply(pain,drug,var)
SSE = sum(SSE_j)
cat("SSE =", SSE, "\n")
## SSE = 28.4
p.value = 1-pf(11.9,2,24);cat("P value =", p.value, "\n")
## P value = 0.000257
Reject the null.
Of course, we can skip the step-by-step calculation and use a pre-canned package in R:
results <- aov(pain~drug, data=migraine)
summary(results)
## Df Sum Sq Mean Sq F value Pr(>F)
## drug 2 28.2 14.11 11.9 0.00026 ***
## Residuals 24 28.4 1.19
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
pairwise.t.test(pain, drug, p.adjust="bonferroni")
##
## Pairwise comparisons using t tests with pooled SD
##
## data: pain and drug
##
## A B
## B 0.001 -
## C 7e-04 1.000
##
## P value adjustment method: bonferroni
trellis.par.set(theme=col.mosaic()) # better color scheme for lattice
bwplot(pain ~ drug, data=migraine)
A is different from both B and C, but B & C are similar
Another multiple comparisons procedure is Tukey‟s method (a.k.a. Tukey’s Honest Significance Test). The function TukeyHSD() creates a set of confidence intervals on the differences between means with the specified family-wise probability of coverage.
results = aov(pain ~ drug, data=migraine)
TukeyHSD(results, conf.level = 0.95)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = pain ~ drug, data = migraine)
##
## $drug
## diff lwr upr p adj
## B-A 2.111 0.830 3.39 0.001
## C-A 2.222 0.941 3.50 0.001
## C-B 0.111 -1.170 1.39 0.975
We are going to see another data-set InsectSprays. 6 different insect sprays (1 Categorical Variable with 6 levels) were tested to see if there was a difference in the number of insects found in the field after each spraying (Dependent Variable).
attach(InsectSprays)
data(InsectSprays)
str(InsectSprays)
## 'data.frame': 72 obs. of 2 variables:
## $ count: num 10 7 20 14 14 12 10 23 17 20 ...
## $ spray: Factor w/ 6 levels "A","B","C","D",..: 1 1 1 1 1 1 1 1 1 1 ...
tapply(count, spray, mean)
## A B C D E F
## 14.50 15.33 2.08 4.92 3.50 16.67
tapply(count, spray, var)
## A B C D E F
## 22.27 18.24 3.90 6.27 3.00 38.61
tapply(count, spray, length)
## A B C D E F
## 12 12 12 12 12 12
R package ‘mosaic’ and ‘gmodel’ can create better looking plots and summary:
require(mosaic)
require(gmodels)
options(digits=3)
favstats(count ~ spray, data=InsectSprays)
## spray min Q1 median Q3 max mean sd n missing
## 1 A 7 11.50 14.0 17.8 23 14.50 4.72 12 0
## 2 B 7 12.50 16.5 17.5 21 15.33 4.27 12 0
## 3 C 0 1.00 1.5 3.0 7 2.08 1.98 12 0
## 4 D 2 3.75 5.0 5.0 12 4.92 2.50 12 0
## 5 E 1 2.75 3.0 5.0 6 3.50 1.73 12 0
## 6 F 9 12.50 15.0 22.5 26 16.67 6.21 12 0
trellis.par.set(theme=col.mosaic()) # better color scheme for lattice
histogram(~ count| spray, fit="normal", layout=c(2, 3), data=InsectSprays)
trellis.par.set(theme=col.mosaic()) # better color scheme for lattice
densityplot(~ count, groups=spray, auto.key=TRUE, data=InsectSprays)
trellis.par.set(theme=col.mosaic()) # better color scheme for lattice
bwplot(count ~ spray, data=InsectSprays)
n_j = tapply(count,spray,length)
SST_j = n_j * (tapply(count,spray,mean)-mean(count))^2
SST = sum(SST_j)
cat("SST = ", SST, '\n')
## SST = 2669
Variation within the treatments = Sum of squares for Error (SSE): \[ SSE = (n_1 -1)s_1^2 + (n_2 - 1)s_2^2 + \cdots + (n_K -1)s_K^2 \]
SSE_j = (n_j-1)*tapply(count,spray,var)
SSE = sum(SSE_j)
cat("SSE =", SSE, "\n")
## SSE = 1015
p.value = 1-pf(34.7,5,66);cat("P value =", p.value, "\n")
## P value = 0
Reject the null.
Of course, we can skip the step-by-step calculation and use a pre-canned package in R:
results <- aov(count~spray, data=InsectSprays)
summary(results)
## Df Sum Sq Mean Sq F value Pr(>F)
## spray 5 2669 534 34.7 <2e-16 ***
## Residuals 66 1015 15
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
pairwise.t.test(count, spray, p.adjust="bonferroni")
##
## Pairwise comparisons using t tests with pooled SD
##
## data: count and spray
##
## A B C D E
## B 1 - - - -
## C 1e-09 1e-10 - - -
## D 1e-06 2e-07 1 - -
## E 4e-08 5e-09 1 1 -
## F 1 1 4e-12 6e-09 2e-10
##
## P value adjustment method: bonferroni
trellis.par.set(theme=col.mosaic()) # better color scheme for lattice
bwplot(count ~ spray, data=InsectSprays)
A, B and F are similar, C, D and E are similar. But each of A, B & F are different from each of C, D and E !
results = aov(count ~ spray, data=InsectSprays)
TukeyHSD(results, conf.level = 0.95)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = count ~ spray, data = InsectSprays)
##
## $spray
## diff lwr upr p adj
## B-A 0.833 -3.87 5.53 0.995
## C-A -12.417 -17.12 -7.72 0.000
## D-A -9.583 -14.28 -4.88 0.000
## E-A -11.000 -15.70 -6.30 0.000
## F-A 2.167 -2.53 6.87 0.754
## C-B -13.250 -17.95 -8.55 0.000
## D-B -10.417 -15.12 -5.72 0.000
## E-B -11.833 -16.53 -7.13 0.000
## F-B 1.333 -3.37 6.03 0.960
## D-C 2.833 -1.87 7.53 0.492
## E-C 1.417 -3.28 6.12 0.949
## F-C 14.583 9.88 19.28 0.000
## E-D -1.417 -6.12 3.28 0.949
## F-D 11.750 7.05 16.45 0.000
## F-E 13.167 8.47 17.87 0.000
bartlett.test(count ~ spray, data=InsectSprays)
##
## Bartlett test of homogeneity of variances
##
## data: count by spray
## Bartlett's K-squared = 30, df = 5, p-value = 9e-05
results = aov(count ~ spray, data=InsectSprays)
opar <- par(mfrow = c(2,2), oma = c(0, 0, 1.1, 0))
plot(results, las = 1) # Residuals, Fitted, ...
par(opar)
# Kruskal Wallis Test One Way Anova by Ranks
kruskal.test(count ~ spray, data=InsectSprays)
##
## Kruskal-Wallis rank sum test
##
## data: count by spray
## Kruskal-Wallis chi-squared = 50, df = 5, p-value = 2e-10
Reject the null hypothesis!
bartlett.test(pain ~ drug, data=migraine)
##
## Bartlett test of homogeneity of variances
##
## data: pain by drug
## Bartlett's K-squared = 4, df = 2, p-value = 0.1
results = aov(pain ~ drug, data=migraine)
opar <- par(mfrow = c(2,2), oma = c(0, 0, 1.1, 0))
plot(results, las = 1) # Residuals, Fitted, ...
par(opar)
# Kruskal Wallis Test One Way Anova by Ranks
kruskal.test(pain ~ drug, data=migraine)
##
## Kruskal-Wallis rank sum test
##
## data: pain by drug
## Kruskal-Wallis chi-squared = 10, df = 2, p-value = 7e-04
Reject the null hypothesis!