This R markdown evaluates ggplot options for random data reflectig a block design with six treatment groups in each block. In this markdown I also build a function to plot resuls from a tukey honest significant difference post hoc test.
# Define Functions------------------------------------
#######################################################
#FUNCTION: AOVtest
# function to conduct Analysis of Variance, produce a vector of outputs and plot a graph
#inputs:
#y - continuous response variable
#x - categorical x variable
#alpha - significance level e.g. 0.05 or 0.01 or 0.001
#postHoc - T/F if TRUE function runs TukeyHSD
#outputs: returns a list with p value and TukeyHSD results
AOVtest <- function(x=rep(letters[1:3],each=10),
y=runif(30)*rep(1:3,each=10),
alpha=0.05,
postHoc = TRUE){
if (length(y)!=length(x)){
stop('x and y vectors are not the same lenght')
}
# make sure treatment var, x, a factor
if (!is.factor(x)){x<-factor(x)}
df <- data.frame(ID=1:length(y),y=y,x=x)
test <- aov(y~x, data=df) #analysis of variance function
pvalue <- as.numeric(unlist(summary(test))[9])
if (pvalue <= alpha && postHoc == TRUE){
tuk <- TukeyHSD(test, ordered = FALSE, conf.level = (1 - alpha))
results <- c('pvalue'= pvalue, #p-value for h0 means are not different
'tukey.table'=tuk) #results of tukey table
return(results)
}
results <- c('pvalue'=pvalue, test)
return(results)
}
#AOVtest()
#######################################################
# FUNCTION: plotAOV
# produces a boxplot labeled with analysis of variance results
# Inputs:
# treatment - catagorical x independant variable
# response - continuous y dependant variable
# statsOnPlot - T/F if TRUE graph p value and post hoc test on plot
# postHoc - T/F if TRUE graph significant difference labels on plot
#Outputs:
#p - ggplot object with boxplot and results
#Dependancies:
# tukeylabel_df()
# ggplot2
#-----------------------------------------------------------
plotAOV <- function(treatment=rep(1:3,each=10),
response=runif(30),
statsOnPlot=FALSE,
postHoc=FALSE,
alpha=0.05){
#create data frame for analysis
if(is.integer(treatment)){x <- factor(treatment)
}else{stop('function failed: treatment must be an integer')}
df <- data.frame(ID = 1:length(treatment),
x=x, #converting to numeric factor
y=response)
if(statsOnPlot==TRUE){
#generate significant diference labels for postHOC tests
if (postHoc == TRUE){
stats <- tukeyLabels(treatment=treatment,response=response)
p <- ggplot(data=df,aes(x=x,y=y))+geom_boxplot()+
geom_text(data=stats,
aes(x=plot.labels,
y=V1,
label=labels,
hjust=c(2,2,2),
fontface='bold'))
} else {
t <- unlist(summary(aov(y~x,data=df)))
pvalue <- toString(t[9])
p <- ggplot(data=df,aes(x=x,y=y,fill=x))+
geom_boxplot()+
geom_text(aes(x=Inf,y=Inf,hjust=1.1,
vjust=1.5,label=paste("p =",pvalue)))
} #end if postHoc
} else {#if stats on plot not equal TRUE
p <- ggplot(data=df,aes(x=x,y=y,fill=x))+geom_boxplot()
} #end if stats on plot
return(p) # return ggplot object
}
#plotAOV(statsOnPlot=TRUE,postHoc=TRUE)
#########################
#FUNCTION: tukeylabel_df
#generates labels indicating significant differences from tukey post-hoc
#for use in ggplot2 boxplots
#inputs:
#x and y - vectors of same length,
#alpha - a numeric value indicating significance level (e.g. 0.05)
#outputs: a dataframe of labels for the tukey test
#------------------------------------------------
tukeyLabels <- function(treatment=rep(1:3,each=10), # treatment levels
response=runif(30), # responses to treatments
alpha=0.05){
# Extract labels and factor levels from Tukey post-hoc
if(is.factor(treatment)){}else{treatment <- factor(treatment)}
df <- data.frame(x=treatment,y=response)
HSD <- TukeyHSD(aov(y~x,data=df),
ordered = FALSE,
conf.level = (1-alpha))
Tukey.levels <- HSD[['x']][,4]
Tukey.labels <- multcompLetters(Tukey.levels)['Letters']
plot.labels <- names(Tukey.labels[['Letters']])
# Get highest quantile for Tukey's 5 number summary and add a bit of space to buffer between
# upper quantile and label placement
boxplot.df <- plyr::ddply(df, 'x', function (z) max(fivenum(z$y)) + 0.2)
# Create a data frame out of the factor levels and Tukey's homogenous group letters
plot.levels <- data.frame(plot.labels,
labels = Tukey.labels[['Letters']],
stringsAsFactors = FALSE)
# Merge it with the labels
labels.df <- merge(plot.levels,
boxplot.df,
by.x = 'plot.labels',
by.y = 'x',
sort = FALSE)
return(labels.df)
}
#tukeyLabels()
#########################################################################
#FUNCTION: creatExperiment
# function to simulate random normal data for a specified balanced experimental setup
#inputs:
# number of factors, blocks, levels, replications in experiment
# sample mean and standard deviation of all samples
# the effect size of blocks, factors, treatments, replications
#outputs:
# a datafram containing random data and treatment factors
#notes on updates:
# - This is a revised version of simulateData, which operated very slowly
# because the function called c() to build vectors within a for loop. This version pre-assigns the mode and length of vectors based on the number of factors, blocks, levels and replications.
# - The response variable model can now be specified in a string using the same notation as lm()
# y~x1+x2 --> y = b0 + b1*x1 + b2*x2 + E
#-------------------------------------------------------------------------
makeXdf <- function(nBlocks = 1, #number of study blocks or sites
nFactors = 1, #number of treatment factors
nLevels = 3, #number of treatment levels for each factor
nReps = 10 #number of replications f(b(l(r))))
){
nTotal <- nFactors*nBlocks*nLevels*nReps
ID <- vector(mode = "numeric", length = nTotal)
fact <- vector(mode = "numeric", length = nTotal)
block <- vector(mode = "numeric", length = nTotal)
level <- vector(mode = "numeric", length = nTotal)
rep <- vector(mode = "numeric", length = nTotal)
response <- vector(mode = "numeric", length = nTotal)
i <- 0
for (b in seq(1,nBlocks)){
for (f in seq(1,nFactors)){
for (l in seq(1,nLevels)){
for (r in seq(1,nReps)){
i <- i + 1
ID[i] <- i
fact[i] <- f
block[i] <- b
level[i] <- l
rep[i] <- r
cat(i,"f",f,"b",b,"l",l,"r",r,"\n")
}# end o loop
} # end t loop
} # end b loop
} # end f loop
df <- data.frame(ID=ID,
Blk=block,
Dgst=fact,
Rcp=level,
Yld=response)
return(df)
}
#make a dataframe
df <- makeXdf(nBlocks = 2, #number of study blocks or sites
nFactors = 2, #number of treatment factors
nLevels = 6, #number of treatment levels for each factor
nReps = 4)
## 1 f 1 b 1 l 1 r 1
## 2 f 1 b 1 l 1 r 2
## 3 f 1 b 1 l 1 r 3
## 4 f 1 b 1 l 1 r 4
## 5 f 1 b 1 l 2 r 1
## 6 f 1 b 1 l 2 r 2
## 7 f 1 b 1 l 2 r 3
## 8 f 1 b 1 l 2 r 4
## 9 f 1 b 1 l 3 r 1
## 10 f 1 b 1 l 3 r 2
## 11 f 1 b 1 l 3 r 3
## 12 f 1 b 1 l 3 r 4
## 13 f 1 b 1 l 4 r 1
## 14 f 1 b 1 l 4 r 2
## 15 f 1 b 1 l 4 r 3
## 16 f 1 b 1 l 4 r 4
## 17 f 1 b 1 l 5 r 1
## 18 f 1 b 1 l 5 r 2
## 19 f 1 b 1 l 5 r 3
## 20 f 1 b 1 l 5 r 4
## 21 f 1 b 1 l 6 r 1
## 22 f 1 b 1 l 6 r 2
## 23 f 1 b 1 l 6 r 3
## 24 f 1 b 1 l 6 r 4
## 25 f 2 b 1 l 1 r 1
## 26 f 2 b 1 l 1 r 2
## 27 f 2 b 1 l 1 r 3
## 28 f 2 b 1 l 1 r 4
## 29 f 2 b 1 l 2 r 1
## 30 f 2 b 1 l 2 r 2
## 31 f 2 b 1 l 2 r 3
## 32 f 2 b 1 l 2 r 4
## 33 f 2 b 1 l 3 r 1
## 34 f 2 b 1 l 3 r 2
## 35 f 2 b 1 l 3 r 3
## 36 f 2 b 1 l 3 r 4
## 37 f 2 b 1 l 4 r 1
## 38 f 2 b 1 l 4 r 2
## 39 f 2 b 1 l 4 r 3
## 40 f 2 b 1 l 4 r 4
## 41 f 2 b 1 l 5 r 1
## 42 f 2 b 1 l 5 r 2
## 43 f 2 b 1 l 5 r 3
## 44 f 2 b 1 l 5 r 4
## 45 f 2 b 1 l 6 r 1
## 46 f 2 b 1 l 6 r 2
## 47 f 2 b 1 l 6 r 3
## 48 f 2 b 1 l 6 r 4
## 49 f 1 b 2 l 1 r 1
## 50 f 1 b 2 l 1 r 2
## 51 f 1 b 2 l 1 r 3
## 52 f 1 b 2 l 1 r 4
## 53 f 1 b 2 l 2 r 1
## 54 f 1 b 2 l 2 r 2
## 55 f 1 b 2 l 2 r 3
## 56 f 1 b 2 l 2 r 4
## 57 f 1 b 2 l 3 r 1
## 58 f 1 b 2 l 3 r 2
## 59 f 1 b 2 l 3 r 3
## 60 f 1 b 2 l 3 r 4
## 61 f 1 b 2 l 4 r 1
## 62 f 1 b 2 l 4 r 2
## 63 f 1 b 2 l 4 r 3
## 64 f 1 b 2 l 4 r 4
## 65 f 1 b 2 l 5 r 1
## 66 f 1 b 2 l 5 r 2
## 67 f 1 b 2 l 5 r 3
## 68 f 1 b 2 l 5 r 4
## 69 f 1 b 2 l 6 r 1
## 70 f 1 b 2 l 6 r 2
## 71 f 1 b 2 l 6 r 3
## 72 f 1 b 2 l 6 r 4
## 73 f 2 b 2 l 1 r 1
## 74 f 2 b 2 l 1 r 2
## 75 f 2 b 2 l 1 r 3
## 76 f 2 b 2 l 1 r 4
## 77 f 2 b 2 l 2 r 1
## 78 f 2 b 2 l 2 r 2
## 79 f 2 b 2 l 2 r 3
## 80 f 2 b 2 l 2 r 4
## 81 f 2 b 2 l 3 r 1
## 82 f 2 b 2 l 3 r 2
## 83 f 2 b 2 l 3 r 3
## 84 f 2 b 2 l 3 r 4
## 85 f 2 b 2 l 4 r 1
## 86 f 2 b 2 l 4 r 2
## 87 f 2 b 2 l 4 r 3
## 88 f 2 b 2 l 4 r 4
## 89 f 2 b 2 l 5 r 1
## 90 f 2 b 2 l 5 r 2
## 91 f 2 b 2 l 5 r 3
## 92 f 2 b 2 l 5 r 4
## 93 f 2 b 2 l 6 r 1
## 94 f 2 b 2 l 6 r 2
## 95 f 2 b 2 l 6 r 3
## 96 f 2 b 2 l 6 r 4
# simulate yield as a function of block(Blk) factor(Dgst) and level(Rcp)onto data
b0 <- 450 # average Yld across all data
b1 <- 0 #
b2 <- 200
b3 <- 30
b4 <- 0
SD <- 100
df$Yld <- with(df, b0 + b1*Blk + b2*Dgst + b3*Rcp + b4*Dgst*Rcp + rnorm(length(df$Yld),0,SD))
df
## ID Blk Dgst Rcp Yld
## 1 1 1 1 1 706.4380
## 2 2 1 1 1 755.0527
## 3 3 1 1 1 830.0211
## 4 4 1 1 1 801.5212
## 5 5 1 1 2 732.8892
## 6 6 1 1 2 602.1277
## 7 7 1 1 2 766.9442
## 8 8 1 1 2 711.6467
## 9 9 1 1 3 552.4465
## 10 10 1 1 3 749.3574
## 11 11 1 1 3 775.7521
## 12 12 1 1 3 642.4774
## 13 13 1 1 4 825.4818
## 14 14 1 1 4 712.5887
## 15 15 1 1 4 825.1914
## 16 16 1 1 4 812.7405
## 17 17 1 1 5 704.9822
## 18 18 1 1 5 840.2974
## 19 19 1 1 5 746.9153
## 20 20 1 1 5 706.5117
## 21 21 1 1 6 908.5762
## 22 22 1 1 6 858.0082
## 23 23 1 1 6 865.1528
## 24 24 1 1 6 734.3575
## 25 25 1 2 1 1130.6021
## 26 26 1 2 1 730.0465
## 27 27 1 2 1 833.9632
## 28 28 1 2 1 914.2118
## 29 29 1 2 2 937.1398
## 30 30 1 2 2 924.8626
## 31 31 1 2 2 808.4236
## 32 32 1 2 2 1045.2867
## 33 33 1 2 3 949.9288
## 34 34 1 2 3 757.5682
## 35 35 1 2 3 906.0843
## 36 36 1 2 3 845.7114
## 37 37 1 2 4 883.1161
## 38 38 1 2 4 1173.6701
## 39 39 1 2 4 1222.3510
## 40 40 1 2 4 1012.2612
## 41 41 1 2 5 1094.7368
## 42 42 1 2 5 1150.0013
## 43 43 1 2 5 910.0105
## 44 44 1 2 5 1156.2022
## 45 45 1 2 6 920.7993
## 46 46 1 2 6 1091.9777
## 47 47 1 2 6 1076.3277
## 48 48 1 2 6 1070.3373
## 49 49 2 1 1 575.9569
## 50 50 2 1 1 572.7247
## 51 51 2 1 1 516.8975
## 52 52 2 1 1 742.2355
## 53 53 2 1 2 546.0629
## 54 54 2 1 2 937.2865
## 55 55 2 1 2 698.3683
## 56 56 2 1 2 577.2317
## 57 57 2 1 3 704.6852
## 58 58 2 1 3 773.6542
## 59 59 2 1 3 845.7383
## 60 60 2 1 3 689.8267
## 61 61 2 1 4 900.4336
## 62 62 2 1 4 771.1799
## 63 63 2 1 4 690.0680
## 64 64 2 1 4 755.4918
## 65 65 2 1 5 705.4354
## 66 66 2 1 5 571.4214
## 67 67 2 1 5 857.3077
## 68 68 2 1 5 854.6633
## 69 69 2 1 6 784.3028
## 70 70 2 1 6 710.7705
## 71 71 2 1 6 630.5571
## 72 72 2 1 6 864.0877
## 73 73 2 2 1 1047.6637
## 74 74 2 2 1 970.0835
## 75 75 2 2 1 915.2007
## 76 76 2 2 1 899.8117
## 77 77 2 2 2 722.0726
## 78 78 2 2 2 1068.2111
## 79 79 2 2 2 1047.2389
## 80 80 2 2 2 944.3038
## 81 81 2 2 3 802.5264
## 82 82 2 2 3 1014.8915
## 83 83 2 2 3 1035.1603
## 84 84 2 2 3 803.0289
## 85 85 2 2 4 853.1991
## 86 86 2 2 4 996.5221
## 87 87 2 2 4 1121.7509
## 88 88 2 2 4 1001.9214
## 89 89 2 2 5 1003.0314
## 90 90 2 2 5 1033.6015
## 91 91 2 2 5 1044.9669
## 92 92 2 2 5 1055.8373
## 93 93 2 2 6 1170.1667
## 94 94 2 2 6 1030.4916
## 95 95 2 2 6 871.4857
## 96 96 2 2 6 1063.2401
dfAOV <- data.frame (ID=df$ID,
Blk=as.factor(df$Blk),
Dgst=as.factor(df$Dgst),
Rcp=as.factor(df$Rcp),
Yld=df$Yld)
str(dfAOV)
## 'data.frame': 96 obs. of 5 variables:
## $ ID : num 1 2 3 4 5 6 7 8 9 10 ...
## $ Blk : Factor w/ 2 levels "1","2": 1 1 1 1 1 1 1 1 1 1 ...
## $ Dgst: Factor w/ 2 levels "1","2": 1 1 1 1 1 1 1 1 1 1 ...
## $ Rcp : Factor w/ 6 levels "1","2","3","4",..: 1 1 1 1 2 2 2 2 3 3 ...
## $ Yld : num 706 755 830 802 733 ...
model <- lm(Yld~Blk + Dgst*Rcp,data=dfAOV)
summary(model)
##
## Call:
## lm(formula = Yld ~ Blk + Dgst * Rcp, data = dfAOV)
##
## Residuals:
## Min 1Q Median 3Q Max
## -209.738 -74.776 4.358 64.787 250.303
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 697.1925 39.1088 17.827 < 2e-16 ***
## Blk2 -19.1730 21.6936 -0.884 0.3794
## Dgst2 242.5919 53.1383 4.565 1.71e-05 ***
## Rcp2 8.9637 53.1383 0.169 0.8665
## Rcp3 29.1362 53.1383 0.548 0.5850
## Rcp4 99.0410 53.1383 1.864 0.0659 .
## Rcp5 60.8359 53.1383 1.145 0.2556
## Rcp6 106.8706 53.1383 2.011 0.0476 *
## Dgst2:Rcp2 -1.9692 75.1489 -0.026 0.9792
## Dgst2:Rcp3 -69.9717 75.1489 -0.931 0.3545
## Dgst2:Rcp4 3.8601 75.1489 0.051 0.9592
## Dgst2:Rcp5 65.0147 75.1489 0.865 0.3895
## Dgst2:Rcp6 -0.2153 75.1489 -0.003 0.9977
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 106.3 on 83 degrees of freedom
## Multiple R-squared: 0.6436, Adjusted R-squared: 0.592
## F-statistic: 12.49 on 12 and 83 DF, p-value: 4.363e-14
#PLOT DATA
library(ggplot2); theme_set(theme_classic())
myColors <- c('red','green','pink','blue','orange','cyan')
p1 <- ggplot(data=dfAOV, mapping=aes(x=Rcp,y=Yld)) + geom_boxplot(data=dfAOV, mapping=aes(fill=Dgst))
p1 + scale_fill_manual(values=myColors)
p1 + scale_fill_brewer(palette="BuGn")
p2 <- ggplot(data=dfAOV, mapping=aes(x=Dgst,y=Yld)) + geom_boxplot(data=dfAOV, mapping=aes(fill=Rcp))
#color schemes available from color brewer
#http://colorbrewer2.org/#type=sequential&scheme=BuGn&n=3
# the website above helps you pick the right colors
#Diverging color schemes (two or three colors equal saturation)
p2 + scale_fill_brewer(palette="Spectral")
p2 + scale_fill_brewer(palette="RdYlGn")
p2 + scale_fill_brewer(palette="PRGn") #color blind safe
#sequential color schemes (two or three colors equal saturation)
p2 + scale_fill_brewer(palette="YlOrRd")
p2 + scale_fill_brewer(palette="RdPu")
p2 + scale_fill_brewer(palette="BuPu")
p2 + scale_fill_brewer(palette="Greys")
p1 <- ggplot(data=dfAOV, mapping=aes(x=Rcp,y=Yld)) + geom_boxplot(data=dfAOV, mapping=aes(fill=Dgst))
p1 + scale_fill_brewer(palette="Greys")
p2 <- ggplot(data=dfAOV, mapping=aes(x=Dgst,y=Yld)) + geom_boxplot(data=dfAOV, mapping=aes(fill=Rcp))
#color schemes available from color brewer
#http://colorbrewer2.org/#type=sequential&scheme=BuGn&n=3
# the website above helps you pick the right colors
p2 + scale_fill_brewer(palette="BuGn")
# now generate a plot with tukey labels
tuk <- tukeyLabels(treatment=dfAOV$Rcp,
response = dfAOV$Yld,
alpha = 0.05)
print(tuk)
## plot.labels labels V1
## 1 2 a 1068.411
## 2 3 a 1035.360
## 3 4 a 1222.551
## 4 5 a 1156.402
## 5 6 a 1170.367
## 6 1 a 1130.802
str(tuk)
## 'data.frame': 6 obs. of 3 variables:
## $ plot.labels: chr "2" "3" "4" "5" ...
## $ labels : chr "a" "a" "a" "a" ...
## $ V1 : num 1068 1035 1223 1156 1170 ...
tukeyPlot <- ggplot(data=dfAOV, mapping=aes(x=Rcp,y=Yld)) + geom_text(data=tuk, aes(x=plot.labels,
y=V1,
label=labels,
hjust=c(rep(.4)*length(labels)),
fontface='bold')) + scale_fill_brewer(palette="BuGn")
tukeyPlot + geom_boxplot(aes(group=Rcp)) + geom_jitter(width=0.1,height=0)
ggsave(filename="tukeyplot_Rcp-Yld.pdf",
plot=tukeyPlot,
device="pdf")
## Saving 7 x 5 in image