R Markdown

This R markdown evaluates ggplot options for random data reflectig a block design with six treatment groups in each block. In this markdown I also build a function to plot resuls from a tukey honest significant difference post hoc test.

# Define Functions------------------------------------
#######################################################
#FUNCTION: AOVtest
# function to conduct Analysis of Variance, produce a vector of outputs and plot a graph
#inputs:
#y - continuous response variable
#x - categorical x variable
#alpha - significance level e.g. 0.05 or 0.01 or 0.001
#postHoc - T/F if TRUE function runs TukeyHSD
#outputs: returns a list with p value and TukeyHSD results
AOVtest <- function(x=rep(letters[1:3],each=10),
                    y=runif(30)*rep(1:3,each=10),
                    alpha=0.05,
                    postHoc = TRUE){
  if (length(y)!=length(x)){
    stop('x and y vectors are not the same lenght')
  }
  # make sure treatment var, x, a factor
  if (!is.factor(x)){x<-factor(x)}
  df <- data.frame(ID=1:length(y),y=y,x=x)
  test <- aov(y~x, data=df) #analysis of variance function
  pvalue <- as.numeric(unlist(summary(test))[9])
  if (pvalue <= alpha && postHoc == TRUE){
    tuk <- TukeyHSD(test, ordered = FALSE, conf.level = (1 - alpha))
    results <- c('pvalue'= pvalue, #p-value for h0 means are not different
                 'tukey.table'=tuk) #results of tukey table
    return(results)
  }
  results <- c('pvalue'=pvalue, test)
  return(results)
}
#AOVtest()

#######################################################
# FUNCTION: plotAOV
# produces a boxplot labeled with analysis of variance results 
# Inputs: 
# treatment - catagorical x independant variable 
# response - continuous y dependant variable
# statsOnPlot - T/F if TRUE graph p value and post hoc test on plot
# postHoc - T/F if TRUE graph significant difference labels on plot
#Outputs: 
#p - ggplot object with boxplot and results
#Dependancies:
# tukeylabel_df()
# ggplot2
#-----------------------------------------------------------
plotAOV <- function(treatment=rep(1:3,each=10),
                    response=runif(30),
                    statsOnPlot=FALSE,
                    postHoc=FALSE,
                    alpha=0.05){
  #create data frame for analysis
  if(is.integer(treatment)){x <- factor(treatment)
  }else{stop('function failed: treatment must be an integer')}
  df <- data.frame(ID = 1:length(treatment),
                   x=x, #converting to numeric factor
                   y=response)
  
  if(statsOnPlot==TRUE){
    #generate significant diference labels for postHOC tests
    if (postHoc == TRUE){
      stats <- tukeyLabels(treatment=treatment,response=response)
      p <- ggplot(data=df,aes(x=x,y=y))+geom_boxplot()+ 
           geom_text(data=stats, 
                     aes(x=plot.labels,
                         y=V1,
                         label=labels,
                         hjust=c(2,2,2),
                         fontface='bold'))
    } else {
      t <- unlist(summary(aov(y~x,data=df)))
      pvalue <- toString(t[9]) 
      p <- ggplot(data=df,aes(x=x,y=y,fill=x))+
        geom_boxplot()+
        geom_text(aes(x=Inf,y=Inf,hjust=1.1,
                      vjust=1.5,label=paste("p =",pvalue)))
    } #end if postHoc
  } else {#if stats on plot not equal TRUE
  p <- ggplot(data=df,aes(x=x,y=y,fill=x))+geom_boxplot()
  } #end if stats on plot
  return(p) # return ggplot object
}
#plotAOV(statsOnPlot=TRUE,postHoc=TRUE)


#########################
#FUNCTION: tukeylabel_df
#generates labels indicating significant differences from tukey post-hoc
#for use in ggplot2 boxplots
#inputs: 
#x and y - vectors of same length, 
#alpha - a numeric value indicating significance level (e.g. 0.05)
#outputs: a dataframe of labels for the tukey test
#------------------------------------------------
tukeyLabels <- function(treatment=rep(1:3,each=10), # treatment levels
                        response=runif(30), # responses to treatments
                        alpha=0.05){
  # Extract labels and factor levels from Tukey post-hoc
  if(is.factor(treatment)){}else{treatment <- factor(treatment)}
  df <- data.frame(x=treatment,y=response)
  HSD <- TukeyHSD(aov(y~x,data=df), 
                  ordered = FALSE, 
                  conf.level = (1-alpha))
  Tukey.levels <- HSD[['x']][,4]
  Tukey.labels <- multcompLetters(Tukey.levels)['Letters']
  plot.labels <- names(Tukey.labels[['Letters']])
  # Get highest quantile for Tukey's 5 number summary and add a bit of space to buffer between    
  # upper quantile and label placement
  boxplot.df <- plyr::ddply(df, 'x', function (z) max(fivenum(z$y)) + 0.2)
  
  # Create a data frame out of the factor levels and Tukey's homogenous group letters
  plot.levels <- data.frame(plot.labels, 
                            labels = Tukey.labels[['Letters']],
                            stringsAsFactors = FALSE)
  
  # Merge it with the labels
  labels.df <- merge(plot.levels, 
                     boxplot.df, 
                     by.x = 'plot.labels', 
                     by.y = 'x',
                     sort = FALSE)
  
  return(labels.df)
}
#tukeyLabels()

#########################################################################
#FUNCTION: creatExperiment
# function to simulate random normal data for a specified balanced experimental setup
#inputs: 
# number of factors, blocks, levels, replications in experiment
# sample mean and standard deviation of all samples
# the effect size of blocks, factors, treatments, replications

#outputs:
# a datafram containing random data and treatment factors
#notes on updates:
# - This is a revised version of simulateData, which operated very slowly
# because the function called c() to build vectors within a for loop. This version pre-assigns the mode and length of vectors based on the number of factors, blocks, levels and replications. 
# - The response variable model can now be specified in a string using the same notation as lm() 
#   y~x1+x2 --> y = b0 + b1*x1 + b2*x2 + E
    
#-------------------------------------------------------------------------
makeXdf <- function(nBlocks = 1, #number of study blocks or sites
                    nFactors = 1, #number of treatment factors
                    nLevels = 3, #number of treatment levels for each factor
                    nReps = 10 #number of replications f(b(l(r))))
                      ){
  nTotal <- nFactors*nBlocks*nLevels*nReps
  ID <- vector(mode = "numeric", length = nTotal)
  fact <- vector(mode = "numeric", length = nTotal)
  block <- vector(mode = "numeric", length = nTotal)
  level <- vector(mode = "numeric", length = nTotal)
  rep <- vector(mode = "numeric", length = nTotal)
  response <- vector(mode = "numeric", length = nTotal)
  i <- 0
  for (b in seq(1,nBlocks)){
    for (f in seq(1,nFactors)){
      for (l in seq(1,nLevels)){
        for (r in seq(1,nReps)){
          i <- i + 1
          ID[i] <- i
          fact[i] <- f
          block[i] <- b
          level[i] <- l
          rep[i] <- r
          cat(i,"f",f,"b",b,"l",l,"r",r,"\n")
        }# end o loop
      } # end t loop
    } # end b loop
  } # end f loop
  df <- data.frame(ID=ID,
                   Blk=block,
                   Dgst=fact,
                   Rcp=level,
                   Yld=response)
  return(df)
}


#make a dataframe 
df <- makeXdf(nBlocks = 2, #number of study blocks or sites
              nFactors = 2, #number of treatment factors
              nLevels = 6, #number of treatment levels for each factor
              nReps = 4)
## 1 f 1 b 1 l 1 r 1 
## 2 f 1 b 1 l 1 r 2 
## 3 f 1 b 1 l 1 r 3 
## 4 f 1 b 1 l 1 r 4 
## 5 f 1 b 1 l 2 r 1 
## 6 f 1 b 1 l 2 r 2 
## 7 f 1 b 1 l 2 r 3 
## 8 f 1 b 1 l 2 r 4 
## 9 f 1 b 1 l 3 r 1 
## 10 f 1 b 1 l 3 r 2 
## 11 f 1 b 1 l 3 r 3 
## 12 f 1 b 1 l 3 r 4 
## 13 f 1 b 1 l 4 r 1 
## 14 f 1 b 1 l 4 r 2 
## 15 f 1 b 1 l 4 r 3 
## 16 f 1 b 1 l 4 r 4 
## 17 f 1 b 1 l 5 r 1 
## 18 f 1 b 1 l 5 r 2 
## 19 f 1 b 1 l 5 r 3 
## 20 f 1 b 1 l 5 r 4 
## 21 f 1 b 1 l 6 r 1 
## 22 f 1 b 1 l 6 r 2 
## 23 f 1 b 1 l 6 r 3 
## 24 f 1 b 1 l 6 r 4 
## 25 f 2 b 1 l 1 r 1 
## 26 f 2 b 1 l 1 r 2 
## 27 f 2 b 1 l 1 r 3 
## 28 f 2 b 1 l 1 r 4 
## 29 f 2 b 1 l 2 r 1 
## 30 f 2 b 1 l 2 r 2 
## 31 f 2 b 1 l 2 r 3 
## 32 f 2 b 1 l 2 r 4 
## 33 f 2 b 1 l 3 r 1 
## 34 f 2 b 1 l 3 r 2 
## 35 f 2 b 1 l 3 r 3 
## 36 f 2 b 1 l 3 r 4 
## 37 f 2 b 1 l 4 r 1 
## 38 f 2 b 1 l 4 r 2 
## 39 f 2 b 1 l 4 r 3 
## 40 f 2 b 1 l 4 r 4 
## 41 f 2 b 1 l 5 r 1 
## 42 f 2 b 1 l 5 r 2 
## 43 f 2 b 1 l 5 r 3 
## 44 f 2 b 1 l 5 r 4 
## 45 f 2 b 1 l 6 r 1 
## 46 f 2 b 1 l 6 r 2 
## 47 f 2 b 1 l 6 r 3 
## 48 f 2 b 1 l 6 r 4 
## 49 f 1 b 2 l 1 r 1 
## 50 f 1 b 2 l 1 r 2 
## 51 f 1 b 2 l 1 r 3 
## 52 f 1 b 2 l 1 r 4 
## 53 f 1 b 2 l 2 r 1 
## 54 f 1 b 2 l 2 r 2 
## 55 f 1 b 2 l 2 r 3 
## 56 f 1 b 2 l 2 r 4 
## 57 f 1 b 2 l 3 r 1 
## 58 f 1 b 2 l 3 r 2 
## 59 f 1 b 2 l 3 r 3 
## 60 f 1 b 2 l 3 r 4 
## 61 f 1 b 2 l 4 r 1 
## 62 f 1 b 2 l 4 r 2 
## 63 f 1 b 2 l 4 r 3 
## 64 f 1 b 2 l 4 r 4 
## 65 f 1 b 2 l 5 r 1 
## 66 f 1 b 2 l 5 r 2 
## 67 f 1 b 2 l 5 r 3 
## 68 f 1 b 2 l 5 r 4 
## 69 f 1 b 2 l 6 r 1 
## 70 f 1 b 2 l 6 r 2 
## 71 f 1 b 2 l 6 r 3 
## 72 f 1 b 2 l 6 r 4 
## 73 f 2 b 2 l 1 r 1 
## 74 f 2 b 2 l 1 r 2 
## 75 f 2 b 2 l 1 r 3 
## 76 f 2 b 2 l 1 r 4 
## 77 f 2 b 2 l 2 r 1 
## 78 f 2 b 2 l 2 r 2 
## 79 f 2 b 2 l 2 r 3 
## 80 f 2 b 2 l 2 r 4 
## 81 f 2 b 2 l 3 r 1 
## 82 f 2 b 2 l 3 r 2 
## 83 f 2 b 2 l 3 r 3 
## 84 f 2 b 2 l 3 r 4 
## 85 f 2 b 2 l 4 r 1 
## 86 f 2 b 2 l 4 r 2 
## 87 f 2 b 2 l 4 r 3 
## 88 f 2 b 2 l 4 r 4 
## 89 f 2 b 2 l 5 r 1 
## 90 f 2 b 2 l 5 r 2 
## 91 f 2 b 2 l 5 r 3 
## 92 f 2 b 2 l 5 r 4 
## 93 f 2 b 2 l 6 r 1 
## 94 f 2 b 2 l 6 r 2 
## 95 f 2 b 2 l 6 r 3 
## 96 f 2 b 2 l 6 r 4
# simulate yield as a function of block(Blk) factor(Dgst) and level(Rcp)onto data
b0 <- 450 # average Yld across all data
b1 <- 0 #
b2 <- 200
b3 <- 30
b4 <- 0
SD <- 100

df$Yld <- with(df, b0 + b1*Blk + b2*Dgst + b3*Rcp + b4*Dgst*Rcp + rnorm(length(df$Yld),0,SD))
df
##    ID Blk Dgst Rcp       Yld
## 1   1   1    1   1  706.4380
## 2   2   1    1   1  755.0527
## 3   3   1    1   1  830.0211
## 4   4   1    1   1  801.5212
## 5   5   1    1   2  732.8892
## 6   6   1    1   2  602.1277
## 7   7   1    1   2  766.9442
## 8   8   1    1   2  711.6467
## 9   9   1    1   3  552.4465
## 10 10   1    1   3  749.3574
## 11 11   1    1   3  775.7521
## 12 12   1    1   3  642.4774
## 13 13   1    1   4  825.4818
## 14 14   1    1   4  712.5887
## 15 15   1    1   4  825.1914
## 16 16   1    1   4  812.7405
## 17 17   1    1   5  704.9822
## 18 18   1    1   5  840.2974
## 19 19   1    1   5  746.9153
## 20 20   1    1   5  706.5117
## 21 21   1    1   6  908.5762
## 22 22   1    1   6  858.0082
## 23 23   1    1   6  865.1528
## 24 24   1    1   6  734.3575
## 25 25   1    2   1 1130.6021
## 26 26   1    2   1  730.0465
## 27 27   1    2   1  833.9632
## 28 28   1    2   1  914.2118
## 29 29   1    2   2  937.1398
## 30 30   1    2   2  924.8626
## 31 31   1    2   2  808.4236
## 32 32   1    2   2 1045.2867
## 33 33   1    2   3  949.9288
## 34 34   1    2   3  757.5682
## 35 35   1    2   3  906.0843
## 36 36   1    2   3  845.7114
## 37 37   1    2   4  883.1161
## 38 38   1    2   4 1173.6701
## 39 39   1    2   4 1222.3510
## 40 40   1    2   4 1012.2612
## 41 41   1    2   5 1094.7368
## 42 42   1    2   5 1150.0013
## 43 43   1    2   5  910.0105
## 44 44   1    2   5 1156.2022
## 45 45   1    2   6  920.7993
## 46 46   1    2   6 1091.9777
## 47 47   1    2   6 1076.3277
## 48 48   1    2   6 1070.3373
## 49 49   2    1   1  575.9569
## 50 50   2    1   1  572.7247
## 51 51   2    1   1  516.8975
## 52 52   2    1   1  742.2355
## 53 53   2    1   2  546.0629
## 54 54   2    1   2  937.2865
## 55 55   2    1   2  698.3683
## 56 56   2    1   2  577.2317
## 57 57   2    1   3  704.6852
## 58 58   2    1   3  773.6542
## 59 59   2    1   3  845.7383
## 60 60   2    1   3  689.8267
## 61 61   2    1   4  900.4336
## 62 62   2    1   4  771.1799
## 63 63   2    1   4  690.0680
## 64 64   2    1   4  755.4918
## 65 65   2    1   5  705.4354
## 66 66   2    1   5  571.4214
## 67 67   2    1   5  857.3077
## 68 68   2    1   5  854.6633
## 69 69   2    1   6  784.3028
## 70 70   2    1   6  710.7705
## 71 71   2    1   6  630.5571
## 72 72   2    1   6  864.0877
## 73 73   2    2   1 1047.6637
## 74 74   2    2   1  970.0835
## 75 75   2    2   1  915.2007
## 76 76   2    2   1  899.8117
## 77 77   2    2   2  722.0726
## 78 78   2    2   2 1068.2111
## 79 79   2    2   2 1047.2389
## 80 80   2    2   2  944.3038
## 81 81   2    2   3  802.5264
## 82 82   2    2   3 1014.8915
## 83 83   2    2   3 1035.1603
## 84 84   2    2   3  803.0289
## 85 85   2    2   4  853.1991
## 86 86   2    2   4  996.5221
## 87 87   2    2   4 1121.7509
## 88 88   2    2   4 1001.9214
## 89 89   2    2   5 1003.0314
## 90 90   2    2   5 1033.6015
## 91 91   2    2   5 1044.9669
## 92 92   2    2   5 1055.8373
## 93 93   2    2   6 1170.1667
## 94 94   2    2   6 1030.4916
## 95 95   2    2   6  871.4857
## 96 96   2    2   6 1063.2401
dfAOV <- data.frame (ID=df$ID,
                     Blk=as.factor(df$Blk),
                     Dgst=as.factor(df$Dgst),
                     Rcp=as.factor(df$Rcp),
                     Yld=df$Yld)

str(dfAOV)
## 'data.frame':    96 obs. of  5 variables:
##  $ ID  : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ Blk : Factor w/ 2 levels "1","2": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Dgst: Factor w/ 2 levels "1","2": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Rcp : Factor w/ 6 levels "1","2","3","4",..: 1 1 1 1 2 2 2 2 3 3 ...
##  $ Yld : num  706 755 830 802 733 ...
model <- lm(Yld~Blk + Dgst*Rcp,data=dfAOV)
summary(model)
## 
## Call:
## lm(formula = Yld ~ Blk + Dgst * Rcp, data = dfAOV)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -209.738  -74.776    4.358   64.787  250.303 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 697.1925    39.1088  17.827  < 2e-16 ***
## Blk2        -19.1730    21.6936  -0.884   0.3794    
## Dgst2       242.5919    53.1383   4.565 1.71e-05 ***
## Rcp2          8.9637    53.1383   0.169   0.8665    
## Rcp3         29.1362    53.1383   0.548   0.5850    
## Rcp4         99.0410    53.1383   1.864   0.0659 .  
## Rcp5         60.8359    53.1383   1.145   0.2556    
## Rcp6        106.8706    53.1383   2.011   0.0476 *  
## Dgst2:Rcp2   -1.9692    75.1489  -0.026   0.9792    
## Dgst2:Rcp3  -69.9717    75.1489  -0.931   0.3545    
## Dgst2:Rcp4    3.8601    75.1489   0.051   0.9592    
## Dgst2:Rcp5   65.0147    75.1489   0.865   0.3895    
## Dgst2:Rcp6   -0.2153    75.1489  -0.003   0.9977    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 106.3 on 83 degrees of freedom
## Multiple R-squared:  0.6436, Adjusted R-squared:  0.592 
## F-statistic: 12.49 on 12 and 83 DF,  p-value: 4.363e-14
#PLOT DATA
library(ggplot2); theme_set(theme_classic())

myColors <- c('red','green','pink','blue','orange','cyan')
p1 <- ggplot(data=dfAOV, mapping=aes(x=Rcp,y=Yld)) + geom_boxplot(data=dfAOV, mapping=aes(fill=Dgst))
p1 + scale_fill_manual(values=myColors)

p1 + scale_fill_brewer(palette="BuGn")

p2 <- ggplot(data=dfAOV, mapping=aes(x=Dgst,y=Yld)) + geom_boxplot(data=dfAOV, mapping=aes(fill=Rcp))

#color schemes available from color brewer
#http://colorbrewer2.org/#type=sequential&scheme=BuGn&n=3
# the website above helps you pick the right colors

#Diverging color schemes (two or three colors equal saturation)
p2 + scale_fill_brewer(palette="Spectral")

p2 + scale_fill_brewer(palette="RdYlGn") 

p2 + scale_fill_brewer(palette="PRGn") #color blind safe

#sequential color schemes (two or three colors equal saturation)
p2 + scale_fill_brewer(palette="YlOrRd")

p2 + scale_fill_brewer(palette="RdPu")

p2 + scale_fill_brewer(palette="BuPu") 

p2 + scale_fill_brewer(palette="Greys")

p1 <- ggplot(data=dfAOV, mapping=aes(x=Rcp,y=Yld)) + geom_boxplot(data=dfAOV, mapping=aes(fill=Dgst))
p1 + scale_fill_brewer(palette="Greys")

p2 <- ggplot(data=dfAOV, mapping=aes(x=Dgst,y=Yld)) + geom_boxplot(data=dfAOV, mapping=aes(fill=Rcp))

#color schemes available from color brewer
#http://colorbrewer2.org/#type=sequential&scheme=BuGn&n=3
# the website above helps you pick the right colors
p2 + scale_fill_brewer(palette="BuGn")

# now generate a plot with tukey labels
tuk <- tukeyLabels(treatment=dfAOV$Rcp,
                   response = dfAOV$Yld,
                   alpha = 0.05)
print(tuk)
##   plot.labels labels       V1
## 1           2      a 1068.411
## 2           3      a 1035.360
## 3           4      a 1222.551
## 4           5      a 1156.402
## 5           6      a 1170.367
## 6           1      a 1130.802
str(tuk)
## 'data.frame':    6 obs. of  3 variables:
##  $ plot.labels: chr  "2" "3" "4" "5" ...
##  $ labels     : chr  "a" "a" "a" "a" ...
##  $ V1         : num  1068 1035 1223 1156 1170 ...
tukeyPlot <- ggplot(data=dfAOV, mapping=aes(x=Rcp,y=Yld)) + geom_text(data=tuk, aes(x=plot.labels,
                         y=V1,
                         label=labels,
                         hjust=c(rep(.4)*length(labels)),
                         fontface='bold')) +  scale_fill_brewer(palette="BuGn")

tukeyPlot + geom_boxplot(aes(group=Rcp)) + geom_jitter(width=0.1,height=0)

ggsave(filename="tukeyplot_Rcp-Yld.pdf",
       plot=tukeyPlot,
       device="pdf")
## Saving 7 x 5 in image