######################################### ## Multivariate Analysis - R Tutorial ## ######################################### # Please read the tutorial's general instructions first, and prepare # for loading an external datafile, if you will use one. # The content of this file can be pasted directly into the R console. # It should blurp lost of errors only at the point where you # are supposed to have an external dataset available. # However, it is much handier to use the "display file" command # within the R menu to look at this file, and paste command by command # to the console, using control-V. Take your time to experiment a bit # with the listed commands. # You will need the "mva", "MASS", and "lattice" libraries. #! Load them here. library(mva) library(MASS) library(lattice) #--------------------------------------------- # .Rmultivariate: Multivariate Analysis in R. #--------------------------------------------- # Content: # # I. Principal component analysis # II. Multivariate normal linear models (MANOVA) # ----------------------------------- # | I. Principal Component Analysis | # ----------------------------------- # We will carry out a principal component analysis in this section. # This will occur on the iris dataset that often appears # in multivariate analysis textbooks. data(iris) # Edgar Anderson's famous iris data attach(iris) summary(iris) # This dataset contains five variables, of which # four variables are responses. We group those together into a # "response matrix" y: y<-cbind(Sepal.Length,Sepal.Width,Petal.Length,Petal.Width) # We use the function cbind() for this, # which binds columns into a matrix. # The following creates the Principal Component object pciris # that contains a load of information which we can request. # Note the option "cor=T", which means that we scale the variances # before calculating the principal components. pciris<-princomp(log(y),cor=T) # Let's take a look at what it contains: summary(pciris) # Among other things, we see the proportion of the total variance # explained by each principal component (PC). # Using loadings(), you can inspect the contribution of # each original variable to each PC. loadings(pciris) # The iris dataset contains data from three Iris species. # We now want to see how the three species become # separated in the direction determined by each PC. # We first construct predicted values for that: pcirispred<-predict(pciris) # Then we look at the distribution of the data over the first PC: eqscplot(pcirispred[,1:2],xlab="Principal component 1", ylab="Principal Component 2") # This is not extremely insightful. # Let's be a bit brave, and run a complicated plot script, # using the "splom" graphical function from the lattice package: super.sym <- trellis.par.get("superpose.symbol") splom(~pcirispred, groups = Species,panel = panel.superpose, key = list(title = "Three Varieties of Iris",columns = 3, points = list(pch = super.sym$pch[1:3],col = super.sym$col[1:3]), text = list(c("Setosa", "Versicolor", "Virginica")))) # Interpret the result. # Please take a closer look at the functions we used: help(princomp)# from the mva library help(predict) help(eqscplot) # from the MASS library help(splom) # from the lattice library # ----------------------------------------- # | II. Multivariate normal Linear models | # ----------------------------------------- # Let's look at relevant help files first: help(manova) help(summary.manova) # Manova stands for multivariate Analysis of Variance. # It is related to univariate ANOVA, and you can request the # results of such ANOVA's from MANOVA objects. # We will use the example from the R help files: # Example on producing plastic film from Krzanowski (1998, p. 381) # There are three response variables in this case. tear <- c(6.5, 6.2, 5.8, 6.5, 6.5, 6.9, 7.2, 6.9, 6.1, 6.3,6.7, 6.6, 7.2, 7.1, 6.8, 7.1, 7.0, 7.2, 7.5, 7.6) gloss <- c(9.5, 9.9, 9.6, 9.6, 9.2, 9.1, 10.0, 9.9, 9.5, 9.4,9.1, 9.3, 8.3, 8.4, 8.5, 9.2, 8.8, 9.7, 10.1, 9.2) opacity <- c(4.4, 6.4, 3.0, 4.1, 0.8, 5.7, 2.0, 3.9, 1.9, 5.7,2.8, 4.1, 3.8, 1.6, 3.4, 8.4, 5.2, 6.9, 2.7, 1.9) resp <- cbind(tear, gloss, opacity) # And two explanatory or predictor variables rate <- factor(gl(2,10), labels=c("Low", "High")) additive <- factor(gl(2, 5, len=20), labels=c("Low", "High")) # We fit the MANOVA model # The star "*" between rate and additive means that we # fit main effects of the factor "rate", # of the factor "additive" and their interaction. fit <- manova(resp ~ rate * additive) # summary.aov() produces univariate ANOVA tables # per separate response variables summary.aov(fit) # Summary() gives multivariate tests, i.e., for an effect on # all predictor variables seen as a multivariate normal # cloud of datapoints. summary(fit, test="Pillai") summary(fit, test="Roy") # Scheiner and Gurevitch (2001) advise using Pillai's trace # when assumptions of MANOVA are violated. # Roy's largest root has the greatest power. # In this case, they produce exactly the same result. # Let's list the MANOVA assumptions here for completeness: # - subjects are independent # - all random effects/residuals are normally distributed # - the variances of residuals are equal among groups (homoscedasticity) # - The covariances among error effects are equal among groups. # You can assess the assumptions by inspecting residuals. resfit<-residuals(fit) # creates residuals for all response variables. hist(resfit[,1]) qqnorm(resfit[,1]) qqline(resfit[,1]) # This shows that violation of normality assumptions is not unlikely # for the first response variable, # but the dataset is small. # Most of the material in this short tutorial comes from # Modern Applied Statistics with S # by W. N. Venables and B. D. Ripley (2002) # Design and Analysis of Ecological Experiments # by S. M. Scheiner and J. Gurevitch (2001) # Krzanowski, W. J. (1988) Principles of Multivariate Analysis. # A User's Perspective. Oxford. # Tom Van Dooren, version 30/1/2003