###################################################################################################### ###################################################################################################### #### PMR_MultivariateDataAnalysis #### comes from #### http://little-book-of-r-for-multivariate-analysis.readthedocs.org/en/latest/src/multivariateanalysis.html ###################################################################################################### ###################################################################################################### library("car") library("RColorBrewer") setwd("G:\\PMR") wine <- read.table("http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data",sep=",") # Alternativ: wine <- read.table("data\\PMR_tut_MultivariateDataAnalysis_wine.txt",sep=",") names(wine) <- c("cultivar", paste0("c", seq(1:13))) head(wine) # Plot scatterplotMatrix(wine[2:6]) #For example, in the matrix scatterplot above, the cell in the third column of the fourth row down is a scatterplot of V5 (x-axis) against V4 (y-axis). If you look at this scatterplot, it appears that there may be a positive relationship between V5 and V4. plot(wine$c4, wine$c5) # label data text(wine$c4, wine$c5, wine$c1, cex=0.7, pos=4, col="red") #For example, to make a profile plot of the concentrations of the first five chemicals in the wine samples (stored in columns V2, V3, V4, V5, V6 of variable "wine"), we type: names <- c("c2","c3","c4","c5","c6") mylist <- list(wine$c2,wine$c3,wine$c4,wine$c5,wine$c6) makeProfilePlot(mylist,names) #It is clear from the profile plot that the mean and standard deviation for V6 is quite a lot higher than that for the other variables. sapply(wine[2:14],mean) sapply(wine[2:14],sd) #To extract out the data for just cultivar 2, we can type: cultivar2wine <- wine[wine$cultivar=="2",] sapply(cultivar2wine[2:14],mean) sapply(cultivar2wine[2:14], sd) #To use the function "printMeanAndSdByGroup()", you first need to copy and paste it into R. The arguments of the function are the variables that you want to calculate means and standard deviations for, and the variable containing the group of each sample. For example, to calculate the mean and standard deviation for each of the 13 chemical concentrations, for each of the three different wine cultivars, we type: printMeanAndSdByGroup(wine[2:14],wine[1]) # Between-groups Variance and Within-groups Variance for a Variable calcWithinGroupsVariance(wine[2],wine[1]) calcBetweenGroupsVariance(wine[2],wine[1]) calcSeparations(wine[2:14],wine[1]) calcWithinGroupsCovariance(wine[8],wine[11],wine[1]) calcBetweenGroupsCovariance(wine[8],wine[11],wine[1]) cor.test(wine$c2, wine$c3) mosthighlycorrelated(wine[2:14], 10) standardisedconcentrations <- as.data.frame(scale(wine[2:14])) sapply(standardisedconcentrations,mean) sapply(standardisedconcentrations,sd) standardisedconcentrations <- as.data.frame(scale(wine[2:14])) wine.pca <- prcomp(standardisedconcentrations) summary(wine.pca) screeplot(wine.pca, type="lines") calcpc(standardisedconcentrations, wine.pca$rotation[,1]) ###################################################################################################### ###################################################################################################### # Useful Functions ###################################################################################################### ###################################################################################################### makeProfilePlot <- function(mylist,names) { require(RColorBrewer) # find out how many variables we want to include numvariables <- length(mylist) # choose 'numvariables' random colours colours <- brewer.pal(numvariables,"Set1") # find out the minimum and maximum values of the variables: mymin <- 1e+20 mymax <- 1e-20 for (i in 1:numvariables) { vectori <- mylist[[i]] mini <- min(vectori) maxi <- max(vectori) if (mini < mymin) { mymin <- mini } if (maxi > mymax) { mymax <- maxi } } # plot the variables for (i in 1:numvariables) { vectori <- mylist[[i]] namei <- names[i] colouri <- colours[i] if (i == 1) { plot(vectori,col=colouri,type="l",ylim=c(mymin,mymax)) } else { points(vectori, col=colouri,type="l") } lastxval <- length(vectori) lastyval <- vectori[length(vectori)] text((lastxval-10),(lastyval),namei,col="black",cex=0.6) } } printMeanAndSdByGroup <- function(variables,groupvariable) { # find the names of the variables variablenames <- c(names(groupvariable),names(as.data.frame(variables))) # within each group, find the mean of each variable groupvariable <- groupvariable[,1] # ensures groupvariable is not a list means <- aggregate(as.matrix(variables) ~ groupvariable, FUN = mean) names(means) <- variablenames print(paste("Means:")) print(means) # within each group, find the standard deviation of each variable: sds <- aggregate(as.matrix(variables) ~ groupvariable, FUN = sd) names(sds) <- variablenames print(paste("Standard deviations:")) print(sds) # within each group, find the number of samples: samplesizes <- aggregate(as.matrix(variables) ~ groupvariable, FUN = length) names(samplesizes) <- variablenames print(paste("Sample sizes:")) print(samplesizes) } calcWithinGroupsVariance <- function(variable,groupvariable) { # find out how many values the group variable can take groupvariable2 <- as.factor(groupvariable[[1]]) levels <- levels(groupvariable2) numlevels <- length(levels) # get the mean and standard deviation for each group: numtotal <- 0 denomtotal <- 0 for (i in 1:numlevels) { leveli <- levels[i] levelidata <- variable[groupvariable==leveli,] levelilength <- length(levelidata) # get the standard deviation for group i: sdi <- sd(levelidata) numi <- (levelilength - 1)*(sdi * sdi) denomi <- levelilength numtotal <- numtotal + numi denomtotal <- denomtotal + denomi } # calculate the within-groups variance Vw <- numtotal / (denomtotal - numlevels) return(Vw) } calcBetweenGroupsVariance <- function(variable,groupvariable) { # find out how many values the group variable can take groupvariable2 <- as.factor(groupvariable[[1]]) levels <- levels(groupvariable2) numlevels <- length(levels) # calculate the overall grand mean: grandmean <- mean(variable) # get the mean and standard deviation for each group: numtotal <- 0 denomtotal <- 0 for (i in 1:numlevels) { leveli <- levels[i] levelidata <- variable[groupvariable==leveli,] levelilength <- length(levelidata) # get the mean and standard deviation for group i: meani <- mean(levelidata) sdi <- sd(levelidata) numi <- levelilength * ((meani - grandmean)^2) denomi <- levelilength numtotal <- numtotal + numi denomtotal <- denomtotal + denomi } # calculate the between-groups variance Vb <- numtotal / (numlevels - 1) Vb <- Vb[[1]] return(Vb) } calcSeparations <- function(variables,groupvariable) { # find out how many variables we have variables <- as.data.frame(variables) numvariables <- length(variables) # find the variable names variablenames <- colnames(variables) # calculate the separation for each variable for (i in 1:numvariables) { variablei <- variables[i] variablename <- variablenames[i] Vw <- calcWithinGroupsVariance(variablei, groupvariable) Vb <- calcBetweenGroupsVariance(variablei, groupvariable) sep <- Vb/Vw print(paste("variable",variablename,"Vw=",Vw,"Vb=",Vb,"separation=",sep)) } } calcWithinGroupsCovariance <- function(variable1,variable2,groupvariable) { # find out how many values the group variable can take groupvariable2 <- as.factor(groupvariable[[1]]) levels <- levels(groupvariable2) numlevels <- length(levels) # get the covariance of variable 1 and variable 2 for each group: Covw <- 0 for (i in 1:numlevels) { leveli <- levels[i] levelidata1 <- variable1[groupvariable==leveli,] levelidata2 <- variable2[groupvariable==leveli,] mean1 <- mean(levelidata1) mean2 <- mean(levelidata2) levelilength <- length(levelidata1) # get the covariance for this group: term1 <- 0 for (j in 1:levelilength) { term1 <- term1 + ((levelidata1[j] - mean1)*(levelidata2[j] - mean2)) } Cov_groupi <- term1 # covariance for this group Covw <- Covw + Cov_groupi } totallength <- nrow(variable1) Covw <- Covw / (totallength - numlevels) return(Covw) } calcBetweenGroupsCovariance <- function(variable1,variable2,groupvariable) { # find out how many values the group variable can take groupvariable2 <- as.factor(groupvariable[[1]]) levels <- levels(groupvariable2) numlevels <- length(levels) # calculate the grand means variable1mean <- mean(variable1) variable2mean <- mean(variable2) # calculate the between-groups covariance Covb <- 0 for (i in 1:numlevels) { leveli <- levels[i] levelidata1 <- variable1[groupvariable==leveli,] levelidata2 <- variable2[groupvariable==leveli,] mean1 <- mean(levelidata1) mean2 <- mean(levelidata2) levelilength <- length(levelidata1) term1 <- (mean1 - variable1mean)*(mean2 - variable2mean)*(levelilength) Covb <- Covb + term1 } Covb <- Covb / (numlevels - 1) Covb <- Covb[[1]] return(Covb) } mosthighlycorrelated <- function(mydataframe,numtoreport) { # find the correlations cormatrix <- cor(mydataframe) # set the correlations on the diagonal or lower triangle to zero, # so they will not be reported as the highest ones: diag(cormatrix) <- 0 cormatrix[lower.tri(cormatrix)] <- 0 # flatten the matrix into a dataframe for easy sorting fm <- as.data.frame(as.table(cormatrix)) # assign human-friendly names names(fm) <- c("First.Variable", "Second.Variable","Correlation") # sort and print the top n correlations head(fm[order(abs(fm$Correlation),decreasing=T),],n=numtoreport) } calcpc <- function(variables,loadings) { # find the number of samples in the data set as.data.frame(variables) numsamples <- nrow(variables) # make a vector to store the component pc <- numeric(numsamples) # find the number of variables numvariables <- length(variables) # calculate the value of the component for each sample for (i in 1:numsamples) { valuei <- 0 for (j in 1:numvariables) { valueij <- variables[i,j] loadingj <- loadings[j] valuei <- valuei + (valueij * loadingj) } pc[i] <- valuei } return(pc) } calclda <- function(variables,loadings) { # find the number of samples in the data set as.data.frame(variables) numsamples <- nrow(variables) # make a vector to store the discriminant function ld <- numeric(numsamples) # find the number of variables numvariables <- length(variables) # calculate the value of the discriminant function for each sample for (i in 1:numsamples) { valuei <- 0 for (j in 1:numvariables) { valueij <- variables[i,j] loadingj <- loadings[j] valuei <- valuei + (valueij * loadingj) } ld[i] <- valuei } # standardise the discriminant function so that its mean value is 0: ld <- as.data.frame(scale(ld, center=TRUE, scale=FALSE)) ld <- ld[[1]] return(ld) } groupStandardise <- function(variables, groupvariable) { # find out how many variables we have variables <- as.data.frame(variables) numvariables <- length(variables) # find the variable names variablenames <- colnames(variables) # calculate the group-standardised version of each variable for (i in 1:numvariables) { variablei <- variables[i] variablei_name <- variablenames[i] variablei_Vw <- calcWithinGroupsVariance(variablei, groupvariable) variablei_mean <- mean(variablei) variablei_new <- (variablei - variablei_mean)/(sqrt(variablei_Vw)) data_length <- nrow(variablei) if (i == 1) { variables_new <- data.frame(row.names=seq(1,data_length)) } variables_new[`variablei_name`] <- variablei_new } return(variables_new) } calcAllocationRuleAccuracy <- function(ldavalue, groupvariable, cutoffpoints) { # find out how many values the group variable can take groupvariable2 <- as.factor(groupvariable[[1]]) levels <- levels(groupvariable2) numlevels <- length(levels) # calculate the number of true positives and false negatives for each group numlevels <- length(levels) for (i in 1:numlevels) { leveli <- levels[i] levelidata <- ldavalue[groupvariable==leveli] # see how many of the samples from this group are classified in each group for (j in 1:numlevels) { levelj <- levels[j] if (j == 1) { cutoff1 <- cutoffpoints[1] cutoff2 <- "NA" results <- summary(levelidata <= cutoff1) } else if (j == numlevels) { cutoff1 <- cutoffpoints[(numlevels-1)] cutoff2 <- "NA" results <- summary(levelidata > cutoff1) } else { cutoff1 <- cutoffpoints[(j-1)] cutoff2 <- cutoffpoints[(j)] results <- summary(levelidata > cutoff1 & levelidata <= cutoff2) } trues <- results["TRUE"] trues <- trues[[1]] print(paste("Number of samples of group",leveli,"classified as group",levelj," : ", trues,"(cutoffs:",cutoff1,",",cutoff2,")")) } } }