What does PCA do?


PCA

data(iris)
head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

It is good practice to center and scale the variables prior to PCA.

log.ir <- log(iris[, 1:4])   # log transform 
ir.species <- iris[, 5]
ir.pca <- prcomp(log.ir, center = TRUE, scale. = TRUE) 
plot(ir.pca, type = "b", col = "red", main="Variance associated with PCs", font.main = 1) 


Biplot

biplot(ir.pca, main = "PCA biplot for iris data", font.main = 1, cex = 0.6)

Cosine of the angle between two variables = correlation of these variables

If a variable vectors is short:


Scatterplot

str(ir.pca)
## List of 5
##  $ sdev    : num [1:4] 1.712 0.952 0.365 0.166
##  $ rotation: num [1:4, 1:4] 0.504 -0.302 0.577 0.567 -0.455 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:4] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
##   .. ..$ : chr [1:4] "PC1" "PC2" "PC3" "PC4"
##  $ center  : Named num [1:4] 1.755 1.107 1.175 -0.172
##   ..- attr(*, "names")= chr [1:4] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
##  $ scale   : Named num [1:4] 0.141 0.143 0.59 0.983
##   ..- attr(*, "names")= chr [1:4] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
##  $ x       : num [1:150, 1:4] -2.41 -2.22 -2.58 -2.45 -2.54 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : chr [1:4] "PC1" "PC2" "PC3" "PC4"
##  - attr(*, "class")= chr "prcomp"
plot(ir.pca$x[,1], ir.pca$x[,2])

# text(ir.pca$x[,1], ir.pca$x[,2], rownames(log.ir), pos = 1)  # to check if this is the correct plot


Better scatterplot

Colouring the points according to species might be a good idea:

class(ir.species)
## [1] "factor"
levels(ir.species) 
## [1] "setosa"     "versicolor" "virginica"
colvec = rep("black", length(ir.species))  # initialize the color vector 
colvec[ir.species == "setosa"] = "blue" 
colvec[ir.species == "versicolor"] = "red"  
colvec[ir.species == "virginica"] = "green"   
maintxt = "PCA for 3 kinds of flowers"
plot(ir.pca$x[,1], ir.pca$x[,2], col = colvec, xlab = "", ylab = "", main = maintxt, font.main = 1)
legend("topleft", c("setosa", "versicolor", "virginica"), col = c("blue", "red", "green"), pch = 1)