What does cluster analysis do?


Clustering using kmeans

Get some data:

clouds = get(load("clouds.RData"))
dim(clouds)
## [1] 75  2
head(clouds)
##      cloud1_x cloud1_y
## [1,] 1.454594 3.751767
## [2,] 1.072375 4.571505
## [3,] 1.028263 3.808842
## [4,] 1.508426 3.705582
## [5,] 0.889498 4.217310
## [6,] 1.291009 4.051441
plot(clouds[,1], clouds[,2], col = "black", xlab = "x", ylab = "y", main = "Clouds", font.main = 1)

Cluster with kmeans:

nr = 3                      # needs number of clusters as input
clres = kmeans(clouds, nr)
str(clres)
## List of 9
##  $ cluster     : int [1:75] 3 3 3 3 3 3 3 3 3 3 ...
##  $ centers     : num [1:3, 1:2] 1.97 3.07 1.07 2.95 2.01 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:3] "1" "2" "3"
##   .. ..$ : chr [1:2] "cloud1_x" "cloud1_y"
##  $ totss       : num 114
##  $ withinss    : num [1:3] 4.92 3.66 6.13
##  $ tot.withinss: num 14.7
##  $ betweenss   : num 99.5
##  $ size        : int [1:3] 26 25 24
##  $ iter        : int 2
##  $ ifault      : int 0
##  - attr(*, "class")= chr "kmeans"
clres$cluster
##  [1] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 3 3 3 3 3 3 3 2 2 2 2 2 2 2 2 2 2
## [36] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [71] 1 1 1 1 1

Plot, color according to identified clusters:

colvec = rep("black", nrow(clouds))
clres$cluster == 1
##  [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [12] FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE
## [23] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [34] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [45] FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE
## [56]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [67]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
which(clres$cluster == 1)
##  [1] 18 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
## [24] 73 74 75
colvec[which(clres$cluster == 1)] = "blue"
colvec[which(clres$cluster == 2)] = "red"
colvec[which(clres$cluster == 3)] = "green"
colvec
##  [1] "green" "green" "green" "green" "green" "green" "green" "green"
##  [9] "green" "green" "green" "green" "green" "green" "green" "green"
## [17] "green" "blue"  "green" "green" "green" "green" "green" "green"
## [25] "green" "red"   "red"   "red"   "red"   "red"   "red"   "red"  
## [33] "red"   "red"   "red"   "red"   "red"   "red"   "red"   "red"  
## [41] "red"   "red"   "red"   "red"   "red"   "red"   "red"   "red"  
## [49] "red"   "red"   "blue"  "blue"  "blue"  "blue"  "blue"  "blue" 
## [57] "blue"  "blue"  "blue"  "blue"  "blue"  "blue"  "blue"  "blue" 
## [65] "blue"  "blue"  "blue"  "blue"  "blue"  "blue"  "blue"  "blue" 
## [73] "blue"  "blue"  "blue"
plot(clouds[,1], clouds[,2], col = colvec, xlab = "x", ylab = "y", main = "Clusters found", font.main = 1)

Add the cluster centers:

clres$centers
##   cloud1_x cloud1_y
## 1 1.967705 2.952207
## 2 3.071303 2.014523
## 3 1.068808 4.037214
plot(clouds[,1], clouds[,2], col = colvec, xlab = "x", ylab = "y", main = "Clusters found", font.main = 1)
points(clres$centers[,1], clres$centers[,2], col = c("blue", "red", "green"), pch = 15, cex = 1.8)


More dimensions (e.g. time lines):

Some data:

timedata = get(load("timedata.RData"))
matplot(1:10, t(timedata), pch = 1:6, type = "b", col = "black") 

Find clusters (assume that there are two):

cres = kmeans(timedata, 2) 
cres$cluster
## gene 1 gene 2 gene 3 gene 4 gene 5 gene 6 
##      1      1      1      2      2      2
colvec = rep("black", 6)
colvec[which(cres$cluster == 1)] = "blue"
colvec[which(cres$cluster == 2)] = "red"
cbind(cres$cluster, colvec)
##            colvec
## gene 1 "1" "blue"
## gene 2 "1" "blue"
## gene 3 "1" "blue"
## gene 4 "2" "red" 
## gene 5 "2" "red" 
## gene 6 "2" "red"
matplot(1:10, t(timedata), pch = 1:6, type = "b", col = colvec, main = "Clustered timelines") 


Clustering using cmeans

library(e1071) 
nr = 3                      # needs number of clusters as input
res = cmeans(clouds, nr)
str(res)
## List of 7
##  $ centers    : num [1:3, 1:2] 1.98 3.08 1.07 2.96 2.02 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:3] "1" "2" "3"
##   .. ..$ : chr [1:2] "cloud1_x" "cloud1_y"
##  $ size       : int [1:3] 26 25 24
##  $ cluster    : int [1:75] 3 3 3 3 3 3 3 3 3 3 ...
##  $ membership : num [1:75, 1:3] 0.1869 0.0819 0.0247 0.258 0.0258 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : chr [1:3] "1" "2" "3"
##  $ iter       : num 9
##  $ withinerror: num 0.155
##  $ call       : language cmeans(x = clouds, centers = nr)
##  - attr(*, "class")= chr "fclust"
res$cluster
##  [1] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 3 3 3 3 3 3 3 2 2 2 2 2 2 2 2 2 2
## [36] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [71] 1 1 1 1 1

Plot, color according to identified clusters:

colvec = rep("black", nrow(clouds))
res$cluster == 1
##  [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [12] FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE
## [23] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [34] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [45] FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE
## [56]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [67]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
which(res$cluster == 1)
##  [1] 18 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
## [24] 73 74 75
colvec[which(res$cluster == 1)] = "blue"
colvec[which(res$cluster == 2)] = "red"
colvec[which(res$cluster == 3)] = "green"
colvec
##  [1] "green" "green" "green" "green" "green" "green" "green" "green"
##  [9] "green" "green" "green" "green" "green" "green" "green" "green"
## [17] "green" "blue"  "green" "green" "green" "green" "green" "green"
## [25] "green" "red"   "red"   "red"   "red"   "red"   "red"   "red"  
## [33] "red"   "red"   "red"   "red"   "red"   "red"   "red"   "red"  
## [41] "red"   "red"   "red"   "red"   "red"   "red"   "red"   "red"  
## [49] "red"   "red"   "blue"  "blue"  "blue"  "blue"  "blue"  "blue" 
## [57] "blue"  "blue"  "blue"  "blue"  "blue"  "blue"  "blue"  "blue" 
## [65] "blue"  "blue"  "blue"  "blue"  "blue"  "blue"  "blue"  "blue" 
## [73] "blue"  "blue"  "blue"
plot(clouds[,1], clouds[,2], col = colvec, xlab = "x", ylab = "y", main = "Clusters found", font.main = 1)

Clustering with fanny

library(cluster)
# ?fanny

The cmeans function does not accept a distance matrix as input, but the fanny function does
(cmeans knows only euclidian and manhattan distance).