mu1 = c(5, 5) # x and y component of mean value vector for cluster 1
mu2 = c(9, 9) # x and y component of mean value vector for cluster 2
mu3 = c(8, 5) # x and y component of mean value vector for cluster 3
sigma1 = c(1.0, 1.1) # x and y component of standard deviation vector for cluster 1
sigma2 = c(1.0, 1.0) # x and y component of standard deviation vector for cluster 1
sigma3 = c(1.0, 1.1) # x and y component of standard deviation vector for cluster 1
rhoxy1 = 0.8 # correlation between x and y, cluster 1
rhoxy2 = 0.9 # correlation between x and y, cluster 2
rhoxy3 = 0.6 # correlation between x and y, cluster 3
alpha = c(0.3, 0.3, 0.4) # cluster weights
sum(alpha) == 1 # must be TRUE
## [1] TRUE
\[ \pmb{\Sigma_k} = \begin{pmatrix} \sigma_x^2 & \rho \sigma_x \sigma_y \\ \rho \sigma_x \sigma_y & \sigma_y^2 \end{pmatrix} \]
mat = c(sigma1[1]^2, rhoxy1*sigma1[1]*sigma1[2], rhoxy1*sigma1[1]*sigma1[2], sigma1[2]^2)
Sigma1 = matrix(mat, nrow = 2, byrow = TRUE) # Covariance matrix for cluster 1
mat = c(sigma2[1]^2, rhoxy2*sigma2[1]*sigma2[2], rhoxy2*sigma2[1]*sigma2[2], sigma2[2]^2)
Sigma2 = matrix(mat, nrow = 2, byrow = TRUE) # Covariance matrix for cluster 2
mat = c(sigma3[1]^2, rhoxy3*sigma3[1]*sigma3[2], rhoxy3*sigma3[1]*sigma3[2], sigma3[2]^2)
Sigma3 = matrix(mat, nrow = 2, byrow = TRUE) # Covariance matrix for cluster 3
rnorm()
rmultinorm()
combines these steps:rmultinorm <- function(mu, Sigma){
d = length(mu) # d = dimension
A = t(chol(Sigma)) # determine a matrix A so that A %*% t(A) = Sigma, "Cholesky decomposition"
Z = rnorm(d) # generate N(0,1) sample point with dimension d
w = A %*% Z + mu # sample point of the d-dimensional Gaussian
return(w[,1]) # returns sample point of a multivariate Gaussian with mean mu and covariance matrix Sigma
}
mu = list()
mu[[1]] = mu1
mu[[2]] = mu2
mu[[3]] = mu3
Sigma = list()
Sigma[[1]] = Sigma1
Sigma[[2]] = Sigma2
Sigma[[3]] = Sigma3
rmultinorm()
function, we conduct two-step experiment to generate sample points from a 2-D Gaussian mixture (with 3 clusters)
N = 1000 # number of observations to generate
x = matrix(NA, ncol = 2, nrow = N) # matrix for 1000 2-dim observations, one row = one observation
cols = rep(NA, N) # color recalls which Gaussian was chosen
color_palette = c("red", "darkgreen", "blue") # different color for each cluster
for (i in 1:N) {
component = sample(c(1,2,3), size = 1, prob = alpha) # draw a cluster with probability alpha
x[i,] = rmultinorm(mu[[component]], Sigma[[component]]) # generate a 2-dim sample from the chosen Gaussian
cols[i] = color_palette[component] # assign a color to the sample point
}
dim(x) # the random sample points are stored in the array x
## [1] 1000 2
plot(x[,1], x[,2], col = cols, main = "Observations from Gaussian mixture", font.main = 1)
plot(x[,1], x[,2], col = "grey", main = "Observations from Gaussian mixture", font.main = 1)
K = 3 # we have to know (or to guess) the number of Gaussians
max_iter = 50 # maximum number of iterations
mu1_t = c(6, 6) # x and y component of mean value vector for cluster 1
mu2_t = c(8, 8) # x and y component of mean value vector for cluster 2
mu3_t = c(7, 6) # x and y component of mean value vector for cluster 3
sigma1_t = c(1.2, 1.0) # x and y component of standard deviation vector for cluster 1
sigma2_t = c(1.2, 1.2) # x and y component of standard deviation vector for cluster 1
sigma3_t = c(1.4, 1.5) # x and y component of standard deviation vector for cluster 1
rhoxy1_t = 0.7 # correlation between x and y, cluster 1
rhoxy2_t = 0.8 # correlation between x and y, cluster 2
rhoxy3_t = 0.7 # correlation between x and y, cluster 3
alpha_t = c(0.3, 0.3, 0.4) # cluster weights
sum(alpha_t) == 1 # must be TRUE
## [1] TRUE
mat = c(sigma1_t[1]^2, rhoxy1_t*sigma1_t[1]*sigma1_t[2], rhoxy1_t*sigma1_t[1]*sigma1_t[2], sigma1_t[2]^2)
Sigma1_t = matrix(mat, nrow = 2, byrow = TRUE) # Covariance matrix for cluster 1
mat = c(sigma2_t[1]^2, rhoxy2_t*sigma2_t[1]*sigma2_t[2], rhoxy2_t*sigma2_t[1]*sigma2_t[2], sigma2_t[2]^2)
Sigma2_t = matrix(mat, nrow = 2, byrow = TRUE) # Covariance matrix for cluster 2
mat = c(sigma3_t[1]^2, rhoxy3_t*sigma3_t[1]*sigma3_t[2], rhoxy3_t*sigma3_t[1]*sigma3_t[2], sigma3_t[2]^2)
Sigma3_t = matrix(mat, nrow = 2, byrow = TRUE) # Covariance matrix for cluster 3
mu_t = list()
mu_t[[1]] = mu1_t
mu_t[[2]] = mu2_t
mu_t[[3]] = mu3_t
Sigma_t = list()
Sigma_t[[1]] = Sigma1_t
Sigma_t[[2]] = Sigma2_t
Sigma_t[[3]] = Sigma3_t
xmin = 0 ; xmax = 12 # define the plot range by looking at the sample point plot
ymin = 0 ; ymax = 12
x1 = seq(xmin, xmax, length = 200)
x2 = seq(ymin, ymax, length = 200)
fx_Gauss()
can do that:fx_Gauss <- function(x, y, mu, Sigma, alpha) { # Density function for a mixture of 2-D Gaussians
if(length(x) != length(y)) stop("\n 'x' and 'y' must have equal length \n\n")
N = nrow(x)
if(!is.list(mu)) stop("\n 'mu' must be a list with K components.\n\n")
if(!is.list(Sigma)) stop("\n 'Sigma' must be a list with K components.\n\n")
K = length(mu) # number of clusters
density = 0
for (k in 1:K) density = density + alpha[k]*dmvnorm(cbind(x, y), mean = mu[[k]], sigma = Sigma[[k]])
return(density)
}
fx_Gauss()
uses the function dmvnorm()
from the R-package mvtnorm
, load the library:library(mvtnorm)
outer
, we define a 2-dimensional grid for the contour plotfx_Gauss()
to determine the function value for each grid pointfx_t = outer(x1, x2, FUN = function(x, y) fx_Gauss(x, y, mu = mu_t, Sigma = Sigma_t, alpha = alpha_t))
plot(x[,1], x[,2], col = "grey", main = "Observations from Gaussian mixture", font.main = 1) # sample points
contour(x1, x2, fx_t, col = "red", lty = 2, add = TRUE) # contour-plot representing the 1st guess
plot(x[,1], x[,2], col = "grey", main = "Observations from Gaussian mixture", font.main = 1) # sample points
contour(x1, x2, fx_t, col = "red", lty = 2, add = TRUE) # contour-plot representing the 1st guess
fx_sol = outer(x1, x2, FUN = function(x, y) fx_Gauss(x, y, mu = mu, Sigma = Sigma, alpha = alpha))
contour(x1, x2, fx_sol, col = "green", lty = 2, add = TRUE) # contour-plot representing the true prob. distribution
logL = 0
for (k in 1:K) logL = logL + alpha_t[k] * dmvnorm(x, mu_t[[k]], Sigma_t[[k]]) # k-sum
logL = log(logL)
logL = sum(logL) # i-sum
cat(paste(" Log likelihood after 1st guess:", signif(logL,4), "\n"))
## Log likelihood after 1st guess: -4255
Lvector = logL # store the first value, append logL-values during the EM loop below
mu = list()
mu[[1]] = rep(NA, 2) # components x and y
mu[[2]] = rep(NA, 2)
mu[[3]] = rep(NA, 2)
Sigma = list()
Sigma[[1]] = matrix(NA, nrow = 2, ncol = 2)
Sigma[[2]] = matrix(NA, nrow = 2, ncol = 2)
Sigma[[3]] = matrix(NA, nrow = 2, ncol = 2)
alpha = rep(NA, K) # K = number of clusters
e_mu_max = 0.001 # means
e_Sigma_max = 0.001 # covariance matrices
e_alpha_max = 0.001 # mixture weights
\[ \omega_{ik} = \; \frac{ \alpha_k^t \cdot f_k \left( \pmb{x_i} \mid \pmb{\mu_k^t}, \pmb{\Sigma_k^t} \right) }{\sum_{k} \alpha_k^t \cdot f_k \left( \pmb{x_i} \mid \pmb{\mu_k^t}, \pmb{\Sigma_k^t} \right)} \]
\[ \pmb{\mu_m} = \frac{\sum_{i=1}^N \omega_{im} \cdot \pmb{x_i}}{\sum_{i=1}^N \omega_{im}} \]
\[ \pmb{\Sigma_m} = \frac{ \sum_{i=1}^N \omega_{im} \cdot \left(\pmb{x_i} - \pmb{\mu_m} \right) \cdot \left(\pmb{x_i} - \pmb{\mu_m} \right)^T }{ \sum_{i=1}^N \omega_{im}} \]
\[ \alpha_m = \frac{1}{N} \cdot \sum_{i=1}^N \omega_{im} \]
for (iter in 1:max_iter) {
# --- E-step
omega = matrix(NA, nrow = N, ncol = K)
for (k in 1:K) omega[,k] = alpha_t[k] * dmvnorm(x, mu_t[[k]], Sigma_t[[k]]) # numerator for each k
denominator = 0; for (k in 1:K) denominator = denominator + omega[,k] # denominator
for (k in 1:K) omega[,k] = omega[,k] / denominator
if(!all(round(rowSums(omega),4) == 1)) stop("\n All rowsums of omega must be 1! \n\n")
if(round(sum(rowSums(omega)), 3) != N)
stop("\n\n Sum over all elements of omega must be equal to the number of observations! \n\n")
if(any(omega < 0)) stop("\n No element of omega can be negative! \n\n")
# --- M-step
# update means:
for (k in 1:K) mu[[k]] = as.vector((omega[,k] %*% x) / sum(omega[,k])) # a vector with 2 components
# update covariance matrices:
for (k in 1:K) {
Sigma[[k]] = matrix(rep(0,4), nrow = 2) # init
for (i in 1:N) Sigma[[k]] = Sigma[[k]] + omega[i,k] * ((x[i,] - mu_t[[k]]) %*% t(x[i,] - mu_t[[k]]))
Sigma[[k]] = Sigma[[k]] / sum(omega[,k]) # weigthed covariance matrix
}
# update cluster membership
for (k in 1:K) alpha[k] = sum(omega[,k]) / N
# new value for log-likelihood:
logL = rep(0 , N)
for (k in 1:K) logL = logL + alpha[k] * dmvnorm(x, mu[[k]], Sigma[[k]]) # k-sum
logL = log(logL)
logL = sum(logL) # i-sum
cat(paste("Log likelihood after iteration", iter, ":", signif(logL,4), "\n"))
Lvector = c(Lvector, logL) # append the new value
# calculate relative errors:
e_mu = max(abs((unlist(mu) - unlist(mu_t))/(unlist(mu) + 1e-50)))
e_Sigma = max(abs((unlist(Sigma) - unlist(Sigma_t))/(unlist(Sigma) + 1e-50)))
e_alpha = max(abs((alpha - alpha_t) / (alpha_t + 1e-50)))
cat(paste("Loop", iter, ": e_mu =", signif(e_mu,4), "e_Sigma =", signif(e_Sigma,4), "e_alpha =", signif(e_alpha,5), "\n"))
# need another loop?:
cond1 = e_mu <= e_mu_max # condition 1, for mu
cond2 = e_Sigma <= e_Sigma_max # condition 2, for sigma
cond3 = e_alpha <= e_alpha_max # condition 3, for alpha
if(cond1 & cond2 & cond3) break # converged!
# otherwise, continue to next loop:
mu_t = mu
Sigma_t = Sigma
alpha_t = alpha
} # EM-loop
## Log likelihood after iteration 1 : -3633
## Loop 1 : e_mu = 0.1521 e_Sigma = 0.4903 e_alpha = 0.375
## Log likelihood after iteration 2 : -3509
## Loop 2 : e_mu = 0.06336 e_Sigma = 0.6493 e_alpha = 0.058626
## Log likelihood after iteration 3 : -3461
## Loop 3 : e_mu = 0.0401 e_Sigma = 0.4534 e_alpha = 0.081443
## Log likelihood after iteration 4 : -3438
## Loop 4 : e_mu = 0.01227 e_Sigma = 0.2887 e_alpha = 0.074074
## Log likelihood after iteration 5 : -3427
## Loop 5 : e_mu = 0.009132 e_Sigma = 0.1364 e_alpha = 0.053024
## Log likelihood after iteration 6 : -3420
## Loop 6 : e_mu = 0.006481 e_Sigma = 0.08775 e_alpha = 0.038344
## Log likelihood after iteration 7 : -3416
## Loop 7 : e_mu = 0.004695 e_Sigma = 0.05418 e_alpha = 0.028112
## Log likelihood after iteration 8 : -3414
## Loop 8 : e_mu = 0.004269 e_Sigma = 0.03442 e_alpha = 0.020268
## Log likelihood after iteration 9 : -3413
## Loop 9 : e_mu = 0.003457 e_Sigma = 0.02189 e_alpha = 0.014391
## Log likelihood after iteration 10 : -3412
## Loop 10 : e_mu = 0.002669 e_Sigma = 0.01588 e_alpha = 0.010161
## Log likelihood after iteration 11 : -3412
## Loop 11 : e_mu = 0.002013 e_Sigma = 0.01293 e_alpha = 0.007187
## Log likelihood after iteration 12 : -3412
## Loop 12 : e_mu = 0.001502 e_Sigma = 0.01015 e_alpha = 0.0051119
## Log likelihood after iteration 13 : -3412
## Loop 13 : e_mu = 0.001114 e_Sigma = 0.007804 e_alpha = 0.0036597
## Log likelihood after iteration 14 : -3412
## Loop 14 : e_mu = 0.0008249 e_Sigma = 0.00593 e_alpha = 0.0026365
## Log likelihood after iteration 15 : -3412
## Loop 15 : e_mu = 0.0006105 e_Sigma = 0.004475 e_alpha = 0.00191
## Log likelihood after iteration 16 : -3412
## Loop 16 : e_mu = 0.0004521 e_Sigma = 0.003365 e_alpha = 0.0013903
## Log likelihood after iteration 17 : -3412
## Loop 17 : e_mu = 0.0003352 e_Sigma = 0.002525 e_alpha = 0.0010161
## Log likelihood after iteration 18 : -3412
## Loop 18 : e_mu = 0.0002488 e_Sigma = 0.001893 e_alpha = 0.00074533
## Log likelihood after iteration 19 : -3412
## Loop 19 : e_mu = 0.0001849 e_Sigma = 0.001419 e_alpha = 0.00054837
## Log likelihood after iteration 20 : -3412
## Loop 20 : e_mu = 0.0001376 e_Sigma = 0.001063 e_alpha = 0.00040454
## Log likelihood after iteration 21 : -3412
## Loop 21 : e_mu = 0.0001025 e_Sigma = 0.0007965 e_alpha = 0.00029913
if(iter == max_iter) {
warning("Not converged!\n\n")
} else {
cat("Converged after", iter, "iterations.\n\n")
}
## Converged after 21 iterations.
## List results:
cat(paste("Mean values of the", K, "clusters:\n"))
## Mean values of the 3 clusters:
for (k in 1:K) {
mux = signif(mu[[k]][1],4)
muy = signif(mu[[k]][2],4)
cat(paste(" Cluster", k, " mux =", mux, " muy =", muy, "\n"))
}
## Cluster 1 mux = 4.975 muy = 4.933
## Cluster 2 mux = 8.947 muy = 8.937
## Cluster 3 mux = 7.967 muy = 5.002
cat(paste("Standard deviations and correlation of the", K, "clusters:\n"))
## Standard deviations and correlation of the 3 clusters:
for (k in 1:K) {
sigmax = signif(sqrt(Sigma[[k]][1,1]),4)
sigmay = signif(sqrt(Sigma[[k]][2,2]),4)
rhoxy = signif(Sigma[[k]][1,2]/sigmax/sigmay, 4)
cat(paste(" Cluster", k, " sdx =", sigmax, " sdy =", sigmay, " rhoxy =", rhoxy, "\n"))
}
## Cluster 1 sdx = 0.9608 sdy = 1.067 rhoxy = 0.8153
## Cluster 2 sdx = 1.056 sdy = 1.053 rhoxy = 0.9087
## Cluster 3 sdx = 1.018 sdy = 1.117 rhoxy = 0.6334
cat(paste("Cluster weigths of the", K, "clusters:\n"))
## Cluster weigths of the 3 clusters:
for (k in 1:K) cat(paste(" Cluster", k, " alpha =", signif(alpha[k], 4), "\n"))
## Cluster 1 alpha = 0.2658
## Cluster 2 alpha = 0.3149
## Cluster 3 alpha = 0.4193
plot(Lvector, ylab = "log(L)", main = "Log-likelihood", font.main = 1, xlab = "iter", col = "blue", lwd = 1.5 )
mtext(side = 3, "First value based on first guess", col = "blue", cex = 0.9)
plot(x[,1], x[,2], xlab = "x1", ylab = "x2", col = "grey", main = "Sample points and EM solution", font.main = 1)
# define grid for the contour plot (same as above):
xmin = 0 ; xmax = 12
ymin = 0 ; ymax = 12
x1 = seq(xmin, xmax, length = 200)
x2 = seq(ymin, ymax, length = 200)
fx_sol = outer(x1, x2, FUN = function(x, y) fx_Gauss(x, y, mu = mu, Sigma = Sigma, alpha = alpha))
contour(x1, x2, fx_sol, col = "green", lty = 2, add = TRUE) # contour-plot showing the calculated solution
image(x1, x2, fx_sol, col = heat.colors(12), xlab = "x", ylab = "y", main = "Solution by EM", font.main = 1)
x1 = seq(xmin, xmax, length = 100)
x2 = seq(ymin, ymax, length = 100)
fx_sol = outer(x1, x2, FUN = function(x, y) fx_Gauss(x, y, mu = mu, Sigma = Sigma, alpha = alpha))
persp(x1, x2, fx_sol, col='green', theta = 30, phi = 30, xlab = "x1", ylab = "x2", zlab = "f(x)")