Skip to content

Instantly share code, notes, and snippets.

@bistaumanga
Last active September 26, 2024 10:00
Show Gist options
  • Select an option

  • Save bistaumanga/6023716 to your computer and use it in GitHub Desktop.

Select an option

Save bistaumanga/6023716 to your computer and use it in GitHub Desktop.
Gaussian Mixture Model using Expectation Maximization algorithm in python
# -*- coding: utf-8 -*-
"""
Created on Sat May 3 10:21:21 2014
@author: umb
"""
import numpy as np
class GMM:
def __init__(self, k = 3, eps = 0.0001):
self.k = k ## number of clusters
self.eps = eps ## threshold to stop `epsilon`
# All parameters from fitting/learning are kept in a named tuple
from collections import namedtuple
def fit_EM(self, X, max_iters = 1000):
# n = number of data-points, d = dimension of data points
n, d = X.shape
# randomly choose the starting centroids/means
## as 3 of the points from datasets
mu = X[np.random.choice(n, self.k, False), :]
# initialize the covariance matrices for each gaussians
Sigma= [np.eye(d)] * self.k
# initialize the probabilities/weights for each gaussians
w = [1./self.k] * self.k
# responsibility matrix is initialized to all zeros
# we have responsibility for each of n points for eack of k gaussians
R = np.zeros((n, self.k))
### log_likelihoods
log_likelihoods = []
P = lambda mu, s: np.linalg.det(s) ** -.5 ** (2 * np.pi) ** (-X.shape[1]/2.) \
* np.exp(-.5 * np.einsum('ij, ij -> i',\
X - mu, np.dot(np.linalg.inv(s) , (X - mu).T).T ) )
# Iterate till max_iters iterations
while len(log_likelihoods) < max_iters:
# E - Step
## Vectorized implementation of e-step equation to calculate the
## membership for each of k -gaussians
for k in range(self.k):
R[:, k] = w[k] * P(mu[k], Sigma[k])
### Likelihood computation
log_likelihood = np.sum(np.log(np.sum(R, axis = 1)))
log_likelihoods.append(log_likelihood)
## Normalize so that the responsibility matrix is row stochastic
R = (R.T / np.sum(R, axis = 1)).T
## The number of datapoints belonging to each gaussian
N_ks = np.sum(R, axis = 0)
# M Step
## calculate the new mean and covariance for each gaussian by
## utilizing the new responsibilities
for k in range(self.k):
## means
mu[k] = 1. / N_ks[k] * np.sum(R[:, k] * X.T, axis = 1).T
x_mu = np.matrix(X - mu[k])
## covariances
Sigma[k] = np.array(1 / N_ks[k] * np.dot(np.multiply(x_mu.T, R[:, k]), x_mu))
## and finally the probabilities
w[k] = 1. / n * N_ks[k]
# check for onvergence
if len(log_likelihoods) < 2 : continue
if np.abs(log_likelihood - log_likelihoods[-2]) < self.eps: break
## bind all results together
from collections import namedtuple
self.params = namedtuple('params', ['mu', 'Sigma', 'w', 'log_likelihoods', 'num_iters'])
self.params.mu = mu
self.params.Sigma = Sigma
self.params.w = w
self.params.log_likelihoods = log_likelihoods
self.params.num_iters = len(log_likelihoods)
return self.params
def plot_log_likelihood(self):
import pylab as plt
plt.plot(self.params.log_likelihoods)
plt.title('Log Likelihood vs iteration plot')
plt.xlabel('Iterations')
plt.ylabel('log likelihood')
plt.show()
def predict(self, x):
p = lambda mu, s : np.linalg.det(s) ** - 0.5 * (2 * np.pi) **\
(-len(x)/2) * np.exp( -0.5 * np.dot(x - mu , \
np.dot(np.linalg.inv(s) , x - mu)))
probs = np.array([w * p(mu, s) for mu, s, w in \
zip(self.params.mu, self.params.Sigma, self.params.w)])
return probs/np.sum(probs)
def demo_2d():
# Load data
#X = np.genfromtxt('data1.csv', delimiter=',')
### generate the random data
np.random.seed(3)
m1, cov1 = [9, 8], [[.5, 1], [.25, 1]] ## first gaussian
data1 = np.random.multivariate_normal(m1, cov1, 90)
m2, cov2 = [6, 13], [[.5, -.5], [-.5, .1]] ## second gaussian
data2 = np.random.multivariate_normal(m2, cov2, 45)
m3, cov3 = [4, 7], [[0.25, 0.5], [-0.1, 0.5]] ## third gaussian
data3 = np.random.multivariate_normal(m3, cov3, 65)
X = np.vstack((data1,np.vstack((data2,data3))))
np.random.shuffle(X)
# np.savetxt('sample.csv', X, fmt = "%.4f", delimiter = ",")
####
gmm = GMM(3, 0.000001)
params = gmm.fit_EM(X, max_iters= 100)
print params.log_likelihoods
import pylab as plt
from matplotlib.patches import Ellipse
def plot_ellipse(pos, cov, nstd=2, ax=None, **kwargs):
def eigsorted(cov):
vals, vecs = np.linalg.eigh(cov)
order = vals.argsort()[::-1]
return vals[order], vecs[:,order]
if ax is None:
ax = plt.gca()
vals, vecs = eigsorted(cov)
theta = np.degrees(np.arctan2(*vecs[:,0][::-1]))
# Width and height are "full" widths, not radius
width, height = 2 * nstd * np.sqrt(abs(vals))
ellip = Ellipse(xy=pos, width=width, height=height, angle=theta, **kwargs)
ax.add_artist(ellip)
return ellip
def show(X, mu, cov):
plt.cla()
K = len(mu) # number of clusters
colors = ['b', 'k', 'g', 'c', 'm', 'y', 'r']
plt.plot(X.T[0], X.T[1], 'm*')
for k in range(K):
plot_ellipse(mu[k], cov[k], alpha=0.6, color = colors[k % len(colors)])
fig = plt.figure(figsize = (13, 6))
fig.add_subplot(121)
show(X, params.mu, params.Sigma)
fig.add_subplot(122)
plt.plot(np.array(params.log_likelihoods))
plt.title('Log Likelihood vs iteration plot')
plt.xlabel('Iterations')
plt.ylabel('log likelihood')
plt.show()
print gmm.predict(np.array([1, 2]))
if __name__ == "__main__":
demo_2d()
from optparse import OptionParser
parser = OptionParser()
parser.add_option("-f", "--file", dest="filepath", help="File path for data")
parser.add_option("-k", "--clusters", dest="clusters", help="No. of gaussians")
parser.add_option("-e", "--eps", dest="epsilon", help="Epsilon to stop")
parser.add_option("-m", "--maxiters", dest="max_iters", help="Maximum no. of iteration")
options, args = parser.parse_args()
if not options.filepath : raise('File not provided')
if not options.clusters :
print("Used default number of clusters = 3" )
k = 3
else: k = int(options.clusters)
if not options.epsilon :
print("Used default eps = 0.0001" )
eps = 0.0001
else: eps = float(options.epsilon)
if not options.max_iters :
print("Used default maxiters = 1000" )
max_iters = 1000
else: eps = int(options.maxiters)
X = np.genfromtxt(options.filepath, delimiter=',')
gmm = GMM(k, eps)
params = gmm.fit_EM(X, max_iters)
print params.log_likelihoods
gmm.plot_log_likelihood()
print gmm.predict(np.array([1, 2]))
@TheMLGuy
Copy link
Copy Markdown

@raghughanapuram For 1D data, just set d=1 in fit_EM().

@freako7c5
Copy link
Copy Markdown

Can this be done for images in CIELab spaces?

@XingxingLi2017
Copy link
Copy Markdown

Good implementation. Thank you.

@csantoso
Copy link
Copy Markdown

In you probability calculation on line 41, after the -.5, shouldn't there only be one '*' instead of '**'?

@romaresccoa
Copy link
Copy Markdown

romaresccoa commented Jan 18, 2018

@yashk2810
Copy link
Copy Markdown

yashk2810 commented Feb 11, 2018

I am getting these warnings:

/home/yash/NCSU/Sem 2/CV/Project 1/gmm.py:48: RuntimeWarning: divide by zero encountered in log
log_likelihood = np.sum(np.log(np.sum(R, axis = 1)))
/home/yash/NCSU/Sem 2/CV/Project 1/gmm.py:53: RuntimeWarning: invalid value encountered in divide
R = (R.T / np.sum(R, axis = 1)).T
/home/yash/anaconda2/lib/python2.7/site-packages/numpy/linalg/linalg.py:1804: RuntimeWarning: invalid value encountered in det
r = _umath_linalg.det(a, signature=signature)

The dataset contains images(RGB) and the shape of train array is (1000, 10880). The original shape is (1000, 60, 60, 3) (1000 60x60 rgb images). I have flattened this array to (1000, 10880).

The log likelihood array that I get after running the program is
[-inf, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]

What's going wrong?

@nsitlokeshjain
Copy link
Copy Markdown

how to import new data set?

@nsitlokeshjain
Copy link
Copy Markdown

any suggestion

@hanumantha03
Copy link
Copy Markdown

Could you please tell me how to use ENSEMBLE Technique on this?

@gustavovelascoh
Copy link
Copy Markdown

Good stuff. Any license for using the code?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment