import matplotlib.pyplot as plt
import numpy as np
'figure.dpi'] = 150
plt.rcParams[
= 200
N 42)
np.random.seed(
= np.random.randn(N//3, 2) + np.array([8, 8])
c1 = np.random.randn(N//3, 2) + np.array([8, -1])
c2 = np.random.randn(N//3, 2) + np.array([-1, -1])
c3
= np.concatenate((c1, c2, c3))
X 0], X[:, 1], s=10);
plt.scatter(X[:, 'equal')
plt.axis( plt.show()
Week 3: K-means
Colab Link: Click here!
Construction of a dataset with implicit clustering
Is PCA Enough?
# Perform PCA on X
=(12, 5))
plt.figure(figsize
= X.mean(axis=0)
X_mean
= X - X_mean
X_centered = np.cov(X_centered.T)
cov_X = np.linalg.eigh(cov_X)
eig_val, eig_vec = eig_vec[:,-1] / eig_vec[:,-1][0]
pc_vec = np.array([-7, 8])
x_range
1, 2, 1)
plt.subplot(0], X[:, 1], s=10);
plt.scatter(X[:, 'equal')
plt.axis(+ X_mean[0], x_range * pc_vec[1] + X_mean[1], color='black')
plt.plot(x_range
1, 2, 2)
plt.subplot(0], X[:, 1], s=10);
plt.scatter(X[:, = np.array([i * eig_vec[:, -1] + X_mean for i in X_centered @ eig_vec[:, -1]])
X_proj 0], X_proj[:, 1], s=20, color='red');
plt.scatter(X_proj[:, 'equal')
plt.axis(+ X_mean[0], x_range * pc_vec[1] + X_mean[1], color='black')
plt.plot(x_range
plt.show()
We notice the dataset generated above has some implicit “clustering”. When we run PCA however, no information about this phenomenon is captured in the representations generated by it. Hence, a different problem formulation is required, which is able to capture this similarity in datasets that we postulate to follow this pattern.
K-means Clustering - Lloyd’s Algorithm
- It is an unsupervised learning algorithm
- The algorithm tries to capture “similarity” between datapoints, and assign a cluster indicator for each clique.
Problem Definition
In this context, the objective becomes the following:
\underset{z \in S}{\text{min }} \sum_{i=1}^{n}||x_i - μ_{z_i}||^2_2
Where: - x_i denotes the i’th datapoint - z_i denotes the cluster indicator of x_i - μ_{z_i} denotes the mean of the cluster with indicator z_i - S denotes the set of all possible cluster assignments. Note that S is finite
def obj(X, cluster_centers):
return sum([np.min([np.linalg.norm(x_i - cluster_center)**2 for cluster_center in cluster_centers]) for x_i in X])
Algorithm Strategy
- Initialization Step : Assign random datapoints from the dataset as the cluster centers
- Cluster Assignment Step : For every datapoint x_i, assign a cluster indicator z_i = \underset{j \in [1, 2, ..., n]}{\text{min }} ||x_i - μ_j||^2_2
- Recompute cluster centers : For every cluster indicator j \in [1, 2, ..., n] recompute μ_j = \frac{\sum_{i=1}^{n}x_i \cdot \mathbf{1}(z_i=j)}{\sum_{i=1}^{n} \mathbf{1}(z_i=j)}
- Repeat steps 2 and 3 until convergence.
Convergence in accordance to the objective is established, since the following can be shown: - The set of all possible cluster assignments S is finite. - The objective function value strictly decreases after every iteration of Lloyd’s.
The initialization for K-Means can be done in smarter ways than a random initialization - which improve the chance of lloyd’s converging to a good cluster assignment; with a lesser number of iterations.
It is important to note that the final assignment need not necessarily be the best answer (global optima) to the objective function, but it is good enough in practice.
Initialization Step
n points from the dataset are randomly chosen as cluster centers, where n is the number of clusters - a hyperparameter
= 3
n # cluster_centers = X[np.random.choice(len(X), 3)]
= X[[70, 85, 80]] cluster_centers
Cluster Assignment Step
For every datapoint, the cluster indicator whose center is closest to the datapoint is assigned as its cluster.
from scipy.spatial import Voronoi, voronoi_plot_2d
def cluster_assignment(X, cluster_centers):
= np.zeros(X.shape[0])
z for i in range(X.shape[0]):
= np.argmin([np.linalg.norm(X[i] - cluster_center) for cluster_center in cluster_centers])
z[i] return z
= cluster_assignment(X, cluster_centers)
z
= plt.subplots(1, 1)
fig, (ax) 5, 5)
fig.set_size_inches(
0], X[:, 1], c=z, s=10);
ax.scatter(X[:, 0], cluster_centers[:, 1], marker = 'x', s = 100, color = 'red', linewidth=1)
ax.scatter(cluster_centers[:,
= Voronoi(cluster_centers)
vor =ax, show_points=False, show_vertices=False);
voronoi_plot_2d(vor, ax
'equal'); ax.axis(
For every cluster, there is a corresponding interesction of half-spaces - called Voronoi regions. The K-Means algorithm, equivalently, is trying to find the most optimal Voronoi partition of the space, that minimizes the objective function.
Recompute Means/Cluster Centers
For every cluster, the cluster center is updated to the mean of the points in the cluster.
def recompute_clusters(X, z):
= np.array([np.mean(X[z == i], axis = 0) for i in range(n)])
cluster_centers return cluster_centers
Iteration of K-means
- Assign clusters based on new cluster centers.
- Recompute cluster centers based on new clusters.
= plt.subplots(1, 3)
fig, ax 15, 3.75)
fig.set_size_inches(
for i in range(3):
= cluster_assignment(X, cluster_centers) # cluster_centers -> NEW cluster assignment ->
z
0], X[:, 1], c=z, s=10);
ax[i].scatter(X[:, 0], cluster_centers[:, 1], marker = 'x', s = 100, color = 'red', linewidth=1)
ax[i].scatter(cluster_centers[:, = Voronoi(cluster_centers)
vor =ax[i], show_points=False, show_vertices=False)
voronoi_plot_2d(vor, ax'equal');
ax[i].axis(
= recompute_clusters(X, z) # -> cluster assignment -> NEW cluster centers cluster_centers
Generate a complex dataset with 6 clusters
We make use of the convienient make_blobs data generator from the scikit-learn ibrary.
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import make_blobs
= [(-3, 3), (-2, 1), (0, 0), (1, 2), (0, -2), (3, -1)]
cen = make_blobs(n_samples=1000, centers=cen, n_features=2, cluster_std=0.3, random_state=13, center_box=(-3, 3))
X, ideal_z
= plt.subplots()
fig, ax 0], X[:, 1], s=10, c=ideal_z)
ax.scatter(X[:, = Voronoi(cen)
vor =ax, show_points=False, show_vertices=False);
voronoi_plot_2d(vor, ax5, 4.5)
fig.set_size_inches(-5, 5, -4, 5]) ax.axis([
(-5.0, 5.0, -4.0, 5.0)
# the make_blobs generator returns the optimal/good cluster assignment along with the dataset
# this function checks whether the result from lloyd's is equivalent to the optimal assignment
def ideal_check(ideal, obtained):
= dict([(i, -1) for i in range(n)])
mapping
for i in range(len(ideal)):
if mapping[ideal[i]] == -1:
= obtained[i]
mapping[ideal[i]] elif mapping[ideal[i]] != obtained[i]:
return False
return True
Lloyd’s Algorithm Definition
from IPython.display import HTML
from matplotlib import animation
def lloyds(cluster_centers, X, z, artists = [], animate=True, fig=None, ax=None, ax1=None, n_iter=0, n = len(cen)):
= []
loss if fig is None and animate:
= plt.subplots(1, 2, figsize=(10, 4.15))
fig, (ax, ax1) = ax.set_title('')
title = []
artists
if animate:
= []
frame 0], X[:, 1], c=z, s=10))
frame.append(ax.scatter(X[:, 0], cluster_centers[:, 1], marker = 'x', s = 100, color = 'red', linewidth=1))
frame.append(ax.scatter(cluster_centers[:, 0.5, 1.05, f'Iteration {n_iter} | Cluster Assignment', transform=ax.transAxes, ha="center"))
frame.append(ax.text(
= Voronoi(cluster_centers)
vor = voronoi_plot_2d(vor, ax=ax, show_points=False, show_vertices=False);
d += list(d.axes[0].lines[-1:] + d.axes[0].collections[-2:])
frame -5, 5, -4, 5])
ax.axis([
loss.append(obj(X, cluster_centers))= 1
m 0], [loss[0]], color='red', marker='x', s=30))
frame.append(ax1.scatter([0.5, 1.05, 'Objective Function', transform=ax1.transAxes, ha="center"))
frame.append(ax1.text(0.5, -0.1, 'Iterations', transform=ax1.transAxes, ha="center"))
frame.append(ax1.text(
artists.append(frame)
= False
converged while not converged:
# cluster_centers = recompute_clusters(X, z)
for i in range(n):
= X[z==i]
cluster_points if len(cluster_points)>0:
= np.mean(cluster_points, axis=0)
cluster_centers[i]
+= 1
n_iter
# Modified cluster assignment step
= True
converged
if animate:
= []
frame 0], X[:, 1], c=z, s=10))
frame.append(ax.scatter(X[:, 0], cluster_centers[:, 1], marker = 'x', s = 100, color = 'red', linewidth=1))
frame.append(ax.scatter(cluster_centers[:, 0.5, 1.05, f'Iteration {n_iter} | Cluster Recomputation', transform=ax.transAxes, ha="center"))
frame.append(ax.text(
= Voronoi(cluster_centers)
vor = voronoi_plot_2d(vor, ax=ax, show_points=False, show_vertices=False);
d += list(d.axes[0].lines[-1:] + d.axes[0].collections[-2:])
frame -5, 5, -4, 5])
ax.axis([
for i in range(0, len(loss)-1, 2):
list(ax1.plot([i/2, (i+2)/2], [loss[i], loss[i+1]], color='red', marker='x', markersize=5))[0])
frame.append(
loss.append(obj(X, cluster_centers))list(ax1.plot([(len(loss)-2)/2, (len(loss)-1)/2], [loss[-2], loss[-1]], color='red', linestyle=':'))[0])
frame.append(# lines = list(ax1.plot(np.arange(len(loss))/2, loss, color='red', marker='xo', markersize=6))
# frame += lines
0.5, 1.05, 'Objective Function', transform=ax1.transAxes, ha="center"))
frame.append(ax1.text(0.5, -0.1, 'Iterations', transform=ax1.transAxes, ha="center"))
frame.append(ax1.text(
artists.append(frame)
for i in range(len(X)):
= np.argmin([np.linalg.norm(X[i] - cluster_center) for cluster_center in cluster_centers])
z_i
if z_i != z[i]:
= z_i
z[i] = False
converged
if animate and not converged:
= []
frame 0], X[:, 1], c=z, s=10))
frame.append(ax.scatter(X[:, 0], cluster_centers[:, 1], marker = 'x', s = 100, color = 'red', linewidth=1))
frame.append(ax.scatter(cluster_centers[:, 0.5, 1.05, f'Iteration {n_iter} | Cluster Re-assignment', transform=ax.transAxes, ha="center"))
frame.append(ax.text(
= Voronoi(cluster_centers)
vor = voronoi_plot_2d(vor, ax=ax, show_points=False, show_vertices=False);
d += list(d.axes[0].lines[-1:] + d.axes[0].collections[-2:])
frame -5, 5, -4, 5])
ax.axis([
loss.append(obj(X, cluster_centers))= 1
m for i in range(0, len(loss)-1, 2):
list(ax1.plot([i/2, (i+2)/2], [loss[i], loss[i+1]], color='red', marker='x', markersize=5))[0])
frame.append(
0.5, 1.05, 'Objective Function', transform=ax1.transAxes, ha="center"))
frame.append(ax1.text(0.5, -0.1, 'Iterations', transform=ax1.transAxes, ha="center"))
frame.append(ax1.text(
artists.append(frame)
if animate:
plt.close()return fig, (ax, ax1), cluster_centers, artists
else:
return cluster_centers, n_iter
Random Initialization
We now run Lloyd’s algorithm on the dataset with random initialization. We use the ideal_check
function to see whether the obtained cluster from lloyd’s is optimal; else we re-run it again.
An animation using Matplotlib’s ArtistAnimation
is shown to illustrate the working of this strategy.
= X[np.random.choice(len(X), n)]
cluster_centers = cluster_assignment(X, cluster_centers)
z
= lloyds(cluster_centers, X, z)
fig, ax, final_clusters, artists
while not ideal_check(cluster_assignment(X, final_clusters), ideal_z):
= X[np.random.choice(len(X), n)]
cluster_centers = cluster_assignment(X, cluster_centers)
z = lloyds(cluster_centers, X, z)
fig, ax, final_clusters, artists
= animation.ArtistAnimation(fig, artists, interval=500, repeat=False, blit=False);
anim HTML(anim.to_jshtml())
Some initializations do not give a good clustering
= X[np.random.choice(len(X), n)]
cluster_centers = cluster_assignment(X, cluster_centers)
z = lloyds(cluster_centers, X, z)
fig, ax, final_clusters, artists
while ideal_check(cluster_assignment(X, final_clusters), ideal_z):
= X[np.random.choice(len(X), n)]
cluster_centers = cluster_assignment(X, cluster_centers)
z = lloyds(cluster_centers, X, z)
fig, ax, final_clusters, artists
= animation.ArtistAnimation(fig, artists, interval=500, repeat=False, blit=False);
anim HTML(anim.to_jshtml())
Smart Initialization | k-means++
k-means++ is an intitialization step which tries to improve the chance for Lloyd’s algorithm to converge to a good clustering.
The exact algorithm is as follows:
- Choose one center uniformly at random among the data points.
- For each data point x not chosen yet, compute D(x)^2, the squared distance between x and the nearest center that has already been chosen.
- Choose one new data point at random as a new center, using a weighted probability distribution where a point x is chosen with probability proportional to D(x)^2.
- Repeat Steps 2 and 3 until k centers have been chosen.
After choosing k cluster centers, we proceed as usual with Lloyd’s algorithm.
The intuition behind this approach is that spreading out the k initial cluster centers is a good thing: the first cluster center is chosen uniformly at random from the data points that are being clustered, after which each subsequent cluster center is chosen from the remaining data points with probability proportional to its squared distance from the point’s closest existing cluster center.
def plusplus(animate=True, n = len(cen)):
= np.array(X[np.random.choice(len(X), 1)])
initial_clusters
if animate:
= []
artists = plt.subplots(1, 2, figsize=(10, 4.15), dpi=150)
fig, (ax, ax1)
= ax.scatter(X[:, 0], X[:, 1], color="pink", s=10);
a = ax.scatter(initial_clusters[:, 0], initial_clusters[:, 1], marker = 'x', s = 200, color = 'red', linewidth=1);
b = ax.text(0.5, 1.05, f'K-Means ++', transform=ax.transAxes, ha="center")
c
artists.append([a, c])
= ax.text(0.5, 1.05, f'K-Means ++ | Initialization 1', transform=ax.transAxes, ha="center")
c
artists.append([a, b, c])
for i in range(1, n):
# Rescore based on selected clusters
= np.array([min([np.linalg.norm(datapoint-cluster)**2 for cluster in initial_clusters]) for datapoint in X])
scores
# Normalize scores to probability
= scores/scores.sum()
probabilities
= np.append(initial_clusters, X[np.random.choice(len(X), 1, p=probabilities)], axis=0)
initial_clusters if animate:
= ax.scatter(X[:, 0], X[:, 1], color="pink", s=10);
a = ax.scatter(initial_clusters[:, 0], initial_clusters[:, 1], marker = 'x', s = 200, color = 'red', linewidth=1);
b = ax.text(0.5, 1.05, f'K-Means ++ | Initialization {i+1}', transform=ax.transAxes, ha="center")
c
artists.append([a, b, c])
if animate:
return fig, (ax, ax1), initial_clusters, artists
return initial_clusters
We now run Lloyd’s with K-means++ initialization strategy. The plusplus
function does this smart intialization for us; and is configured to plot frames for each point’s initialization, which we use to visualize in our animation.
# cluster_centers = X[np.random.choice(len(X), n)]
= plusplus()
fig, (ax, ax1), cluster_centers, artists = cluster_assignment(X, cluster_centers)
z = lloyds(cluster_centers, X, z, artists=artists, fig=fig, ax=ax, ax1=ax1)
fig, (ax, ax1), final_clusters, artists
while not ideal_check(cluster_assignment(X, final_clusters), ideal_z):
= plusplus()
fig, ax, cluster_centers, artists = cluster_assignment(X, cluster_centers)
z = lloyds(cluster_centers, X, z, artists=artists, fig=fig, ax=ax, ax1=ax1)
fig, (ax, ax1), final_clusters, artists
= animation.ArtistAnimation(fig, artists, interval=500, repeat=False, blit=False);
anim HTML(anim.to_jshtml())
Choice of K - Elbow Method
= list(range(1, 16))
K = []
obj_vals for k in K:
= []
obj_vals_k for _ in range(3):
= plusplus(animate=False, n=k)
cluster_centers = lloyds(cluster_centers, X, cluster_assignment(X, cluster_centers), animate=False, n=k)
cluster_centers, n_iter
obj_vals_k.append(obj(X, cluster_centers))min(obj_vals_k))
obj_vals.append(
= [100*i for i in K]
penalties
=(12, 4))
plt.figure(figsize1, 2, 1)
plt.subplot(='Obj. Value')
plt.plot(K, obj_vals, label='--', label='Penalty')
plt.plot(K, penalties, linestyle'Choice of K')
plt.xlabel(
plt.legend()
1, 2, 2)
plt.subplot(+ obj_vals[i] for i in range(15)])
plt.plot(K, [penalties[i] 'Choice of K')
plt.xlabel('Objective Function + Penalty(K)');
plt.ylabel(=min(zip(K, [penalties[i] + obj_vals[i] for i in range(15)]), key=lambda i: i[1])[0], color='red', label='Minimum Value')
plt.axvline(x plt.legend()
<matplotlib.legend.Legend at 0x7c57f3588af0>
Comparison of Initialization Strategies - Random vs. K-Means++
For each initialization strategy, we conduct 1000 experiments and record the following:
- Number of Iterations to converge
- SSE (objective value) for final cluster centers
- Number of convergences that are “ideal” (global optimum)
- Time taken from intialization to convergence
Experiments
Random Initialization
from timeit import default_timer as timer
= 1000
noof_trials
= {'ideals' : 0,
random 'SSE' : [],
'iters' : [],
'time' : []}
= timer()
exp_start for _ in range(noof_trials):
= timer()
start = X[np.random.choice(len(X), n)]
cluster_centers = lloyds(cluster_centers, X, cluster_assignment(X, cluster_centers), animate=False)
final_clusters, n_iter
'time'].append(timer()-start)
random['ideals'] += int(ideal_check(ideal_z, cluster_assignment(X, final_clusters)))
random['SSE'].append(obj(X, final_clusters))
random['iters'].append(n_iter)
random[
print(f'Experiment done in {timer()-exp_start:.2f} seconds')
Experiment done in 670.24 seconds
K-Means++ Initialization
= {'ideals' : 0,
kmplusplus 'SSE' : [],
'iters' : [],
'time' : []}
= timer()
exp_start for _ in range(noof_trials):
= timer()
start = plusplus(animate=False)
cluster_centers = lloyds(cluster_centers, X, cluster_assignment(X, cluster_centers), animate=False)
final_clusters, n_iter
'time'].append(timer()-start)
kmplusplus['ideals'] += int(ideal_check(ideal_z, cluster_assignment(X, final_clusters)))
kmplusplus['SSE'].append(obj(X, final_clusters))
kmplusplus['iters'].append(n_iter)
kmplusplus[
print(f'Experiment done in {timer()-exp_start:.2f} seconds')
Experiment done in 539.66 seconds
Results
SSE Comparison
'seaborn-v0_8')
plt.style.use(
= plt.figure(figsize=(12, 5))
fig 'Initialization Comparison | SSE')
fig.suptitle(
= plt.subplot(121)
ax1 'SSE'], bins=30)
ax1.hist(random['Random')
ax1.set_title('Frequency')
ax1.set_ylabel('SSE')
ax1.set_xlabel(
= plt.subplot(122, sharex=ax1, sharey=ax1)
ax2 'SSE'], bins=30)
ax2.hist(kmplusplus['K-Means++')
ax2.set_title('Frequency')
ax2.set_ylabel('SSE')
ax2.set_xlabel(
plt.show()
=(12, 2))
plt.figure(figsize'Fraction of Best Convergences')
plt.title('Random', 'K-Means++'], [random['ideals']/noof_trials, kmplusplus['ideals']/noof_trials], height=0.4);
plt.barh([0, 1]); plt.xlim([
The SSE for the optimal cluster assignment for this particular dataset is around 175. We observe from the above charts that K-Means++ does indeed have a higher chance (0.8) of converging to this as compared to Vanilla K-Means (0.5).
Number of Iterations Comparison
= plt.figure(figsize=(12, 5))
fig 'Initialization Comparison | Number of Iterations')
fig.suptitle(
= plt.subplot(121)
ax1 'iters'], bins=max(random['iters'])-min(random['iters']))
ax1.hist(random['Random')
ax1.set_title('Frequency')
ax1.set_ylabel('Number of Iterations')
ax1.set_xlabel(
= plt.subplot(122, sharex=ax1, sharey=ax1)
ax2 'iters'], bins=max(kmplusplus['iters'])-min(kmplusplus['iters']))
ax2.hist(kmplusplus['K-Means++')
ax2.set_title('Frequency')
ax2.set_ylabel('Number of Iterations')
ax2.set_xlabel(
plt.show()
For Vanilla K-Means, we observe that the number of iterations has quite a bit of variation, with an average of 8.75 iterations to convergence.
K-Means++ has a relatively smaller spread, with an average of 4.2 iterations to convergence (post intialization).
Time Taken Comparison
= plt.figure(figsize=(12, 5))
fig 'Initialization Comparison | Time taken to converge')
fig.suptitle(
= plt.subplot(121)
ax1 'time'], bins=20)
ax1.hist(random['Random')
ax1.set_title('Frequency')
ax1.set_ylabel('Time taken')
ax1.set_xlabel(
= plt.subplot(122, sharex=ax1, sharey=ax1)
ax2 'time'], bins=20)
ax2.hist(kmplusplus['K-Means++')
ax2.set_title('Frequency')
ax2.set_ylabel('Time taken (s)')
ax2.set_xlabel(
plt.show()
'time']).mean() np.array(kmplusplus[
0.4248704458820016
Similar to the results for number of iterations, the time taken till convergence also has a wide spread for Random Initialization, with an average of 0.55s. K-Means++ has an average of 0.42s.
Worked-out Examples
K-Means
K-Means with k = 3 for X = [-15, -10, 0, 5, 15, 20, 25] with mean clusters μ = [-15, 0, 5]
=(15, 7.5))
plt.figure(figsize# From the problem
= np.expand_dims(np.array([-15, -10, 0, 5, 15, 20, 25]), axis=1)
x = np.expand_dims(np.array([-15, 0, 5]), axis=1)
clusters 2, 3, 1)
plt.subplot(0], np.zeros(x.shape[0]));
plt.scatter(x[:,0], np.zeros(3), marker = 'x', s = 100, color = 'red', linewidth=1);
plt.scatter(clusters[:, 'From the problem')
plt.title(
# Initial Cluster Assignment
= cluster_assignment(x, clusters)
z 2, 3, 2)
plt.subplot(0], np.zeros(x.shape[0]), c=z);
plt.scatter(x[:,0], np.zeros(3), marker = 'x', s = 100, color = 'red', linewidth=1);
plt.scatter(clusters[:, 'Initial Cluster Assignment')
plt.title(
# Recompute Cluster Centers
= recompute_clusters(x, z)
clusters 2, 3, 3)
plt.subplot(0], np.zeros(x.shape[0]), c=z);
plt.scatter(x[:,0], np.zeros(3), marker = 'x', s = 100, color = 'red', linewidth=1);
plt.scatter(clusters[:, 'Recompute Cluster Centers')
plt.title(
# Next Cluster Assignment
= cluster_assignment(x, clusters)
z 2, 3, 4)
plt.subplot(0], np.zeros(x.shape[0]), c=z);
plt.scatter(x[:,0], np.zeros(3), marker = 'x', s = 100, color = 'red', linewidth=1);
plt.scatter(clusters[:, 'Next Cluster Assignment')
plt.title(
# Again Recompute Cluster Centers
= recompute_clusters(x, z)
clusters 2, 3, 5)
plt.subplot(0], np.zeros(x.shape[0]), c=z);
plt.scatter(x[:,0], np.zeros(3), marker = 'x', s = 100, color = 'red', linewidth=1);
plt.scatter(clusters[:, 'Again Recompute Cluster Centers')
plt.title(
# Cluster Assignment - No Change
# Algorithm has converged
= cluster_assignment(x, clusters)
z 2, 3, 6)
plt.subplot(0], np.zeros(x.shape[0]), c=z);
plt.scatter(x[:,0], np.zeros(3), marker = 'x', s = 100, color = 'red', linewidth=1);
plt.scatter(clusters[:, 'Algorithm has converged'); plt.title(
K-Means++
For the dataset below, k-means++ algorithm is run with k=2. Find the probability that \mathbf{x}_2, \mathbf{x}_1 are chosen as the initial clusters, in that order.
\left\{\mathbf{x_{1}} \ =\ \begin{bmatrix} 0\\ 2 \end{bmatrix} ,\mathbf{\ x_{2}} \ =\ \begin{bmatrix} 2\\ 0 \end{bmatrix} ,\ \mathbf{x_{3}} \ =\ \begin{bmatrix} 0\\ 0 \end{bmatrix} ,\ \mathbf{x_{4}} \ =\ \begin{bmatrix} 0\\ -2 \end{bmatrix} ,\ \mathbf{x_{5}} \ =\ \begin{bmatrix} -2\\ 0 \end{bmatrix}\right\}
= np.array([[0, 2], [2, 0], [0, 0], [0, -2], [-2, 0]])
dataset = np.array([1/len(dataset) for _ in range(len(dataset))])
probabilities print('Initial Probablities:', probabilities)
= []
clusters = 1
answer
# First we select x2 = [2,0]
1])
clusters.append(dataset[*= probabilities[1]
answer
# Rescore based on selected clusters
= np.array([min([np.linalg.norm(datapoint-cluster)**2 for cluster in clusters]) for datapoint in dataset])
scores
# Normalize scores to probability
= scores/scores.sum()
probabilities print('Probabilities after selecting x2: ', [round(i, 3) for i in probabilities])
# Now we select x1 = [0,2]
0])
clusters.append(dataset[*= probabilities[0]
answer
print('Probability of selecting [x2 x1]:', round(answer, 3))
Initial Probablities: [0.2 0.2 0.2 0.2 0.2]
Probabilities after selecting x2: [0.222, 0.0, 0.111, 0.222, 0.444]
Probability of selecting [x2 x1]: 0.044