import numpy as np
import matplotlib.pyplot as plt
Week 7: Classification - K-NN, Decision tree
Colab Link: Click here!
KNN
Generating the dataset
= np.random.default_rng(seed = 1001)
rng = rng.uniform(-10, 10, (100, 2))
X = np.int32(np.zeros(X.shape[0]))
y 1] > X[:, 0]] = 1
y[X[:, = np.concatenate((X,
X -5, 5], np.eye(2), 10)),
rng.multivariate_normal([= 0)
axis = np.concatenate((y, np.int32(np.zeros(10)))) y
Visualize the dataset
= np.array(['red', 'green'])
c 0], X[:, 1], c = c[y]); plt.scatter(X[:,
Predict the class for a test point
def predict(X, y, x_test, k = 3):
= np.linalg.norm(X - x_test.reshape(1, 2), axis = 1)
dist = np.argsort(dist)[: k]
nearest_k = y[nearest_k]
voter if sum(voter) > len(voter) / 2:
return 1
else:
return 0
-3, -2]), 10)] c[predict(X, y, np.array([
'green'
Decision boundary for various values of k
def boundary(k):
= np.linspace(-10, 10, 100)
x = [ ]
floor = [ ]
color for i in range(x.shape[0]):
for j in range(x.shape[0]):
floor.append([x[i], x[j]])
color.append(c[predict(X, y, np.array([x[i], x[j]]), k)])= np.array(floor)
floor 0], floor[:, 1], c = color);
plt.scatter(floor[:,
plt.title(k)
=(10, 7))
plt.figure(figsizefor ind, k in enumerate([1, 3, 7, 15, 31, 63]):
2, 3, ind + 1)
plt.subplot( boundary(k)
Real World Dataset: Wine Dataset
In this section, we will explore the Wine dataset, which is a real-world dataset used for classification tasks. The dataset contains various attributes related to wine samples, and our goal is to classify these samples into two categories: Class 0 and Class 1.
Dataset Description
Features: The dataset includes several features, but for this analysis, we will focus on two specific attributes: ‘proline’ and ‘hue.’ These attributes represent different characteristics of the wine samples.
Labels: The target variable, denoted as ‘y,’ assigns each sample to one of two classes, Class 0 or Class 1.
from sklearn.datasets import load_wine
= load_wine(return_X_y=True, as_frame=True)
X, y = X[y < 2], y[y < 2]
X, y = X[['proline', 'hue']]
X = X.to_numpy() X
Data Preprocessing
Before using the dataset, we perform some data preprocessing steps to standardize the features. Standardization is a common practice in machine learning to ensure that all features have the same scale. This can improve the performance of our models.
from sklearn.preprocessing import StandardScaler
= StandardScaler()
std_scaler = std_scaler.fit_transform(X)
X
= np.array(['red', 'green'])
c 0], X[:, 1], c = c[y]); plt.scatter(X[:,
Predict the class for a test point
-0.2, 0]), 5)] c[predict(X, y, np.array([
'green'
Decision boundary for various values of k
=(10, 7))
plt.figure(figsizefor ind, k in enumerate([1, 5, 10, 30, 50, 100]):
2, 3, ind + 1)
plt.subplot( boundary(k)
Decision Trees
Consider a dataset X with 28 points that lie in a 2D space. The labels for each point is given by the vector y.
= np.array([[6, 1], [7, 1], [8, 1], [6, 2], [7, 2],
X 8, 2], [1, 6], [1, 7], [2, 6], [2, 8],
[4, 6], [4, 8], [5, 6], [5, 7], [1, 1],
[2, 1], [1, 2], [2, 2], [3, 3], [5, 3],
[4, 4], [3, 5], [5, 5], [6, 6], [8, 6],
[7, 7], [6, 8], [8, 8]]).T
[= np.array([1, 1, 1, 1, 1, 1, 1,
y 1, 1, 1, 1, 1, 1, 1,
-1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1])
Visualize the dataset
0, y==1], X[1, y==1], c="green")
plt.scatter(X[0, y==-1], X[1, y==-1], c="red")
plt.scatter(X[ plt.show()
The entropy of a node is given by, E=-p\log p-( 1-p)\log( 1-p)
def entropy(p):
if p == 0 or p == 1:
return 0
return -p * np.log2(p) - (1 - p) * np.log2(1 - p)
The information gain is given by, IG=E - [\gamma * E_{l} + (1 - \gamma) * E_r]
def IG(E, El, Er, gamma):
return E - gamma * El - (1 - gamma) * Er
def best_split(X, y):
= X.min(), X.max()
min_val, max_val = np.linspace(min_val, max_val, 10)
vals = X[y == 1].shape[0] / X.shape[0]
p = entropy(p)
E = 0, 0, 0
ig_best, value_best, feat_best for val in vals:
for feat in [0, 1]:
= y[X[:, feat] < val]
left = y[X[:, feat] >= val]
right = left.shape[0] / X.shape[0]
gamma = r = 0
q if left.shape[0] != 0:
= left[left == 1].shape[0] / left.shape[0]
q if right.shape[0] != 0:
= right[right == 1].shape[0] / right.shape[0]
r = entropy(q)
El = entropy(r)
Er = E - gamma * El - (1 - gamma) * Er
ig assert ig >= 0
if ig > ig_best:
= ig
ig_best = val
value_best = feat
feat_best return feat_best, value_best, ig_best
= X.T
X
= dict()
tree
def grow_tree(X, y, key):
= X[y == 1].shape[0] / X.shape[0]
p = entropy(p)
E if E <= 0.2:
= 0
label if y[y == 1].shape[0] / y.shape[0] > 0.5:
= 1
label = {'state': 'leaf', 'label': label}
tree[key] return
= best_split(X, y)
feat_best, val_best, _ = {'state': 'internal', 'question': (feat_best, val_best)}
tree[key] = X[:, feat_best] < val_best
left_ind = X[:, feat_best] >= val_best
right_ind = X[left_ind]
left_X = y[left_ind]
left_y = X[right_ind]
right_X = y[right_ind]
right_y 2 * key + 1)
grow_tree(left_X, left_y, 2 * key + 2)
grow_tree(right_X, right_y,
0)
grow_tree(X, y, tree
{0: {'state': 'internal', 'question': (1, 5.666666666666667)},
1: {'state': 'internal', 'question': (0, 5.666666666666667)},
3: {'state': 'leaf', 'label': 0},
4: {'state': 'leaf', 'label': 1},
2: {'state': 'internal', 'question': (0, 5.666666666666667)},
5: {'state': 'leaf', 'label': 1},
6: {'state': 'leaf', 'label': 0}}
Predict label for a test point
def predict(tree, x, ind):
if tree[ind]['state'] == 'leaf':
return tree[ind]['label']
= tree[ind]['question']
feat, val if x[feat] < val:
= 2 * ind + 1
ind return predict(tree, x, ind)
else:
= 2 * ind + 2
ind return predict(tree, x, ind)
3, 4], 0)] c[predict(tree, [
'red'
Visualize the decision boundary
= np.linspace(0, 8, 100)
x = [ ]
floor = [ ]
color for i in range(x.shape[0]):
for j in range(x.shape[0]):
floor.append([x[i], x[j]])0)])
color.append(c[predict(tree, [x[i], x[j]], = np.array(floor)
floor 0], floor[:, 1], c = color); plt.scatter(floor[:,