import numpy as np
import matplotlib.pyplot as plt
= 8
d
plt.rcParams.update({'figure.figsize': (7, 7),
'scatter.marker': 'x'
})
1)
np.random.seed(
= np.random.rand(100, 2)
X_ = (X_-np.mean(X_))*10
X_
= [], []
X, y
= lambda i: -1*i[0]/i[1]
perp = lambda i: 2*int(i >= 0)-1
sign
= np.array([1, 1])/np.sqrt(2)
w = 0.5
gamma
for p in X_:
= w@p.T
d if abs(d) >= gamma:
X.append(p)
y.append(sign(d))
= np.array(X)
X = np.array(y)
y = X.shape[0]
n
= plt.subplots(1, 2)
fig, (ax1, ax2) 16, 7)
fig.set_size_inches(f'A Linearly seperable dataset with γ={gamma} margin')
fig.suptitle(
0][y==1], X[:, 1][y==1], color='green', label='Positive')
ax1.scatter(X[:, 0][y!=1], X[:, 1][y!=1], color='red', label='Negative')
ax1.scatter(X[:,
0][y==1], X[:, 1][y==1], color='green', label='Positive')
ax2.scatter(X[:, 0][y!=1], X[:, 1][y!=1], color='red', label='Negative')
ax2.scatter(X[:, ='lower right')
ax2.legend(loc
=0, c='black')
ax2.axvline(x=0, c='black')
ax2.axhline(y
0, 0, w[0], w[1], length_includes_head=True, head_width=0.1)
ax2.arrow(0, 0], slope=perp(w), c='black')
ax2.axline([*gamma, slope=perp(w), c='black', linestyle="--")
ax2.axline(w*-1*gamma, slope=perp(w), c='black', linestyle="--")
ax2.axline(w
plt.show()
Week 9: Discriminative Models - Perceptron; Logistic Regression
Colab Link: Click here!
Dataset Generation
We create a Linearly seperable dataset with γ margin
max([np.linalg.norm(i) for i in X_])**2/0.25
170.99461095236117
Perceptron Algorithm:
- \mathbf{w}^0 = 0
- In step t, if ∃ some \mathbf{x}_i \ni \text{sign}(\mathbf{w}^{t}\mathbf{x}_i^T) != y_i (misclassified):
- Update \mathbf{w}^{t+1} = \mathbf{w}^{t} + \mathbf{x}_iy_i
- Else, stop (converged).
from IPython.display import HTML, Markdown, display
from matplotlib import animation
# Initialize w to zero
= np.array([0, 0])
w
= plt.subplots(1, 2, figsize=(16*2/3, 7*2/3), dpi=150)
fig, (ax, ax1)
= []
artists = []
frame
= False
converged = 0
n_iter
while not converged and n_iter<=10:
= []
frame1
= np.array(list(map(sign, w@X.T)))
y_pred = np.logical_and(y_pred==-1, y==-1)
true_negatives = np.logical_and(y_pred==1, y==1)
true_positives = np.logical_or(np.logical_and(y_pred==1, y==-1), np.logical_and(y_pred==-1, y==1))
mistakes
0.5, 1.05, f'({n_iter}) Check - w = [{w[0]:.2f} {w[1]:.2f}]', transform=ax1.transAxes, ha="center"))
frame1.append(ax1.text(
if np.linalg.norm(w):
0, 0], slope=perp(w), c='black'))
frame1.append(ax1.axline([0][true_positives], X[:, 1][true_positives], color='green'))
frame1.append(ax1.scatter(X[:, 0][true_negatives], X[:, 1][true_negatives], color='red'))
frame1.append(ax1.scatter(X[:, 0][mistakes], X[:, 1][mistakes], color='blue'))
frame1.append(ax1.scatter(X[:,
else:
0][true_positives], X[:, 1][true_positives], color='green', label='Positive'))
frame1.append(ax1.scatter(X[:, 0][true_negatives], X[:, 1][true_negatives], color='red', label='Negative'))
frame1.append(ax1.scatter(X[:, 0][mistakes], X[:, 1][mistakes], color='blue', label='Mistakes'))
frame1.append(ax1.scatter(X[:, ='cornflowerblue', s=[50], label='Update; False Negative'))
frame1.append(ax.scatter([], [], color='lightcoral', s=[50], label='Update; False Positive'))
frame1.append(ax.scatter([], [], color='lower right')
ax1.legend(loc='lower right')
ax.legend(loc
=0, c='black'))
frame1.append(ax1.axvline(x=0, c='black'))
frame1.append(ax1.axhline(y
0, 0, w[0], w[1], length_includes_head=True, head_width=0.1))
frame1.append(ax1.arrow(+frame)
artists.append(frame1
+= 1
n_iter
for i in range(n):
if sign(w@X[i].T) != y[i]: # if mistake
= []
frame 0.5, 1.05, f'({n_iter}) Arbitrary mistake on [{X[i][0]:.2f} {X[i][1]:.2f}] ' + ['(False Positive)', '(False Negative)'][int(y[i]==1)], transform=ax.transAxes, ha="center"))
frame.append(ax.text(=0, c='black'))
frame.append(ax.axvline(x=0, c='black'))
frame.append(ax.axhline(y0, 0, w[0], w[1], length_includes_head=True, head_width=0.1))
frame.append(ax.arrow(if y[i] == 1:
0]], [X[i][1]], color='cornflowerblue', s=[50]))
frame.append(ax.scatter([X[i][else:
0]], [X[i][1]], color='lightcoral', s=[50]))
frame.append(ax.scatter([X[i][if np.linalg.norm(w):
0, 0], slope=perp(w), c='black'))
frame.append(ax.axline([+frame1)
artists.append(frame
= []
frame 0.5, 1.05, f'({n_iter}) Update - w = [{w[0]:.2f} {w[1]:.2f}] + ({y[i]}) * [{X[i][0]:.2f} {X[i][1]:.2f}]', transform=ax.transAxes, ha="center"))
frame.append(ax.text(=0, c='black'))
frame.append(ax.axvline(x=0, c='black'))
frame.append(ax.axhline(y0, 0, w[0], w[1], length_includes_head=True, head_width=0.1))
frame.append(ax.arrow(if y[i] == 1:
0], w[1], y[i]*X[i][0], y[i]*X[i][1], length_includes_head=True, head_width=0.1, color='cornflowerblue'))
frame.append(ax.arrow(w[0, 0, y[i]*X[i][0], y[i]*X[i][1], length_includes_head=True, head_width=0.1, color='cornflowerblue', linestyle='--'))
frame.append(ax.arrow(0]], [X[i][1]], color='cornflowerblue', s=[50]))
frame.append(ax.scatter([X[i][else:
0], w[1], y[i]*X[i][0], y[i]*X[i][1], length_includes_head=True, head_width=0.1, color='lightcoral'))
frame.append(ax.arrow(w[0, 0, y[i]*X[i][0], y[i]*X[i][1], length_includes_head=True, head_width=0.1, color='lightcoral', linestyle='--'))
frame.append(ax.arrow(0]], [X[i][1]], color='lightcoral', s=[50]))
frame.append(ax.scatter([X[i][if np.linalg.norm(w):
0, 0], slope=perp(w), c='black'))
frame.append(ax.axline([+frame1)
artists.append(frame
# update w
= w + X[i]*y[i]
w
= []
frame 0.5, 1.05, f'({n_iter}) Updated w = [{w[0]:.2f} {w[1]:.2f}]', transform=ax.transAxes, ha="center"))
frame.append(ax.text(=0, c='black'))
frame.append(ax.axvline(x=0, c='black'))
frame.append(ax.axhline(y0, 0, w[0], w[1], length_includes_head=True, head_width=0.1))
frame.append(ax.arrow(if y[i] == 1:
0]], [X[i][1]], color='cornflowerblue', s=[50]))
frame.append(ax.scatter([X[i][else:
0]], [X[i][1]], color='lightcoral', s=[50]))
frame.append(ax.scatter([X[i][if np.linalg.norm(w):
0, 0], slope=perp(w), c='black'))
frame.append(ax.axline([+frame1)
artists.append(frame
break
else:
# if no mistakes, perceptron has converged
= True
converged
plt.close()
= animation.ArtistAnimation(fig, artists, interval=500, repeat=False, blit=False);
anim
display(HTML(anim.to_jshtml()))
r"Perceptron converges to $$\mathbf{w} =\begin{pmatrix}%.2f\\%.2f\end{pmatrix}$$ after %i updates."%(w[0], w[1], n_iter)) Markdown(
Perceptron converges to \mathbf{w} =\begin{pmatrix}7.06\\10.34\end{pmatrix} after 9 updates.
Logistic Regression
We consider our prediction probabilty to be proportional to the distance of a point from the decision boundary. With this formulation, we use MLE to learn the best decision boundary for a given dataset.
P(y=1|\mathbf{x}) = σ(\mathbf{w}^T\mathbf{x}) = \frac{1}{1+e^{-\mathbf{w}^T\mathbf{x}}}
= np.array([1, 1])/np.sqrt(2)
w = np.array([int(w@x.T >= 0) for x in X])
y
= lambda w, x: 1/(1+np.exp(-1*w@x.T))
sigmoid = np.array([sigmoid(w, x) for x in X])
y_proba
0], X[:, 1], c=y_proba, cmap='bwr')
plt.scatter(X[:, 0, 0, w[0], w[1], length_includes_head=True, head_width=0.1)
plt.arrow(0, 0], slope=perp(w), c='black')
plt.axline([
=0, c='black')
plt.axvline(x=0, c='black')
plt.axhline(y
plt.show()
Goal: Minimize error (Negative Log-Likelihood) function below \min_{\mathbf{w}} -\ln L(\mathbf{w}) =\min_{\mathbf{w}} -\sum_{i=1}^{n} y_i \ln (σ(\mathbf{w}^T\mathbf{x})) + (1-y_i) \ln (1-σ(\mathbf{w}^T\mathbf{x}))
The gradient of the log-lokelihood is given as follows:
\nabla \ln L(\mathbf{w}) = \sum_{i=1}^{n} \left(σ(\mathbf{w}^T\mathbf{x})- y_i \right) \mathbf{x}_i
We perform gradient descent (till convergence) as follows: \mathbf{w}^{t+1} = \mathbf{w}^t - \eta \nabla \log L(\mathbf{w}^t)
= np.array([0, 0])
w = 0.001
step
def grad(w):
= np.array([sigmoid(w, x) for x in X])
y_proba return X.T@(y_proba-y)
def error(w):
= 0
l for i in range(n):
+= y[i]*np.log(sigmoid(w, X[i])) + (1-y[i])*np.log(1-sigmoid(w, X[i]))
l return -l
= w - grad(w) * step
w_next
= 0
n_iter = r"Iteration Number $i$ | $\mathbf{w}$ | Error"+"\n--|--|--\n"
results
while np.linalg.norm(w-w_next)>0.001 and n_iter<41:
if not n_iter%10 or n_iter <= 3:
+= str(n_iter)+r" | $$\begin{pmatrix}{" + str(round(w[0], 2)) + ",\quad}{" + str(round(w[0], 2)) + r"}\end{pmatrix}$$ | " + f"{error(w):.2f}\n"
results
= w_next
w = np.array([sigmoid(w, x) for x in X])
y_proba = grad(w_next) * step
grad_step
= w_next - grad_step
w_next +=1
n_iter
Markdown(results)
Iteration Number i | \mathbf{w} | Error |
---|---|---|
0 | \begin{pmatrix}{0,\quad}{0}\end{pmatrix} | 59.61 |
1 | \begin{pmatrix}{0.09,\quad}{0.09}\end{pmatrix} | 44.56 |
2 | \begin{pmatrix}{0.16,\quad}{0.16}\end{pmatrix} | 35.93 |
3 | \begin{pmatrix}{0.21,\quad}{0.21}\end{pmatrix} | 30.52 |
10 | \begin{pmatrix}{0.42,\quad}{0.42}\end{pmatrix} | 17.04 |
20 | \begin{pmatrix}{0.57,\quad}{0.57}\end{pmatrix} | 11.83 |
30 | \begin{pmatrix}{0.67,\quad}{0.67}\end{pmatrix} | 9.49 |
40 | \begin{pmatrix}{0.74,\quad}{0.74}\end{pmatrix} | 8.09 |
It’s worth noting that Logistic Regression can exhibit severe over-fitting on datasets that are linearly seperable. This happens since \mathbf{w} can be made arbitrarily large along this direction to minimize error. This phenomenon can be observed above, since the dataset in question is linearly seperable.
= np.array([7.06, 10.34])
perceptron_w /3)
error(perceptron_w
# Scaling along this direction also yields -inf error
1.3071990315434519
One can get away with this singularity by introducing a regularization term (quantity that grows proportional to ||\mathbf{w}||) into the error function.
We illustrate this in our example again, using the following modified loss and gradient functions:
\begin{aligned} \min_{\mathbf{w}} -\ln L(\mathbf{w}) &= \min_{\mathbf{w}} -\sum_{i=1}^{n} y_i \ln (σ(\mathbf{w}^T\mathbf{x})) + (1-y_i) \ln (1-σ(\mathbf{w}^T\mathbf{x})) + \frac{λ}{2}||\mathbf{w}||^2_2 \\ \nabla \ln L(\mathbf{w}) &= \sum_{i=1}^{n} \left(σ(\mathbf{w}^T\mathbf{x})- y_i \right) \mathbf{x}_i + \lambda \mathbf{w} \end{aligned}
= np.array([0, 0])
w = 0.001
step = 200
lmda
def reg_grad(w):
= np.array([sigmoid(w, x) for x in X])
y_proba return X.T@(y_proba-y) + lmda*w
def reg_error(w):
= 0
l for i in range(n):
+= y[i]*np.log(sigmoid(w, X[i])) + (1-y[i])*np.log(1-sigmoid(w, X[i]))
l return -l + (lmda/2)*w@w.T
= w - reg_grad(w) * step
w_next
= 0
n_iter = r"Iteration Number $i$ | $\mathbf{w}$ | Regularized Error"+"\n--|--|--\n"
results
while np.linalg.norm(w-w_next)>0.001 and n_iter<101:
if not n_iter%10 or n_iter <= 3:
+= str(n_iter)+r" | $$\begin{pmatrix}{" + str(round(w[0], 2)) + ",\quad}{" + str(round(w[0], 2)) + r"}\end{pmatrix}$$ | " + f"{reg_error(w):.2f}\n"
results
= w_next
w = np.array([sigmoid(w, x) for x in X])
y_proba = reg_grad(w_next) * step
reg_grad_step
= w_next - reg_grad_step
w_next +=1
n_iterelse:
+= str(n_iter)+r" | $$\begin{pmatrix}{" + str(round(w[0], 2)) + ",\quad}{" + str(round(w[0], 2)) + r"}\end{pmatrix}$$ | " + f"{reg_error(w):.2f}\n"
results
Markdown(results)
Iteration Number i | \mathbf{w} | Regularized Error |
---|---|---|
0 | \begin{pmatrix}{0,\quad}{0}\end{pmatrix} | 59.61 |
1 | \begin{pmatrix}{0.09,\quad}{0.09}\end{pmatrix} | 46.28 |
2 | \begin{pmatrix}{0.14,\quad}{0.14}\end{pmatrix} | 42.17 |
3 | \begin{pmatrix}{0.17,\quad}{0.17}\end{pmatrix} | 40.77 |
10 | \begin{pmatrix}{0.21,\quad}{0.21}\end{pmatrix} | 39.94 |