Support Vector Machine Implementation via Sequential Minimal Optimization
Kernel Function Definitions
The foundation of non-linear classification relies on mapping input data in to higher-dimensional spaces through kernel transformations. The following module defines three common kernels using a functional factory pattern.
import numpy as np
def build_linear_kernel():
"""Generates a dot-product kernel function."""
def compute(vec_a, vec_b):
return np.dot(vec_a, vec_b)
return compute
def build_polynomial_kernel(degree=3, intercept=1.0):
"""Generates a polynomial kernel function."""
def compute(vec_a, vec_b):
return (np.dot(vec_a, vec_b) + intercept) ** degree
return compute
def build_gaussian_kernel(sigma=1.0):
"""Generates a Radial Basis Function (RBF) kernel."""
def compute(vec_a, vec_b):
diff = np.asarray(vec_a) - np.asarray(vec_b)
if diff.ndim <= 1:
distance_sq = np.dot(diff, diff)
else:
distance_sq = np.sum(diff ** 2, axis=1)
return np.exp(-distance_sq / (2.0 * sigma ** 2))
return compute
SMO Optimization Engine
The core solver applies the Sequential Minimal Optimization strategy to solve the dual quadratic programming problem. It iteratively identifies pairs of Lagrange multipliers that violate the Karush-Kuhn-Tucker conditions and updates them analytically.
import copy
import random
import numpy as np
import matplotlib.pyplot as plt
from kernels import build_linear_kernel, build_polynomial_kernel, build_gaussian_kernel
class BinarySVM:
def __init__(self, penalty=1.0, sigma=1.0, poly_deg=3, poly_c=1.0,
k_type=None, eps=1e-3, alpha_tol=1e-5, iterations=500):
self.penalty = penalty
self.eps = eps
self.alpha_tol = alpha_tol
self.max_iters = iterations
if k_type is None or k_type.lower() == 'linear':
self.kernel = build_linear_kernel()
elif k_type.lower() == 'poly':
self.kernel = build_polynomial_kernel(poly_deg, poly_c)
elif k_type.lower() == 'rbf':
self.kernel = build_gaussian_kernel(sigma)
else:
self.kernel = build_linear_kernel()
self.dual_vars = None
self.error_cache = None
self.weights = None
self.intercept = 0.0
self.support_idx = []
self.loss_history = []
def _compute_outputs(self, X):
X = np.asarray(X)
if not self.support_idx:
return np.zeros(X.shape[0] if X.ndim > 1 else 1)
sv_X = self.support_X
sv_y = self.support_labels
sv_alphas = self.support_alphas
scores = np.zeros(len(X)) if X.ndim > 1 else 0.0
for idx in range(len(self.support_idx)):
scores += sv_alphas[idx] * sv_y[idx] * self.kernel(X, sv_X[idx])
return scores + self.intercept
def _check_violation(self, x_sample, y_label, alpha_val, weight):
margin = y_label * self._compute_outputs(x_sample)
if alpha_val < self.penalty * weight:
return margin >= 1 - self.eps
else:
return margin <= 1 + self.eps
def _select_partner(self, primary_idx):
active = [j for j in range(len(self.dual_vars))
if self.dual_vars[j] > self.alpha_tol and j != primary_idx]
if active:
diffs = np.abs(self.error_cache[primary_idx] - self.error_cache[active])
return active[np.argmax(diffs)]
indices = list(range(len(self.dual_vars)))
indices.remove(primary_idx)
return random.choice(indices)
def _bound_alpha(self, yi, yj, unc_j, old_i, old_j, weight_j):
C_val = self.penalty * weight_j
if yi == yj:
low = max(0, old_i + old_j - C_val)
high = min(C_val, old_i + old_j)
else:
low = max(0, old_j - old_i)
high = min(C_val, C_val + old_j - old_i)
return low if unc_j < low else high if unc_j > high else unc_j
def _update_losses(self, X, true_y, mapped_y):
preds = self._compute_outputs(X)
self.error_cache = preds - mapped_y
sig = lambda v: 1 / (1 + np.exp(-np.clip(v, -50, 30)))
prob = sig(preds)
entropy = -(true_y.T @ np.log(prob + 1e-12) + (1 - true_y).T @ np.log(1 - prob + 1e-12))
self.loss_history.append(entropy / len(mapped_y))
def train(self, X, y, sample_w=None):
X, y = np.asarray(X), np.asarray(y)
if sample_w is None:
sample_w = np.ones(len(X))
unique = np.unique(y)
if len(unique) != 2 or sorted(unique) != [0, 1]:
raise ValueError("Target labels must be exactly 0 or 1.")
mapped_y = y.copy().astype(float)
mapped_y[mapped_y == 0] = -1.0
self.weights = np.zeros(X.shape[1])
self.intercept = 0.0
self.dual_vars = np.zeros(len(X))
self.support_X = np.empty((0, X.shape[1]))
self.support_labels = np.empty(0)
self.support_alphas = np.empty(0)
self.support_idx = []
self._compute_outputs(X) # Initialize errors
for epoch in range(self.max_iters):
converged = True
for i in range(len(X)):
x_i, y_i = X[i], mapped_y[i]
a_i_old, err_i = self.dual_vars[i], self.error_cache[i]
if not self._check_violation(x_i, y_i, a_i_old, sample_w[i]):
converged = False
j = self._select_partner(i)
x_j, y_j = X[j], mapped_y[j]
a_j_old, err_j = self.dual_vars[j], self.error_cache[j]
K_ii = self.kernel(x_i, x_i)
K_jj = self.kernel(x_j, x_j)
K_ij = self.kernel(x_i, x_j)
eta = K_ii + K_jj - 2 * K_ij
if abs(eta) < 1e-6:
continue
unc_a_j = a_j_old - y_j * (err_j - err_i) / eta
a_j_new = self._bound_alpha(y_i, y_j, unc_a_j, a_i_old, a_j_old, sample_w[j])
if abs(a_j_new - a_j_old) < 1e-6:
continue
a_i_new = a_i_old - y_i * y_j * (a_j_new - a_j_old)
self.dual_vars[i] = a_i_new
self.dual_vars[j] = a_j_new
b1 = self.intercept - err_i - y_i * K_ii * (a_i_new - a_i_old) - y_j * K_ij * (a_j_new - a_j_old)
b2 = self.intercept - err_j - y_i * K_ij * (a_i_new - a_i_old) - y_j * K_jj * (a_j_new - a_j_old)
if 0 < a_i_new < self.penalty * sample_w[i]:
self.intercept = b1
elif 0 < a_j_new < self.penalty * sample_w[j]:
self.intercept = b2
else:
self.intercept = (b1 + b2) / 2.0
self._update_losses(X, y, mapped_y)
sv_mask = self.dual_vars > self.alpha_tol
self.support_idx = np.where(sv_mask)[0]
self.support_X = X[sv_mask]
self.support_labels = mapped_y[sv_mask]
self.support_alphas = self.dual_vars[sv_mask]
if converged:
break
def predict(self, X_test):
X_test = np.asarray(X_test)
raw = self._compute_outputs(X_test)
probs = 1 / (1 + np.exp(-np.clip(raw, -50, 30)))
return (probs >= 0.5).astype(int)
def plot_boundary(self, X, y, show=True, draw_margin=False):
X, y = np.asarray(X), np.asarray(y)
if X.shape[1] != 2:
return
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100), np.linspace(y_min, y_max, 100))
Z = self.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)
if show: plt.figure(figsize=(8, 6))
plt.contourf(xx, yy, Z, alpha=0.2, cmap='winter')
plt.scatter(X[y==1][:, 0], X[y==1][:, 1], marker='*', label='Class 1')
plt.scatter(X[y==0][:, 0], X[y==0][:, 1], marker='x', label='Class 0')
if self.support_idx.size > 0:
plt.scatter(self.support_X[:, 0], self.support_X[:, 1], s=100,
facecolors='none', edgecolors='k', linewidths=1.5, label='Support Vectors')
if draw_margin and (self.kernel == build_linear_kernel()):
xi = np.linspace(x_min, x_max, 100)
boundary = -(self.weights[0] * xi + self.intercept) / self.weights[1]
margin = 1 / np.linalg.norm(self.weights)
plt.plot(xi, boundary, 'r-', label='Hyperplane')
plt.plot(xi, boundary + margin/np.abs(self.weights[1]), 'k--')
plt.plot(xi, boundary - margin/np.abs(self.weights[1]), 'k--')
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.legend(frameon=False)
if show: plt.show()
def plot_convergence(self, show=True):
if not self.loss_history: return
if show: plt.figure(figsize=(8, 4))
plt.plot(self.loss_history, linewidth=1.2)
plt.xlabel('Iterations')
plt.ylabel('Mean Cross-Entropy Loss')
plt.title('Optimization Convergence')
plt.grid(alpha=0.3)
if show: plt.show()
Model Evaluation and Visualization
The following scripts demonstrate training scenarios with varying regularization strengths and kernel configurations. The first example contrasts hard-margin and soft-margin behavior on linear separable data, while the second appleis non-linear kernels to complex distributions.
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification, make_moons
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from svm_engine import BinarySVM
# Linear separability and regularization comparison
X_lin, y_lin = make_classification(n_samples=250, n_features=2, n_classes=2,
n_informative=1, class_sep=1.8, random_state=9)
X_tr, X_te, y_tr, y_te = train_test_split(X_lin, y_lin, test_size=0.2, random_state=5)
model_hard = BinarySVM(penalty=1000.0)
model_hard.train(X_tr, y_tr)
print(classification_report(y_te, model_hard.predict(X_te)))
model_soft = BinarySVM(penalty=1.0)
model_soft.train(X_tr, y_tr)
print(classification_report(y_te, model_soft.predict(X_te)))
plt.figure(figsize=(12, 5))
plt.subplot(121)
model_hard.plot_boundary(X_tr, y_tr, show=False, draw_margin=True)
plt.title('Hard Margin SVM')
plt.subplot(122)
model_soft.plot_boundary(X_tr, y_tr, show=False, draw_margin=True)
plt.title('Soft Margin SVM')
plt.tight_layout()
plt.show()
# Non-linear kernel evaluation on complex manifolds
X_moon, y_moon = make_moons(n_samples=300, noise=0.08, random_state=12)
X_tr_m, X_te_m, y_tr_m, y_te_m = train_test_split(X_moon, y_moon, test_size=0.25, random_state=7)
clf_rbf = BinarySVM(penalty=15.0, sigma=0.6, k_type='rbf')
clf_rbf.train(X_tr_m, y_tr_m)
print(classification_report(y_te_m, clf_rbf.predict(X_te_m)))
clf_poly = BinarySVM(penalty=15.0, poly_deg=3, k_type='poly')
clf_poly.train(X_tr_m, y_tr_m)
print(classification_report(y_te_m, clf_poly.predict(X_te_m)))
plt.figure(figsize=(12, 5))
plt.subplot(121)
clf_rbf.plot_boundary(X_tr_m, y_tr_m, show=False)
plt.title('RBF Kernel Decision Region')
plt.subplot(122)
clf_poly.plot_boundary(X_tr_m, y_tr_m, show=False)
plt.title('Polynomial Kernel Decision Region')
plt.tight_layout()
plt.show()