SSD Object Detection: Single Shot MultiBox Detector
Understanding Object Detection
The primary task of object detection is to locate and classify objects within images. Current mainstream object detection algorithms can be categorized into two approaches:
- Two-stage methods: Such as the RCNN family, which generate region proposals and then classify and refine these regions.
- One-stage methods: Like YOLO and SSD, which directly predict bounding boxes and class probabilities through a single neural network pass.
SSD Model Overview
SSD (Single Shot MultiBox Detector) is a one-stage object detection algorithm that employs convolutional neural networks for feature extraction. It generates detection outputs from multiple feature layers at different scales. For each feature layer used for detection, it applies 3×3 convolutions to transform channel dimensions.
SSD Network Architecture
The SSD architecture consists of five main components:
- Backbone Layer: Typically VGG16, wich processes input images resized to 300×300 and extracts initial features.
- Extra Feature Layers: Four additional convolutional layers built upon VGG16 to extract higher-level semantic information.
- Detection Layers: SSD uses six prediction maps. For a feature map of size m×n with p channels, it generates k anchors per pixel. Each anchor corresponds to c classes and 4 bounding box regresion offsets. This is achieved using (4+c)k convolutional kernels of size 3×3 with p channels, producing output feature maps of size m×n with (4+c)k channels.
- NMS (Non-Maximum Suppression): Applied to each class to remove overlapping bounding boxes while retaining the most confident ones.
- Anchors (PriorBoxes): Predefined bounding boxes represented by center coordintaes and width/height for each pixel location.
Model Implementation
Backbone Layer
from mindspore import nn
def create_conv_block(input_channels, output_channels):
return nn.SequentialCell([
nn.Conv2d(in_channels=input_channels, out_channels=output_channels, kernel_size=3, padding='same'),
nn.BatchNorm2d(output_channels),
nn.ReLU()
])
class FeatureExtractor(nn.Cell):
"""VGG16-based feature extraction backbone."""
def __init__(self):
super(FeatureExtractor, self).__init__()
# Convolutional blocks
self.conv_block1 = create_conv_block(3, 64)
self.conv_block2 = create_conv_block(64, 128)
self.conv_block3 = create_conv_block(128, 256)
self.conv_block4 = create_conv_block(256, 512)
self.conv_block5 = create_conv_block(512, 512)
# Pooling layers
self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2, pad_mode='SAME')
self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2, pad_mode='SAME')
self.pool3 = nn.MaxPool2d(kernel_size=2, stride=2, pad_mode='SAME')
self.pool4 = nn.MaxPool2d(kernel_size=2, stride=2, pad_mode='SAME')
self.pool5 = nn.MaxPool2d(kernel_size=3, stride=1, pad_mode='SAME')
def forward(self, x):
# Feature extraction through VGG16 blocks
x = self.conv_block1(x)
x = self.pool1(x)
x = self.conv_block2(x)
x = self.pool2(x)
x = self.conv_block3(x)
x = self.pool3(x)
x = self.conv_block4(x)
intermediate_feature = x
x = self.pool4(x)
x = self.conv_block5(x)
x = self.pool5(x)
return intermediate_feature, x
SSD Detection Head
import mindspore as ms
import mindspore.nn as nn
import mindspore.ops as ops
def detection_conv(in_filters, out_filters, num_anchors, kernel_size=3):
"""Creates a convolutional layer for SSD detection head."""
return nn.SequentialCell([
nn.Conv2d(in_filters, out_filters, kernel_size=kernel_size, padding='same'),
nn.BatchNorm2d(out_filters),
nn.ReLU6(),
nn.Conv2d(out_filters, out_filters, kernel_size=1),
nn.Flatten()
])
class FeatureFusion(nn.Cell):
"""Combines features from different layers."""
def __init__(self, num_predictions):
super(FeatureFusion, self).__init__()
self.num_predictions = num_predictions
def forward(self, feature_maps):
batch_size = ops.shape(feature_maps[0])[0]
processed_features = []
for feature_map in feature_maps:
# Permute dimensions: [N, C, H, W] -> [N, H, W, C]
transposed = ops.transpose(feature_map, (0, 2, 3, 1))
# Reshape to [N, H*W*C]
reshaped = ops.reshape(transposed, (batch_size, -1))
processed_features.append(reshaped)
# Concatenate all features
concatenated = ops.concat(processed_features, axis=1)
return ops.reshape(concatenated, (batch_size, self.num_predictions, -1))
class DetectionHead(nn.Cell):
"""SSD detection head for bounding box and class predictions."""
def __init__(self, num_classes=81, num_anchors=8732):
super(DetectionHead, self).__init__()
self.num_classes = num_classes
self.num_anchors = num_anchors
# Configuration for each detection layer
layer_configs = [
(512, 4), # layer1: 512 filters, 4 anchors
(1024, 6), # layer2: 1024 filters, 6 anchors
(512, 6), # layer3: 512 filters, 6 anchors
(256, 6), # layer4: 256 filters, 6 anchors
(256, 4), # layer5: 256 filters, 4 anchors
(256, 4) # layer6: 256 filters, 4 anchors
]
# Create location and classification prediction layers
self.loc_layers = nn.CellList()
self.cls_layers = nn.CellList()
for filters, anchors in layer_configs:
self.loc_layers.append(
detection_conv(filters, anchors * 4, anchors)
)
self.cls_layers.append(
detection_conv(filters, anchors * num_classes, anchors)
)
self.feature_fusion = FeatureFusion(num_anchors)
def forward(self, feature_maps):
location_predictions = []
class_predictions = []
# Process each feature map
for i in range(len(feature_maps)):
loc_pred = self.loc_layers[i](feature_maps[i])
cls_pred = self.cls_layers[i](feature_maps[i])
location_predictions.append(loc_pred)
class_predictions.append(cls_pred)
# Fuse predictions from all layers
final_locations = self.feature_fusion(location_predictions)
final_classes = self.feature_fusion(class_predictions)
return final_locations, final_classes
class SSDModel(nn.Cell):
"""Complete SSD model with backbone and detection head."""
def __init__(self, num_classes=81):
super(SSDModel, self).__init__()
self.backbone = FeatureExtractor()
self.detection_head = DetectionHead(num_classes)
def forward(self, input_tensor):
# Extract features using backbone
intermediate_feature, deep_feature = self.backbone(input_tensor)
# Additional feature extraction layers
# Block 6
x = nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6)(deep_feature)
x = nn.Dropout(0.5)(x)
# Block 7
x = nn.Conv2d(1024, 1024, kernel_size=1)(x)
x = nn.Dropout(0.5)(x)
block7_output = x
# Block 8
x = nn.Conv2d(1024, 256, kernel_size=1, padding=1)(x)
x = nn.Conv2d(256, 512, kernel_size=3, stride=2, padding='valid')(x)
block8_output = x
# Block 9
x = nn.Conv2d(512, 128, kernel_size=1, padding=1)(x)
x = nn.Conv2d(128, 256, kernel_size=3, stride=2, padding='valid')(x)
block9_output = x
# Block 10
x = nn.Conv2d(256, 128, kernel_size=1)(x)
x = nn.Conv2d(128, 256, kernel_size=3, padding='valid')(x)
block10_output = x
# Block 11
x = nn.Conv2d(256, 128, kernel_size=1)(x)
x = nn.Conv2d(128, 256, kernel_size=3, padding='valid')(x)
block11_output = x
# Combine all feature maps for detection
all_features = [
intermediate_feature,
block7_output,
block8_output,
block9_output,
block10_output,
block11_output
]
# Get predictions
bbox_preds, class_preds = self.detection_head(all_features)
# Apply sigmoid to class predictions during inference
if not self.training:
class_preds = ops.sigmoid(class_preds)
return bbox_preds.astype(ms.float32), class_preds.astype(ms.float32)
Loss Functions
The SSD objective function combines two components: confidence loss and localization loss:
L(x,c,l,g) = (1/N)(L_conf(x,c) + α·L_loc(x,l,g))
- N: Number of positive prior boxes
- c: Predicted class confidence
- l: Predicted box location
- g: Ground truth location parameters
- α: Balancing factor (default: 1)
Localization Loss
For all positive samples, Smooth L1 Loss is used:
SmoothL1(x) = { 0.5x² if |x| < 1, |x| - 0.5 otherwise }
Confidence Loss
The confidence loss is a multi-class softmax loss:
L_conf(x,c) = -∑_{i∈Pos} x_ij^p log(ĉ_i^p) - ∑_{i∈Neg} log(ĉ_i^0)
where ĉ_i^p = exp(c_i^p) / ∑_p exp(c_i^p)
def calculate_class_loss(predictions, targets):
"""Compute focal loss for classification."""
# Convert targets to one-hot encoding
num_classes = ops.shape(predictions)[-1]
one_hot_targets = ops.one_hot(targets, num_classes, Tensor(1.0, ms.float32), Tensor(0.0, ms.float32))
# Binary cross entropy with logits
sigmoid_cross_entropy = ops.binary_cross_entropy_with_logits(
predictions, one_hot_targets,
weight=ops.ones_like(predictions),
pos_weight=ops.ones_like(predictions)
)
# Apply sigmoid to predictions
sigmoid_preds = ops.sigmoid(predictions)
one_hot_targets = one_hot_targets.astype(ms.float32)
# Calculate modulating factor and alpha weight
p_t = one_hot_targets * sigmoid_preds + (1 - one_hot_targets) * (1 - sigmoid_preds)
modulating_factor = ops.pow(1 - p_t, 2.0)
alpha_weight = one_hot_targets * 0.75 + (1 - one_hot_targets) * 0.25
# Compute focal loss
focal_loss = modulating_factor * alpha_weight * sigmoid_cross_entropy
return focal_loss
Summary
SSD is a single-stage object detection algorithm that uses VGG16 as its backbone for feature extraction, supplemented with four additional convolutional layers to capture higher-level semantic information. Unlike two-stage methods, SSD doesn't generate bounding boxes through the network; instead, it classifies and refines a large set of pre-defined bounding boxes. By performing detection at multiple feature layers, SSD effectively detects objects of various sizes, making it a versatile solution for object detection tasks.