Fading Coder

One Final Commit for the Last Sprint

Home > Tools > Content

Tracking Training Progress: Plotting Loss and Accuracy Curves in PyTorch

Tools May 15 1

Visualziing training and testing metrics through Loss and Accuracy curves provides immediate insight into whether your model is learning effectively.

MetricDescriptionLoss CurveRepresents the model's error during training and evaluation. Lower values indicate better performance.Accuracy CurveRepresents the proportion of correct predictions. Higher values indicate better performance.

What Can These Curves Reveal?

1. Is the Model Converging?

  • Is the Loss steadily decreasing?
  • Is the Accuracy steadily increasing?

If Loss remains high throughout training, the model may be failing to learn, indicating issues with network architecture or learning rate.

2. Is the Model Overfitting?

  • If Training Accuracy is high while Testing Accuracy remains low, the model has memorized the training data without developing generalization capabilities (overfitting).

3. Is the Model Underfitting?

  • If both Training and Testing Accuracy are low, the model has not learned the underlying patterns. This may indicate an overly simple architecture or insufficient training epochs.

image_classifier.py

import torch
import torch.nn as nn


class ImageClassifier(nn.Module):
    """A simple fully connected neural network for image classification."""
    
    def __init__(self, input_dim: int = 784, hidden_dim: int = 128, 
                 output_dim: int = 10) -> None:
        super().__init__()
        self.flatten = nn.Flatten()
        self.feature_layers = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 64),
            nn.ReLU(),
        )
        self.output_layer = nn.Linear(64, output_dim)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.flatten(x)
        features = self.feature_layers(x)
        return self.output_layer(features)

training_monitor.py

import torch
from torch import nn, optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import matplotlib.pyplot as plt
from image_classifier import ImageClassifier


# Prepare FashionMNIST dataset
transform = transforms.ToTensor()

train_dataset = datasets.FashionMNIST(
    root="./data",
    train=True,
    download=True,
    transform=transform
)

test_dataset = datasets.FashionMNIST(
    root="./data",
    train=False,
    download=True,
    transform=transform
)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)


# Initialize model, loss function, and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
classifier = ImageClassifier().to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(classifier.parameters(), lr=0.01)


def evaluate_model(data_loader: DataLoader, model: nn.Module, 
                   loss_fn: nn.Module) -> tuple:
    """Evaluate model performance on given dataset."""
    dataset_size = len(data_loader.dataset)
    num_batches = len(data_loader)
    
    model.eval()
    total_loss, correct = 0, 0
    
    with torch.no_grad():
        for batch_x, batch_y in data_loader:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            
            predictions = model(batch_x)
            batch_loss = loss_fn(predictions, batch_y)
            
            total_loss += batch_loss.item()
            correct += (predictions.argmax(dim=1) == batch_y).sum().item()
    
    avg_loss = total_loss / num_batches
    accuracy = correct / dataset_size
    return avg_loss, accuracy


def train_model(data_loader: DataLoader, model: nn.Module, 
                loss_fn: nn.Module, optimizer: optim.Optimizer) -> tuple:
    """Train model for one epoch and return metrics."""
    dataset_size = len(data_loader.dataset)
    num_batches = len(data_loader)
    
    model.train()
    total_loss, correct = 0, 0
    
    for batch_x, batch_y in data_loader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        
        predictions = model(batch_x)
        loss = loss_fn(predictions, batch_y)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        correct += (predictions.argmax(dim=1) == batch_y).sum().item()
    
    avg_loss = total_loss / num_batches
    accuracy = correct / dataset_size
    return avg_loss, accuracy


# Collect metrics across training epochs
training_losses, training_accuracies = [], []
validation_losses, validation_accuracies = [], []


# Training loop
num_epochs = 30
print(f"Starting training for {num_epochs} epochs...\n")

for epoch_idx in range(num_epochs):
    epoch_num = epoch_idx + 1
    
    # Train and evaluate
    train_loss, train_acc = train_model(
        train_loader, classifier, criterion, optimizer
    )
    val_loss, val_acc = evaluate_model(
        test_loader, classifier, criterion
    )
    
    # Store metrics for plotting
    training_losses.append(train_loss)
    training_accuracies.append(train_acc)
    validation_losses.append(val_loss)
    validation_accuracies.append(val_acc)
    
    print(f"Epoch {epoch_num:2d}/{num_epochs}")
    print(f"  Training   - Loss: {train_loss:.4f}, Accuracy: {train_acc:.4f}")
    print(f"  Validation - Loss: {val_loss:.4f}, Accuracy: {val_acc:.4f}")
    print("-" * 55)


# Generate visualization
plt.figure(figsize=(12, 5))

epoch_range = range(1, num_epochs + 1)

# Loss comparison subplot
plt.subplot(1, 2, 1)
plt.plot(epoch_range, training_losses, marker='o', label='Training Loss', linewidth=2)
plt.plot(epoch_range, validation_losses, marker='s', label='Validation Loss', linewidth=2)
plt.xlabel('Epoch', fontsize=12)
plt.ylabel('Loss', fontsize=12)
plt.title('Loss Progression During Training', fontsize=14, fontweight='bold')
plt.legend(fontsize=10)
plt.grid(True, alpha=0.3)

# Accuracy comparison subplot
plt.subplot(1, 2, 2)
plt.plot(epoch_range, training_accuracies, marker='o', label='Training Accuracy', linewidth=2)
plt.plot(epoch_range, validation_accuracies, marker='s', label='Validation Accuracy', linewidth=2)
plt.xlabel('Epoch', fontsize=12)
plt.ylabel('Accuracy', fontsize=12)
plt.title('Accuracy Progression During Training', fontsize=14, fontweight='bold')
plt.legend(fontsize=10)
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

After 10 training epochs, the curves show steady improvement. However, extending training to 30 epochs reveals a critical pattern: around epoch 25, validation accuracy plateaus, and beyond epoch 28, validation accuracy begins to decline while training accuracy continues to rise. This divergence is a clear indicator of overfitting—the model is becoming increasingly specialized to the training data at the expense of generalization ability.

To mitigate overfitting, consider implementing techniques such as dropout layers, L2 regularization, early stopping, or collecting more training data. Monitoring these curves during training enables timely intervention before overfitting becomes severe.

Related Articles

Efficient Usage of HTTP Client in IntelliJ IDEA

IntelliJ IDEA incorporates a versatile HTTP client tool, enabling developres to interact with RESTful services and APIs effectively with in the editor. This functionality streamlines workflows, replac...

Installing CocoaPods on macOS Catalina (10.15) Using a User-Managed Ruby

System Ruby on macOS 10.15 frequently fails to build native gems required by CocoaPods (for example, ffi), leading to errors like: ERROR: Failed to build gem native extension checking for ffi.h... no...

Resolve PhpStorm "Interpreter is not specified or invalid" on WAMP (Windows)

Symptom PhpStorm displays: "Interpreter is not specified or invalid. Press ‘Fix’ to edit your project configuration." This occurs when the IDE cannot locate a valid PHP CLI executable or when the debu...

Leave a Comment

Anonymous

◎Feel free to join the discussion and share your thoughts.