Implementing a COVID-19 Prediction Model with PyTorch: From Data Loading to Model Evaluation
This guide walks through the implementation of a neural network-based regression model for predicting COVID-19 cases using PyTorch. We cover custom dataset creation, model architecture design, training loops with validation, and inference export.
Prerequisites
First, import the required libraries for data manipulation, neural network construction, and visualization.
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
import csv
import time
import matplotlib.pyplot as plt
Custom Dataset Implementation
We define a custom dataset class to handle CSV data ingestion, automatic train/validation splitting, and feature normalization. The dataset uses a modulo-based indexing strategy to separate training and validation data without shuffling.
class ViralLoadDataset(Dataset):
def __init__(self, filepath, split='train'):
# Load raw CSV data using pandas for flexibility
raw_df = pd.read_csv(filepath)
raw_data = raw_df.values
# Create boolean mask for train/val split (80/20 split using modulo 5)
if split == 'train':
selection_mask = np.arange(len(raw_data)) % 5 != 0
elif split == 'val':
selection_mask = np.arange(len(raw_data)) % 5 == 0
else: # test
selection_mask = slice(None)
# Extract features and labels based on data split
if split == 'test':
# Test data lacks the target column (last column)
features = raw_data[:, 1:].astype(np.float32)
self.labels = None
else:
features = raw_data[selection_mask, 1:-1].astype(np.float32)
labels = raw_data[selection_mask, -1].astype(np.float32)
self.labels = torch.from_numpy(labels)
self.features = torch.from_numpy(features)
# Apply Z-score normalization to prevent scale bias
# Calculate statistics along feature dimension (dim=0)
feature_mean = self.features.mean(dim=0, keepdim=True)
feature_std = self.features.std(dim=0, keepdim=True)
self.features = (self.features - feature_mean) / (feature_std + 1e-8)
self.split = split
def __len__(self):
return len(self.features)
def __getitem__(self, index):
input_vec = self.features[index].float()
if self.split == 'test':
return input_vec
return input_vec, self.labels[index].float()
Neural Network Architecture
We implement a feedforward neural network with a single hidden layer and dropout regularization. The output is squeezed to match the one-dimensional target tensor shape required by the loss function.
class RegressionNet(nn.Module):
def __init__(self, input_dim, hidden_dim=100):
super(RegressionNet, self).__init__()
self.layer_stack = nn.Sequential(
nn.Linear(input_dim, hidden_dim),
nn.ReLU(inplace=True),
nn.Dropout(0.1), # Regularization to prevent overfitting
nn.Linear(hidden_dim, 1)
)
def forward(self, input_batch):
logits = self.layer_stack(input_batch)
# Remove the singleton dimension to match target shape [batch_size]
# instead of [batch_size, 1]
return logits.squeeze(dim=-1)
Training and Validation Loop
The training function manages both the training phase (with gradient computation) and validation phase (without gradient tracking). It implements early stopping via model checkpointing based on validation loss.
def train_and_validate(model, train_loader, val_loader, device, hyperparams, checkpoint_path):
model = model.to(device)
optimizer = optim.Adam(model.parameters(), lr=hyperparams['lr'])
loss_fn = nn.MSELoss()
metrics = {'train_losses': [], 'val_losses': []}
lowest_val_loss = float('inf')
for epoch in range(hyperparams['epochs']):
start_ts = time.time()
# Training Phase
model.train()
cumulative_train_loss = 0.0
for batch_inputs, batch_targets in train_loader:
batch_inputs = batch_inputs.to(device)
batch_targets = batch_targets.to(device)
# Forward propagation
predictions = model(batch_inputs)
batch_loss = loss_fn(predictions, batch_targets)
# Backward propagation and parameter update
optimizer.zero_grad()
batch_loss.backward()
optimizer.step()
cumulative_train_loss += batch_loss.item() * batch_inputs.size(0)
avg_train_loss = cumulative_train_loss / len(train_loader.dataset)
metrics['train_losses'].append(avg_train_loss)
# Validation Phase
model.eval()
cumulative_val_loss = 0.0
with torch.no_grad():
for val_inputs, val_targets in val_loader:
val_inputs = val_inputs.to(device)
val_targets = val_targets.to(device)
val_preds = model(val_inputs)
val_loss = loss_fn(val_preds, val_targets)
cumulative_val_loss += val_loss.item() * val_inputs.size(0)
avg_val_loss = cumulative_val_loss / len(val_loader.dataset)
metrics['val_losses'].append(avg_val_loss)
# Save best model based on validation performance
if avg_val_loss < lowest_val_loss:
torch.save(model.state_dict(), checkpoint_path)
lowest_val_loss = avg_val_loss
elapsed = time.time() - start_ts
print(f"[{epoch+1:03d}/{hyperparams['epochs']:03d}] {elapsed:.2f}s - "
f"Train Loss: {avg_train_loss:.6f}, Val Loss: {avg_val_loss:.6f}")
# Plot training history
plt.figure(figsize=(8, 5))
plt.plot(metrics['train_losses'], label='Training Loss', linewidth=2)
plt.plot(metrics['val_losses'], label='Validation Loss', linewidth=2)
plt.xlabel('Epoch')
plt.ylabel('Mean Squared Error')
plt.title('Training Convergence')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
return metrics
Inference and Result Export
After training completes, we load the optimal model weights and generate predictions for the test dataset, exporting the results to a CSV file for downstream analysis.
def predict_and_export(model_class, checkpoint_path, test_loader, device, output_csv, input_dim):
# Initialize fresh model instance and load best weights
inference_model = model_class(input_dim).to(device)
inference_model.load_state_dict(torch.load(checkpoint_path))
inference_model.eval()
results = []
with torch.no_grad():
for test_batch in test_loader:
# Handle both single tensor and tuple/list returns
test_features = test_batch[0] if isinstance(test_batch, (tuple, list)) else test_batch
test_features = test_features.to(device)
batch_preds = inference_model(test_features)
results.extend(batch_preds.cpu().numpy().tolist())
# Write predictions to CSV format
with open(output_csv, 'w', newline='') as file_handle:
csv_writer = csv.writer(file_handle)
csv_writer.writerow(['id', 'test_positive'])
for idx, prediction in enumerate(results):
csv_writer.writerow([str(idx), str(prediction)])
print(f"Inference complete. Results written to: {output_csv}")
return results
Execution Pipeline
The main execution block orchestrates the entire workflow: hardware detection, dataset instantiation, model initialization, training, and final prediction generation.
if __name__ == "__main__":
# Hardware configuration
compute_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Hyperparameter configuration
config = {
'learning_rate': 0.001,
'epochs': 20,
'batch_size': 16,
'model_checkpoint': 'artifacts/best_model.pth',
'predictions_file': 'output/submission.csv'
}
# Dataset instantiation
train_set = ViralLoadDataset('data/covid_train.csv', split='train')
val_set = ViralLoadDataset('data/covid_train.csv', split='val')
test_set = ViralLoadDataset('data/covid_test.csv', split='test')
# DataLoader creation
train_batches = DataLoader(train_set, batch_size=config['batch_size'], shuffle=True)
val_batches = DataLoader(val_set, batch_size=config['batch_size'], shuffle=False)
test_batches = DataLoader(test_set, batch_size=1, shuffle=False)
# Model initialization
feature_count = train_set.features.shape[1]
predictor = RegressionNet(feature_count)
# Training execution
training_history = train_and_validate(
predictor, train_batches, val_batches,
compute_device, config, config['model_checkpoint']
)
# Inference execution
final_predictions = predict_and_export(
RegressionNet, config['model_checkpoint'],
test_batches, compute_device,
config['predictions_file'], feature_count
)
Key Implementation Details
Data Normalization: The dataset applies Z-score normalization using (x - mean) / std along the feature dimension. This prevents features with larger scales from dominating the learning proces.
Train/Validation Split: Rather than random splitting, the implementation uses a deterministic modulo operatino (index % 5) to insure reproducible data partitioning without requiring random seeds.
Dimension Handling: The model's forward method includes a squeeze operation to ensure the output shape matches the target tensor shape, preventing broadcasting errors during loss calculation.
Model Checkpointing: The training loop saves model weights only when validation loss improves, ensuring the final model represents the optimal balance between bias and variance rather than the last epoch's potentially overfitted state.