Architecting a Distributed Web Service Health Monitoring System
A production-grade monitoring platform requires coordination between a stateful configuration layer, a scheduling engine, and an execution runtime capable of handling diverse HTTP endpoints while maintaining audit trails and alerting channels.
System Architecture
The infrastructure stack separates concerns across persistence, caching, and transport layers:
| Layer | Technology | Function |
|---|---|---|
| API Gateway | Nginx | SSL termination, load balancing |
| Frontend | Vue.js | Configuration dashboard |
| Backend | Spring Boot 3.x | Business logic & scheduling |
| Database | PostgreSQL | Monitor definitions & history |
| Cache | Redis | Distributed locks & rate limiting |
| Messaging | RabbitMQ | Asynchronous alert dispatch |
Domain Model
The core entity represents a configurable health probe with temporal scheduling metadata:
@Entity
@Table(name = "service_probes")
@Data
@Builder
public class HealthProbe {
@Id
@GeneratedValue(strategy = GenerationType.UUID)
private UUID probeId;
private String displayName;
private String endpointUrl;
@Enumerated(EnumType.STRING)
private HttpMethod httpMethod;
@Column(columnDefinition = "jsonb")
private String headerJson;
@Column(columnDefinition = "jsonb")
private String requestBody;
@Enumerated(EnumType.STRING)
private BodyEncoding bodyEncoding;
private Integer expectedHttpStatus;
private String responseSubstring;
@Enumerated(EnumType.STRING)
private RecurrenceUnit intervalUnit;
private Integer intervalValue;
private Instant scheduledFor;
private Instant lastCheckedAt;
private Boolean isActive;
private Integer successCount;
private Integer attemptCount;
private ProbeState currentState;
private String statusDetails;
private static final int MINIMUM_INTERVAL_MINUTES = 30;
public void computeNextCheck() {
Instant baseTime = lastCheckedAt != null ? lastCheckedAt : Instant.now();
long minutesToAdd = switch (intervalUnit) {
case MINUTES -> intervalValue;
case HOURS -> intervalValue * 60L;
case DAYS -> intervalValue * 1440L;
};
minutesToAdd = Math.max(minutesToAdd, MINIMUM_INTERVAL_MINUTES);
this.scheduledFor = baseTime.plus(minutesToAdd, ChronoUnit.MINUTES);
}
}
public enum RecurrenceUnit { MINUTES, HOURS, DAYS }
public enum ProbeState { HEALTHY, DEGRADED, DOWN }
public enum BodyEncoding { JSON, FORM_URLENCODED }
Scheduling Engine
The coordinator uses Spring's task scheduling too poll for due probes and distribute execution across worker threads:
@Component
@RequiredArgsConstructor
@Slf4j
public class ProbeScheduler {
private final ProbeRepository probeRepository;
private final ProbeExecutor probeExecutor;
@Scheduled(cron = "0 */5 * * * *")
public void enqueueDueProbes() {
List<HealthProbe> candidates = probeRepository
.findAllByIsActiveTrueAndScheduledForBefore(Instant.now());
log.info("Triggering {} health probes", candidates.size());
candidates.parallelStream().forEach(probeExecutor::runCheck);
}
}
Enable scheduling via @EnableScheduling on the application entry class.
Execution Runtime
The executor handles transport, validation, persistence, and notification logic within a fault-tolerant wrapper:
@Service
@RequiredArgsConstructor
public class ProbeExecutor {
private final RestTemplate httpClient;
private final ProbeRepository probeRepository;
private final CheckRepository checkRepository;
private final AlertService alertService;
public void runCheck(HealthProbe probe) {
long startMillis = System.currentTimeMillis();
Instant checkTime = Instant.now();
ExecutionOutcome outcome;
int httpStatus = 0;
String responseBody = "";
try {
ResponseEntity<String> exchange = executeHttpCall(probe);
httpStatus = exchange.getStatusCodeValue();
responseBody = exchange.getBody();
outcome = evaluateSuccess(probe, httpStatus, responseBody);
} catch (RestClientException rce) {
outcome = ExecutionOutcome.failure("Transport error: " + rce.getMessage());
} catch (Exception ex) {
outcome = ExecutionOutcome.failure("Unexpected: " + ex.getMessage());
}
long latency = System.currentTimeMillis() - startMillis;
updateProbeMetrics(probe, outcome, checkTime);
persistExecutionLog(probe, outcome, httpStatus, responseBody, latency, checkTime);
if (!outcome.isHealthy()) {
alertService.triggerIncident(probe, outcome);
}
}
private ResponseEntity<String> executeHttpCall(HealthProbe probe) {
HttpHeaders headers = new HttpHeaders();
if (probe.getHeaderJson() != null) {
Map<String, String> headerMap = new Gson().fromJson(
probe.getHeaderJson(), new TypeToken<Map<String, String>>(){}.getType()
);
headerMap.forEach(headers::set);
}
HttpEntity<String> entity = new HttpEntity<>(probe.getRequestBody(), headers);
return httpClient.exchange(
probe.getEndpointUrl(),
org.springframework.http.HttpMethod.valueOf(probe.getHttpMethod().name()),
entity,
String.class
);
}
private ExecutionOutcome evaluateSuccess(HealthProbe probe, int status, String body) {
if (probe.getExpectedHttpStatus() != null && status != probe.getExpectedHttpStatus()) {
return ExecutionOutcome.failure("Status mismatch. Expected: " + probe.getExpectedHttpStatus() + ", Got: " + status);
}
if (probe.getResponseSubstring() != null && (body == null || !body.contains(probe.getResponseSubstring()))) {
return ExecutionOutcome.failure("Body validation failed. Missing substring: " + probe.getResponseSubstring());
}
return ExecutionOutcome.success();
}
private void updateProbeMetrics(HealthProbe probe, ExecutionOutcome outcome, Instant checkTime) {
probe.setLastCheckedAt(checkTime);
probe.setCurrentState(outcome.isHealthy() ? ProbeState.HEALTHY : ProbeState.DOWN);
probe.setStatusDetails(outcome.getDescription());
probe.setAttemptCount(probe.getAttemptCount() + 1);
if (outcome.isHealthy()) {
probe.setSuccessCount(probe.getSuccessCount() + 1);
}
probe.computeNextCheck();
probeRepository.save(probe);
}
private void persistExecutionLog(HealthProbe probe, ExecutionOutcome outcome,
int status, String body, long latency, Instant timestamp) {
CheckLog logEntry = CheckLog.builder()
.probeId(probe.getProbeId())
.checkedAt(timestamp)
.responseTimeMs(latency)
.httpStatus(status)
.isHealthy(outcome.isHealthy())
.responseSnippet(truncate(body, 512))
.build();
checkRepository.save(logEntry);
}
private String truncate(String input, int maxLength) {
if (input == null || input.length() <= maxLength) return input;
return input.substring(0, maxLength) + "...";
}
}
@Data
@AllArgsConstructor
public class ExecutionOutcome {
private boolean healthy;
private String description;
public static ExecutionOutcome success() {
return new ExecutionOutcome(true, "All assertions passed");
}
public static ExecutionOutcome failure(String reason) {
return new ExecutionOutcome(false, reason);
}
}
Alerting Pipeline
Notification are decoupled from the execution path using an asynchronous event listener to prevent blocking the check cycle:
@Service
@RequiredArgsConstructor
public class EmailAlertAdapter {
private final JavaMailSender mailSender;
private final UserDirectory userDirectory;
@Async("alertExecutor")
public void triggerIncident(HealthProbe probe, ExecutionOutcome outcome) {
UserAccount owner = userDirectory.findOwner(probe.getProbeId());
if (owner == null || owner.getEmailAddress() == null) return;
SimpleMailMessage notification = new SimpleMailMessage();
notification.setTo(owner.getEmailAddress());
notification.setSubject("[CRITICAL] Service Degradation Detected: " + probe.getDisplayName());
notification.setText(String.format(
"Probe: %s\nEndpoint: %s\nTimestamp: %s\nFailure: %s\nConsecutive Failures: %d",
probe.getDisplayName(),
probe.getEndpointUrl(),
Instant.now(),
outcome.getDescription(),
probe.getAttemptCount() - probe.getSuccessCount()
));
mailSender.send(notification);
}
}
Configure the mail transport and asynchronous execution pool in application.yml:
spring:
mail:
host: smtp.example.com
port: 587
username: ${MAIL_USER}
password: ${MAIL_PASS}