Automating Prometheus Rule Deployment with Python and Flask
Handling dozens of monitoring targets makes manual Prometheus rule updates a bottleneck. An automated pipeline that accepts alert definitions through a REST interface, persists them in a database, and safely pushes generated rule files to Prometheus servers eliminates this pain.
Architecture Overview
The solution consists of:
- A Flask web service that provides endpoints for rule CRUD operations.
- A SQL database to store rule definitions and associated metadata.
- A rule generator that reads the database and produces YAML-based Prometheus alert rules.
- A remote deployer that connects to target Prometheus hosts via SSH, backs up existing rules, uploads new ones, and triggers a configuration reload.
Database Schema and Model
We store each alert rule in a table prom_rules. The schema defines the alert expression, duration, severity, annotations, and a group identifier for rule organization.
CREATE TABLE prom_rules (
id INTEGER PRIMARY KEY AUTOINCREMENT,
expression VARCHAR(1024) NOT NULL,
duration VARCHAR(16) DEFAULT '1m',
severity VARCHAR(32) DEFAULT 'warning',
summary TEXT,
description TEXT,
datasource_id INTEGER,
group_id INTEGER,
heal_enabled TINYINT DEFAULT 0,
created_at DATETIME,
updated_at DATETIME
);
A corresponding SQLAlchemy model mirrors this table:
from sqlalchemy import Column, Integer, String, Text, DateTime, Boolean
from sqlalchemy.ext.declarative import declarative_base
Base = declarative_base()
class RuleRecord(Base):
__tablename__ = 'prom_rules'
id = Column(Integer, primary_key=True)
expression = Column(String(1024))
duration = Column(String(16), default='1m')
severity = Column(String(32), default='warning')
summary = Column(Text)
description = Column(Text)
datasource_id = Column(Integer)
group_id = Column(Integer)
heal_enabled = Column(Boolean, default=False)
created_at = Column(DateTime)
updated_at = Column(DateTime)
REST API Endpoint
A Flask blueprint provides the /rules endpoint that accepts a JSON payload and inserts a new record. The payload mirrors the model fields:
from flask import Blueprint, request, jsonify
from models import RuleRecord, db
rules_bp = Blueprint('rules', __name__)
@rules_bp.route('/rules', methods=['POST'])
def create_rule():
data = request.get_json()
rule = RuleRecord(
expression=data['expression'],
duration=data.get('duration', '1m'),
severity=data.get('severity', 'warning'),
summary=data.get('summary', ''),
description=data.get('description', ''),
datasource_id=data.get('datasource_id'),
group_id=data.get('group_id'),
heal_enabled=data.get('heal_enabled', False)
)
db.session.add(rule)
db.session.commit()
return jsonify({'status': 'created', 'id': rule.id}), 201
Generating the Rule Configuration
Prometheus expects alert rules inside a groups list. A dedicated function queries all rules, groups them by group_id, and writes a temporary YAML file. We avoid direct string concatenation to prevent injection or formatting errors.
import yaml
import tempfile
import os
def generate_rule_yaml(session):
all_rules = session.query(RuleRecord).filter_by(heal_enabled=True).all()
groups = {}
for r in all_rules:
gid = r.group_id
if gid not in groups:
groups[gid] = {
'name': f'group-{gid}',
'rules': []
}
groups[gid]['rules'].append({
'alert': f'Alert_{r.id}',
'expr': r.expression,
'for': r.duration,
'labels': {'severity': r.severity},
'annotations': {
'summary': r.summary,
'description': r.description
}
})
config = {'groups': list(groups.values())}
with tempfile.NamedTemporaryFile(mode='w', suffix='.yml', delete=False) as f:
yaml.safe_dump(config, f, default_flow_style=False, allow_unicode=True)
temp_path = f.name
return temp_path
Remote Deployment with SSH
A deployer class wraps Paramiko for file transfer and HTTP for the Prometheus reload API. The workflow:
- Connect to the remote host.
- Download and save a backup of the curent rules file.
- Remove the existing rules file from the target directory.
- Upload the newly generated YAML file.
- Call
POST /-/reloadon the Prometheus API. - If the reload fails, restore the backup file.
import paramiko
import requests
import shutil
import os
class PrometheusDeployer:
def __init__(self, host, username, key_path, rules_dir='/etc/prometheus/rules'):
self.host = host
self.username = username
self.key_path = key_path
self.rules_dir = rules_dir
self.client = paramiko.SSHClient()
self.client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
def connect(self):
self.client.connect(self.host, username=self.username, key_filename=self.key_path)
def backup_file(self, remote_filename):
sftp = self.client.open_sftp()
remote_path = os.path.join(self.rules_dir, remote_filename)
bak_path = f'/tmp/{remote_filename}.bak'
try:
sftp.get(remote_path, bak_path)
except FileNotFoundError:
bak_path = None
sftp.close()
return bak_path
def delete_remote_file(self, remote_filename):
sftp = self.client.open_sftp()
remote_path = os.path.join(self.rules_dir, remote_filename)
try:
sftp.remove(remote_path)
except FileNotFoundError:
pass
sftp.close()
def upload_file(self, local_path, remote_filename):
sftp = self.client.open_sftp()
remote_path = os.path.join(self.rules_dir, remote_filename)
sftp.put(local_path, remote_path)
sftp.close()
def reload_prometheus(self):
resp = requests.post(f'http://{self.host}:9090/-/reload', timeout=10)
return resp.status_code == 200
def restore_backup(self, backup_path, remote_filename):
if backup_path:
sftp = self.client.open_sftp()
remote_path = os.path.join(self.rules_dir, remote_filename)
sftp.put(backup_path, remote_path)
sftp.close()
os.remove(backup_path)
def close(self):
self.client.close()
Orchestration Logic
A deployment function ties everything together. It receives a database session and a target host IP, generates the rules, deploys them, and hendles rollback on failure. Temporary files are cleaned up regardless of the outcome.
def deploy_rules_to_host(session, host_ip):
deployer = PrometheusDeployer(host_ip, 'prometheus', '/home/user/.ssh/id_rsa')
temp_file = generate_rule_yaml(session)
remote_file = 'alert_rules.yml'
try:
deployer.connect()
backup = deployer.backup_file(remote_file)
deployer.delete_remote_file(remote_file)
deployer.upload_file(temp_file, remote_file)
if deployer.reload_prometheus():
print('Reload successful')
else:
print('Reload failed, restoring backup')
deployer.restore_backup(backup, remote_file)
except Exception as e:
print(f'Deployment error: {e}')
if backup:
deployer.restore_backup(backup, remote_file)
finally:
os.unlink(temp_file)
deployer.close()
Triggering Deployment
A separate Flask endpoint can invoke the deployment after rule changes, or a background scheduler can run it periodically. A minimal view:
@rules_bp.route('/deploy', methods=['POST'])
def trigger_deploy():
data = request.get_json()
host = data['host']
deploy_rules_to_host(db.session, host)
return jsonify({'result': 'deployment initiated'})
This architecture separates concerns, keeps the Prometheus configuration reproducible, and allows safe rollback if anything goes wrong during the update.