Server Monitoring Best Practices: Keeping Your WordPress App and MySQL Clusters Alive on OVH

Proactive MySQL Replication Lag Monitoring

For any WordPress deployment relying on a MySQL cluster, especially in a high-availability or read-scaling setup, replication lag is a critical metric. Unchecked lag can lead to stale data being served to users, inconsistent content, and in worst-case scenarios, data corruption if writes are attempted on a replica that has fallen too far behind. OVH’s managed MySQL services, while robust, still require diligent monitoring.

We’ll implement a Nagios-style check that can be run via cron or a dedicated monitoring agent. This script connects to the MySQL primary and then to each replica, querying SHOW REPLICA STATUS (or SHOW SLAVE STATUS for older versions) and calculating the time difference between when the event was applied on the primary and when it was read on the replica. A threshold will trigger an alert if this difference exceeds a predefined limit (e.g., 60 seconds).

MySQL Replication Lag Check Script (PHP)

This PHP script connects to a primary and then iterates through a list of replicas. It’s designed to be run from a server that has network access to all MySQL instances.

<?php

// Configuration
$db_primary = [
    'host' => 'your_mysql_primary_host',
    'port' => 3306,
    'user' => 'monitor_user',
    'password' => 'monitor_password',
    'database' => 'information_schema' // Or any accessible database
];

$db_replicas = [
    [
        'host' => 'your_mysql_replica_1_host',
        'port' => 3306,
        'user' => 'monitor_user',
        'password' => 'monitor_password',
        'database' => 'information_schema'
    ],
    [
        'host' => 'your_mysql_replica_2_host',
        'port' => 3306,
        'user' => 'monitor_user',
        'password' => 'monitor_password',
        'database' => 'information_schema'
    ],
    // Add more replicas as needed
];

$replication_lag_threshold_seconds = 60; // Alert if lag exceeds 60 seconds

// --- Script Logic ---

function check_replication_lag(array $primary_config, array $replica_config, int $threshold): array
{
    $lag_info = [
        'host' => $replica_config['host'],
        'status' => 'UNKNOWN',
        'message' => '',
        'lag_seconds' => null,
        'error' => null
    ];

    try {
        // Connect to the replica
        $replica_conn = new mysqli(
            $replica_config['host'],
            $replica_config['user'],
            $replica_config['password'],
            $replica_config['database'],
            $replica_config['port']
        );

        if ($replica_conn->connect_error) {
            throw new Exception("Replica connection failed: " . $replica_conn->connect_error);
        }

        // Get replica status
        $sql_replica_status = "SHOW REPLICA STATUS"; // Use SHOW SLAVE STATUS for older MySQL versions
        $result_replica = $replica_conn->query($sql_replica_status);

        if ($result_replica === false) {
            throw new Exception("Failed to execute 'SHOW REPLICA STATUS': " . $replica_conn->error);
        }

        if ($result_replica->num_rows === 0) {
            throw new Exception("'SHOW REPLICA STATUS' returned no rows.");
        }

        $replica_status = $result_replica->fetch_assoc();
        $replica_conn->close();

        // Check if replication is running
        if (!isset($replica_status['Replica_IO_Running']) || $replica_status['Replica_IO_Running'] !== 'Yes') {
            $lag_info['status'] = 'CRITICAL';
            $lag_info['message'] = "Replication IO thread is not running.";
            $lag_info['error'] = $replica_status['Last_IO_Error'] ?? 'N/A';
            return $lag_info;
        }
        if (!isset($replica_status['Replica_SQL_Running']) || $replica_status['Replica_SQL_Running'] !== 'Yes') {
            $lag_info['status'] = 'CRITICAL';
            $lag_info['message'] = "Replication SQL thread is not running.";
            $lag_info['error'] = $replica_status['Last_SQL_Error'] ?? 'N/A';
            return $lag_info;
        }

        // Calculate lag
        if (isset($replica_status['Seconds_Behind_Source'])) {
            $lag_seconds = (int)$replica_status['Seconds_Behind_Source'];
            $lag_info['lag_seconds'] = $lag_seconds;

            if ($lag_seconds > $threshold) {
                $lag_info['status'] = 'WARNING';
                $lag_info['message'] = sprintf("Replication lag is %d seconds (threshold: %d seconds).", $lag_seconds, $threshold);
            } else {
                $lag_info['status'] = 'OK';
                $lag_info['message'] = sprintf("Replication lag is %d seconds.", $lag_seconds);
            }
        } else {
            // Fallback: Calculate manually if Seconds_Behind_Source is not available (older versions or specific configs)
            // This requires connecting to the primary to get its current time or a timestamp from a recent event.
            // For simplicity and modern MySQL versions, Seconds_Behind_Source is preferred.
            // If you need this fallback, you'd query the primary for a timestamp and compare it with Replica_SQL_Running_Time.
            $lag_info['status'] = 'UNKNOWN';
            $lag_info['message'] = "Could not determine 'Seconds_Behind_Source'. Manual calculation might be needed.";
        }

    } catch (Exception $e) {
        $lag_info['status'] = 'CRITICAL';
        $lag_info['message'] = "An error occurred: " . $e->getMessage();
        $lag_info['error'] = $e->getMessage();
    }

    return $lag_info;
}

// --- Main Execution ---

$overall_status = 'OK';
$output_messages = [];

foreach ($db_replicas as $replica) {
    $result = check_replication_lag($db_primary, $replica, $replication_lag_threshold_seconds);
    $output_messages[] = sprintf("[%s] %s: %s", $result['status'], $result['host'], $result['message']);

    if ($result['status'] === 'CRITICAL') {
        $overall_status = 'CRITICAL';
    } elseif ($result['status'] === 'WARNING' && $overall_status !== 'CRITICAL') {
        $overall_status = 'WARNING';
    }
}

// Output for monitoring systems (e.g., Nagios, Zabbix, Prometheus node_exporter textfile collector)
echo $overall_status . ": " . implode(" | ", $output_messages) . "\n";

// Exit with appropriate status code
switch ($overall_status) {
    case 'OK':
        exit(0);
    case 'WARNING':
        exit(1);
    case 'CRITICAL':
        exit(2);
    default:
        exit(3); // UNKNOWN
}
?>

Prerequisites:

PHP installed on the monitoring server.
The mysqli extension enabled for PHP.
A dedicated MySQL user (e.g., monitor_user) with minimal privileges: REPLICATION CLIENT and REPLICATION SLAVE on the primary, and REPLICATION CLIENT on the replicas. This user should ideally not have access to sensitive data.
Network connectivity from the monitoring server to all MySQL hosts on the specified ports.

OVH MySQL Cluster Configuration for Monitoring

When using OVH’s managed MySQL services, you’ll typically interact with them via their API or control panel. For monitoring, ensure you have static IP addresses or hostnames for your primary and replica instances. If you’re using a load balancer in front of your replicas, you’ll monitor the load balancer’s health and individual replica health behind it.

Creating the Monitoring User:

-- On the MySQL Primary:
CREATE USER 'monitor_user'@'%' IDENTIFIED BY 'monitor_password';
GRANT REPLICATION CLIENT, REPLICATION SLAVE ON *.* TO 'monitor_user'@'%';
FLUSH PRIVILEGES;

-- On each MySQL Replica:
CREATE USER 'monitor_user'@'%' IDENTIFIED BY 'monitor_password';
GRANT REPLICATION CLIENT ON *.* TO 'monitor_user'@'%';
FLUSH PRIVILEGES;

Replace '%' with the specific IP address or subnet of your monitoring server for enhanced security.

Cron Job for Scheduled Checks

To automate this check, add it to your crontab. For example, to run the check every minute:

* * * * * /usr/bin/php /path/to/your/mysql_replication_check.php >> /var/log/mysql_replication_check.log 2>&1

Ensure the PHP executable path is correct for your system. The output redirection logs both standard output and standard error to a file, which is crucial for debugging.

Integrating with Monitoring Systems (e.g., Nagios/Icinga)

The script is designed to output status codes compatible with Nagios/Icinga. You can place the script in your monitoring plugin directory (e.g., /usr/local/nagios/libexec/) and define a command and service check.

# In commands.cfg (or equivalent)
define command {
    command_name    check_mysql_replication_lag
    command_line    /usr/bin/php $USER1$/mysql_replication_check.php
}

# In your host/service definition
define service {
    use                     generic-service
    host_name               your_monitoring_server
    service_description     MySQL Replication Lag
    check_command           check_mysql_replication_lag
    max_check_attempts      3
    check_interval          1
    retry_interval          1
    notification_interval   60
    notification_period     24x7
    notification_options    w,c,r
}

This setup will trigger alerts when the script outputs WARNING or CRITICAL.

Advanced: Per-Replica Lag Thresholds and Alerting

For critical applications, you might want different lag thresholds for different replicas. For instance, a replica serving a critical dashboard might need a stricter threshold than one used for occasional reporting.

Modify the $db_replicas array to include a threshold key for each replica:

$db_replicas = [
    [
        'host' => 'your_mysql_replica_1_host',
        'port' => 3306,
        'user' => 'monitor_user',
        'password' => 'monitor_password',
        'database' => 'information_schema',
        'threshold' => 30 // Stricter threshold for replica 1
    ],
    [
        'host' => 'your_mysql_replica_2_host',
        'port' => 3306,
        'user' => 'monitor_user',
        'password' => 'monitor_password',
        'database' => 'information_schema',
        'threshold' => 90 // More lenient threshold for replica 2
    ],
];

Then, update the check_replication_lag function to use the replica-specific threshold:

function check_replication_lag(array $primary_config, array $replica_config, int $default_threshold): array
{
    // ... (previous code) ...

    $replica_threshold = $replica_config['threshold'] ?? $default_threshold; // Use replica-specific or default

    // ... (rest of the lag calculation logic using $replica_threshold) ...
    if ($lag_seconds > $replica_threshold) {
        $lag_info['status'] = 'WARNING';
        $lag_info['message'] = sprintf("Replication lag is %d seconds (threshold: %d seconds).", $lag_seconds, $replica_threshold);
    } else {
        $lag_info['status'] = 'OK';
        $lag_info['message'] = sprintf("Replication lag is %d seconds.", $lag_seconds);
    }
    // ...
}

Monitoring WordPress Application Health

Beyond the database, the WordPress application itself needs robust health checks. This involves verifying that the web server is responding, PHP is executing correctly, and WordPress can connect to its database and serve content.

HTTP Health Check Endpoint

The simplest form of application health check is an HTTP endpoint that returns a 200 OK status if the application is fundamentally alive. For WordPress, this can be a simple PHP file.

// /wp-content/plugins/my-health-check/health-check.php
<?php
/**
 * Plugin Name: My Health Check
 * Description: Provides a simple health check endpoint.
 * Version: 1.0
 * Author: Your Name
 */

// Ensure this file is not accessed directly
if (!defined('ABSPATH')) {
    exit; // Exit if accessed directly
}

// Hook into a WordPress action to output status
add_action('wp_loaded', function() {
    // Basic check: can we connect to the DB?
    global $wpdb;
    if ($wpdb->check_connection()) {
        // Further checks can be added here:
        // - Check external API dependencies
        // - Check file permissions
        // - Check transient data integrity

        header('Content-Type: application/json');
        echo json_encode([
            'status' => 'OK',
            'message' => 'WordPress application is healthy.',
            'timestamp' => current_time('mysql')
        ]);
        status_header(200);
        exit;
    } else {
        header('Content-Type: application/json');
        echo json_encode([
            'status' => 'ERROR',
            'message' => 'Failed to connect to the database.',
            'timestamp' => current_time('mysql')
        ]);
        status_header(503); // Service Unavailable
        exit;
    }
});

// Add a rewrite rule to map a clean URL to this file
// This is a simplified example; a proper plugin would use register_activation_hook for flushing rules.
add_action('init', function() {
    add_rewrite_rule('^health-check/?$', 'wp-content/plugins/my-health-check/health-check.php', 'top');
});

// Flush rewrite rules on plugin activation (for a real plugin)
// register_activation_hook(__FILE__, 'flush_rewrite_rules');
?>

To make this work:

Create the directory wp-content/plugins/my-health-check/ in your WordPress installation.
Save the code above as health-check.php inside that directory.
Activate the “My Health Check” plugin via the WordPress admin dashboard.
Manually flush rewrite rules by going to Settings > Permalinks and clicking “Save Changes” (or implement register_activation_hook for automatic flushing).

Now, you can access https://your-wordpress-site.com/health-check/. A monitoring tool (like Prometheus’s Blackbox Exporter, or a simple `curl` command in cron) can poll this URL.

Monitoring with Prometheus Blackbox Exporter

The Prometheus Blackbox Exporter is ideal for probing endpoints over various protocols, including HTTP. Configure it to probe your WordPress health check URL.

# prometheus/blackbox.yml
modules:
  wordpress_health:
    prober: http
    timeout: 5s
    http:
      method: GET
      # Expect a 200 OK status code and a JSON response containing "status": "OK"
      valid_status_codes: [200]
      body_match: '"status": "OK"'
      no_follow_redirects: true
      fail_if_ssl: false
      fail_if_not_ssl: false

# prometheus.yml (scrape config)
scrape_configs:
  - job_name: 'wordpress_blackbox'
    metrics_path: /probe
    params:
      module: [wordpress_health]
    static_configs:
      - targets:
          - https://your-wordpress-site.com/health-check/
    relabel_configs:
      - source_labels: [__address__]
        target_label: __param_target
      - source_labels: [__param_target]
        target_label: instance
      - target_label: __address__
        replacement: blackbox_exporter_host:9115 # Address of your Blackbox Exporter

This setup allows Prometheus to collect metrics on the availability and basic health of your WordPress application, independent of the application’s own metrics.

Nginx Configuration for Health Check Endpoint

While the WordPress plugin handles the logic, you might want Nginx to serve the health check endpoint directly for efficiency or to bypass WordPress entirely for a more fundamental check. This is useful if you suspect WordPress core or PHP itself is the bottleneck.

# In your WordPress Nginx site configuration
server {
    listen 80;
    server_name your-wordpress-site.com;
    root /var/www/your-wordpress-site.com/html; # Adjust path

    # ... other WordPress configurations ...

    location = /healthz {
        access_log off;
        return 200 'OK'; # Simple text response
        add_header Content-Type text/plain;
    }

    location = /healthz.json {
        access_log off;
        # This checks if PHP-FPM is running and can process a simple script.
        # For a more robust check, you'd point this to a dedicated PHP file
        # that checks DB connectivity.
        try_files /healthz.php?$args =404;
        fastcgi_pass unix:/var/run/php/php8.1-fpm.sock; # Adjust PHP-FPM socket
        fastcgi_index index.php;
        include fastcgi_params;
        fastcgi_param SCRIPT_FILENAME $document_root$fastcgi_script_name;
    }

    # ... other WordPress configurations ...
}

And the corresponding /healthz.php file:

<?php
// /var/www/your-wordpress-site.com/html/healthz.php
header('Content-Type: application/json');

// Basic check: DB connection
global $wpdb;
if ($wpdb->check_connection()) {
    echo json_encode(['status' => 'OK', 'message' => 'DB connection successful.']);
    http_response_code(200);
} else {
    echo json_encode(['status' => 'ERROR', 'message' => 'Failed to connect to database.']);
    http_response_code(503); // Service Unavailable
}
?>

This Nginx-level check is faster and can confirm basic web server and PHP-FPM functionality before even hitting WordPress. The /healthz endpoint provides a minimal check, while /healthz.json leverages PHP-FPM and checks database connectivity.

Monitoring OVH Infrastructure Metrics

OVH provides infrastructure-level metrics through their API and control panel. It’s crucial to integrate these into your central monitoring system.

CPU, Memory, Disk I/O, and Network Usage

For dedicated servers or VPS instances hosting your WordPress application and MySQL, standard monitoring agents like Node Exporter (for Prometheus) or Telegraf (for InfluxDB/Grafana) are essential. These agents collect system-level metrics.

Node Exporter Configuration Snippet:

# Example systemd service file for Node Exporter
[Unit]
Description=Node Exporter
Wants=network-online.target
After=network-online.target

[Service]
User=prometheus
ExecStart=/usr/local/bin/node_exporter \
  --collector.textfile.directory=/var/lib/node_exporter/textfile_collector \
  --web.listen-address=":9100"

[Install]
WantedBy=multi-user.target

Ensure the textfile_collector directory is populated with custom checks (like the MySQL lag script’s output). Prometheus can then scrape these metrics.

OVH Specific Metrics via API/CLI

OVH offers APIs to retrieve information about your services, including load balancers, public cloud instances, and managed databases. You can write custom scripts to query these APIs and expose metrics in a Prometheus-compatible format.

Example: Fetching OVH Load Balancer Stats (Conceptual Python)

import requests
import json
import time
from prometheus_client import start_http_server, Gauge

# --- OVH API Configuration ---
OVH_ENDPOINT = "https://eu.api.ovh.com/1.0"
CONSUMER_KEY = "YOUR_CONSUMER_KEY"
CONSUMER_SECRET = "YOUR_CONSUMER_SECRET"
ACCESS_KEY = "YOUR_ACCESS_KEY"
APPLICATION_KEY = "YOUR_APPLICATION_KEY"
APPLICATION_SECRET = "YOUR_APPLICATION_SECRET"

# --- Prometheus Metrics ---
lb_status = Gauge('ovh_loadbalancer_status', 'Status of OVH Load Balancer (1=Active, 0=Inactive)', ['lb_id'])
lb_connections_total = Gauge('ovh_loadbalancer_connections_total', 'Total connections on OVH Load Balancer', ['lb_id'])
lb_bytes_in_total = Gauge('ovh_loadbalancer_bytes_in_total', 'Total bytes received by OVH Load Balancer', ['lb_id'])
lb_bytes_out_total = Gauge('ovh_loadbalancer_bytes_out_total', 'Total bytes sent by OVH Load Balancer', ['lb_id'])

def get_signature(method, uri, body=None):
    """Generates the OVH API signature."""
    timestamp = int(time.time())
    body_str = body if body else ""
    message = f"{CONSUMER_SECRET}+{ACCESS_KEY}+{method}+{uri}+{body_str}+{timestamp}"
    signature = f"${{ALGO}}${hashlib.sha1(message.encode('utf-8')).hexdigest()}"
    return f"OVHAPISIG sig='{signature}', consumerKey='{CONSUMER_KEY}', epoch='{timestamp}', priority='3'"

def get_ovh_lbs():
    """Fetches list of OVH Load Balancers."""
    uri = "/loadBalancer"
    headers = {
        "X-Ovh-Application-Key": APPLICATION_KEY,
        "X-Ovh-Application-Signature": get_signature("GET", uri),
        "X-Ovh-Consumer-Key": CONSUMER_KEY,
        "X-Ovh-Timestamp": str(int(time.time()))
    }
    response = requests.get(f"{OVH_ENDPOINT}{uri}", headers=headers)
    response.raise_for_status()
    return response.json()

def get_lb_stats(lb_id):
    """Fetches statistics for a specific Load Balancer."""
    uri = f"/loadBalancer/{lb_id}/stats"
    headers = {
        "X-Ovh-Application-Key": APPLICATION_KEY,
        "X-Ovh-Application-Signature": get_signature("GET", uri),
        "X-Ovh-Consumer-Key": CONSUMER_KEY,
        "X-Ovh-Timestamp": str(int(time.time()))
    }
    response = requests.get(f"{OVH_ENDPOINT}{uri}", headers=headers)
    response.raise_for_status()
    return response.json()

def collect_metrics():
    """Collects and exposes OVH LB metrics."""
    try:
        lbs = get_ovh_lbs()
        for lb in lbs:
            lb_id = lb['id']
            try:
                stats = get_lb_stats(lb_id)
                lb_status.labels(lb_id=lb_id).set(1 if lb['state'] == 'ACTIVE' else 0)
                lb_connections_total.labels(lb_id=lb_id).set(stats.get('totalConnections', 0))
                lb_bytes_in_total.labels(lb_id=lb_id).set(stats.get('totalIn', 0))
                lb_bytes_out_total.labels(lb_id=lb_id).set(stats.get('totalOut', 0))
            except Exception as e:
                print(f"Error fetching stats for LB {lb_id}: {e}")
                lb_status.labels(lb_id=lb_id).set(0) # Mark as down if stats fail
    except Exception as e:
        print(f"Error fetching LB list: {e}")

if __name__ == '__main__':
    # Start up the server to expose the metrics.
    start_http_server(8000) # Expose metrics on port 8000
    print("OVH Metrics Exporter started on port 8000")
    # Collect metrics every 60 seconds
    while True:
        collect_metrics()
        time.sleep(60)

Note: This Python script requires the requests and prometheus_client libraries. You’ll need to obtain API credentials from your OVH account. The script should be run on a server that can reach the OVH API endpoints and then be scraped by Prometheus.

Conclusion

A comprehensive monitoring strategy for a WordPress application on OVH involves multiple layers: proactive database replication checks, application-level HTTP health endpoints, and infrastructure metrics. By combining custom scripts with standard monitoring tools and leveraging OVH’s API, you can build a resilient system that alerts you to issues before they impact your users.