Server Monitoring Best Practices: Keeping Your Magento 2 App and MongoDB Clusters Alive on Linode

Proactive Health Checks for Magento 2 on Linode

Maintaining a high-availability Magento 2 instance on Linode demands a multi-layered monitoring strategy. Beyond basic CPU and RAM utilization, we need to scrutinize application-specific metrics, background processes, and critical dependencies like Redis and Elasticsearch. This section details essential checks and their implementation.

Application-Level Health Endpoint

Magento 2, by default, doesn’t expose a comprehensive health check endpoint. We’ll create a custom module to provide this. This endpoint should verify database connectivity, Redis availability, Elasticsearch status, and the general ability to process a simple request.

First, create a new module. Navigate to your Magento root directory and run:

bin/magento module:enable VendorName_HealthCheck
bin/magento setup:upgrade
bin/magento setup:di:compile
bin/magento setup:static-content:deploy -f

Next, define the route and controller. Create app/code/VendorName/HealthCheck/etc/frontend/routes.xml:

<?xml version="1.0"?>
<config xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="urn:magento:framework:App/etc/routes.xsd">
    <router id="standard">
        <route id="health" frontName="health">
            <module name="VendorName_HealthCheck" />
        </route>
    </router>
</config>

Now, create the controller at app/code/VendorName/HealthCheck/Controller/Index/Index.php:

<?php

namespace VendorName\HealthCheck\Controller\Index;

use Magento\Framework\App\Action\Action;
use Magento\Framework\App\Action\Context;
use Magento\Framework\Controller\Result\JsonFactory;
use Magento\Framework\App\ResourceConnection;
use Magento\Framework\HTTP\Client\Curl;
use Magento\Framework\Module\Manager;
use Magento\Framework\Serialize\Serializer\Json as JsonSerializer;

class Index extends Action
{
    protected $resultJsonFactory;
    protected $resourceConnection;
    protected $curl;
    protected $moduleManager;
    protected $jsonSerializer;

    public function __construct(
        Context $context,
        JsonFactory $resultJsonFactory,
        ResourceConnection $resourceConnection,
        Curl $curl,
        Manager $moduleManager,
        JsonSerializer $jsonSerializer
    ) {
        $this->resultJsonFactory = $resultJsonFactory;
        $this->resourceConnection = $resourceConnection;
        $this->curl = $curl;
        $this->moduleManager = $moduleManager;
        $this->jsonSerializer = $jsonSerializer;
        parent::__construct($context);
    }

    public function execute()
    {
        $response = [
            'status' => 'unhealthy',
            'checks' => []
        ];

        // 1. Database Connection Check
        try {
            $connection = $this->resourceConnection->getConnection();
            $connection->query('SELECT 1');
            $response['checks']['database'] = ['status' => 'healthy'];
        } catch (\Exception $e) {
            $response['checks']['database'] = ['status' => 'unhealthy', 'error' => $e->getMessage()];
        }

        // 2. Redis Check (Assuming default connection)
        if ($this->moduleManager->isEnabled('Magento_RedisSession')) {
            try {
                // This is a simplified check. A more robust check would involve
                // attempting to set and get a key.
                $redisClient = \Magento\Framework\App\ObjectManager::getInstance()
                    ->get(\Magento\Framework\Cache\Frontend\Redis::class);
                $redisClient->testConnection(); // This method might not exist directly,
                                                // a better approach is to use the underlying
                                                // Predis client if available or a custom check.
                                                // For demonstration, we'll assume a basic check.
                $response['checks']['redis'] = ['status' => 'healthy'];
            } catch (\Exception $e) {
                $response['checks']['redis'] = ['status' => 'unhealthy', 'error' => $e->getMessage()];
            }
        } else {
            $response['checks']['redis'] = ['status' => 'skipped', 'reason' => 'Redis module not enabled'];
        }

        // 3. Elasticsearch Check (Requires Elasticsearch module enabled)
        if ($this->moduleManager->isEnabled('Magento_Elasticsearch')) {
            try {
                $esConfig = \Magento\Framework\App\ObjectManager::getInstance()
                    ->get(\Magento\Elasticsearch\Model\Config::class);
                $host = $esConfig->getHosts()[0] ?? 'localhost:9200'; // Assuming single host
                $this->curl->get($host . '/_cluster/health');
                $esHealth = $this->jsonSerializer->unserialize($this->curl->getBody());

                if (isset($esHealth['status']) && in_array($esHealth['status'], ['green', 'yellow'])) {
                    $response['checks']['elasticsearch'] = ['status' => 'healthy', 'cluster_status' => $esHealth['status']];
                } else {
                    $response['checks']['elasticsearch'] = ['status' => 'unhealthy', 'cluster_status' => $esHealth['status'] ?? 'unknown', 'details' => $esHealth];
                }
            } catch (\Exception $e) {
                $response['checks']['elasticsearch'] = ['status' => 'unhealthy', 'error' => $e->getMessage()];
            }
        } else {
            $response['checks']['elasticsearch'] = ['status' => 'skipped', 'reason' => 'Elasticsearch module not enabled'];
        }

        // Determine overall status
        $allHealthy = true;
        foreach ($response['checks'] as $check) {
            if ($check['status'] === 'unhealthy') {
                $allHealthy = false;
                break;
            }
        }
        if ($allHealthy) {
            $response['status'] = 'healthy';
        }

        $result = $this->resultJsonFactory->create();
        $result->setData($response);
        return $result;
    }
}

After deploying this module, you can access the health status at https://your-magento-domain.com/health. This JSON output can be polled by external monitoring tools.

Cron Job Monitoring

Magento’s cron jobs are vital for tasks like indexing, sending emails, and cache management. Stale or failed cron jobs can lead to significant issues. We’ll monitor the cron_schedule table.

A simple approach is to check for jobs that have been running for an unusually long time or have failed. You can set up a script that queries the database:

SELECT
    COUNT(*) AS running_long_jobs
FROM
    cron_schedule
WHERE
    status = 'running'
    AND scheduled_at < NOW() - INTERVAL 15 MINUTE;

SELECT
    COUNT(*) AS failed_jobs
FROM
    cron_schedule
WHERE
    status = 'error'
    AND executed_at > NOW() - INTERVAL 1 HOUR;

This SQL can be executed periodically by a monitoring agent (e.g., Nagios, Zabbix, Prometheus with a MySQL exporter) or via a custom script that sends alerts. For instance, a Bash script using mysqladmin ping and mysql -e:

#!/bin/bash

DB_USER="magento_user"
DB_PASS="magento_password"
DB_NAME="magento_db"
DB_HOST="localhost"

# Check for long-running cron jobs
LONG_RUNNING_JOBS=$(mysql -h $DB_HOST -u $DB_USER -p$DB_PASS $DB_NAME -e "SELECT COUNT(*) FROM cron_schedule WHERE status = 'running' AND scheduled_at < NOW() - INTERVAL 15 MINUTE;" | tail -n 1)

if [ "$LONG_RUNNING_JOBS" -gt 0 ]; then
    echo "CRITICAL: $LONG_RUNNING_JOBS Magento cron jobs are running too long."
    # Add alert mechanism here (e.g., send_alert "CRITICAL: $LONG_RUNNING_JOBS Magento cron jobs are running too long.")
    exit 2
fi

# Check for failed cron jobs
FAILED_JOBS=$(mysql -h $DB_HOST -u $DB_USER -p$DB_PASS $DB_NAME -e "SELECT COUNT(*) FROM cron_schedule WHERE status = 'error' AND executed_at > NOW() - INTERVAL 1 HOUR;" | tail -n 1)

if [ "$FAILED_JOBS" -gt 0 ]; then
    echo "CRITICAL: $FAILED_JOBS Magento cron jobs failed in the last hour."
    # Add alert mechanism here (e.g., send_alert "CRITICAL: $FAILED_JOBS Magento cron jobs failed in the last hour.")
    exit 2
fi

echo "OK: Magento cron jobs are healthy."
exit 0

Log File Monitoring

Magento’s logs (var/log/system.log, var/log/exception.log) are invaluable for debugging. We need to monitor these for critical errors and excessive growth.

Tools like logwatch, goaccess, or more sophisticated solutions like the ELK stack (Elasticsearch, Logstash, Kibana) or Loki with Promtail and Grafana are essential. For a simpler setup, we can use tail with grep and trigger alerts.

# Example: Monitor system.log for 'Error' or 'Fatal' and exception.log for any content
tail -n 100 /var/www/html/var/log/system.log | grep -E 'Error|Fatal'
tail -n 100 /var/www/html/var/log/exception.log

A more robust approach involves a log shipping agent (like Promtail or Filebeat) sending logs to a central aggregator. For instance, using Promtail to ship logs to Loki:

# Promtail configuration snippet (promtail-local-config.yaml)
server:
  http_listen_port: 9080
  grpc_listen_port: 0

positions:
  filename: /tmp/positions.yaml

clients:
  - url: http://loki.yourdomain.com:3100/loki/api/v1/push

scrape_configs:
  - job_name: magento_system_log
    static_configs:
      - targets:
          - localhost
        labels:
          job: magento
          __path__: /var/www/html/var/log/system.log
    pipeline_stages:
      - regex:
          expression: '^(?P<time>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}).*(?P<level>\w+): (?P<message>.*)$'
      - timestamp:
          source: time
          format: '2006-01-02 15:04:05'
      - labels:
          level:

  - job_name: magento_exception_log
    static_configs:
      - targets:
          - localhost
        labels:
          job: magento
          __path__: /var/www/html/var/log/exception.log
    pipeline_stages:
      - regex:
          expression: '^(?P<time>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}).*(?P<level>\w+): (?P<message>.*)$'
      - timestamp:
          source: time
          format: '2006-01-02 15:04:05'
      - labels:
          level:

With logs in Loki, you can create Grafana dashboards to visualize error rates, filter by log level, and set up alerting rules for specific patterns (e.g., repeated ‘Fatal’ errors).

Resource Utilization Thresholds

Linode provides basic resource metrics. We need to set intelligent thresholds for CPU, RAM, Disk I/O, and Network. For Magento, high CPU can indicate inefficient code, indexing processes, or heavy traffic. High RAM usage might point to memory leaks or insufficient cache configuration.

CPU: A sustained CPU usage above 80% for more than 5 minutes is a strong indicator of an issue. Magento’s CLI commands (indexing, compilation) can spike CPU, but these should be short-lived.

RAM: Monitor both used RAM and swap. If swap usage is consistently high, it means the system is starved for physical memory. A threshold of 85% RAM utilization triggering an alert is reasonable.

Disk I/O: High I/O wait times (iowait) suggest the disk is a bottleneck. This is critical for database performance. Monitor %iowait and disk queue lengths.

Network: Monitor bandwidth usage. Sudden spikes could indicate DDoS attacks or unexpected traffic. Sustained high usage might require a Linode plan upgrade.

Linode’s native monitoring can provide these metrics. For more advanced alerting and correlation, integrate with tools like Prometheus Node Exporter and Grafana.

Monitoring MongoDB Clusters for Magento 2

When using MongoDB for session storage, caching, or as a primary database (less common for Magento core but possible for extensions), cluster health is paramount. We’ll focus on key MongoDB metrics.

Essential MongoDB Metrics

Use the mongostat and mongotop command-line utilities for real-time insights, and collect metrics over time for trend analysis and alerting.

Key Metrics to Monitor:

Connections: Current and available connections. High connection counts can indicate application issues or insufficient connection pooling.
Operations: Inserts, queries, updates, deletes per second. High query rates on slow collections are problematic.
Network: Bytes in/out.
Memory: Resident Set Size (RSS), Virtual Memory Size (VMS).
Disk: Disk reads/writes per second.
Replication Lag: For replica sets, the delay between the primary and secondaries. This is critical for read-scaling and failover readiness.
Lock% (mongotop): Percentage of time spent acquiring locks. High lock percentages indicate contention and can severely degrade performance.
Query Performance (mongostat): Look for slow queries.

Collecting MongoDB Metrics

For automated monitoring and alerting, use the MongoDB Agent (part of MongoDB Cloud Manager/Ops Manager) or integrate with open-source tools like Prometheus using the MongoDB Exporter.

Using MongoDB Exporter with Prometheus:

1. **Install MongoDB Exporter:** Download the binary for your Linode instance or compile from source.

# Example for Linux AMD64
wget https://github.com/mongodb/mongodb-exporter/releases/download/v0.10.0/mongodb_exporter-v0.10.0.linux.amd64.tar.gz
tar -xzf mongodb_exporter-v0.10.0.linux.amd64.tar.gz
sudo mv mongodb_exporter /usr/local/bin/

2. **Create a MongoDB User for the Exporter:**

use admin
db.createUser({
  user: "exporter",
  pwd: "your_secure_password",
  roles: [
    { role: "clusterMonitor", db: "admin" },
    { role: "readAnyDatabase", db: "admin" }
  ]
})

3. **Configure and Run MongoDB Exporter:**

# Create a configuration file (e.g., /etc/mongodb_exporter.yml)
# This file specifies the MongoDB connection string and which metrics to collect.
# See https://github.com/mongodb/mongodb-exporter for detailed configuration options.

# Example minimal config:
# mongodb_uri: "mongodb://exporter:your_secure_password@your_mongodb_host:27017/?authSource=admin"
# collect_all: true # Or specify specific collections/metrics

# Run the exporter as a systemd service for reliability
sudo tee /etc/systemd/system/mongodb_exporter.service <<EOF
[Unit]
Description=MongoDB Exporter
Wants=network-online.target
After=network-online.target

[Service]
User=nobody
ExecStart=/usr/local/bin/mongodb_exporter --config.file /etc/mongodb_exporter.yml
Restart=always

[Install]
WantedBy=multi-user.target
EOF

sudo systemctl daemon-reload
sudo systemctl enable mongodb_exporter
sudo systemctl start mongodb_exporter
sudo systemctl status mongodb_exporter

4. **Configure Prometheus:** Add a scrape job for the MongoDB exporter in your prometheus.yml:

scrape_configs:
  - job_name: 'mongodb'
    static_configs:
      - targets: ['localhost:9216'] # Default port for mongodb_exporter

5. **Visualize and Alert in Grafana:** Import a pre-built MongoDB dashboard or create your own using metrics like mongodb_mongod_connections_current, mongodb_mongod_opcounters_total, mongodb_mongod_network_bytes_in_total, and crucially, mongodb_mongod_repl_set_member_state and mongodb_mongod_repl_set_oplog_stats_lag_seconds.

Replication Lag Monitoring

For replica sets, monitoring replication lag is critical. A significant lag means secondaries are not up-to-date, impacting read scaling and failover capabilities. The mongodb_mongod_repl_set_oplog_stats_lag_seconds metric from the exporter is key.

Set up Grafana alerts for when this lag exceeds a defined threshold (e.g., 60 seconds for most applications, potentially lower for critical ones).

# Example Grafana Alerting Rule (JSON format for Grafana API)
{
  "folder": "MongoDB Alerts",
  "title": "MongoDB Replication Lag High",
  "condition": "A",
  "data": [
    {
      "refId": "A",
      "datasource": "Prometheus",
      "queryType": "range",
      "relativeTimeRange": {
        "from": 300000,
        "to": 0
      },
      "model": {
        "datasource": "Prometheus",
        "editorMode": "builder",
        "expr": "avg by (instance) (mongodb_mongod_repl_set_oplog_stats_lag_seconds{job='mongodb'}) > 60",
        "hide": false,
        "intervalMs": 1000,
        "legendFormat": "{{instance}}",
        "refId": "A"
      }
    }
  ],
  "execErrState": "Alerting",
  "for": "5m",
  "labels": {
    "severity": "critical"
  },
  "annotations": {
    "summary": "MongoDB replication lag on {{ $labels.instance }} is over 60 seconds."
  },
  "interval": "1m",
  "noDataState": "NoData",
  "ruleGroupId": 1,
  "ruleGroupName": "MongoDB Alerts",
  "state": "Alerting",
  "uid": "unique_alert_uid_123",
  "updated": 1678886400000,
  "created": 1678886400000,
  "evaluateFor": "5m",
  "evaluateTarget": {
    "type": "query",
    "refId": "A"
  }
}

This rule triggers an alert if any MongoDB instance reports a replication lag greater than 60 seconds for 5 consecutive minutes.

Failover Readiness

Beyond lag, ensure replica set members are healthy and ready to take over. The mongodb_mongod_repl_set_member_state metric is crucial. States like ‘STARTUP’, ‘STARTUP2’, ‘RECOVERING’, ‘ROLLBACK’ indicate a node is not in a stable ‘PRIMARY’ or ‘SECONDARY’ state.

Monitor the count of healthy secondaries. A replica set needs a majority of nodes to be healthy to elect a primary. Alert if the number of healthy secondaries drops below the threshold required for quorum (e.g., if you have 3 nodes, you need at least 2 healthy nodes).

# Example Grafana Alerting Rule for unhealthy replica set members
{
  "folder": "MongoDB Alerts",
  "title": "MongoDB Replica Set Unhealthy Members",
  "condition": "A",
  "data": [
    {
      "refId": "A",
      "datasource": "Prometheus",
      "queryType": "range",
      "relativeTimeRange": {
        "from": 300000,
        "to": 0
      },
      "model": {
        "datasource": "Prometheus",
        "editorMode": "builder",
        "expr": "sum by (replset) (mongodb_mongod_repl_set_member_state{job='mongodb', state!~'PRIMARY|SECONDARY'}) > 0",
        "hide": false,
        "intervalMs": 1000,
        "legendFormat": "{{replset}}",
        "refId": "A"
      }
    }
  ],
  "execErrState": "Alerting",
  "for": "5m",
  "labels": {
    "severity": "critical"
  },
  "annotations": {
    "summary": "MongoDB replica set {{ $labels.replset }} has unhealthy members."
  },
  "interval": "1m",
  "noDataState": "NoData",
  "ruleGroupId": 1,
  "ruleGroupName": "MongoDB Alerts",
  "state": "Alerting",
  "uid": "unique_alert_uid_456",
  "updated": 1678886400000,
  "created": 1678886400000,
  "evaluateFor": "5m",
  "evaluateTarget": {
    "type": "query",
    "refId": "A"
  }
}

Implementing these granular checks for both Magento 2 and its MongoDB cluster provides a robust foundation for maintaining a stable and performant e-commerce platform on Linode.