Linux: Email Alert on MySQL Replication Failure

In my environment, MySQL replication is really important because we are splitting different web servers with different database server to balance the load between MySQL servers. It quite traditional ways because this is kind of old database servers which sustain until today.

There is simple way to setup a monitoring script to alert us via email on any replication error. This script need to be run on slave server of MySQL replication.

Variable as below:

Database server: MySQL 5.0.77
Hostname: mysql.mydomain.org
Database host: localhost
Database name: mymarket_data
Database user: root
Database password: [email protected]

1. Create directory and file for the script:

$ mkdir -p /root/scripts
$ touch /root/scripts/check_slave.php

2. Copy and paste following PHP scripts and put it into a file called check_slave.php:

<?php
/**
 * Description: This script checks the slave status on the configured host
 *              printing "BAD <description>" or "OK <description>" for failure
 *              or success respectively.
 *              The possible failures could include:
 *              1) connection failure
 *              2) query failure (permissions, network, etc.)
 *              3) fetch failure (???)
 *              4) slave or io thread is not running
 *              5) Unknown master state (seconds_behind_master is null)
 *              6) seconds_behind_master has exceeded the configured threshold
 *
 *              If none of these condition occur, we asssume success and return
 *              an "OK" response, otherwise we include the error we can find
 *              (mysqli_connect_error() or $mysqli->error, or problem
 *               description).  A monitoring system need only check for:
 *              /^BAD/ (alert) or /^OK/ (everybody happy)
 */
 
/* **************************
 * Change related value below
 * **************************
 */
    $host = "";
    $user = "";
    $pass = "";
    $mailto = ""; // Your email address
    $mailsubject = "";
    $mailfrom = "";
 
/* ******************************************
 * No need to change anything below this line
 * ******************************************
 */ $mailmessage = "BAD: ".$err_msg."\n";
    $mailheaders = "From:" . $mailfrom;
    error_reporting(E_ALL);
    header("Content-Type: text/plain"); # Not HTML
    $sql = "SHOW SLAVE STATUS";
    $skip_file = 'skip_alerts';
    $link = mysql_connect($host, $user, $pass, null);
 
    if($link)
        $result = mysql_query($sql, $link);
    else {
        printf("BAD: Connection Failed %s", mysql_error());
        mysql_close($link);
        return;
    }
 
    if($result)
        $status = mysql_fetch_assoc($result);
    else {
        printf("BAD: Query failed - %s\n", mysql_error($link));
        mysql_close($link);
        return;
    }
 
    mysql_close($link);
 
    $slave_lag_threshold = 120;
 
    $tests = array(
        'test_slave_io_thread' => array('Slave_IO_Running', "\$var === 'Yes'",
                                        'Slave IO Thread is not running'),
        'test_slave_sql_thread' => array('Slave_SQL_Running', "\$var === 'Yes'",
                                        'Slave SQL Thread is not running'),
        'test_last_err' => array('Last_Errno', "\$var == 0",
                                 "Error encountered during replication - "
                                 .$status['Last_Error']),
        'test_master_status' => array('Seconds_Behind_Master', "isset(\$var)",
                                        'Unknown master status (Seconds_Behind_Master IS NULL)'),
        'test_slave_lag' => array('Seconds_Behind_Master',
                                  "\$var < \$slave_lag_threshold",
                                  "Slave is ${status['Seconds_Behind_Master']}s behind master (threshold=$slave_lag_threshold)")
    );
 
    $epic_fail = false;
    if(is_file($skip_file))
        $epic_fail = false;
    else
    {
        foreach($tests as $test_name => $data) {
            list($field, $expr, $err_msg) = $data;
            $var = $status[$field];
            $val = eval("return $expr;");
            if(!$val) {
                mail($mailto,$mailsubject,$mailmessage,$mailheaders);
                $epic_fail = true;
            }
        }
    }
 
    if(!$epic_fail) {
        print "OK: Checks all completed successfully\n";
    }
?>

3. Change related value on following line:

/* **************************
 * Change related value below
 * **************************
 */
    $host = "localhost";
    $user = "root";
    $pass = "[email protected]";
    $mailto = "[email protected]"; // Your email address
    $mailsubject = "mysql.mydomain.org Replication Alert";
    $mailfrom = "[email protected]";

4. Setup cron to execute the script every 15 minutes:

*/15 * * * * /usr/bin/php -q /root/scripts/check_slave.php

5. Restart cron service:

$ service crond restart

Done. You should receive notification email if the master/slave replication failed!

 

Update

Following script is being modified by Joel Brock as stated in comment section below:

<!--?php 
/**
* Description: This script checks the slave status on the configured host
* printing "BAD ” or “OK ” for failure
* or success respectively.
* The possible failures could include:
* 1) connection failure
* 2) query failure (permissions, network, etc.)
* 3) fetch failure (???)
* 4) slave or io thread is not running
* 5) Unknown master state (seconds_behind_master is null)
* 6) seconds_behind_master has exceeded the configured threshold
*
* If none of these condition occur, we asssume success and return
* an “OK” response, otherwise we include the error we can find
* (mysqli_connect_error() or $mysqli--->error, or problem
* description). A monitoring system need only check for:
* /^BAD/ (alert) or /^OK/ (everybody happy)
*/
 
/* **************************
* Change related value below
* **************************
*/
$host = array(
// “cr-1″ => “192.168.0.81″,
// “cr-2″ => “192.168.0.82″,
// “cr-3″ => “192.168.0.83″,
// “cr-4″ => “192.168.0.84″,
// “cr-5″ => “192.168.0.85″,
// “jp-1″ => “192.168.100.81″,
// “jp-2″ => “192.168.100.82″,
// “jp-3″ => “192.168.100.83″,
// “jp-4″ => “192.168.100.84″,
// “jp-5″ => “192.168.100.85″,
“cr-test” => “192.168.0.87″
);
$user = “”;
$pass = “”;
$mailto = “”;
$mailfrom = “”;
 
/* ******************************************
* No need to change anything below this line
* ******************************************
*/
 
error_reporting(E_ALL);
header(“Content-Type: text/plain”); # Not HTML
foreach ($host as $key => $value) {
 
$mailsubject =[".$key."] SLAVE REPLICATION ALERT”;
$mailheaders = “From:. $mailfrom;
$sql = “SHOW SLAVE STATUS”;
$skip_file = ‘skip_alerts’;
$link = mysql_connect($value, $user, $pass, null);
 
if($link)
    $result = mysql_query($sql, $link);
else {
    printf(“BAD: Connection Failed %s”, mysql_error());
    mysql_close($link);
    return;
}
 
if($result)
    $status = mysql_fetch_assoc($result);
else {
    printf(“BAD: Query failed – %s\n”, mysql_error($link));
    mysql_close($link);
    return;
}
 
mysql_close($link);
 
$slave_lag_threshold = 120;
 
$tests = array(
    ‘test_slave_io_thread’ => array(‘Slave_IO_Running’, “\$var === ‘Yes’”,
    ‘Slave IO Thread is not running’),
    ‘test_slave_sql_thread’ => array(‘Slave_SQL_Running’, “\$var === ‘Yes’”,
    ‘Slave SQL Thread is not running’),
    ‘test_last_err’ => array(‘Last_Errno’, “\$var == 0,
    “Error encountered during replication – ”
    .$status['Last_Error']),
    ‘test_master_status’ => array(‘Seconds_Behind_Master’,isset(\$var),
    ‘Unknown master status (Seconds_Behind_Master IS NULL)),
    ‘test_slave_lag’ => array(‘Seconds_Behind_Master’,
    “\$var $data) {
        list($field, $expr, $err_msg) = $data;
        $var = $status[$field];
        $val = eval(return $expr;);
        $val1 = (!$val) ? $val1 + 1 : $val11;
        $mailmessage .= “BAD:. $key . ” replication failed. Reason:. $err_msg . “\n”;
        }
    if ($val1 > 0) {
        mail($mailto,$mailsubject,$mailmessage,$mailheaders);
        print $mailmessage . “\n”;
        $epic_fail = true;
        }
    }
 
    if(!$epic_fail) {
        print “OK: Checks all completed successfully on [".$key."]\n”;
    }
}
?>