Linux: Email Alert on MySQL Replication Failure

In my environment, MySQL replication is really important because we are splitting different web servers with different database server to balance the load between MySQL servers. It quite traditional ways because this is kind of old database servers which sustain until today.

There is simple way to setup a monitoring script to alert us via email on any replication error. This script need to be run on slave server of MySQL replication.

Variable as below:

Database server: MySQL 5.0.77
Hostname: mysql.mydomain.org
Database host: localhost
Database name: mymarket_data
Database user: root
Database password: [email protected]

1. Create directory and file for the script:

$ mkdir -p /root/scripts
$ touch /root/scripts/check_slave.php

2. Copy and paste following PHP scripts and put it into a file called check_slave.php:

<?php
/**
 * Description: This script checks the slave status on the configured host
 *              printing "BAD <description>" or "OK <description>" for failure
 *              or success respectively.
 *              The possible failures could include:
 *              1) connection failure
 *              2) query failure (permissions, network, etc.)
 *              3) fetch failure (???)
 *              4) slave or io thread is not running
 *              5) Unknown master state (seconds_behind_master is null)
 *              6) seconds_behind_master has exceeded the configured threshold
 *
 *              If none of these condition occur, we asssume success and return
 *              an "OK" response, otherwise we include the error we can find
 *              (mysqli_connect_error() or $mysqli->error, or problem
 *               description).  A monitoring system need only check for:
 *              /^BAD/ (alert) or /^OK/ (everybody happy)
 */
 
/* **************************
 * Change related value below
 * **************************
 */
    $host = "";
    $user = "";
    $pass = "";
    $mailto = ""; // Your email address
    $mailsubject = "";
    $mailfrom = "";
 
/* ******************************************
 * No need to change anything below this line
 * ******************************************
 */ $mailmessage = "BAD: ".$err_msg."\n";
    $mailheaders = "From:" . $mailfrom;
    error_reporting(E_ALL);
    header("Content-Type: text/plain"); # Not HTML
    $sql = "SHOW SLAVE STATUS";
    $skip_file = 'skip_alerts';
    $link = mysql_connect($host, $user, $pass, null);
 
    if($link)
        $result = mysql_query($sql, $link);
    else {
        printf("BAD: Connection Failed %s", mysql_error());
        mysql_close($link);
        return;
    }
 
    if($result)
        $status = mysql_fetch_assoc($result);
    else {
        printf("BAD: Query failed - %s\n", mysql_error($link));
        mysql_close($link);
        return;
    }
 
    mysql_close($link);
 
    $slave_lag_threshold = 120;
 
    $tests = array(
        'test_slave_io_thread' => array('Slave_IO_Running', "\$var === 'Yes'",
                                        'Slave IO Thread is not running'),
        'test_slave_sql_thread' => array('Slave_SQL_Running', "\$var === 'Yes'",
                                        'Slave SQL Thread is not running'),
        'test_last_err' => array('Last_Errno', "\$var == 0",
                                 "Error encountered during replication - "
                                 .$status['Last_Error']),
        'test_master_status' => array('Seconds_Behind_Master', "isset(\$var)",
                                        'Unknown master status (Seconds_Behind_Master IS NULL)'),
        'test_slave_lag' => array('Seconds_Behind_Master',
                                  "\$var < \$slave_lag_threshold",
                                  "Slave is ${status['Seconds_Behind_Master']}s behind master (threshold=$slave_lag_threshold)")
    );
 
    $epic_fail = false;
    if(is_file($skip_file))
        $epic_fail = false;
    else
    {
        foreach($tests as $test_name => $data) {
            list($field, $expr, $err_msg) = $data;
            $var = $status[$field];
            $val = eval("return $expr;");
            if(!$val) {
                mail($mailto,$mailsubject,$mailmessage,$mailheaders);
                $epic_fail = true;
            }
        }
    }
 
    if(!$epic_fail) {
        print "OK: Checks all completed successfully\n";
    }
?>

3. Change related value on following line:

/* **************************
 * Change related value below
 * **************************
 */
    $host = "localhost";
    $user = "root";
    $pass = "[email protected]";
    $mailto = "[email protected]"; // Your email address
    $mailsubject = "mysql.mydomain.org Replication Alert";
    $mailfrom = "[email protected]";

4. Setup cron to execute the script every 15 minutes:

*/15 * * * * /usr/bin/php -q /root/scripts/check_slave.php

5. Restart cron service:

$ service crond restart

Done. You should receive notification email if the master/slave replication failed!

 

Update

Following script is being modified by Joel Brock as stated in comment section below:

<!--?php 
/**
* Description: This script checks the slave status on the configured host
* printing "BAD ” or “OK ” for failure
* or success respectively.
* The possible failures could include:
* 1) connection failure
* 2) query failure (permissions, network, etc.)
* 3) fetch failure (???)
* 4) slave or io thread is not running
* 5) Unknown master state (seconds_behind_master is null)
* 6) seconds_behind_master has exceeded the configured threshold
*
* If none of these condition occur, we asssume success and return
* an “OK” response, otherwise we include the error we can find
* (mysqli_connect_error() or $mysqli--->error, or problem
* description). A monitoring system need only check for:
* /^BAD/ (alert) or /^OK/ (everybody happy)
*/
 
/* **************************
* Change related value below
* **************************
*/
$host = array(
// “cr-1″ => “192.168.0.81″,
// “cr-2″ => “192.168.0.82″,
// “cr-3″ => “192.168.0.83″,
// “cr-4″ => “192.168.0.84″,
// “cr-5″ => “192.168.0.85″,
// “jp-1″ => “192.168.100.81″,
// “jp-2″ => “192.168.100.82″,
// “jp-3″ => “192.168.100.83″,
// “jp-4″ => “192.168.100.84″,
// “jp-5″ => “192.168.100.85″,
“cr-test” => “192.168.0.87″
);
$user = “”;
$pass = “”;
$mailto = “”;
$mailfrom = “”;
 
/* ******************************************
* No need to change anything below this line
* ******************************************
*/
 
error_reporting(E_ALL);
header(“Content-Type: text/plain”); # Not HTML
foreach ($host as $key => $value) {
 
$mailsubject =[".$key."] SLAVE REPLICATION ALERT”;
$mailheaders = “From:. $mailfrom;
$sql = “SHOW SLAVE STATUS”;
$skip_file = ‘skip_alerts’;
$link = mysql_connect($value, $user, $pass, null);
 
if($link)
    $result = mysql_query($sql, $link);
else {
    printf(“BAD: Connection Failed %s”, mysql_error());
    mysql_close($link);
    return;
}
 
if($result)
    $status = mysql_fetch_assoc($result);
else {
    printf(“BAD: Query failed – %s\n”, mysql_error($link));
    mysql_close($link);
    return;
}
 
mysql_close($link);
 
$slave_lag_threshold = 120;
 
$tests = array(
    ‘test_slave_io_thread’ => array(‘Slave_IO_Running’, “\$var === ‘Yes’”,
    ‘Slave IO Thread is not running’),
    ‘test_slave_sql_thread’ => array(‘Slave_SQL_Running’, “\$var === ‘Yes’”,
    ‘Slave SQL Thread is not running’),
    ‘test_last_err’ => array(‘Last_Errno’, “\$var == 0,
    “Error encountered during replication – ”
    .$status['Last_Error']),
    ‘test_master_status’ => array(‘Seconds_Behind_Master’,isset(\$var),
    ‘Unknown master status (Seconds_Behind_Master IS NULL)),
    ‘test_slave_lag’ => array(‘Seconds_Behind_Master’,
    “\$var $data) {
        list($field, $expr, $err_msg) = $data;
        $var = $status[$field];
        $val = eval(return $expr;);
        $val1 = (!$val) ? $val1 + 1 : $val11;
        $mailmessage .= “BAD:. $key . ” replication failed. Reason:. $err_msg . “\n”;
        }
    if ($val1 > 0) {
        mail($mailto,$mailsubject,$mailmessage,$mailheaders);
        print $mailmessage . “\n”;
        $epic_fail = true;
        }
    }
 
    if(!$epic_fail) {
        print “OK: Checks all completed successfully on [".$key."]\n”;
    }
}
?>

16 thoughts on “Linux: Email Alert on MySQL Replication Failure

  1. Very very very helpful script, thanks!

    One small, line $mailmessage = “BAD: “.$err_msg.”\n\”; should not have that final backslash and should not be commented out which seems to have happened during the html paste.

    Thanks again!

    Reply

  2. I was just thinking about this as we prep for our first replication setup, Googled, found you, and now I can hit off more stuff on my todo list. Thanks for sharing!

    Reply

  3. Hello. Thank you very much for your script. It is exactly what I am looking for. However I am having a problem running it:

    PHP Notice: Undefined variable: err_msg in /root/scripts/check_slave.php on line 36
    PHP Fatal error: Call to undefined function mysql_connect() in /root/scripts/check_slave.php on line 42

    Any ideas?

    Reply

    1. It seems like you do not compile PHP with MySQL support. Make sure php-mysql package is installed. You can verify this with phpinfo page.
      If RedHat based, run:
      yum install php-mysql
      If Debian based, run:
      apt-get install php5-mysql

      Reply

      1. Thank you! That fixed the mysql_connect error but the err_msg is still an undefined variable.

        Reply

  4. Very excellent work!! I’ve made a couple changes to the script to adapt it to our needs. Changes include:

    -feed it an array of IPs and run centrally from the server/master, rather than locally on each slave
    -consolidated errors into 1 email, rather than 3 or more per slave failure (hint: feed it your cellphones email address for text notifications)
    -fixed the $err_msg undefined var error

    Not sure what’ll happen if i paste it in here, but here goes…

    <?php
    /**
    * Description: This script checks the slave status on the configured host
    * printing "BAD ” or “OK ” for failure
    * or success respectively.
    * The possible failures could include:
    * 1) connection failure
    * 2) query failure (permissions, network, etc.)
    * 3) fetch failure (???)
    * 4) slave or io thread is not running
    * 5) Unknown master state (seconds_behind_master is null)
    * 6) seconds_behind_master has exceeded the configured threshold
    *
    * If none of these condition occur, we asssume success and return
    * an “OK” response, otherwise we include the error we can find
    * (mysqli_connect_error() or $mysqli->error, or problem
    * description). A monitoring system need only check for:
    * /^BAD/ (alert) or /^OK/ (everybody happy)
    */

    /* **************************
    * Change related value below
    * **************************
    */
    $host = array(
    // “cr-1” => “192.168.0.81”,
    // “cr-2” => “192.168.0.82”,
    // “cr-3” => “192.168.0.83”,
    // “cr-4” => “192.168.0.84”,
    // “cr-5” => “192.168.0.85”,
    // “jp-1” => “192.168.100.81”,
    // “jp-2” => “192.168.100.82”,
    // “jp-3” => “192.168.100.83”,
    // “jp-4” => “192.168.100.84”,
    // “jp-5” => “192.168.100.85”,
    “cr-test” => “192.168.0.87”
    );
    $user = “”;
    $pass = “”;
    $mailto = “”;
    $mailfrom = “”;

    /* ******************************************
    * No need to change anything below this line
    * ******************************************
    */

    error_reporting(E_ALL);
    header(“Content-Type: text/plain”); # Not HTML
    foreach ($host as $key => $value) {

    $mailsubject = “[“.$key.”] SLAVE REPLICATION ALERT”;
    $mailheaders = “From:” . $mailfrom;
    $sql = “SHOW SLAVE STATUS”;
    $skip_file = ‘skip_alerts’;
    $link = mysql_connect($value, $user, $pass, null);

    if($link)
    $result = mysql_query($sql, $link);
    else {
    printf(“BAD: Connection Failed %s”, mysql_error());
    mysql_close($link);
    return;
    }

    if($result)
    $status = mysql_fetch_assoc($result);
    else {
    printf(“BAD: Query failed – %s\n”, mysql_error($link));
    mysql_close($link);
    return;
    }

    mysql_close($link);

    $slave_lag_threshold = 120;

    $tests = array(
    ‘test_slave_io_thread’ => array(‘Slave_IO_Running’, “\$var === ‘Yes'”,
    ‘Slave IO Thread is not running’),
    ‘test_slave_sql_thread’ => array(‘Slave_SQL_Running’, “\$var === ‘Yes'”,
    ‘Slave SQL Thread is not running’),
    ‘test_last_err’ => array(‘Last_Errno’, “\$var == 0”,
    “Error encountered during replication – ”
    .$status[‘Last_Error’]),
    ‘test_master_status’ => array(‘Seconds_Behind_Master’, “isset(\$var)”,
    ‘Unknown master status (Seconds_Behind_Master IS NULL)’),
    ‘test_slave_lag’ => array(‘Seconds_Behind_Master’,
    “\$var $data) {
    list($field, $expr, $err_msg) = $data;
    $var = $status[$field];
    $val = eval(“return $expr;”);
    $val1 = (!$val) ? $val1 + 1 : $val1 – 1;
    $mailmessage .= “BAD: ” . $key . ” replication failed. Reason: ” . $err_msg . “\n”;
    }
    if ($val1 > 0) {
    mail($mailto,$mailsubject,$mailmessage,$mailheaders);
    print $mailmessage . “\n”;
    $epic_fail = true;
    }
    }

    if(!$epic_fail) {
    print “OK: Checks all completed successfully on [“.$key.”]\n”;
    }
    }
    ?>

    Reply

    1. Hi Joel, I have updated your script into the post to be shared among readers. Thank you so much!

      Reply

      1. Nice! Thanks for the original script. Great approach. Elegant solution.

        Reply

  5. I obtain a
    PHP Parse error: syntax error, unexpected T_RETURN, expecting ‘)’
    Wwhen execute the modified script.

    Any ideas ?

    Reply

  6. I don`t know how the hell Joel Brock`s script could work :).
    I`ve corrected it

    /* ******************************************
    * No need to change anything below this line
    * ******************************************
    */
    error_reporting(E_ALL);
    header("Content-Type: text/plain"); # Not HTML
    foreach ($host as $key => $value) {

    $mailsubject = "[".$key."] SLAVE REPLICATION ALERT";
    $mailheaders = "From:" . $mailfrom;
    $sql = "SHOW SLAVE STATUS";
    $skip_file = 'skip_alerts';
    $link = mysql_connect($value, $user, $pass, null);

    if($link)
    $result = mysql_query($sql, $link);
    else {
    printf("BAD: Connection Failed %s", mysql_error());
    mysql_close($link);
    return;
    }

    if($result)
    $status = mysql_fetch_assoc($result);
    else {
    printf("BAD: Query failed – %s\n", mysql_error($link));
    mysql_close($link);
    return;
    }

    mysql_close($link);

    $slave_lag_threshold = 120;

    $tests = array(
    'test_slave_io_thread' => array('Slave_IO_Running', "\$var === 'Yes'",'Slave IO Thread is not running'),
    'test_slave_sql_thread' => array('Slave_SQL_Running', "\$var === 'Yes'",'Slave SQL Thread is not running'),
    'test_last_err' => array('Last_Errno', "\$var == 0","Error encountered during replication – ".$status['Last_Error']),
    'test_master_status' => array('Seconds_Behind_Master', "isset(\$var)",'Unknown master status', '(Seconds_Behind_Master IS NULL)'),
    'test_slave_lag' => array('Seconds_Behind_Master', "\$var 0) {
    mail($mailto,$mailsubject,$mailmessage,$mailheaders);
    print $mailmessage . "\n";
    $epic_fail = true;
    }
    }

    if(!$epic_fail) {
    print "OK: Checks all completed successfully on [".$key."]\n";
    }

    Reply

      1. Thanks qmic. Comment section is really not a good way to share codes!

        Reply

  7. Useful but messed up with pasting so I will try to paste my working version is here
    /* ******************************************
    * No need to change anything below this line
    * ******************************************
    */
    error_reporting(E_ALL);
    header(“Content-Type: text/plain”); # Not HTML
    foreach ($host as $key => $value) {

    $mailmessage = “”;
    $mailsubject = “[“.$key.”] SLAVE REPLICATION ALERT”;
    $mailheaders = “From:” . $mailfrom;
    $sql = “SHOW SLAVE STATUS”;
    $skip_file = ‘skip_alerts’;
    $link = mysqli_connect($value, $user, $pass, null);

    if($link)
    $result = mysqli_query($link,$sql);
    else {
    printf(“BAD: Connection Failed %s”, mysqli_error());
    mysqli_close($link);
    return;
    }

    if($result)
    $status = mysqli_fetch_assoc($result);
    else {
    printf(“BAD: Query failed – %s\n”, mysqli_error($link));
    mysqli_close($link);
    return;
    }
    mysqli_close($link);

    $slave_lag_threshold = 120;

    $tests = array(
    ‘test_slave_io_thread’ => array(‘Slave_IO_Running’, “\$var === ‘Yes'”,’Slave IO Thread is not running’),
    ‘test_slave_sql_thread’ => array(‘Slave_SQL_Running’, “\$var === ‘Yes'”,’Slave SQL Thread is not running’),
    ‘test_last_err’ => array(‘Last_Errno’, “\$var == 0”, “Error encountered during replication – “.$status[‘Last_Error’]),
    ‘test_master_status’ => array(‘Seconds_Behind_Master’, “isset(\$var)”,’Unknown master status (Seconds_Behind_Master IS NULL)’),
    ‘test_slave_lag’ => array(‘Seconds_Behind_Master’, “\$var $data) {
    list($field, $expr, $err_msg) = $data;
    $var = $status[$field];
    $val = eval(“return $expr;”);

    if (!$val)
    {
    $val1 =1;$mailmessage .= “BAD: ” . $key . ” replication failed. Reason: ” . $err_msg . “\n”;
    }
    }

    if ($val1 > 0) {
    mail($mailto,$mailsubject,$mailmessage,$mailheaders);
    print $mailmessage . “\n”;
    $epic_fail = true;
    }
    }

    if(!$epic_fail) {
    print “OK: Checks all completed successfully on [“.$key.”]\n”;
    }
    }

    Reply

Leave a Reply

Your email address will not be published. Required fields are marked *