Nagios Monitoring
If you run a Nagios server to monitor your servers, you can use this script to monitor the health of your cluster.
Create the script and set permissions on each database node in the cluster:
Code:
sudo touch /usr/lib/nagios/plugins/check_galera_cluster.sh
sudo chown root:root /usr/lib/nagios/plugins/check_galera_cluster.sh
sudo chmod 755 /usr/lib/nagios/plugins/check_galera_cluster.sh
Edit the script:
Code:
sudo vi /usr/lib/nagios/plugins/check_galera_cluster.sh
Copy this into the script:
Code:
#!/bin/bash
# Name: check_galera_cluster
# Source: https://exchange.nagios.org/directory/Plugins/Databases/MySQL/check_galera_cluster/details
# Modified by: LHammonds (2019-09-06) (Version 1.1.4a)
PROGNAME=`basename $0`
VERSION="Version 1.1.4a"
AUTHOR="Guillaume Coré <fridim@onfi.re>, Ales Nosek <ales.nosek@gmail.com>, Staf Wagemakers <staf@wagemakers.be>"
ST_OK=0
ST_WR=1
ST_CR=2
ST_UK=3
warnAlerts=0
critAlerts=0
unknAlerts=0
print_version() {
echo "$VERSION $AUTHOR"
}
print_help() {
print_version $PROGNAME $VERSION
echo ""
echo "$PROGNAME is a Nagios plugin to monitor Galera cluster status."
echo ""
echo "$PROGNAME [-u USER] [-p PASSWORD] [-H HOST] [-P PORT] [-m file] [-w SIZE] [-c SIZE] [-s statefile] [-f FLOAT] [-0]"
echo ""
echo "Options:"
echo " u)"
echo " MySQL user."
echo " p)"
echo " MySQL password."
echo " H)"
echo " MySQL host."
echo " P)"
echo " MySQL port."
echo " m)"
echo " MySQL extra my.cnf configuration file."
echo " w)"
echo " Sets minimum number of nodes in the cluster when WARNING is raised. (default is same as critical)."
echo " c)"
echo " Sets minimum number of nodes in the cluster when CRITICAL is raised. (default is 2)."
echo " f)"
echo " Sets critical value of wsrep_flow_control_paused (default is 0.1)."
echo " 0)"
echo " Rise CRITICAL if the node is not primary"
echo " s)"
echo " Create state file, detect disconnected nodes"
exit $ST_UK
}
# default values
crit=2
fcp=0.1
check_executable() {
if [ -z "$1" ]; then
echo "check_executable: no parameter given!"
exit $ST_UK
fi
if ! command -v "$1" &>/dev/null; then
echo "UNKNOWN: Cannot find $1"
exit $ST_UK
fi
}
check_executable mysql
check_executable bc
while getopts “hvu:p:H:P:w:c:f:m:s:0” OPTION; do
case $OPTION in
h)
print_help
exit $ST_UK
;;
v)
print_version $PROGNAME $VERSION
exit $ST_UK
;;
u)
mysqluser=$OPTARG
;;
p)
password=$OPTARG
;;
H)
mysqlhost=$OPTARG
;;
P)
port=$OPTARG
;;
m)
myconfig=$OPTARG
;;
w)
warn=$OPTARG
;;
c)
crit=$OPTARG
;;
f)
fcp=$OPTARG
;;
0)
primary='TRUE'
;;
s)
stateFile=$OPTARG
;;
?)
echo "Unknown argument: $1"
print_help
exit $ST_UK
;;
esac
done
if [ -z "$warn" ]; then
warn=$crit
fi
create_param() {
if [ -n "$2" ]; then
echo $1$2
fi
}
param_mysqlhost=$(create_param -h "$mysqlhost")
param_port=$(create_param -P "$port")
param_mysqluser=$(create_param -u "$mysqluser")
param_password=$(create_param -p "$password")
param_configfile=$(create_param --defaults-extra-file= "$myconfig")
param_mysql="$param_mysqlhost $param_port $param_mysqluser $param_password $param_configfile"
#
# verify the database connection
#
mysql $param_mysql -B -N -e '\s;' >/dev/null 2>&1 || {
echo "CRITICAL: mysql connection check failed"
exit $ST_CR
}
#
# verify that the node is part of a cluster
#
rClusterStateUuid=$(mysql $param_mysql -B -N -e "show status like 'wsrep_cluster_state_uuid'; "|cut -f 2)
if [ -z "$rClusterStateUuid" ]; then
echo "CRITICAL: Node is not part of a cluster"
exit $ST_CR
fi
rClusterSize=$(mysql $param_mysql -B -N -e "show status like 'wsrep_cluster_size'"|cut -f 2)
rClusterStatus=$(mysql $param_mysql -B -N -e "show status like 'wsrep_cluster_status'"|cut -f 2) # Primary
rFlowControl=$(mysql $param_mysql -B -N -e "show status like 'wsrep_flow_control_paused'"|cut -f 2) # < 0.1
rReady=$(mysql $param_mysql -B -N -e "show status like 'wsrep_ready'"|cut -f 2) # ON
rConnected=$(mysql $param_mysql -B -N -e "show status like 'wsrep_connected'"|cut -f 2) # ON
rLocalStateComment=$(mysql $param_mysql -B -N -e "show status like 'wsrep_local_state_comment'"|cut -f 2) # Synced
rIncommingAddresses=$(mysql $param_mysql -B -N -e "show global status like 'wsrep_incoming_addresses';"|cut -f 2)
if [ -z "$rFlowControl" ]; then
echo "UNKNOWN: wsrep_flow_control_paused is empty"
unknAlerts=$(($unknAlerts+1))
fi
if [ $(echo "$rFlowControl > $fcp" | bc) = 1 ]; then
echo "CRITICAL: wsrep_flow_control_paused is > $fcp"
critAlerts=$(($criticalAlerts+1))
fi
if [ "$primary" = 'TRUE' ]; then
if [ "$rClusterStatus" != 'Primary' ]; then
echo "CRITICAL: Node is not primary (wsrep_cluster_status)"
critAlerts=$(($criticalAlerts+1))
fi
fi
if [ "$rReady" != 'ON' ]; then
echo "CRITICAL: Node is not ready (wsrep_ready)"
critAlerts=$(($criticalAlerts+1))
fi
if [ "$rConnected" != 'ON' ]; then
echo "CRITICAL: Node is not connected (wsrep_connected)"
critAlerts=$(($criticalAlerts+1))
fi
if [ "$rLocalStateComment" != 'Synced' ]; then
echo "CRITICAL: Node is not synced - actual state is: $rLocalStateComment (wsrep_local_state_comment)"
critAlerts=$(($criticalAlerts+1))
fi
if [ $rClusterSize -gt $warn ]; then
# only display the ok message if the state check not enabled
if [ -z "$stateFile" ]; then
echo "OK: Number of NODES = $rClusterSize (wsrep_cluster_size)"
fi
elif [ $rClusterSize -le $crit ]; then
echo "CRITICAL: Number of NODES = $rClusterSize (wsrep_cluster_size)"
critAlerts=$(($criticalAlerts+1))
elif [ $rClusterSize -le $warn ]; then
echo "WARNING: Number of NODES = $rClusterSize (wsrep_cluster_size)"
warnAlerts=$(($warnAlerts+1))
else
exit $ST_UK
fi
#
# detect is the connection is lost automatically
#
if [ ! -z "$stateFile" ]; then
touch $stateFile
if [ $? != "0" ]; then
echo "UNKNOWN: stateFile \"$stateFile\" is not writeable"
unknAlerts=$(($unknAlerts+1))
else
if [ "$rConnected" = "ON" ]; then
# get the current connected Nodes
currentNodes=$(echo $rIncommingAddresses | tr "," "\n" | sort -u)
if [ -f "$stateFile" ]; then
# get the nodes added to the cluster
newNodes=$(echo $currentNodes | tr " " "\n" | comm -2 -3 - $stateFile)
# get the nodes that were removed from the cluster
missingNodes=$(echo $currentNodes | tr " " "\n" | comm -1 -3 - $stateFile)
if [ ! -z "$newNodes" ]; then
# add the new nodes to the cluster to the state file
echo $newNodes | tr " " "\n" >> $stateFile
fi
else
# there is no state file yet, creating new one.
echo $currentNodes | tr " " "\n" > $stateFile
fi # -f stateFile
# get the numeber of nodes that were part of the cluster before
maxClusterSize=$(cat $stateFile | wc -l)
if [ $maxClusterSize -eq $rClusterSize ]; then
if [ $maxClusterSize -eq 1 ]; then
if [ $crit -eq 0 -a $warn -eq 0 ]; then
echo "OK: Running single-node database cluster"
fi
else
echo "OK: Running redundant $rClusterSize online / $maxClusterSize total"
fi
else
echo "WARNING: Redundant $rClusterSize online / $maxClusterSize total, missing peers: $missingNodes"
warnAlerts=$(($warnAlerts+1))
fi
fi # rConnected
fi # -w stateFile
fi # -z stateFile
#
# exit
#
[ "$critAlerts" -gt "0" ] && exit $ST_CR
[ "$unknAlerts" -gt "0" ] && exit $ST_UK
[ "$warnAlerts" -gt "0" ] && exit $ST_WR
exit 0
Modify Nagios on each database node in the cluster so it can use the custom script:
Code:
sudo vi /etc/nagios/nrpe_local.cfg
Add the following line:
Code:
command[check_galera_cluster]=/usr/lib/nagios/plugins/check_galera_cluster.sh
Reload the plugin for changes to take effect:
Code:
sudo systemctl reload nagios-nrpe-server
Run the script manually to ensure it works:
Code:
sudo /usr/lib/nagios/plugins/check_galera_cluster.sh
If the cluster is online and working properly, it should display something like the following:
Code:
OK: number of NODES = 3 (wsrep_cluster_size)
On your Nagios server, add the following service line to the database nodes you are monitoring which will specifically test the cluster status on each of them. In this example, we are modifying the existing srv-db1 node which already monitors apt updates, cpu/ram/hd usage, etc.
Code:
sudo vi /etc/nagios/servers/srv-db1
Add this section and customize it to fit your environment:
Code:
define service{
use critical-service
host_name srv-db1
service_description Galera Cluster
check_command check_nrpe!check_galera_cluster
check_period 24x7
check_interval 10
notification_options c
notification_period 24x7
contact_groups database-admins
}
On the Nagios server, reload the Nagios service for changes to take effect:
Code:
sudo systemctl reload nagios
Bookmarks