#!/usr/bin/bash ## This check measures age of nc_cp_backup.sh process and rises alarm if it ## takes longer than expected # Forked from nagios_hosting_check_backup-splited (check_cpbackup_process.sh) by Vladimir Kuprikov usage() { cat << EOF This script measures age of pkgacct and rsync processes which have nc_cp_backup.sh as parent and rises alarm if it takes longer than expected Usage: $(basename "${0}") -c CTIME[D|H|M] -w WTIME[D|H|M] -h Print this help -c CRITICAL status if backups are older than CTIME (default CTIME = 5 days)\ (default assumes Days) -w WARNING status if backups are older than WTIME (default WTIME = 3 days)\ (default assumes Days) EOF } # Set variables PARAMS="${@}" NCCPBACKUP="/root/bin/nc_cp_backup.sh" #INTERPRETER="(/bin/){0,1}bash" PKGACCT="pkgacct" RSYNC="rsync" CPBACKUP_PAUSED="/backup/CPBACKUP_PAUSED" declare -A UNITS=([days]=86400 [hours]=3600 [minutes]=60) declare -A CPROC declare -A WPROC # Set defaults #myIFS="${IFS}" WTIME=259200 # 3 days CTIME=432000 # 5 days UNIT="days" WUNIT="${UNIT}" CUNIT="${UNIT}" OK=0 WARN=1 CRIT=2 UNKN=3 RC="${UNKN}" ident_time () { unset IDENT NUM UNIT TIME IDENT="${1: -1}" NUM="${1//,/.}" case "${IDENT}" in [Dd]) UNIT="days" NUM="${NUM:0:$(( ${#NUM} - 1 ))}" ;; [Hh]) UNIT="hours" NUM="${NUM:0:$(( ${#NUM} - 1 ))}" ;; [Mm]) UNIT="minutes" NUM="${NUM:0:$(( ${#NUM} - 1 ))}" ;; [0-9]) TIME="$(bc -l <<<"scale=0; ${NUM}*86400")" UNIT="days" return ;; *) echo "Can not recognize time ${1}. Please use identifier [D|H|M]" exit "${UNKN}" ;; esac if [[ "${NUM}" =~ ^[0-9]+$ ]] ; then : # All is OK, keep running else echo "Can not recognize time ${1}." exit "${UNKN}" fi TIME="$(bc -l <<<"scale=0; ${NUM:0:$(( ${#1} - 1 ))}*${UNITS[${UNIT}]}")" # Show time in seconds } while getopts ":hw:c:" OPTS ${PARAMS}; do case "${OPTS}" in h) usage exit "${WARN}" ;; w) ident_time "${OPTARG}" WTIME="${TIME}" WUNIT="${UNIT}" ;; c) ident_time "${OPTARG}" CTIME="${TIME}" CUNIT="${UNIT}" ;; :) echo "Option -${OPTARG} requires an argument" >&2 usage exit "${UNKN}" ;; ?) usage exit "${UNKN}" ;; esac done #CPBACKUPPROC="$( ps ax -o lstart -o "|%P|%p|" -o command | awk -v NCCPBACKUP="${NCCPBACKUP}" ' # (match($0,NCCPBACKUP)&& ! match($0,"\\|awk")) {print $0}')" CPBACKUPPROC="$( ps ax -o lstart -o "|%P|%p|" -o command | awk -v pkgacct="${PKGACCT}" -v rsync="${RSYNC}" ' ((match($0,pkgacct) || match($0,rsync)) && ! match($0,"\\|awk")) {print $0}')" if [[ -n "${CPBACKUPPROC}" ]] ; then IFS='|' flagSubProcRunning=0 while read mySDATE myP2ID myPID myCOMMAND ; do # we are expecting the following tree of nc_cp_backup.sh processes (refering to the task https://track.namecheap.net/browse/TO-6668): #root 738309 3.6 0.0 108256 1700 pts/1 S+ 05:15 0:00 | \_ bash /root/bin/nc_cp_backup.sh -a #root 738353 0.0 0.0 108256 832 pts/1 S+ 05:15 0:00 | \_ bash /root/bin/nc_cp_backup.sh -a #root 738364 0.0 0.0 108256 676 pts/1 S+ 05:15 0:00 | | \_ bash /root/bin/nc_cp_backup.sh -a #root 738367 0.2 0.0 220000 25812 pts/1 D+ 05:15 0:00 | | \_ pkgacct - husqolkv - av: 4 #root 738369 0.0 0.0 103004 700 pts/1 S+ 05:15 0:00 | | \_ tail -1 #root 738360 0.0 0.0 108256 832 pts/1 S+ 05:15 0:00 | \_ bash /root/bin/nc_cp_backup.sh -a #root 738372 0.0 0.0 108256 676 pts/1 S+ 05:15 0:00 | | \_ bash /root/bin/nc_cp_backup.sh -a #root 738376 0.2 0.0 220012 25808 pts/1 D+ 05:15 0:00 | | \_ pkgacct - axiaatmy - av: 4 #root 738378 0.0 0.0 103004 700 pts/1 S+ 05:15 0:00 | | \_ tail -1 #root 738370 0.0 0.0 108256 832 pts/1 S+ 05:15 0:00 | \_ bash /root/bin/nc_cp_backup.sh -a #root 738381 0.0 0.0 108256 676 pts/1 S+ 05:15 0:00 | | \_ bash /root/bin/nc_cp_backup.sh -a #root 738382 0.2 0.0 220000 25812 pts/1 D+ 05:15 0:00 | | \_ pkgacct - delejqqw - av: 4 #root 738383 0.0 0.0 103004 696 pts/1 S+ 05:15 0:00 | | \_ tail -1 #root 738684 0.0 0.0 102968 616 pts/1 S+ 05:16 0:00 | \_ sleep 1 # #so here is 3 level parent-child relations. And we want to monitor the second one (PIDs 738353, 738360, 738370 in example) myPID="${myPID// /}" # removing spaces myP2ID="${myP2ID// /}" # removing spaces if [ -d "/proc/${myPID}" ] ; then # EXE="$(readlink -e "/proc/${myPID}/exe")" CMDLINE="$(< /proc/${myPID}/cmdline)" P2CMDLINE="$(< /proc/${myP2ID}/cmdline)" # get cmdline of our parental process myP3ID="$(awk '{print $4}' /proc/${myP2ID}/stat )" # get PID of parental of our parental P3CMDLINE="$(< /proc/${myP3ID}/cmdline)" # get cmdline of parental of our parental else continue fi # !!! OLD BEHAVIOUR !!! # We are expecting to see a process "bash /root/bin/nc_cp_backup.sh" # also we are expecting that it will be child of the same-named initiator # and our parent is not a child of same-named initiator # We are checking in the following way: # 1 make sure that process is run by bash # 2 check that our parental and parental of our parental are not similar (we are on the second level) # 3 our cmdline is what we are expecting to see # 4 check that our cmdline is equal cmdline of our parental (it is not an accidental coincidence) # if [[ "${EXE}" =~ ^${INTERPRETER} ]] && \ # if [[ "${P2CMDLINE}" != "${P3CMDLINE}" ]] && \ # [[ "${CMDLINE}" =~ ^${INTERPRETER}${NCCPBACKUP} ]] && \ # [[ "${CMDLINE}" == "${P2CMDLINE}" ]] ; then # !!! NEW BEHAVIOUR !!! # Now we are looking for processes pkgacct or rsync with parent cmdline "bash /root/bin/nc_cp_backup.sh" and parent of parent should be the same if [[ "${P2CMDLINE}" == "${P3CMDLINE}" ]] && \ ( [[ "${CMDLINE}" =~ ^${PKGACCT} ]] || [[ "${CMDLINE}" =~ ^${RSYNC} ]] ); then flagSubProcRunning=1 #nc_cp_backup is running and there is at least one of the running backing up sub-processes STARTDATE="$(date -d "${mySDATE}" +"%s")" if ! [ "${STARTDATE}" -eq "${STARTDATE}" ] 2>/dev/null ; then #make sure that STARTDATE is an integer echo "ERROR: can not get start date of process" exit "${UNKN}" fi RUNTIME="$(( $(date +"%s") - ${STARTDATE} ))" if (( ${RUNTIME} >= ${CTIME} )) ; then ((CFLAG++)) CPROC[${myPID}]="${RUNTIME}" elif (( ${RUNTIME} >= ${WTIME} )) ; then ((WFLAG++)) WPROC[${myPID}]="${RUNTIME}" else ((OFLAG++)) fi else continue fi done <<<"${CPBACKUPPROC}" # if nc_cp_backup is running and there are no any of the running backing up sub-processes we should check if the nc_cp_backup has a PAUSED flag: # WARNING: file /backup/CPBACKUP_PAUSED older than 8 hours # CRITICAL: file /backup/CPBACKUP_PAUSED older than 12 hours if [[ ${flagSubProcRunning} == 0 && -e "${CPBACKUP_PAUSED}" ]]; then pauseAge=$((($(date +%s) - $(date +%s -r "${CPBACKUP_PAUSED}")) / 3600)) # in hours if (( pauseAge >= 12 )) ; then echo "CRITICAL. CPBACKUP is PAUSED for more than 12 hours" exit 2 elif (( pauseAge >= 8 )) ; then echo "WARNING. CPBACKUP is PAUSED for more than 8 hours" exit 1 fi fi if [[ -n "${CFLAG}" ]] ; then STATUS="[CRITICAL]" STATUSTXT="${NCCPBACKUP} process(es): PID(s)={${!CPROC[@]}} exceeded running time of $(( ${CTIME} /${UNITS[${CUNIT}]} )) ${CUNIT}" RC="${CRIT}" elif [[ -n "${WFLAG}" ]] ; then STATUS="[WARNING]" STATUSTXT="${NCCPBACKUP} process(es): PID(s)={${!WPROC[@]}} exceeded running time of $(( ${WTIME} /${UNITS[${WUNIT}]} )) ${WUNIT}" RC="${WARN}" else STATUS="[OK]" STATUSTXT="${NCCPBACKUP} process is OK" RC="${OK}" fi echo "${STATUS} ${STATUSTXT}" exit "${RC}" else echo "[OK] ${NCCPBACKUP} is not running" exit "${OK}" fi echo "UNKNOWN RESULT" exit "${UNKN}"