#!/usr/bin/env python3 # -*- coding: utf-8 -*- import argparse import os import psutil import time import json # Set default values for thresholds and monitoring time WARNNUM = 10 CRITNUM = 20 MONTIME = 15 LOG_FILE = '/var/log/check_stalled_procs.json' def parse_arguments(): """ Parse command-line arguments for warning and critical thresholds, and monitoring time. """ parser = argparse.ArgumentParser() parser.add_argument('-w', type=int, default=WARNNUM, help='Warning threshold for process count') parser.add_argument('-c', type=int, default=CRITNUM, help='Critical threshold for process count') parser.add_argument('-t', type=int, default=MONTIME, help='Monitoring time in minutes') return parser.parse_args() def get_process_info(state): """ Retrieve information about processes in a given state. """ try: # Collect processes matching the given state processes = [p for p in psutil.process_iter(['pid', 'status', 'cmdline', 'username', 'name']) if p.info['status'] == state] # Create a description for each process descriptions = [{ 'pid': p.info['pid'], 'user': p.info.get('username', 'unknown'), 'cmd': p.info['cmdline'][:3] if p.info['cmdline'] else p.info.get('name', 'unknown') } for p in processes] return len(processes), descriptions except Exception as e: print(f"Error retrieving process information: {e}") return 0, [] def read_last_log(): """ Read the last entry from the log file. Return None if the file does not exist or has 0 size """ if os.path.exists(LOG_FILE) and os.path.getsize(LOG_FILE) > 0: with open(LOG_FILE, 'r') as log: lines = log.readlines() if lines: return json.loads(lines[-1].strip()) return None def write_log(current_time, d_count, z_count, status, d_desc, z_desc): """ Write a log entry to the log file. """ log_entry = { 'time': current_time, 'd_count': d_count, 'z_count': z_count, 'status': status, 'd_desc': d_desc, 'z_desc': z_desc } with open(LOG_FILE, 'a') as log: log.write(json.dumps(log_entry) + '\n') def main(): """ Main function to monitor stalled processes and report their status. """ args = parse_arguments() current_time = int(time.time()) # Get the count and description of processes in disk sleep (D) state and zombie (Z) state d_count, d_desc = get_process_info(psutil.STATUS_DISK_SLEEP) z_count, z_desc = get_process_info(psutil.STATUS_ZOMBIE) # Read the last log entry last_log = read_last_log() if last_log: last_time = int(last_log['time']) last_d_count = int(last_log['d_count']) last_z_count = int(last_log['z_count']) last_status = last_log['status'] else: # Initialize variables if no last log entry exists last_time, last_d_count, last_z_count, last_status = current_time, d_count, z_count, "OK" write_log(last_time, last_d_count, last_z_count, last_status, d_desc, z_desc) time_diff = current_time - last_time status = last_status # Check if the monitoring time has elapsed if time_diff >= args.t * 60: # Use the maximum count between current and last counts to determine status td_count = max(d_count, last_d_count) tz_count = max(z_count, last_z_count) if td_count >= args.c or tz_count >= args.c: status = "CRITICAL" elif td_count >= args.w or tz_count >= args.w: status = "WARNING" else: status = "OK" write_log(current_time, d_count, z_count, status, d_desc, z_desc) elif last_status != "OK" and d_count < args.w and z_count < args.w: # Reset status to OK if counts drop below warning thresholds and previous status was not OK status = "OK" write_log(current_time, d_count, z_count, status, d_desc, z_desc) # Output the status, counts and perfdata output = f"{status} - Processes in D state: {d_count}, Z state: {z_count} | D={d_count};{args.w};{args.c}; Z={z_count};{args.w};{args.c};" print(output) # Exit with the appropriate code if status == "OK": exit(0) elif status == "WARNING": exit(1) elif status == "CRITICAL": exit(2) else: exit(3) if __name__ == "__main__": main()