[Pkg-nagios-changes] [pkg-mod-gearman] 24/48: restart worker if there are no checks going on (fixes #41)

Stig Sandbeck Mathisen ssm at debian.org
Sun Nov 24 22:38:10 UTC 2013


This is an automated email from the git hooks/post-receive script.

ssm pushed a commit to branch master
in repository pkg-mod-gearman.

commit 58cc8e9a7885a7d6848584ffc49cd59647003bf7
Author: Sven Nierlein <sven at nierlein.de>
Date:   Wed Oct 30 21:49:28 2013 +0100

    restart worker if there are no checks going on (fixes #41)
    
    Add safety hook which restarts all workers if there where no checks in 2
    minutes.  If thats the case, either the worker are broken or nagios is not
    running and sending checks at all. In the second case, restarting the workers
    doesn't hurt.
---
 include/worker.h       |  5 +++--
 worker/worker.c        | 18 ++++++++++++++++--
 worker/worker_client.c |  2 ++
 3 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/include/worker.h b/include/worker.h
index 08eb071..c64d073 100644
--- a/include/worker.h
+++ b/include/worker.h
@@ -44,13 +44,14 @@
  * @{
  */
 
-int mod_gm_shm_key;        /**< key for the shared memory segment */
+int mod_gm_shm_key;             /**< key for the shared memory segment */
 
-#define SHM_SHIFT             4 /**< nr of global counter              */
+#define SHM_SHIFT             5 /**< nr of global counter              */
 #define SHM_JOBS_DONE         0 /**< shm id for jobs done counter      */
 #define SHM_WORKER_TOTAL      1 /**< shm id for total worker counter   */
 #define SHM_WORKER_RUNNING    2 /**< shm id for running worker counter */
 #define SHM_STATUS_WORKER_PID 3 /**< shm id for status worker pid      */
+#define SHM_WORKER_LAST_CHECK 4 /**< shm time of last check executed   */
 
 /** Mod-Gearman Worker
  *
diff --git a/worker/worker.c b/worker/worker.c
index e817de8..532126a 100644
--- a/worker/worker.c
+++ b/worker/worker.c
@@ -245,14 +245,29 @@ void check_worker_population() {
 
     gm_log( GM_LOG_TRACE3, "check_worker_population()\n");
 
+    now = (int)time(NULL);
+
     /* collect finished workers */
     while(waitpid(-1, &status, WNOHANG) > 0)
         gm_log( GM_LOG_TRACE, "waitpid() worker exited with: %d\n", status);
 
-
     /* set current worker number */
     count_current_worker(GM_ENABLED);
 
+    /* check last check time, force restart all worker if there is no result in 2 minutes */
+    if( shm[SHM_WORKER_LAST_CHECK] < (now - 120) ) {
+        gm_log( GM_LOG_INFO, "no checks in 2minutes, restarting all workers\n", shm[SHM_WORKER_LAST_CHECK]);
+        shm[SHM_WORKER_LAST_CHECK] = now;
+        for(x=SHM_SHIFT; x < mod_gm_opt->max_worker+SHM_SHIFT; x++) {
+            save_kill(shm[x], SIGINT);
+        }
+        sleep(3);
+        for(x=SHM_SHIFT; x < mod_gm_opt->max_worker+SHM_SHIFT; x++) {
+            save_kill(shm[x], SIGKILL);
+            shm[x] = -1;
+        }
+    }
+
     /* check if status worker died */
     if( shm[SHM_STATUS_WORKER_PID] == -1 ) {
         make_new_child(GM_WORKER_STATUS);
@@ -265,7 +280,6 @@ void check_worker_population() {
     }
 
     /* check every second if we need to increase worker population */
-    now = (int)time(NULL);
     if(last_time_increased >= now)
         return;
 
diff --git a/worker/worker_client.c b/worker/worker_client.c
index 888fcfc..584d41d 100644
--- a/worker/worker_client.c
+++ b/worker/worker_client.c
@@ -454,6 +454,8 @@ void set_state(int status) {
     if(status == GM_JOB_END) {
         shm[SHM_JOBS_DONE]++; /* increase jobs done */
 
+        shm[SHM_WORKER_LAST_CHECK] = (int)time(NULL); /* set last job date */
+
         /* status slot changed to -1 -> exit */
         if( shm[shm_index] == -1 ) {
             gm_log( GM_LOG_TRACE, "worker finished: %d\n", getpid() );

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-nagios/pkg-mod-gearman



More information about the Pkg-nagios-changes mailing list