[Pkg-nagios-changes] [pkg-mod-gearman] 24/48: restart worker if there are no checks going on (fixes #41)
Stig Sandbeck Mathisen
ssm at debian.org
Sun Nov 24 22:38:10 UTC 2013
This is an automated email from the git hooks/post-receive script.
ssm pushed a commit to branch master
in repository pkg-mod-gearman.
commit 58cc8e9a7885a7d6848584ffc49cd59647003bf7
Author: Sven Nierlein <sven at nierlein.de>
Date: Wed Oct 30 21:49:28 2013 +0100
restart worker if there are no checks going on (fixes #41)
Add safety hook which restarts all workers if there where no checks in 2
minutes. If thats the case, either the worker are broken or nagios is not
running and sending checks at all. In the second case, restarting the workers
doesn't hurt.
---
include/worker.h | 5 +++--
worker/worker.c | 18 ++++++++++++++++--
worker/worker_client.c | 2 ++
3 files changed, 21 insertions(+), 4 deletions(-)
diff --git a/include/worker.h b/include/worker.h
index 08eb071..c64d073 100644
--- a/include/worker.h
+++ b/include/worker.h
@@ -44,13 +44,14 @@
* @{
*/
-int mod_gm_shm_key; /**< key for the shared memory segment */
+int mod_gm_shm_key; /**< key for the shared memory segment */
-#define SHM_SHIFT 4 /**< nr of global counter */
+#define SHM_SHIFT 5 /**< nr of global counter */
#define SHM_JOBS_DONE 0 /**< shm id for jobs done counter */
#define SHM_WORKER_TOTAL 1 /**< shm id for total worker counter */
#define SHM_WORKER_RUNNING 2 /**< shm id for running worker counter */
#define SHM_STATUS_WORKER_PID 3 /**< shm id for status worker pid */
+#define SHM_WORKER_LAST_CHECK 4 /**< shm time of last check executed */
/** Mod-Gearman Worker
*
diff --git a/worker/worker.c b/worker/worker.c
index e817de8..532126a 100644
--- a/worker/worker.c
+++ b/worker/worker.c
@@ -245,14 +245,29 @@ void check_worker_population() {
gm_log( GM_LOG_TRACE3, "check_worker_population()\n");
+ now = (int)time(NULL);
+
/* collect finished workers */
while(waitpid(-1, &status, WNOHANG) > 0)
gm_log( GM_LOG_TRACE, "waitpid() worker exited with: %d\n", status);
-
/* set current worker number */
count_current_worker(GM_ENABLED);
+ /* check last check time, force restart all worker if there is no result in 2 minutes */
+ if( shm[SHM_WORKER_LAST_CHECK] < (now - 120) ) {
+ gm_log( GM_LOG_INFO, "no checks in 2minutes, restarting all workers\n", shm[SHM_WORKER_LAST_CHECK]);
+ shm[SHM_WORKER_LAST_CHECK] = now;
+ for(x=SHM_SHIFT; x < mod_gm_opt->max_worker+SHM_SHIFT; x++) {
+ save_kill(shm[x], SIGINT);
+ }
+ sleep(3);
+ for(x=SHM_SHIFT; x < mod_gm_opt->max_worker+SHM_SHIFT; x++) {
+ save_kill(shm[x], SIGKILL);
+ shm[x] = -1;
+ }
+ }
+
/* check if status worker died */
if( shm[SHM_STATUS_WORKER_PID] == -1 ) {
make_new_child(GM_WORKER_STATUS);
@@ -265,7 +280,6 @@ void check_worker_population() {
}
/* check every second if we need to increase worker population */
- now = (int)time(NULL);
if(last_time_increased >= now)
return;
diff --git a/worker/worker_client.c b/worker/worker_client.c
index 888fcfc..584d41d 100644
--- a/worker/worker_client.c
+++ b/worker/worker_client.c
@@ -454,6 +454,8 @@ void set_state(int status) {
if(status == GM_JOB_END) {
shm[SHM_JOBS_DONE]++; /* increase jobs done */
+ shm[SHM_WORKER_LAST_CHECK] = (int)time(NULL); /* set last job date */
+
/* status slot changed to -1 -> exit */
if( shm[shm_index] == -1 ) {
gm_log( GM_LOG_TRACE, "worker finished: %d\n", getpid() );
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-nagios/pkg-mod-gearman
More information about the Pkg-nagios-changes
mailing list