[Pkg-nagios-changes] [SCM] Debian packaging for mod gearman. branch, upstream, updated. upstream/1.0.6-34-gd3fb2ec
Sven Nierlein
sven at nierlein.de
Wed Aug 24 09:22:03 UTC 2011
The following commit has been merged in the upstream branch:
commit 22399305975662a415ccebf77118494d887a6ce2
Author: Sven Nierlein <sven at nierlein.de>
Date: Tue Aug 16 01:26:40 2011 +0200
better fork_on_exec handling
- replace died workers to maintain the worker population
- better timeout handling when using fork_on_exec=off
diff --git a/Changes b/Changes
index 712ed67..1d8759c 100644
--- a/Changes
+++ b/Changes
@@ -7,6 +7,8 @@ This file documents the revision history for mod_gearman.
of duplicate execution
- removed version output to stderr while reloading/starting the core
- check server definition for duplicates
+ - replace died workers to maintain the worker population
+ - better timeout handling when using fork_on_exec=off
1.0.8 Fri Jul 22 22:21:34 CEST 2011
- use identifier for error messages if set
diff --git a/README.asciidoc b/README.asciidoc
index 0881643..05ebf99 100644
--- a/README.asciidoc
+++ b/README.asciidoc
@@ -583,8 +583,9 @@ Disabled when set to 0. Default: 1000
====
fork_on_exec::
-Use this option to disable an extra fork for each plugin execution. This option
-will reduce the load on the worker host. Default: yes
+Use this option to disable an extra fork for each plugin execution.
+Disabling this option will reduce the load on the worker host, but may
+cause trouble with unclean plugins. Default: yes
+
====
fork_on_exec=no
diff --git a/common/gearman.c b/common/gearman.c
index 75512d3..ae5e168 100644
--- a/common/gearman.c
+++ b/common/gearman.c
@@ -112,6 +112,8 @@ int create_client_dup( char ** server_list, gearman_client_st *client ) {
x++;
}
+ current_client_dup = client;
+
return GM_OK;
}
@@ -150,6 +152,9 @@ int create_client( char ** server_list, gearman_client_st *client ) {
}
assert(x != 0);
+
+ current_client = client;
+
return GM_OK;
}
diff --git a/common/utils.c b/common/utils.c
index ab8b976..f1a5ea2 100644
--- a/common/utils.c
+++ b/common/utils.c
@@ -24,8 +24,11 @@
#include "utils.h"
#include "crypt.h"
#include "base64.h"
+#include "gearman.h"
pid_t current_child_pid = 0;
+char temp_buffer1[GM_BUFFERSIZE];
+char temp_buffer2[GM_BUFFERSIZE];
/* escapes newlines in a string */
char *escape_newlines(char *rawbuf) {
@@ -1248,7 +1251,9 @@ int execute_safe_command(gm_job_t * exec_job, int fork_exec, char * identifier)
bufdup = strdup(buffer);
snprintf( buffer, sizeof( buffer )-1, "CRITICAL: Return code of %d is out of bounds. (worker: %s)\n%s\n", (int)(return_code), identifier, bufdup);
free(bufdup);
- return_code = STATE_CRITICAL;
+ if(return_code != 25 && mod_gm_opt->workaround_rc_25 == GM_DISABLED) {
+ return_code = STATE_CRITICAL;
+ }
}
exec_job->output = strdup(buffer);
@@ -1325,6 +1330,10 @@ void check_alarm_handler(int sig) {
pid_t pid = getpid();
gm_log( GM_LOG_TRACE, "check_alarm_handler(%i)\n", sig );
+ if(current_job != NULL && mod_gm_opt->fork_on_exec == GM_DISABLED) {
+ send_timeout_result(current_job);
+ gearman_job_send_complete(current_gearman_job, NULL, 0);
+ }
if(current_child_pid > 0) {
gm_log( GM_LOG_TRACE, "send SIGINT to %d\n", current_child_pid);
@@ -1841,3 +1850,136 @@ char * get_param_server(char * servername, char * server_list[GM_LISTSIZE], int
return new_server;
}
+
+void send_timeout_result(gm_job_t * exec_job) {
+ struct timeval end_time;
+ char buffer[GM_BUFFERSIZE];
+ buffer[0] = '\x0';
+
+ gm_log( GM_LOG_TRACE, "send_timeout_result()\n");
+
+ gettimeofday(&end_time, NULL);
+ exec_job->finish_time = end_time;
+
+ exec_job->return_code = 2;
+ exec_job->early_timeout = 1;
+ if ( !strcmp( exec_job->type, "service" ) )
+ snprintf( buffer, sizeof( buffer ) -1, "(Service Check Timed Out On Worker: %s)\n", mod_gm_opt->identifier);
+ if ( !strcmp( exec_job->type, "host" ) )
+ snprintf( buffer, sizeof( buffer ) -1, "(Host Check Timed Out On Worker: %s)\n", mod_gm_opt->identifier);
+ free(exec_job->output);
+ exec_job->output = strdup( buffer );
+
+ send_result_back(exec_job);
+
+ return;
+}
+
+
+/* send results back */
+void send_result_back(gm_job_t * exec_job) {
+ gm_log( GM_LOG_TRACE, "send_result_back()\n" );
+
+ if(exec_job->result_queue == NULL) {
+ return;
+ }
+ if(exec_job->output == NULL) {
+ return;
+ }
+
+ /* workaround for rc 25 bug
+ * duplicate jobs from gearmand result in exit code 25 of plugins
+ * because they are executed twice and get killed because of using
+ * the same ressource.
+ * Sending results (when exit code is 25 ) will be skipped with this
+ * enabled.
+ */
+ if( exec_job->return_code == 25 && mod_gm_opt->workaround_rc_25 == GM_ENABLED ) {
+ return;
+ }
+
+ gm_log( GM_LOG_TRACE, "queue: %s\n", exec_job->result_queue );
+ temp_buffer1[0]='\x0';
+ snprintf( temp_buffer1, sizeof( temp_buffer1 )-1, "host_name=%s\ncore_start_time=%i.%i\nstart_time=%i.%i\nfinish_time=%i.%i\nlatency=%f\nreturn_code=%i\nexited_ok=%i\n",
+ exec_job->host_name,
+ ( int )exec_job->core_start_time.tv_sec,
+ ( int )exec_job->core_start_time.tv_usec,
+ ( int )exec_job->start_time.tv_sec,
+ ( int )exec_job->start_time.tv_usec,
+ ( int )exec_job->finish_time.tv_sec,
+ ( int )exec_job->finish_time.tv_usec,
+ exec_job->latency,
+ exec_job->return_code,
+ exec_job->exited_ok
+ );
+ temp_buffer1[sizeof( temp_buffer1 )-1]='\x0';
+
+ if(exec_job->service_description != NULL) {
+ temp_buffer2[0]='\x0';
+ strncat(temp_buffer2, "service_description=", (sizeof(temp_buffer2)-1));
+ strncat(temp_buffer2, exec_job->service_description, (sizeof(temp_buffer2)-1));
+ strncat(temp_buffer2, "\n", (sizeof(temp_buffer2)-1));
+
+ strncat(temp_buffer1, temp_buffer2, (sizeof(temp_buffer1)-1));
+ }
+ temp_buffer1[sizeof( temp_buffer1 )-1]='\x0';
+
+ if(exec_job->output != NULL) {
+ temp_buffer2[0]='\x0';
+ strncat(temp_buffer2, "output=", (sizeof(temp_buffer2)-1));
+ if(mod_gm_opt->debug_result) {
+ strncat(temp_buffer2, "(", (sizeof(temp_buffer2)-1));
+ strncat(temp_buffer2, hostname, (sizeof(temp_buffer2)-1));
+ strncat(temp_buffer2, ") - ", (sizeof(temp_buffer2)-1));
+ }
+ strncat(temp_buffer2, exec_job->output, (sizeof(temp_buffer2)-1));
+ strncat(temp_buffer2, "\n\n\n", (sizeof(temp_buffer2)-1));
+ strncat(temp_buffer1, temp_buffer2, (sizeof(temp_buffer1)-1));
+ }
+ strncat(temp_buffer1, "\n", (sizeof(temp_buffer1)-2));
+ temp_buffer1[sizeof( temp_buffer1 )-1]='\x0';
+
+ gm_log( GM_LOG_TRACE, "data:\n%s\n", temp_buffer1);
+
+ if(add_job_to_queue( current_client,
+ mod_gm_opt->server_list,
+ exec_job->result_queue,
+ NULL,
+ temp_buffer1,
+ GM_JOB_PRIO_NORMAL,
+ GM_DEFAULT_JOB_RETRIES,
+ mod_gm_opt->transportmode,
+ TRUE
+ ) == GM_OK) {
+ gm_log( GM_LOG_TRACE, "send_result_back() finished successfully\n" );
+ }
+ else {
+ gm_log( GM_LOG_TRACE, "send_result_back() finished unsuccessfully\n" );
+ }
+
+ if( mod_gm_opt->dupserver_num ) {
+ strncpy(temp_buffer2, "type=passive\n", (sizeof(temp_buffer1)-2));
+ strncat(temp_buffer2, temp_buffer1, (sizeof(temp_buffer2)-2));
+ temp_buffer2[sizeof( temp_buffer2 )-1]='\x0';
+ if( add_job_to_queue( current_client_dup,
+ mod_gm_opt->dupserver_list,
+ exec_job->result_queue,
+ NULL,
+ temp_buffer2,
+ GM_JOB_PRIO_NORMAL,
+ GM_DEFAULT_JOB_RETRIES,
+ mod_gm_opt->transportmode,
+ TRUE
+ ) == GM_OK) {
+ gm_log( GM_LOG_TRACE, "send_result_back() finished successfully for duplicate server.\n" );
+ }
+ else {
+ gm_log( GM_LOG_TRACE, "send_result_back() finished unsuccessfully for duplicate server\n" );
+ }
+ }
+ else {
+ gm_log( GM_LOG_TRACE, "send_result_back() has no duplicate servers to send to.\n" );
+ }
+
+ return;
+}
diff --git a/etc/mod_gearman.conf.in b/etc/mod_gearman.conf.in
index 3196e1b..a2163dc 100644
--- a/etc/mod_gearman.conf.in
+++ b/etc/mod_gearman.conf.in
@@ -161,3 +161,22 @@ idle-timeout=30
# Use this to control how fast the amount of workers will go down
# after high load times
max-jobs=50
+
+# defined the rate of spawed worker per second as long
+# as there are jobs waiting
+spawn-rate=1
+
+# Use this option to disable an extra fork for each plugin execution. Disabling
+# this option will reduce the load on the worker host but can lead to problems with
+# unclean plugin.
+fork_on_exec=yes
+
+# Workarounds
+
+# workaround for rc 25 bug
+# duplicate jobs from gearmand result in exit code 25 of plugins
+# because they are executed twice and get killed because of using
+# the same ressource.
+# Sending results (when exit code is 25 ) will be skipped with this
+# enabled.
+workaround_rc_25=off
diff --git a/extras/shared.conf b/extras/shared.conf
index 12ee381..778d6cc 100644
--- a/extras/shared.conf
+++ b/extras/shared.conf
@@ -182,6 +182,10 @@ max-jobs=50
# as there are jobs waiting
spawn-rate=1
+# Use this option to disable an extra fork for each plugin execution. Disabling
+# this option will reduce the load on the worker host but can lead to problems with
+# unclean plugin. Default: yes
+fork_on_exec=yes
# Workarounds
diff --git a/include/common.h b/include/common.h
index c11910f..b0ac570 100644
--- a/include/common.h
+++ b/include/common.h
@@ -99,6 +99,7 @@
#define GM_DEFAULT_MAX_WORKER 20 /**< maximum number of concurrent worker */
#define GM_DEFAULT_JOB_MAX_AGE 600 /**< discard jobs older than that */
#define GM_DEFAULT_SPAWN_RATE 1 /**< number of spawned worker per seconds */
+#define GM_DEFAULT_WORKER_LOOP_SLEEP 1 /**< sleep in worker main loop */
/* transport modes */
#define GM_ENCODE_AND_ENCRYPT 1
@@ -238,6 +239,8 @@ typedef struct gm_job_struct {
/** options structure */
mod_gm_opt_t *mod_gm_opt;
+gm_job_t * current_job;
+char hostname[GM_BUFFERSIZE];
/*
* @}
diff --git a/include/gearman.h b/include/gearman.h
index 1e848e5..efa67f7 100644
--- a/include/gearman.h
+++ b/include/gearman.h
@@ -37,6 +37,10 @@
typedef void*( mod_gm_worker_fn)(gearman_job_st *job, void *context, size_t *result_size, gearman_return_t *ret_ptr);
+gearman_client_st *current_client;
+gearman_client_st *current_client_dup;
+gearman_job_st *current_gearman_job;
+
int create_client( char ** server_list, gearman_client_st * client);
int create_client_dup( char ** server_list, gearman_client_st * client);
int create_worker( char ** server_list, gearman_worker_st * worker);
diff --git a/include/utils.h b/include/utils.h
index e19a945..05709dd 100644
--- a/include/utils.h
+++ b/include/utils.h
@@ -493,6 +493,29 @@ void write_core_log(char *data);
*/
char * get_param_server(char * servername, char * server_list[GM_LISTSIZE], int server_num);
+
+/**
+ * send_timeout_result
+ *
+ * send back a timeout result
+ *
+ * @param[in] exec_job - the exec job with all results
+ *
+ * @return nothing
+ */
+void send_timeout_result(gm_job_t * exec_job);
+
+/**
+ * send_result_back
+ *
+ * send back result
+ *
+ * @param[in] exec_job - the exec job with all results
+ *
+ * @return nothing
+ */
+void send_result_back(gm_job_t * exec_job);
+
/**
* @}
*/
diff --git a/include/worker.h b/include/worker.h
index 0e73833..da688a8 100644
--- a/include/worker.h
+++ b/include/worker.h
@@ -185,9 +185,11 @@ int get_next_shm_index(void);
/**
* count and set the current number of worker
*
+ * @param[in] restart - set to GM_ENABLED if stale worker should be replaced
+ *
* @return nothing
*/
-void count_current_worker(void);
+void count_current_worker(int restart);
/**
* @}
diff --git a/include/worker_client.h b/include/worker_client.h
index c042c8f..af2c578 100644
--- a/include/worker_client.h
+++ b/include/worker_client.h
@@ -49,7 +49,6 @@ void worker_loop(void);
void *get_job( gearman_job_st *, void *, size_t *, gearman_return_t * );
void do_exec_job(void);
int set_worker( gearman_worker_st *worker );
-void send_result_back(void);
void idle_sighandler(int sig);
void set_state(int status);
void clean_worker_exit(int sig);
@@ -62,4 +61,3 @@ void write_debug_file(char ** text);
/**
* @}
*/
-
diff --git a/worker/worker.c b/worker/worker.c
index 628f39d..08c81d1 100644
--- a/worker/worker.c
+++ b/worker/worker.c
@@ -152,7 +152,7 @@ void monitor_loop() {
/* maintain the population */
while (1) {
/* check number of workers every second */
- sleep(1);
+ sleep(GM_DEFAULT_WORKER_LOOP_SLEEP);
/* collect finished workers */
while(waitpid(-1, &status, WNOHANG) > 0)
@@ -165,7 +165,7 @@ void monitor_loop() {
/* count current worker and jobs */
-void count_current_worker() {
+void count_current_worker(int restart) {
int x;
gm_log( GM_LOG_TRACE, "count_current_worker()\n");
@@ -190,11 +190,16 @@ void count_current_worker() {
current_number_of_jobs = 0;
for(x=4; x < mod_gm_opt->max_worker+4; x++) {
/* verify worker is alive */
+ gm_log( GM_LOG_TRACE, "worker slot: shm[%d] = %d\n", x, shm[x]);
if( shm[x] != -1 && pid_alive(shm[x]) == FALSE ) {
gm_log( GM_LOG_TRACE, "removed stale worker %d, old pid: %d\n", x, shm[x]);
shm[x] = -1;
+ /* immediately start new worker, otherwise the fork rate cannot be guaranteed */
+ if(restart == GM_ENABLED) {
+ make_new_child(GM_WORKER_MULTI);
+ current_number_of_workers++;
+ }
}
- gm_log( GM_LOG_TRACE, "worker slot: shm[%d] = %d\n", x, shm[x]);
if(shm[x] != -1) {
current_number_of_workers++;
}
@@ -218,7 +223,7 @@ void check_worker_population() {
gm_log( GM_LOG_TRACE, "check_worker_population()\n");
/* set current worker number */
- count_current_worker();
+ count_current_worker(GM_ENABLED);
/* check if status worker died */
if( shm[3] == -1 ) {
@@ -302,7 +307,6 @@ int parse_arguments(int argc, char **argv) {
int i;
int errors = 0;
int verify;
- char hostname[GM_BUFFERSIZE];
mod_gm_opt_t * mod_gm_new_opt;
mod_gm_new_opt = malloc(sizeof(mod_gm_opt_t));
set_default_options(mod_gm_new_opt);
@@ -583,7 +587,7 @@ void stop_childs(int mode) {
if(waited > GM_CHILD_SHUTDOWN_TIMEOUT) {
break;
}
- count_current_worker();
+ count_current_worker(GM_DISABLED);
if(current_number_of_workers == 0)
return;
gm_log( GM_LOG_TRACE, "still waiting (%d) %d childs missing...\n", waited, current_number_of_workers);
@@ -591,7 +595,7 @@ void stop_childs(int mode) {
if(mode == GM_WORKER_STOP) {
killpg(0, SIGINT);
- count_current_worker();
+ count_current_worker(GM_DISABLED);
if(current_number_of_workers == 0)
return;
@@ -609,7 +613,7 @@ void stop_childs(int mode) {
}
/* kill them the hard way */
- count_current_worker();
+ count_current_worker(GM_DISABLED);
if(current_number_of_workers == 0)
return;
for(x=3; x < mod_gm_opt->max_worker+4; x++) {
@@ -623,7 +627,7 @@ void stop_childs(int mode) {
}
/* count childs a last time */
- count_current_worker();
+ count_current_worker(GM_DISABLED);
if(current_number_of_workers == 0)
return;
diff --git a/worker/worker_client.c b/worker/worker_client.c
index 3910a2f..1cf18d3 100644
--- a/worker/worker_client.c
+++ b/worker/worker_client.c
@@ -30,13 +30,11 @@
char temp_buffer1[GM_BUFFERSIZE];
char temp_buffer2[GM_BUFFERSIZE];
-char hostname[GM_BUFFERSIZE];
gearman_worker_st worker;
gearman_client_st client;
gearman_client_st client_dup;
-gm_job_t * current_job;
pid_t current_pid;
gm_job_t * exec_job;
@@ -165,6 +163,7 @@ void *get_job( gearman_job_st *job, void *context, size_t *result_size, gearman_
sigprocmask(SIG_BLOCK, &block_mask, &old_mask);
/* get the data */
+ current_gearman_job = job;
wsize = gearman_job_workload_size(job);
strncpy(workload, (const char*)gearman_job_workload(job), wsize);
workload[wsize] = '\0';
@@ -263,6 +262,8 @@ void *get_job( gearman_job_st *job, void *context, size_t *result_size, gearman_
exit( EXIT_SUCCESS );
}
+ current_gearman_job = NULL;
+
return NULL;
}
@@ -316,7 +317,7 @@ void do_exec_job( ) {
if ( !strcmp( exec_job->type, "service" ) || !strcmp( exec_job->type, "host" ) ) {
exec_job->output = strdup("(Could Not Start Check In Time)");
- send_result_back();
+ send_result_back(exec_job);
}
return;
@@ -326,119 +327,12 @@ void do_exec_job( ) {
/* run the command */
gm_log( GM_LOG_TRACE, "command: %s\n", exec_job->command_line);
+ current_job = exec_job;
execute_safe_command(exec_job, mod_gm_opt->fork_on_exec, mod_gm_opt->identifier );
+ current_job = NULL;
if ( !strcmp( exec_job->type, "service" ) || !strcmp( exec_job->type, "host" ) ) {
- send_result_back();
- }
-
- return;
-}
-
-
-/* send results back */
-void send_result_back() {
- gm_log( GM_LOG_TRACE, "send_result_back()\n" );
-
- if(exec_job->result_queue == NULL) {
- return;
- }
- if(exec_job->output == NULL) {
- return;
- }
-
- /* workaround for rc 25 bug
- * duplicate jobs from gearmand result in exit code 25 of plugins
- * because they are executed twice and get killed because of using
- * the same ressource.
- * Sending results (when exit code is 25 ) will be skipped with this
- * enabled.
- */
- if( exec_job->return_code == 25 && mod_gm_opt->workaround_rc_25 == GM_ENABLED ) {
- return;
- }
-
- gm_log( GM_LOG_TRACE, "queue: %s\n", exec_job->result_queue );
- temp_buffer1[0]='\x0';
- snprintf( temp_buffer1, sizeof( temp_buffer1 )-1, "host_name=%s\ncore_start_time=%i.%i\nstart_time=%i.%i\nfinish_time=%i.%i\nlatency=%f\nreturn_code=%i\nexited_ok=%i\n",
- exec_job->host_name,
- ( int )exec_job->core_start_time.tv_sec,
- ( int )exec_job->core_start_time.tv_usec,
- ( int )exec_job->start_time.tv_sec,
- ( int )exec_job->start_time.tv_usec,
- ( int )exec_job->finish_time.tv_sec,
- ( int )exec_job->finish_time.tv_usec,
- exec_job->latency,
- exec_job->return_code,
- exec_job->exited_ok
- );
- temp_buffer1[sizeof( temp_buffer1 )-1]='\x0';
-
- if(exec_job->service_description != NULL) {
- temp_buffer2[0]='\x0';
- strncat(temp_buffer2, "service_description=", (sizeof(temp_buffer2)-1));
- strncat(temp_buffer2, exec_job->service_description, (sizeof(temp_buffer2)-1));
- strncat(temp_buffer2, "\n", (sizeof(temp_buffer2)-1));
-
- strncat(temp_buffer1, temp_buffer2, (sizeof(temp_buffer1)-1));
- }
- temp_buffer1[sizeof( temp_buffer1 )-1]='\x0';
-
- if(exec_job->output != NULL) {
- temp_buffer2[0]='\x0';
- strncat(temp_buffer2, "output=", (sizeof(temp_buffer2)-1));
- if(mod_gm_opt->debug_result) {
- strncat(temp_buffer2, "(", (sizeof(temp_buffer2)-1));
- strncat(temp_buffer2, hostname, (sizeof(temp_buffer2)-1));
- strncat(temp_buffer2, ") - ", (sizeof(temp_buffer2)-1));
- }
- strncat(temp_buffer2, exec_job->output, (sizeof(temp_buffer2)-1));
- strncat(temp_buffer2, "\n\n\n", (sizeof(temp_buffer2)-1));
- strncat(temp_buffer1, temp_buffer2, (sizeof(temp_buffer1)-1));
- }
- strncat(temp_buffer1, "\n", (sizeof(temp_buffer1)-2));
- temp_buffer1[sizeof( temp_buffer1 )-1]='\x0';
-
- gm_log( GM_LOG_TRACE, "data:\n%s\n", temp_buffer1);
-
- if(add_job_to_queue( &client,
- mod_gm_opt->server_list,
- exec_job->result_queue,
- NULL,
- temp_buffer1,
- GM_JOB_PRIO_NORMAL,
- GM_DEFAULT_JOB_RETRIES,
- mod_gm_opt->transportmode,
- TRUE
- ) == GM_OK) {
- gm_log( GM_LOG_TRACE, "send_result_back() finished successfully\n" );
- }
- else {
- gm_log( GM_LOG_TRACE, "send_result_back() finished unsuccessfully\n" );
- }
-
- if( mod_gm_opt->dupserver_num ) {
- strncpy(temp_buffer2, "type=passive\n", (sizeof(temp_buffer1)-2));
- strncat(temp_buffer2, temp_buffer1, (sizeof(temp_buffer2)-2));
- temp_buffer2[sizeof( temp_buffer2 )-1]='\x0';
- if( add_job_to_queue( &client_dup,
- mod_gm_opt->dupserver_list,
- exec_job->result_queue,
- NULL,
- temp_buffer2,
- GM_JOB_PRIO_NORMAL,
- GM_DEFAULT_JOB_RETRIES,
- mod_gm_opt->transportmode,
- TRUE
- ) == GM_OK) {
- gm_log( GM_LOG_TRACE, "send_result_back() finished successfully for duplicate server.\n" );
- }
- else {
- gm_log( GM_LOG_TRACE, "send_result_back() finished unsuccessfully for duplicate server\n" );
- }
- }
- else {
- gm_log( GM_LOG_TRACE, "send_result_back() has no duplicate servers to send to.\n" );
+ send_result_back(exec_job);
}
return;
--
Debian packaging for mod gearman.
More information about the Pkg-nagios-changes
mailing list