[Qa-jenkins-scm] [Git][qa/jenkins.debian.net][master] reproducible trbo system health check: seperate nodes which have been automatically marked as down

Holger Levsen gitlab at salsa.debian.org
Mon Jul 20 14:22:15 BST 2020



Holger Levsen pushed to branch master at Debian QA / jenkins.debian.net


Commits:
283a5697 by Holger Levsen at 2020-07-20T15:22:03+02:00
reproducible trbo system health check: seperate nodes which have been automatically marked as down

Signed-off-by: Holger Levsen <holger at layer-acht.org>

- - - - -


1 changed file:

- bin/reproducible_system_health.sh


Changes:

=====================================
bin/reproducible_system_health.sh
=====================================
@@ -28,10 +28,13 @@ STATUS=-1
 INPUTS=0
 SCORE=0
 INVALID=0
+SUSPICIOUS=0
 FAILED_JOBS=$(mktemp --tmpdir=$TMPDIR trbo-status-XXXXXXX)
 UNSTABLE_JOBS=$(mktemp --tmpdir=$TMPDIR trbo-status-XXXXXXX)
 IGNORED_JOBS=$(mktemp --tmpdir=$TMPDIR trbo-status-XXXXXXX)
 BAD_JOBS=$(mktemp --tmpdir=$TMPDIR trbo-status-XXXXXXX)
+FAILED_SUSPECTS=$(mktemp --tmpdir=$TMPDIR trbo-status-XXXXXXX)
+UNSTABLE_SUSPECTS=$(mktemp --tmpdir=$TMPDIR trbo-status-XXXXXXX)
 # gather data
 echo "$(date -u) - starting up."
 cd /var/lib/jenkins/jobs/
@@ -78,18 +81,24 @@ for JOB in reproducible_* ; do
 			arm64)	NODE="codethink-sled${NODE_ALIAS#codethink}-arm64.debian.net" ;;
 			armhf)	NODE="${NODE_ALIAS}-armhf-rb.debian.net" ;;
 		esac
-		# check the offline list from git (and not the one updated by jenkins)
+		# check the offline list from git (and the one updated by jenkins)
 		# as this should only ignore nodes humans (and not jenkins) have
 		# acknowledged they are down...
+		# jobs one nodes marked down by jenkins are remarked but still counted
+		SUSPECT=false
 		if grep -q $NODE $JENKINS_OFFLINE_GIT_LIST >/dev/null 2>&1 ; then
 			echo "   <li><a href=\"https://jenkins.debian.net/job/$JOB/\">$JOB</a></li>" >> ${IGNORED_JOBS}
-			echo "  ignored job: $JOB (node is offline)"
+			echo "  ignored job: $JOB (node is marked offline)"
 			let INVALID+=1
 			continue
+		elif grep -q $NODE $JENKINS_OFFLINE_LIST >/dev/null 2>&1 ; then
+			SUSPECT=true
+			let SUSPICIOUS+=1
+			# we still count this job for the overall status...
 		fi
 	fi
 	#
-	# node is not known offline, let's go
+	# node is not known offline (in git), let's go
 	#
 	let INPUTS+=1
 	FILE=$JOB/builds/permalinks
@@ -106,7 +115,11 @@ for JOB in reproducible_* ; do
 		if $(grep -q "Kernel needs upgrade" $JOB/builds/$LAST/log) ; then
 			NOTE=" (reboot needed for kernel upgrade)"
 		fi
-		echo "   <li><a href=\"https://jenkins.debian.net/job/$JOB/\">$JOB</a>$NOTE</li>" >> ${UNSTABLE_JOBS}
+		if ! $SUSPECT ; then
+			echo "   <li><a href=\"https://jenkins.debian.net/job/$JOB/\">$JOB</a>$NOTE</li>" >> ${UNSTABLE_JOBS}
+		else
+			echo "   <li><a href=\"https://jenkins.debian.net/job/$JOB/\">$JOB</a>$NOTE</li>" >> ${UNSTABLE_SUSPECTS}
+		fi
 	else
 		case $JOB in
 			reproducible_maintenance_amd64_jenkins)			MODIFIER=250 ;; # main node
@@ -135,10 +148,18 @@ for JOB in reproducible_* ; do
 			continue
 		elif [ $MODIFIER -eq 1 ] ; then
 			echo "  failed job: $JOB"
-			echo "$MODIFIER|   <li><a href=\"https://jenkins.debian.net/job/$JOB/\">$JOB</a></li>" >> ${FAILED_JOBS}
+			if ! $SUSPECT ; then
+				echo "$MODIFIER|   <li><a href=\"https://jenkins.debian.net/job/$JOB/\">$JOB</a></li>" >> ${FAILED_JOBS}
+			else
+				echo "$MODIFIER|   <li><a href=\"https://jenkins.debian.net/job/$JOB/\">$JOB</a></li>" >> ${FAILED_SUSPECTS}
+			fi
 		else
 			echo "  failed job: $JOB - $MODIFIER"
-			echo "$MODIFIER|   <li><a href=\"https://jenkins.debian.net/job/$JOB/\">$JOB</a> <em>($MODIFIER)</em></li>" >> ${FAILED_JOBS}
+			if ! $SUSPECT ; then
+				echo "$MODIFIER|   <li><a href=\"https://jenkins.debian.net/job/$JOB/\">$JOB</a> <em>($MODIFIER)</em></li>" >> ${FAILED_JOBS}
+			else
+				echo "$MODIFIER|   <li><a href=\"https://jenkins.debian.net/job/$JOB/\">$JOB</a> <em>($MODIFIER)</em></li>" >> ${FAILED_SUSPECTS}
+			fi
 		fi
 		let SCORE-=$MODIFIER || SCORE=0
 	fi
@@ -168,6 +189,7 @@ cat > $HEALTH_FILE.html <<- EOF
   Score: $SCORE (a stable jobs adds 3, an unstable job adds 1 and a failed job substracts something between 1 and 500 (indicated in brackets after the job name below), depending on the importance of the job for the setup.)
   <br/>
   Jobs considered: $INPUTS
+  (including $SUSPICIOUS on nodes automatically marked offline)
   <br/>
   Jobs ignored: $INVALID
  </p>
@@ -183,6 +205,18 @@ cat > $HEALTH_FILE.html <<- EOF
    $(cat ${UNSTABLE_JOBS})
   </ul>
  </p>
+ <p>
+  Failed jobs on nodes automatically marked down by jenkins:
+  <ul>
+   $(cat ${FAILED_SUSPECTS} | sort -t '|' -n -r | cut -d '|' -f2- | sort)
+  </ul>
+ </p>
+ <p>
+  Unstable jobs on nodes automatically marked down by jenkins:
+  <ul>
+   $(cat ${UNSTABLE_SUSPECTS})
+  </ul>
+ </p>
  <p>
   Ignored jobs (because these nodes are known and
   <a href="https://salsa.debian.org/qa/jenkins.debian.net/-/blob/master/jenkins-home/offline_nodes">documented</a>
@@ -204,4 +238,4 @@ cat > $HEALTH_FILE.html <<- EOF
 EOF
 echo "$(date -u) - $(basename $HEALTH_FILE).html updated, visible at $REPRODUCIBLE_URL/$(basename $HEALTH_FILE).html."
 echo "$(date -u) - the end."
-rm -f ${FAILED_JOBS} ${UNSTABLE_JOBS} ${IGNORED_JOBS} ${BAD_JOBS}
+rm -f ${FAILED_JOBS} ${UNSTABLE_JOBS} ${IGNORED_JOBS} ${BAD_JOBS} ${FAILED_SUSPECTS} ${UNSTABLE_SUSPECTS}



View it on GitLab: https://salsa.debian.org/qa/jenkins.debian.net/-/commit/283a5697f50cfd477acf1c516bab9611858c94f4

-- 
View it on GitLab: https://salsa.debian.org/qa/jenkins.debian.net/-/commit/283a5697f50cfd477acf1c516bab9611858c94f4
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/qa-jenkins-scm/attachments/20200720/3db07cb9/attachment-0001.html>


More information about the Qa-jenkins-scm mailing list