[Qa-jenkins-scm] [Git][qa/jenkins.debian.net][master] reproducible trbo system health check: seperate nodes which have been automatically marked as down
Holger Levsen
gitlab at salsa.debian.org
Mon Jul 20 14:22:15 BST 2020
Holger Levsen pushed to branch master at Debian QA / jenkins.debian.net
Commits:
283a5697 by Holger Levsen at 2020-07-20T15:22:03+02:00
reproducible trbo system health check: seperate nodes which have been automatically marked as down
Signed-off-by: Holger Levsen <holger at layer-acht.org>
- - - - -
1 changed file:
- bin/reproducible_system_health.sh
Changes:
=====================================
bin/reproducible_system_health.sh
=====================================
@@ -28,10 +28,13 @@ STATUS=-1
INPUTS=0
SCORE=0
INVALID=0
+SUSPICIOUS=0
FAILED_JOBS=$(mktemp --tmpdir=$TMPDIR trbo-status-XXXXXXX)
UNSTABLE_JOBS=$(mktemp --tmpdir=$TMPDIR trbo-status-XXXXXXX)
IGNORED_JOBS=$(mktemp --tmpdir=$TMPDIR trbo-status-XXXXXXX)
BAD_JOBS=$(mktemp --tmpdir=$TMPDIR trbo-status-XXXXXXX)
+FAILED_SUSPECTS=$(mktemp --tmpdir=$TMPDIR trbo-status-XXXXXXX)
+UNSTABLE_SUSPECTS=$(mktemp --tmpdir=$TMPDIR trbo-status-XXXXXXX)
# gather data
echo "$(date -u) - starting up."
cd /var/lib/jenkins/jobs/
@@ -78,18 +81,24 @@ for JOB in reproducible_* ; do
arm64) NODE="codethink-sled${NODE_ALIAS#codethink}-arm64.debian.net" ;;
armhf) NODE="${NODE_ALIAS}-armhf-rb.debian.net" ;;
esac
- # check the offline list from git (and not the one updated by jenkins)
+ # check the offline list from git (and the one updated by jenkins)
# as this should only ignore nodes humans (and not jenkins) have
# acknowledged they are down...
+ # jobs one nodes marked down by jenkins are remarked but still counted
+ SUSPECT=false
if grep -q $NODE $JENKINS_OFFLINE_GIT_LIST >/dev/null 2>&1 ; then
echo " <li><a href=\"https://jenkins.debian.net/job/$JOB/\">$JOB</a></li>" >> ${IGNORED_JOBS}
- echo " ignored job: $JOB (node is offline)"
+ echo " ignored job: $JOB (node is marked offline)"
let INVALID+=1
continue
+ elif grep -q $NODE $JENKINS_OFFLINE_LIST >/dev/null 2>&1 ; then
+ SUSPECT=true
+ let SUSPICIOUS+=1
+ # we still count this job for the overall status...
fi
fi
#
- # node is not known offline, let's go
+ # node is not known offline (in git), let's go
#
let INPUTS+=1
FILE=$JOB/builds/permalinks
@@ -106,7 +115,11 @@ for JOB in reproducible_* ; do
if $(grep -q "Kernel needs upgrade" $JOB/builds/$LAST/log) ; then
NOTE=" (reboot needed for kernel upgrade)"
fi
- echo " <li><a href=\"https://jenkins.debian.net/job/$JOB/\">$JOB</a>$NOTE</li>" >> ${UNSTABLE_JOBS}
+ if ! $SUSPECT ; then
+ echo " <li><a href=\"https://jenkins.debian.net/job/$JOB/\">$JOB</a>$NOTE</li>" >> ${UNSTABLE_JOBS}
+ else
+ echo " <li><a href=\"https://jenkins.debian.net/job/$JOB/\">$JOB</a>$NOTE</li>" >> ${UNSTABLE_SUSPECTS}
+ fi
else
case $JOB in
reproducible_maintenance_amd64_jenkins) MODIFIER=250 ;; # main node
@@ -135,10 +148,18 @@ for JOB in reproducible_* ; do
continue
elif [ $MODIFIER -eq 1 ] ; then
echo " failed job: $JOB"
- echo "$MODIFIER| <li><a href=\"https://jenkins.debian.net/job/$JOB/\">$JOB</a></li>" >> ${FAILED_JOBS}
+ if ! $SUSPECT ; then
+ echo "$MODIFIER| <li><a href=\"https://jenkins.debian.net/job/$JOB/\">$JOB</a></li>" >> ${FAILED_JOBS}
+ else
+ echo "$MODIFIER| <li><a href=\"https://jenkins.debian.net/job/$JOB/\">$JOB</a></li>" >> ${FAILED_SUSPECTS}
+ fi
else
echo " failed job: $JOB - $MODIFIER"
- echo "$MODIFIER| <li><a href=\"https://jenkins.debian.net/job/$JOB/\">$JOB</a> <em>($MODIFIER)</em></li>" >> ${FAILED_JOBS}
+ if ! $SUSPECT ; then
+ echo "$MODIFIER| <li><a href=\"https://jenkins.debian.net/job/$JOB/\">$JOB</a> <em>($MODIFIER)</em></li>" >> ${FAILED_JOBS}
+ else
+ echo "$MODIFIER| <li><a href=\"https://jenkins.debian.net/job/$JOB/\">$JOB</a> <em>($MODIFIER)</em></li>" >> ${FAILED_SUSPECTS}
+ fi
fi
let SCORE-=$MODIFIER || SCORE=0
fi
@@ -168,6 +189,7 @@ cat > $HEALTH_FILE.html <<- EOF
Score: $SCORE (a stable jobs adds 3, an unstable job adds 1 and a failed job substracts something between 1 and 500 (indicated in brackets after the job name below), depending on the importance of the job for the setup.)
<br/>
Jobs considered: $INPUTS
+ (including $SUSPICIOUS on nodes automatically marked offline)
<br/>
Jobs ignored: $INVALID
</p>
@@ -183,6 +205,18 @@ cat > $HEALTH_FILE.html <<- EOF
$(cat ${UNSTABLE_JOBS})
</ul>
</p>
+ <p>
+ Failed jobs on nodes automatically marked down by jenkins:
+ <ul>
+ $(cat ${FAILED_SUSPECTS} | sort -t '|' -n -r | cut -d '|' -f2- | sort)
+ </ul>
+ </p>
+ <p>
+ Unstable jobs on nodes automatically marked down by jenkins:
+ <ul>
+ $(cat ${UNSTABLE_SUSPECTS})
+ </ul>
+ </p>
<p>
Ignored jobs (because these nodes are known and
<a href="https://salsa.debian.org/qa/jenkins.debian.net/-/blob/master/jenkins-home/offline_nodes">documented</a>
@@ -204,4 +238,4 @@ cat > $HEALTH_FILE.html <<- EOF
EOF
echo "$(date -u) - $(basename $HEALTH_FILE).html updated, visible at $REPRODUCIBLE_URL/$(basename $HEALTH_FILE).html."
echo "$(date -u) - the end."
-rm -f ${FAILED_JOBS} ${UNSTABLE_JOBS} ${IGNORED_JOBS} ${BAD_JOBS}
+rm -f ${FAILED_JOBS} ${UNSTABLE_JOBS} ${IGNORED_JOBS} ${BAD_JOBS} ${FAILED_SUSPECTS} ${UNSTABLE_SUSPECTS}
View it on GitLab: https://salsa.debian.org/qa/jenkins.debian.net/-/commit/283a5697f50cfd477acf1c516bab9611858c94f4
--
View it on GitLab: https://salsa.debian.org/qa/jenkins.debian.net/-/commit/283a5697f50cfd477acf1c516bab9611858c94f4
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/qa-jenkins-scm/attachments/20200720/3db07cb9/attachment-0001.html>
More information about the Qa-jenkins-scm
mailing list