[Qa-jenkins-scm] [Git][qa/jenkins.debian.net][master] 2 commits: reproducible: maintenance job: detect hanging health check runs too (not only failed ones)
Holger Levsen
gitlab at salsa.debian.org
Thu Mar 7 22:22:34 GMT 2019
Holger Levsen pushed to branch master at Debian QA / jenkins.debian.net
Commits:
b9941d09 by Holger Levsen at 2019-03-07T22:10:11Z
reproducible: maintenance job: detect hanging health check runs too (not only failed ones)
Signed-off-by: Holger Levsen <holger at layer-acht.org>
- - - - -
0276c1f9 by Holger Levsen at 2019-03-07T22:22:01Z
reproducible: maintenance job: also analyse node maintenance job runs to determine whether to mark nodes offline
Signed-off-by: Holger Levsen <holger at layer-acht.org>
- - - - -
1 changed file:
- bin/reproducible_maintenance.sh
Changes:
=====================================
bin/reproducible_maintenance.sh
=====================================
@@ -150,10 +150,24 @@ if [ "$HOSTNAME" = "$MAINNODE" ] ; then
#
echo "$(date -u) - Looking for unhealthy nodes."
cd ~/jobs
+ DUMMY_FILE=$(mktemp --tmpdir=$TMPDIR maintenance-XXXXXXX)
SICK=""
- for i in reproducible_node_health_check_* ; do
- NODE_ALIAS=$(echo $i | cut -d '_' -f6)
- NODE_ARCH=$(echo $i | cut -d '_' -f5)
+ for i in reproducible_node_health_check_* reproducible_maintenance_* ; do
+ case $i in
+ reproducible_node_health_check_*)
+ NODE_ALIAS=$(echo $i | cut -d '_' -f6)
+ NODE_ARCH=$(echo $i | cut -d '_' -f5)
+ FORCE_DATE=$(date -u -d "1 hour ago" '+%Y-%m-%d %H:%M')
+ MAXDIFF=4
+ ;;
+ reproducible_maintenance_*)
+ NODE_ALIAS=$(echo $i | cut -d '_' -f4)
+ NODE_ARCH=$(echo $i | cut -d '_' -f3)
+ FORCE_DATE=$(date -u -d "5 hour ago" '+%Y-%m-%d %H:%M')
+ MAXDIFF=2
+ ;;
+ esac
+ touch -d "$FORCE_DATE" $DUMMY_FILE
case $NODE_ARCH in
amd64) NODE="profitbricks-build${NODE_ALIAS#profitbricks}-amd64.debian.net" ;;
i386) NODE="profitbricks-build${NODE_ALIAS#profitbricks}-i386.debian.net" ;;
@@ -174,7 +188,9 @@ if [ "$HOSTNAME" = "$MAINNODE" ] ; then
fi
if [ $DIFF -eq -1 ] ; then
echo "Problems analysing $i build logs, ignoring $NODE."
- elif [ $DIFF -gt 4 ] ; then
+ # either the diff is greater than $MAXDIFF (=the last $MAXDIFF job runs failed)
+ # or the last successful run is older than an hour (=a job is still running/hanging)
+ elif [ $DIFF -gt $MAXDIFF ] || [ $LAST -ot $DUMMY_FILE ] ; then
echo -n "$i job has issues since more than an hour"
if grep -q $NODE ~/offline_nodes >/dev/null 2>&1 ; then
echo " and $NODE already marked as offline, good."
@@ -198,6 +214,7 @@ if [ "$HOSTNAME" = "$MAINNODE" ] ; then
fi
irc_message debian-reproducible "$MESSAGE To make this permanent, edit jenkins-home/offline_nodes in git."
fi
+ rm -f $DUMMY_FILE
fi
echo "$(date -u) - updating the schroots and pbuilder now..."
View it on GitLab: https://salsa.debian.org/qa/jenkins.debian.net/compare/2e2966b7ae836fb1147ab8d0a5eb9996b2b80c41...0276c1f97d552f63f90c50a066b7d8177fa9a5bc
--
View it on GitLab: https://salsa.debian.org/qa/jenkins.debian.net/compare/2e2966b7ae836fb1147ab8d0a5eb9996b2b80c41...0276c1f97d552f63f90c50a066b7d8177fa9a5bc
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/qa-jenkins-scm/attachments/20190307/3d1dab88/attachment-0001.html>
More information about the Qa-jenkins-scm
mailing list