[Qa-jenkins-scm] [Git][qa/jenkins.debian.net][master] 2 commits: reproducible: maintenance job: detect hanging health check runs too (not only failed ones)

Holger Levsen gitlab at salsa.debian.org
Thu Mar 7 22:22:34 GMT 2019


Holger Levsen pushed to branch master at Debian QA / jenkins.debian.net


Commits:
b9941d09 by Holger Levsen at 2019-03-07T22:10:11Z
reproducible: maintenance job: detect hanging health check runs too (not only failed ones)

Signed-off-by: Holger Levsen <holger at layer-acht.org>

- - - - -
0276c1f9 by Holger Levsen at 2019-03-07T22:22:01Z
reproducible: maintenance job: also analyse node maintenance job runs to determine whether to mark nodes offline

Signed-off-by: Holger Levsen <holger at layer-acht.org>

- - - - -


1 changed file:

- bin/reproducible_maintenance.sh


Changes:

=====================================
bin/reproducible_maintenance.sh
=====================================
@@ -150,10 +150,24 @@ if [ "$HOSTNAME" = "$MAINNODE" ] ; then
 	#
 	echo "$(date -u) - Looking for unhealthy nodes."
 	cd ~/jobs
+	DUMMY_FILE=$(mktemp --tmpdir=$TMPDIR maintenance-XXXXXXX)
 	SICK=""
-	for i in reproducible_node_health_check_* ; do
-		NODE_ALIAS=$(echo $i | cut -d '_' -f6)
-		NODE_ARCH=$(echo $i | cut -d '_' -f5)
+	for i in reproducible_node_health_check_* reproducible_maintenance_* ; do
+		case $i in
+			reproducible_node_health_check_*)
+				NODE_ALIAS=$(echo $i | cut -d '_' -f6)
+				NODE_ARCH=$(echo $i | cut -d '_' -f5)
+				FORCE_DATE=$(date -u -d "1 hour ago" '+%Y-%m-%d %H:%M')
+				MAXDIFF=4
+				;;
+			reproducible_maintenance_*)
+				NODE_ALIAS=$(echo $i | cut -d '_' -f4)
+				NODE_ARCH=$(echo $i | cut -d '_' -f3)
+				FORCE_DATE=$(date -u -d "5 hour ago" '+%Y-%m-%d %H:%M')
+				MAXDIFF=2
+				;;
+		esac
+		touch -d "$FORCE_DATE" $DUMMY_FILE
 		case $NODE_ARCH in
 			amd64)	NODE="profitbricks-build${NODE_ALIAS#profitbricks}-amd64.debian.net" ;;
 			i386)	NODE="profitbricks-build${NODE_ALIAS#profitbricks}-i386.debian.net" ;;
@@ -174,7 +188,9 @@ if [ "$HOSTNAME" = "$MAINNODE" ] ; then
 		fi
 		if [ $DIFF -eq -1 ] ; then
 			echo "Problems analysing $i build logs, ignoring $NODE."
-		elif [ $DIFF -gt 4 ] ; then
+		# either the diff is greater than $MAXDIFF (=the last $MAXDIFF job runs failed)
+		# or the last successful run is older than an hour (=a job is still running/hanging)
+		elif [ $DIFF -gt $MAXDIFF ] || [ $LAST -ot $DUMMY_FILE ] ; then
 			echo -n "$i job has issues since more than an hour"
 			if grep -q $NODE ~/offline_nodes >/dev/null 2>&1 ; then
 				echo " and $NODE already marked as offline, good."
@@ -198,6 +214,7 @@ if [ "$HOSTNAME" = "$MAINNODE" ] ; then
 		fi
 		irc_message debian-reproducible "$MESSAGE To make this permanent, edit jenkins-home/offline_nodes in git."
 	fi
+	rm -f $DUMMY_FILE
 fi
 
 echo "$(date -u) - updating the schroots and pbuilder now..."



View it on GitLab: https://salsa.debian.org/qa/jenkins.debian.net/compare/2e2966b7ae836fb1147ab8d0a5eb9996b2b80c41...0276c1f97d552f63f90c50a066b7d8177fa9a5bc

-- 
View it on GitLab: https://salsa.debian.org/qa/jenkins.debian.net/compare/2e2966b7ae836fb1147ab8d0a5eb9996b2b80c41...0276c1f97d552f63f90c50a066b7d8177fa9a5bc
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/qa-jenkins-scm/attachments/20190307/3d1dab88/attachment-0001.html>


More information about the Qa-jenkins-scm mailing list