[Qa-jenkins-scm] [Git][qa/jenkins.debian.net][master] 4 commits: reproducible node health: automatically restart failed avahi and acpid services

Holger Levsen gitlab at salsa.debian.org
Sat Aug 1 16:44:24 BST 2020



Holger Levsen pushed to branch master at Debian QA / jenkins.debian.net


Commits:
46e9494b by Holger Levsen at 2020-08-01T17:36:58+02:00
reproducible node health: automatically restart failed avahi and acpid services

Signed-off-by: Holger Levsen <holger at layer-acht.org>

- - - - -
0b9152a5 by Holger Levsen at 2020-08-01T17:39:59+02:00
reproducible trbo system health check: detect reproducible_build.sh zombies

Signed-off-by: Holger Levsen <holger at layer-acht.org>

- - - - -
d919db69 by Holger Levsen at 2020-08-01T17:42:59+02:00
reproducible Debian: mark some armhf nodes as down after reboots due to 10.5 upgrades

Signed-off-by: Holger Levsen <holger at layer-acht.org>

- - - - -
25dc000f by Holger Levsen at 2020-08-01T17:44:14+02:00
reproducible: fix naming scheme example for armhf and add one for osuosl nodes

Signed-off-by: Holger Levsen <holger at layer-acht.org>

- - - - -


8 changed files:

- bin/reproducible_node_health_check.sh
- bin/reproducible_system_health.sh
- hosts/common/etc/sudoers.d/jenkins
- hosts/jenkins-test-vm/etc/sudoers.d/jenkins
- hosts/jenkins/etc/sudoers.d/jenkins
- hosts/profitbricks-build10-amd64/etc/sudoers.d/jenkins
- hosts/profitbricks-build9-amd64/etc/sudoers.d/jenkins
- jenkins-home/offline_nodes


Changes:

=====================================
bin/reproducible_node_health_check.sh
=====================================
@@ -151,7 +151,13 @@ if ! systemctl is-system-running > /dev/null; then
 	echo "$(date -u) - problematic services found:"
 	cat $SERVICES
 	echo "$(date -u) - trying to fix problematic services."
-	for UNIT in acpid rc-local ; do
+	for UNIT in avahi-daemon acpid ; do
+		if [ -n "$(grep $UNIT $SERVICES)" ] ; then
+			echo "$(date -u) - restarting failed service $UNIT..."
+		        sudo systemctl restart $UNIT
+		fi
+	done
+	for UNIT in rc-local ; do
 		if [ -n "$(grep $UNIT $SERVICES)" ] ; then
 			echo "$(date -u) - resetting failed unit $UNIT..."
 		        sudo systemctl reset-failed $UNIT


=====================================
bin/reproducible_system_health.sh
=====================================
@@ -147,12 +147,16 @@ for JOB_NAME in reproducible_* ; do
 			small_note " (unkillable unwanted processes)"
 		elif $(grep -q "failed failed pbuilder_build" $LOG) ; then
 			small_note " (pbuilder build scope failed)"
+		elif $(grep -q "failed failed Avahi mDNS/DNS-SD Stack" $LOG) ; then
+			small_note " (avahi failed)"
 		elif $(grep -q "failed failed Rotate log files" $LOG) ; then
 			small_note " (logrotate failed)"
 		elif $(grep -q "Warning: Tried, but failed to delete these schroots:" $LOG) ; then
 			small_note " (failed to delete schroots)"
 		elif $(grep -q "Warning: Tried, but failed to delete these schroot sessions:" $LOG) ; then
 			small_note " (failed to delete schroot sessions)"
+		elif $(grep -q "Warning: found reproducible_build.sh processes which have pid 1 as parent (and not sshd)" $LOG) ; then
+			small_note " (reproducible_build.sh zombies)"
 		fi
 		if ! $SUSPECT ; then
 			echo "   <li><a href=\"$JOB_URL/\"><img src=\"$JOB_URL/badge/icon\">$JOB_NAME</a>$NOTE</li>" >> ${UNSTABLE_JOBS}


=====================================
hosts/common/etc/sudoers.d/jenkins
=====================================
@@ -4,6 +4,7 @@ jenkins ALL=  \
 	/usr/bin/sbuild-createchroot *, \
 	/usr/bin/sbuild-update *, \
 	/bin/systemctl reset-failed*, \
+	/bin/systemctl restart*, \
 	/usr/bin/tee /schroots/*, \
 	/usr/bin/tee -a /schroots/*, \
 	/usr/bin/tee /etc/schroot/chroot.d/jenkins*, \


=====================================
hosts/jenkins-test-vm/etc/sudoers.d/jenkins
=====================================
@@ -2,6 +2,7 @@ jenkins ALL=  \
 	NOPASSWD: /usr/sbin/debootstrap *, \
 	/usr/bin/mmdebstrap *, \
 	/bin/systemctl reset-failed*, \
+	/bin/systemctl restart*, \
 	/usr/bin/tee /schroots/*, \
 	/usr/bin/tee -a /schroots/*, \
 	/usr/bin/tee /etc/schroot/chroot.d/jenkins*, \


=====================================
hosts/jenkins/etc/sudoers.d/jenkins
=====================================
@@ -2,6 +2,7 @@ jenkins ALL=  \
 	NOPASSWD: /usr/sbin/debootstrap *, \
 	/usr/bin/mmdebstrap *, \
 	/bin/systemctl reset-failed*, \
+	/bin/systemctl restart*, \
 	/usr/bin/tee /schroots/*, \
 	/usr/bin/tee -a /schroots/*, \
 	/usr/bin/tee /etc/schroot/chroot.d/jenkins*, \


=====================================
hosts/profitbricks-build10-amd64/etc/sudoers.d/jenkins
=====================================
@@ -2,6 +2,7 @@ jenkins ALL=  \
 	NOPASSWD: /usr/sbin/debootstrap *, \
 	/usr/bin/mmdebstrap *, \
 	/bin/systemctl reset-failed*, \
+	/bin/systemctl restart*, \
 	/usr/bin/tee /schroots/*, \
 	/usr/bin/tee -a /schroots/*, \
 	/usr/bin/tee /etc/schroot/chroot.d/jenkins*, \


=====================================
hosts/profitbricks-build9-amd64/etc/sudoers.d/jenkins
=====================================
@@ -2,6 +2,7 @@ jenkins ALL=  \
 	NOPASSWD: /usr/sbin/debootstrap *, \
 	/usr/bin/mmdebstrap *, \
 	/bin/systemctl reset-failed*, \
+	/bin/systemctl restart*, \
 	/usr/sbin/chroot /chroots/*, \
 	/bin/rm -rf --one-file-system /chroots/*, \
 	/bin/umount -l /chroots/*, \


=====================================
jenkins-home/offline_nodes
=====================================
@@ -3,9 +3,10 @@
 # as offline in the jenkins UI.
 #
 # The code greps for the hostname, i.e.
-# foobar-armhf.debian.net
+# foobar-armhf-rb.debian.net
 # codethink-sled23-arm64.debian.net
 # profitbricks-build42-amd64.debian.net
+# osuosl-build123-amd64.debian.net
 #
 # (name coming from the basename of the workspace directory for start-agent.sh
 # and from the list in reproducible_build_service.sh for reproducible_worker.sh)
@@ -13,5 +14,13 @@
 
 # Also see https://pad.sfconservancy.org/p/rb-build-nodes-keep
 
+# armhf nodes which didn't come back after reboots follwing 10.5 point release updates
+ff64a-armhf-rb.debian.net
+jtk1a-armhf-rb.debian.net
+jtx1a-armhf-rb.debian.net
+odu3a-armhf-rb.debian.net
+odxu4a-armhf-rb.debian.net
+
+
 # Down here nodes are automatically added by the maintenance job when they have
 # been failing their health check for too long.



View it on GitLab: https://salsa.debian.org/qa/jenkins.debian.net/-/compare/e1adfc062b4da6397719236d6d16047c4b049d7a...25dc000f7a66b7814644c8a42ae9c52c5ba7db36

-- 
View it on GitLab: https://salsa.debian.org/qa/jenkins.debian.net/-/compare/e1adfc062b4da6397719236d6d16047c4b049d7a...25dc000f7a66b7814644c8a42ae9c52c5ba7db36
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/qa-jenkins-scm/attachments/20200801/a7df7d14/attachment-0001.html>


More information about the Qa-jenkins-scm mailing list