[Qa-jenkins-scm] [Git][qa/jenkins.debian.net][master] 4 commits: reproducible node health: automatically restart failed avahi and acpid services
Holger Levsen
gitlab at salsa.debian.org
Sat Aug 1 16:44:24 BST 2020
Holger Levsen pushed to branch master at Debian QA / jenkins.debian.net
Commits:
46e9494b by Holger Levsen at 2020-08-01T17:36:58+02:00
reproducible node health: automatically restart failed avahi and acpid services
Signed-off-by: Holger Levsen <holger at layer-acht.org>
- - - - -
0b9152a5 by Holger Levsen at 2020-08-01T17:39:59+02:00
reproducible trbo system health check: detect reproducible_build.sh zombies
Signed-off-by: Holger Levsen <holger at layer-acht.org>
- - - - -
d919db69 by Holger Levsen at 2020-08-01T17:42:59+02:00
reproducible Debian: mark some armhf nodes as down after reboots due to 10.5 upgrades
Signed-off-by: Holger Levsen <holger at layer-acht.org>
- - - - -
25dc000f by Holger Levsen at 2020-08-01T17:44:14+02:00
reproducible: fix naming scheme example for armhf and add one for osuosl nodes
Signed-off-by: Holger Levsen <holger at layer-acht.org>
- - - - -
8 changed files:
- bin/reproducible_node_health_check.sh
- bin/reproducible_system_health.sh
- hosts/common/etc/sudoers.d/jenkins
- hosts/jenkins-test-vm/etc/sudoers.d/jenkins
- hosts/jenkins/etc/sudoers.d/jenkins
- hosts/profitbricks-build10-amd64/etc/sudoers.d/jenkins
- hosts/profitbricks-build9-amd64/etc/sudoers.d/jenkins
- jenkins-home/offline_nodes
Changes:
=====================================
bin/reproducible_node_health_check.sh
=====================================
@@ -151,7 +151,13 @@ if ! systemctl is-system-running > /dev/null; then
echo "$(date -u) - problematic services found:"
cat $SERVICES
echo "$(date -u) - trying to fix problematic services."
- for UNIT in acpid rc-local ; do
+ for UNIT in avahi-daemon acpid ; do
+ if [ -n "$(grep $UNIT $SERVICES)" ] ; then
+ echo "$(date -u) - restarting failed service $UNIT..."
+ sudo systemctl restart $UNIT
+ fi
+ done
+ for UNIT in rc-local ; do
if [ -n "$(grep $UNIT $SERVICES)" ] ; then
echo "$(date -u) - resetting failed unit $UNIT..."
sudo systemctl reset-failed $UNIT
=====================================
bin/reproducible_system_health.sh
=====================================
@@ -147,12 +147,16 @@ for JOB_NAME in reproducible_* ; do
small_note " (unkillable unwanted processes)"
elif $(grep -q "failed failed pbuilder_build" $LOG) ; then
small_note " (pbuilder build scope failed)"
+ elif $(grep -q "failed failed Avahi mDNS/DNS-SD Stack" $LOG) ; then
+ small_note " (avahi failed)"
elif $(grep -q "failed failed Rotate log files" $LOG) ; then
small_note " (logrotate failed)"
elif $(grep -q "Warning: Tried, but failed to delete these schroots:" $LOG) ; then
small_note " (failed to delete schroots)"
elif $(grep -q "Warning: Tried, but failed to delete these schroot sessions:" $LOG) ; then
small_note " (failed to delete schroot sessions)"
+ elif $(grep -q "Warning: found reproducible_build.sh processes which have pid 1 as parent (and not sshd)" $LOG) ; then
+ small_note " (reproducible_build.sh zombies)"
fi
if ! $SUSPECT ; then
echo " <li><a href=\"$JOB_URL/\"><img src=\"$JOB_URL/badge/icon\">$JOB_NAME</a>$NOTE</li>" >> ${UNSTABLE_JOBS}
=====================================
hosts/common/etc/sudoers.d/jenkins
=====================================
@@ -4,6 +4,7 @@ jenkins ALL= \
/usr/bin/sbuild-createchroot *, \
/usr/bin/sbuild-update *, \
/bin/systemctl reset-failed*, \
+ /bin/systemctl restart*, \
/usr/bin/tee /schroots/*, \
/usr/bin/tee -a /schroots/*, \
/usr/bin/tee /etc/schroot/chroot.d/jenkins*, \
=====================================
hosts/jenkins-test-vm/etc/sudoers.d/jenkins
=====================================
@@ -2,6 +2,7 @@ jenkins ALL= \
NOPASSWD: /usr/sbin/debootstrap *, \
/usr/bin/mmdebstrap *, \
/bin/systemctl reset-failed*, \
+ /bin/systemctl restart*, \
/usr/bin/tee /schroots/*, \
/usr/bin/tee -a /schroots/*, \
/usr/bin/tee /etc/schroot/chroot.d/jenkins*, \
=====================================
hosts/jenkins/etc/sudoers.d/jenkins
=====================================
@@ -2,6 +2,7 @@ jenkins ALL= \
NOPASSWD: /usr/sbin/debootstrap *, \
/usr/bin/mmdebstrap *, \
/bin/systemctl reset-failed*, \
+ /bin/systemctl restart*, \
/usr/bin/tee /schroots/*, \
/usr/bin/tee -a /schroots/*, \
/usr/bin/tee /etc/schroot/chroot.d/jenkins*, \
=====================================
hosts/profitbricks-build10-amd64/etc/sudoers.d/jenkins
=====================================
@@ -2,6 +2,7 @@ jenkins ALL= \
NOPASSWD: /usr/sbin/debootstrap *, \
/usr/bin/mmdebstrap *, \
/bin/systemctl reset-failed*, \
+ /bin/systemctl restart*, \
/usr/bin/tee /schroots/*, \
/usr/bin/tee -a /schroots/*, \
/usr/bin/tee /etc/schroot/chroot.d/jenkins*, \
=====================================
hosts/profitbricks-build9-amd64/etc/sudoers.d/jenkins
=====================================
@@ -2,6 +2,7 @@ jenkins ALL= \
NOPASSWD: /usr/sbin/debootstrap *, \
/usr/bin/mmdebstrap *, \
/bin/systemctl reset-failed*, \
+ /bin/systemctl restart*, \
/usr/sbin/chroot /chroots/*, \
/bin/rm -rf --one-file-system /chroots/*, \
/bin/umount -l /chroots/*, \
=====================================
jenkins-home/offline_nodes
=====================================
@@ -3,9 +3,10 @@
# as offline in the jenkins UI.
#
# The code greps for the hostname, i.e.
-# foobar-armhf.debian.net
+# foobar-armhf-rb.debian.net
# codethink-sled23-arm64.debian.net
# profitbricks-build42-amd64.debian.net
+# osuosl-build123-amd64.debian.net
#
# (name coming from the basename of the workspace directory for start-agent.sh
# and from the list in reproducible_build_service.sh for reproducible_worker.sh)
@@ -13,5 +14,13 @@
# Also see https://pad.sfconservancy.org/p/rb-build-nodes-keep
+# armhf nodes which didn't come back after reboots follwing 10.5 point release updates
+ff64a-armhf-rb.debian.net
+jtk1a-armhf-rb.debian.net
+jtx1a-armhf-rb.debian.net
+odu3a-armhf-rb.debian.net
+odxu4a-armhf-rb.debian.net
+
+
# Down here nodes are automatically added by the maintenance job when they have
# been failing their health check for too long.
View it on GitLab: https://salsa.debian.org/qa/jenkins.debian.net/-/compare/e1adfc062b4da6397719236d6d16047c4b049d7a...25dc000f7a66b7814644c8a42ae9c52c5ba7db36
--
View it on GitLab: https://salsa.debian.org/qa/jenkins.debian.net/-/compare/e1adfc062b4da6397719236d6d16047c4b049d7a...25dc000f7a66b7814644c8a42ae9c52c5ba7db36
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/qa-jenkins-scm/attachments/20200801/a7df7d14/attachment-0001.html>
More information about the Qa-jenkins-scm
mailing list