[Git][qa/jenkins.debian.net][master] 2 commits: reproducible maintenance: be verbose about the proxy used
Holger Levsen (@holger)
gitlab at salsa.debian.org
Tue Oct 24 10:43:38 BST 2023
Holger Levsen pushed to branch master at Debian QA / jenkins.debian.net
Commits:
f2b6ed80 by Holger Levsen at 2023-10-24T11:37:06+02:00
reproducible maintenance: be verbose about the proxy used
Signed-off-by: Holger Levsen <holger at layer-acht.org>
- - - - -
3824405f by Holger Levsen at 2023-10-24T11:43:11+02:00
reproducible maintenance: first do local cleanup tasks, then do tasks which require network+proxy
Signed-off-by: Holger Levsen <holger at layer-acht.org>
- - - - -
1 changed file:
- bin/reproducible_maintenance.sh
Changes:
=====================================
bin/reproducible_maintenance.sh
=====================================
@@ -172,219 +172,6 @@ if [ -n "$OLDSTUFF" ] ; then
DIRTY=true
fi
-#
-# check for working proxy
-#
-echo "$(date -u) - testing whether the proxy works..."
-curl $MIRROR > /dev/null
-if [ $? -ne 0 ] ; then
- echo "Error: curl $MIRROR failed, probably the proxy is down for $HOSTNAME"
- exit 1
-fi
-
-if [ "$HOSTNAME" = "$MAINNODE" ] ; then
- #
- # find nodes with problems and temporarily turn them offline
- #
- echo "$(date -u) - Looking for unhealthy nodes."
- cd ~/jobs
- DUMMY_FILE=$(mktemp --tmpdir=$TMPDIR maintenance-XXXXXXX)
- SICK=""
- for i in reproducible_node_health_check_* reproducible_maintenance_* ; do
- case $i in
- reproducible_node_health_check_amd64_jenkins|reproducible_maintenance_amd64_jenkins)
- echo "Skipping $i..."
- continue
- ;;
- reproducible_node_health_check_*)
- NODE_ALIAS=$(echo $i | cut -d '_' -f6)
- NODE_ARCH=$(echo $i | cut -d '_' -f5)
- FORCE_DATE=$(date -u -d "3 hour ago" '+%Y-%m-%d %H:%M')
- MAXDIFF=12
- ;;
- reproducible_maintenance_*)
- NODE_ALIAS=$(echo $i | cut -d '_' -f4)
- NODE_ARCH=$(echo $i | cut -d '_' -f3)
- FORCE_DATE=$(date -u -d "8 hour ago" '+%Y-%m-%d %H:%M')
- MAXDIFF=3
- ;;
- esac
- touch -d "$FORCE_DATE" $DUMMY_FILE
- case $NODE_ARCH in
- amd64)
- case "$NODE_ALIAS" in
- ionos*) NODE="$NODE_ALIAS-amd64.debian.net" ;;
- osuosl*) NODE="osuosl${NODE_ALIAS#osuosl}-amd64.debian.net" ;;
- esac ;;
- i386) NODE="$NODE_ALIAS-i386.debian.net" ;;
- arm64) NODE="codethink${NODE_ALIAS#codethink}-arm64.debian.net" ;;
- armhf) NODE="${NODE_ALIAS}-armhf-rb.debian.net" ;;
- esac
- case "$NODE" in
- ionos9-amd64.debian.net|ionos10-amd64.debian.net)
- # ionos9 and ionos10 are not used for r-b and sometimes are too busy
- # to run healthcheck / maintenance jobs
- echo "Skipping ${NODE}..."
- continue
- ;;
- esac
- cd $i/builds
- LAST=$(ls -rt1 | tail -1)
- GOOD=$(awk '/^lastSuccessfulBuild/ {print $2}' permalinks)
- if [ "$LAST" = "$GOOD" ] ; then
- DIFF=0
- else
- let DIFF=$LAST-$GOOD 2>/dev/null|| DIFF=-1
- fi
- if [ $DIFF -eq -1 ] ; then
- echo "Warning: Problems analysing $i build logs, ignoring $NODE."
- # either the diff is greater than $MAXDIFF (=the last $MAXDIFF job runs failed)
- # or the last successful run is older than an hour (=a job is still running/hanging)
- elif [ $DIFF -gt $MAXDIFF ] || [ $LAST -ot $DUMMY_FILE ] ; then
- echo -n "$i job has issues since more than an hour"
- if grep -q $NODE $JENKINS_OFFLINE_LIST >/dev/null 2>&1 ; then
- echo " and $NODE already marked as offline, good."
- else
- echo $NODE >> $JENKINS_OFFLINE_LIST
- echo " so $NODE has (temporarily) been marked as offline now."
- SICK="$SICK $NODE"
- fi
- else
- echo "$NODE is doing fine, good."
- fi
- cd ../..
- done
- if [ -n "$SICK" ] ; then
- SICK=$(echo "$SICK" | sed 's#.debian.net##g' | sed 's#-rb##g' | sed 's# ##' )
- if [[ $SICK =~ ' ' ]]; then
- SICK=$(echo "$SICK" | sed 's# # and #g')
- MESSAGE="$SICK have health problems and have temporarily been marked as offline."
- else
- MESSAGE="$SICK has health problems and has temporarily been marked as offline."
- fi
- MESSAGE="$MESSAGE To make this permanent, edit jenkins-home/offline_nodes in git."
- RECIPIENTS="mattia at debian.org holger at debian.org"
- if [[ $MESSAGE =~ armhf ]]; then
- RECIPIENTS="$RECIPIENTS vagrant at reproducible-builds.org"
- fi
- for TO in $RECIPIENTS ; do
- echo -e "$MESSAGE" | mail -s "jenkins nodes temporarily marked offline" $TO
- done
-
- fi
- rm -f $DUMMY_FILE
-fi
-
-echo "$(date -u) - updating the chdists, schroots and pbuilder now..."
-# use host architecture (only)
-ARCH=$(dpkg --print-architecture)
-# use host apt proxy configuration for pbuilder
-if [ -n "$http_proxy" ] ; then
- pbuilder_http_proxy="--http-proxy $http_proxy"
-fi
-for s in $SUITES ; do
- for i in osuosl ionos3 ionos7 ionos9 ionos10 ; do
- if [ "${HOSTNAME:0:${#i}}" = "$i" ]; then
- # this node is not used to do Debian rebuilds, skip it all
- continue 2
- fi
- done
- #
- # chdist update
- #
- distname="$s-$ARCH"
- echo "$(date -u) - updating the $s/$ARCH chdist now."
- if [ ! -d "$CHPATH/$distname" ]; then
- echo "$(date -u) - chdist not existing, creating one now..."
- if ! chdist --data-dir="$CHPATH" --arch="$ARCH" create "$distname" "$MIRROR" "$s" main ; then
- echo "Error: failed to create the $s/$ARCH chdist."
- exit 1
- fi
- . /srv/jenkins/bin/jenkins_node_definitions.sh
- get_node_information "$HOSTNAME"
- if "$NODE_RUN_IN_THE_FUTURE" ; then
- echo "This node is reported to run in the future, configuring APT to ignore the Release file expiration..."
- echo 'Acquire::Check-Valid-Until "false";' > "$CHPATH/$distname/etc/apt/apt.conf.d/398future"
- fi
- fi
- if ! chdist --data-dir="$CHPATH" apt-get "$distname" -q update ; then
- echo "Warning: failed to update the $s/$ARCH chdist."
- DIRTY=true
- fi
- #
- # pbuilder update
- #
- # skip main node
- if [ "$HOSTNAME" = "$MAINNODE" ] ; then
- continue
- else
- echo "$(date -u) - updating pbuilder for $s/$ARCH now."
- fi
- for i in 1 2 3 4 ; do
- [ ! -f /var/cache/pbuilder/$s-reproducible-base.tgz ] || sudo pbuilder --update $pbuilder_http_proxy --basetgz /var/cache/pbuilder/$s-reproducible-base.tgz
- RESULT=$?
- if [ $RESULT -eq 1 ] ; then
- # sleep 61-120 secs
- echo "Sleeping some time... (to workaround network problems like 'Hash Sum mismatch'...)"
- /bin/sleep $(echo "scale=1 ; ($(shuf -i 1-600 -n 1)/10)+60" | bc )
- echo "$(date -u) - Retrying to update pbuilder for $s/$ARCH."
- elif [ $RESULT -eq 0 ] ; then
- break
- fi
- done
- if [ $RESULT -eq 1 ] ; then
- echo "Warning: failed to update pbuilder for $s/$ARCH."
- DIRTY=true
- fi
-done
-set -e
-
-# for alpine
-set +e
-case $HOSTNAME in
- osuosl1-amd64|osuosl2-amd64|jenkins)
- echo "$(date -u) - updating alpine schroot now."
- if $(schroot -l|grep -q alpine) ; then
- schroot --directory /tmp -c source:jenkins-reproducible-alpine -u root -- apk update
- schroot --directory /tmp -c source:jenkins-reproducible-alpine -u root -- apk upgrade
- RESULT=$?
- if [ $RESULT -eq 1 ] ; then
- echo "Warning: failed to update alpine schroot."
- DIRTY=true
- else
- echo "$(date -u) - updating alpine schroot done."
- fi
- else
- echo "No alpine schroot found, how strange."
- fi
- ;;
- *) ;;
-esac
-set -e
-
-# for Arch Linux
-set +e
-case $HOSTNAME in
- osuosl1-amd64|osuosl2-amd64|jenkins)
- echo "$(date -u) - updating Arch Linux schroot now."
- schroot --directory /tmp -c source:jenkins-reproducible-archlinux -u root -- pacman -Syu --noconfirm
- RESULT=$?
- if [ $RESULT -eq 1 ] ; then
- echo "Let's see if /var/lib/pacman/db.lck exists in the schroot..."
- if [ "$(schroot --directory /tmp -c source:jenkins-reproducible-archlinux -u root -- ls /var/lib/pacman/db.lck)" = "/var/lib/pacman/db.lck" ] ; then
- echo "Warning: failed to update Arch Linux schroot, pacman/db.lck exists."
- else
- echo "Warning: failed to update Arch Linux schroot."
- fi
- DIRTY=true
- else
- echo "$(date -u) - updating Arch Linux schroot done."
- fi
- ;;
- *) ;;
-esac
-set -e
-
# delete build services logfiles
dir=/var/lib/jenkins/userContent/reproducible/debian/build_service/
if [ "$HOSTNAME" = "$MAINNODE" ] ; then
@@ -555,23 +342,366 @@ if [ -d $dir ] ; then
fi
fi
-if [ "$HOSTNAME" = "$MAINNODE" ] ; then
- #
- # find failed builds due to network problems and reschedule them
- #
- # only grep through the last 5h (300 minutes) of builds...
- # (ignore "*None.rbuild.log" because these are build which were just started)
- # this job runs every 4h
- echo "$(date -u) - Rescheduling failed builds due to network issues."
- FAILED_BUILDS=$(find $DEBIAN_BASE/rbuild -type f ! -name "*None.rbuild.log" ! -mmin +300 -exec zgrep -l -E 'E: Failed to fetch.*(Unable to connect to|Connection failed|Size mismatch|Cannot initiate the connection to|Bad Gateway|Service Unavailable)' {} \; 2>/dev/null || true)
- if [ -n "$FAILED_BUILDS" ] ; then
+# find+terminate processes which should not be there
+echo "$(date -u) - Looking for processes which should not be there."
+HAYSTACK=$(mktemp --tmpdir=$TEMPDIR maintenance-XXXXXXXXXXX)
+RESULT=$(mktemp --tmpdir=$TEMPDIR maintenance-XXXXXXXXXXX)
+TOKILL=$(mktemp --tmpdir=$TEMPDIR maintenance-XXXXXXXXXXX)
+PBUIDS="1234 1111 2222"
+ps axo pid,user,size,pcpu,cmd > $HAYSTACK
+for i in $PBUIDS ; do
+ for PROCESS in $(pgrep -u $i -P 1 || true) ; do
+ # faked-sysv comes and goes...
+ grep ^$PROCESS $HAYSTACK | grep -v faked-sysv >> $RESULT 2> /dev/null || true
+ done
+done
+if [ -s $RESULT ] ; then
+ for PROCESS in $(cat $RESULT | cut -d " " -f1 | grep -v ^UID | xargs echo) ; do
+ AGE=$(ps -p $PROCESS -o etimes= || echo 0)
+ # a single build may take day, so... (first build: 18h, 2nd: 24h)
+ if [ $AGE -gt $(( 24*60*60 )) ] ; then
+ echo "$PROCESS" >> $TOKILL
+ fi
+ done
+ if [ -s $TOKILL ] ; then
+ DIRTY=true
+ PSCALL=""
echo
- echo "The following builds have failed due to network problems and will be rescheduled now:"
- echo "$FAILED_BUILDS"
+ echo "Info: processes found which should not be there, killing them now:"
+ for PROCESS in $(cat $TOKILL) ; do
+ PSCALL=${PSCALL:+"$PSCALL,"}"$PROCESS"
+ done
+ ps -F -p $PSCALL
echo
- echo "Rescheduling packages: "
- REQUESTER="jenkins maintenance job"
- REASON="maintenance reschedule: reschedule builds which failed due to network errors"
+ for PROCESS in $(cat $TOKILL) ; do
+ sudo kill -9 $PROCESS 2>&1 || true
+ echo "'sudo kill -9 $PROCESS' done."
+ done
+ echo
+ fi
+fi
+rm $HAYSTACK $RESULT $TOKILL
+# There are naughty processes spawning childs and leaving them to their grandparents
+PSCALL=""
+for i in $PBUIDS ; do
+ for p in $(pgrep -u $i) ; do
+ AGE=$(ps -p $p -o etimes= || echo 0)
+ # let's be generous and consider 26 hours here...
+ if [ $AGE -gt $(( 26*60*60 )) ] ; then
+ sudo kill -9 $p 2>&1 || (echo "Could not kill:" ; ps -F -p "$p")
+ sleep 2
+ # check it's gone
+ AGE=$(ps -p $p -o etimes= || echo 0)
+ if [ $AGE -gt $(( 14*60*60 )) ] ; then
+ PSCALL=${PSCALL:+"$PSCALL,"}"$p"
+ fi
+ fi
+ done
+done
+if [ -n "$PSCALL" ] ; then
+ # ignore some well known zombie processes
+ KNOWN_ZOMBIE_PROCESSES="(buf-ring.t|poll-race-mshot.t|ringbuf-read.t|send_recvmsg.t)"
+ if [ $(ps -F -p "$PSCALL" | grep -E -v "$KNOWN_ZOMBIE_PROCESSES" | wc -l) -lt 10 ] ; then
+ echo "Info: ignoring less than ten processes which should not be there and which could not be killed, because those are probably just a few harmless zombies, which can only be removed by rebooting...."
+ else
+ echo "Warning: found more than ten processes which should not be there and which could not be killed. Please investigate and reboot or ignore them...:"
+ fi
+ ps -F -p "$PSCALL" | grep -E -v "$KNOWN_ZOMBIE_PROCESSES"
+ echo
+fi
+
+# find builds which should not be there
+RESULTS=$(pgrep -f reproducible_build.sh --parent 1 || true)
+if [ -n "$RESULTS" ] ; then
+ DIRTY=true
+ echo "Warning: found reproducible_build.sh processes which have pid 1 as parent (and not sshd), thus something went wrong… please investigate."
+ echo -e "$RESULTS"
+fi
+
+# remove debian ci builds artifacts older than a day
+echo "$(date -u) - Checking for artifacts older than a day."
+ARTIFACTS=$(find $DEBIAN_BASE/artifacts/r00t-me/* -maxdepth 1 -type d -mtime +1 -exec ls -lad {} \; 2>/dev/null|| true)
+if [ -n "$ARTIFACTS" ] ; then
+ echo
+ echo "Removed old artifacts:"
+ find $DEBIAN_BASE/artifacts/r00t-me/* -maxdepth 1 -type d -mtime +1 -exec rm -rv --one-file-system {} \; || true
+ echo
+fi
+
+# remove artifacts from the debian live build jobs, older twelve hours
+echo "$(date -u) - Checking for artifacts from debian live build jobs, that are older than 12h"
+ARTIFACTS=$(find $DEBIAN_BASE/live_build/artifacts/r00t-me/* -maxdepth 1 -type f -mmin +720 -exec ls -lad {} \; 2>/dev/null|| true)
+if [ -n "$ARTIFACTS" ] ; then
+ echo
+ echo "Removed old debian-live artifacts:"
+ find $DEBIAN_BASE/live_build/artifacts/r00t-me/* -maxdepth 1 -type f -mmin +720 -exec rm -v --one-file-system {} \; || true
+ echo
+fi
+
+# remove leftovers from rsyncing live-build results
+echo "$(date -u) - Checking for leftovers from rsyncing live-build results, that are older than a day."
+dir=/var/lib/jenkins/userContent
+ARTIFACTS=$(find $dir/reproducible-rsync-????-* -maxdepth 1 -type d -mtime +1 -exec ls -lad {} \; 2>/dev/null|| true)
+if [ -n "$ARTIFACTS" ] ; then
+ echo
+ echo "Removed leftovers from rsyncing live-build artifacts:"
+ find $dir/reproducible-rsync-????-* -maxdepth 1 -type d -mtime +1 -exec rm -rv --one-file-system {} \; || true
+ echo
+fi
+
+# find + chmod files with bad permissions
+echo "$(date -u) - Checking for files with bad permissions."
+# automatically fix rbuild files with wrong permissions...
+# (we know it happens (very rarely) but... shrugs.)
+[ ! -d $DEBIAN_BASE/rbuild ] || find $DEBIAN_BASE/rbuild ! -perm 644 -type f -exec chmod -v 644 {} \; 2>/dev/null|| true
+BADPERMS=$(find $DEBIAN_BASE/{buildinfo,dbd,dbdtxt,dbdjson,logs,logdiffs,rbuild,artifacts,buster,bullseye,bookworm,trixie,unstable,experimental,rb-pkg} ! -perm 644 -type f 2>/dev/null|| true)
+if [ -n "$BADPERMS" ] ; then
+ DIRTY=true
+ echo
+ echo "Warning: Found files with bad permissions (!=644):"
+ echo "Please fix permission manually"
+ echo "$BADPERMS" | xargs echo chmod -v 644
+ echo
+fi
+
+# find kernels we cannot read and fix that
+BADPERMS=$(find /boot/vmlinuz* ! -perm 644)
+if [ -n "$BADPERMS" ] ; then
+ echo "Fixing kernel permissions:"
+ echo $BADPERMS
+ sudo chmod +r /boot/vmlinuz*
+fi
+
+#
+# check for working proxy
+#
+echo "$(date -u) - testing whether the proxy $http_proxy works..."
+curl $MIRROR > /dev/null
+if [ $? -ne 0 ] ; then
+ echo "Error: curl $MIRROR failed, probably the proxy is down for $HOSTNAME"
+ exit 1
+fi
+
+if [ "$HOSTNAME" = "$MAINNODE" ] ; then
+ #
+ # find nodes with problems and temporarily turn them offline
+ #
+ echo "$(date -u) - Looking for unhealthy nodes."
+ cd ~/jobs
+ DUMMY_FILE=$(mktemp --tmpdir=$TMPDIR maintenance-XXXXXXX)
+ SICK=""
+ for i in reproducible_node_health_check_* reproducible_maintenance_* ; do
+ case $i in
+ reproducible_node_health_check_amd64_jenkins|reproducible_maintenance_amd64_jenkins)
+ echo "Skipping $i..."
+ continue
+ ;;
+ reproducible_node_health_check_*)
+ NODE_ALIAS=$(echo $i | cut -d '_' -f6)
+ NODE_ARCH=$(echo $i | cut -d '_' -f5)
+ FORCE_DATE=$(date -u -d "3 hour ago" '+%Y-%m-%d %H:%M')
+ MAXDIFF=12
+ ;;
+ reproducible_maintenance_*)
+ NODE_ALIAS=$(echo $i | cut -d '_' -f4)
+ NODE_ARCH=$(echo $i | cut -d '_' -f3)
+ FORCE_DATE=$(date -u -d "8 hour ago" '+%Y-%m-%d %H:%M')
+ MAXDIFF=3
+ ;;
+ esac
+ touch -d "$FORCE_DATE" $DUMMY_FILE
+ case $NODE_ARCH in
+ amd64)
+ case "$NODE_ALIAS" in
+ ionos*) NODE="$NODE_ALIAS-amd64.debian.net" ;;
+ osuosl*) NODE="osuosl${NODE_ALIAS#osuosl}-amd64.debian.net" ;;
+ esac ;;
+ i386) NODE="$NODE_ALIAS-i386.debian.net" ;;
+ arm64) NODE="codethink${NODE_ALIAS#codethink}-arm64.debian.net" ;;
+ armhf) NODE="${NODE_ALIAS}-armhf-rb.debian.net" ;;
+ esac
+ case "$NODE" in
+ ionos9-amd64.debian.net|ionos10-amd64.debian.net)
+ # ionos9 and ionos10 are not used for r-b and sometimes are too busy
+ # to run healthcheck / maintenance jobs
+ echo "Skipping ${NODE}..."
+ continue
+ ;;
+ esac
+ cd $i/builds
+ LAST=$(ls -rt1 | tail -1)
+ GOOD=$(awk '/^lastSuccessfulBuild/ {print $2}' permalinks)
+ if [ "$LAST" = "$GOOD" ] ; then
+ DIFF=0
+ else
+ let DIFF=$LAST-$GOOD 2>/dev/null|| DIFF=-1
+ fi
+ if [ $DIFF -eq -1 ] ; then
+ echo "Warning: Problems analysing $i build logs, ignoring $NODE."
+ # either the diff is greater than $MAXDIFF (=the last $MAXDIFF job runs failed)
+ # or the last successful run is older than an hour (=a job is still running/hanging)
+ elif [ $DIFF -gt $MAXDIFF ] || [ $LAST -ot $DUMMY_FILE ] ; then
+ echo -n "$i job has issues since more than an hour"
+ if grep -q $NODE $JENKINS_OFFLINE_LIST >/dev/null 2>&1 ; then
+ echo " and $NODE already marked as offline, good."
+ else
+ echo $NODE >> $JENKINS_OFFLINE_LIST
+ echo " so $NODE has (temporarily) been marked as offline now."
+ SICK="$SICK $NODE"
+ fi
+ else
+ echo "$NODE is doing fine, good."
+ fi
+ cd ../..
+ done
+ if [ -n "$SICK" ] ; then
+ SICK=$(echo "$SICK" | sed 's#.debian.net##g' | sed 's#-rb##g' | sed 's# ##' )
+ if [[ $SICK =~ ' ' ]]; then
+ SICK=$(echo "$SICK" | sed 's# # and #g')
+ MESSAGE="$SICK have health problems and have temporarily been marked as offline."
+ else
+ MESSAGE="$SICK has health problems and has temporarily been marked as offline."
+ fi
+ MESSAGE="$MESSAGE To make this permanent, edit jenkins-home/offline_nodes in git."
+ RECIPIENTS="mattia at debian.org holger at debian.org"
+ if [[ $MESSAGE =~ armhf ]]; then
+ RECIPIENTS="$RECIPIENTS vagrant at reproducible-builds.org"
+ fi
+ for TO in $RECIPIENTS ; do
+ echo -e "$MESSAGE" | mail -s "jenkins nodes temporarily marked offline" $TO
+ done
+
+ fi
+ rm -f $DUMMY_FILE
+fi
+
+echo "$(date -u) - updating the chdists, schroots and pbuilder now..."
+# use host architecture (only)
+ARCH=$(dpkg --print-architecture)
+# use host apt proxy configuration for pbuilder
+if [ -n "$http_proxy" ] ; then
+ pbuilder_http_proxy="--http-proxy $http_proxy"
+fi
+for s in $SUITES ; do
+ for i in osuosl ionos3 ionos7 ionos9 ionos10 ; do
+ if [ "${HOSTNAME:0:${#i}}" = "$i" ]; then
+ # this node is not used to do Debian rebuilds, skip it all
+ continue 2
+ fi
+ done
+ #
+ # chdist update
+ #
+ distname="$s-$ARCH"
+ echo "$(date -u) - updating the $s/$ARCH chdist now."
+ if [ ! -d "$CHPATH/$distname" ]; then
+ echo "$(date -u) - chdist not existing, creating one now..."
+ if ! chdist --data-dir="$CHPATH" --arch="$ARCH" create "$distname" "$MIRROR" "$s" main ; then
+ echo "Error: failed to create the $s/$ARCH chdist."
+ exit 1
+ fi
+ . /srv/jenkins/bin/jenkins_node_definitions.sh
+ get_node_information "$HOSTNAME"
+ if "$NODE_RUN_IN_THE_FUTURE" ; then
+ echo "This node is reported to run in the future, configuring APT to ignore the Release file expiration..."
+ echo 'Acquire::Check-Valid-Until "false";' > "$CHPATH/$distname/etc/apt/apt.conf.d/398future"
+ fi
+ fi
+ if ! chdist --data-dir="$CHPATH" apt-get "$distname" -q update ; then
+ echo "Warning: failed to update the $s/$ARCH chdist."
+ DIRTY=true
+ fi
+ #
+ # pbuilder update
+ #
+ # skip main node
+ if [ "$HOSTNAME" = "$MAINNODE" ] ; then
+ continue
+ else
+ echo "$(date -u) - updating pbuilder for $s/$ARCH now."
+ fi
+ for i in 1 2 3 4 ; do
+ [ ! -f /var/cache/pbuilder/$s-reproducible-base.tgz ] || sudo pbuilder --update $pbuilder_http_proxy --basetgz /var/cache/pbuilder/$s-reproducible-base.tgz
+ RESULT=$?
+ if [ $RESULT -eq 1 ] ; then
+ # sleep 61-120 secs
+ echo "Sleeping some time... (to workaround network problems like 'Hash Sum mismatch'...)"
+ /bin/sleep $(echo "scale=1 ; ($(shuf -i 1-600 -n 1)/10)+60" | bc )
+ echo "$(date -u) - Retrying to update pbuilder for $s/$ARCH."
+ elif [ $RESULT -eq 0 ] ; then
+ break
+ fi
+ done
+ if [ $RESULT -eq 1 ] ; then
+ echo "Warning: failed to update pbuilder for $s/$ARCH."
+ DIRTY=true
+ fi
+done
+set -e
+
+# for alpine
+set +e
+case $HOSTNAME in
+ osuosl1-amd64|osuosl2-amd64|jenkins)
+ echo "$(date -u) - updating alpine schroot now."
+ if $(schroot -l|grep -q alpine) ; then
+ schroot --directory /tmp -c source:jenkins-reproducible-alpine -u root -- apk update
+ schroot --directory /tmp -c source:jenkins-reproducible-alpine -u root -- apk upgrade
+ RESULT=$?
+ if [ $RESULT -eq 1 ] ; then
+ echo "Warning: failed to update alpine schroot."
+ DIRTY=true
+ else
+ echo "$(date -u) - updating alpine schroot done."
+ fi
+ else
+ echo "No alpine schroot found, how strange."
+ fi
+ ;;
+ *) ;;
+esac
+set -e
+
+# for Arch Linux
+set +e
+case $HOSTNAME in
+ osuosl1-amd64|osuosl2-amd64|jenkins)
+ echo "$(date -u) - updating Arch Linux schroot now."
+ schroot --directory /tmp -c source:jenkins-reproducible-archlinux -u root -- pacman -Syu --noconfirm
+ RESULT=$?
+ if [ $RESULT -eq 1 ] ; then
+ echo "Let's see if /var/lib/pacman/db.lck exists in the schroot..."
+ if [ "$(schroot --directory /tmp -c source:jenkins-reproducible-archlinux -u root -- ls /var/lib/pacman/db.lck)" = "/var/lib/pacman/db.lck" ] ; then
+ echo "Warning: failed to update Arch Linux schroot, pacman/db.lck exists."
+ else
+ echo "Warning: failed to update Arch Linux schroot."
+ fi
+ DIRTY=true
+ else
+ echo "$(date -u) - updating Arch Linux schroot done."
+ fi
+ ;;
+ *) ;;
+esac
+set -e
+
+if [ "$HOSTNAME" = "$MAINNODE" ] ; then
+ #
+ # find failed builds due to network problems and reschedule them
+ #
+ # only grep through the last 5h (300 minutes) of builds...
+ # (ignore "*None.rbuild.log" because these are build which were just started)
+ # this job runs every 4h
+ echo "$(date -u) - Rescheduling failed builds due to network issues."
+ FAILED_BUILDS=$(find $DEBIAN_BASE/rbuild -type f ! -name "*None.rbuild.log" ! -mmin +300 -exec zgrep -l -E 'E: Failed to fetch.*(Unable to connect to|Connection failed|Size mismatch|Cannot initiate the connection to|Bad Gateway|Service Unavailable)' {} \; 2>/dev/null || true)
+ if [ -n "$FAILED_BUILDS" ] ; then
+ echo
+ echo "The following builds have failed due to network problems and will be rescheduled now:"
+ echo "$FAILED_BUILDS"
+ echo
+ echo "Rescheduling packages: "
+ REQUESTER="jenkins maintenance job"
+ REASON="maintenance reschedule: reschedule builds which failed due to network errors"
for SUITE in $(echo $FAILED_BUILDS | sed "s# #\n#g" | cut -d "/" -f9 | sort -u) ; do
for ARCH in $(echo $FAILED_BUILDS | sed "s# #\n#g" | cut -d "/" -f10 | sort -u) ; do
CANDIDATES=$(for PKG in $(echo $FAILED_BUILDS | sed "s# #\n#g" | grep "/$SUITE/$ARCH/" | cut -d "/" -f11 | cut -d "_" -f1) ; do echo "$PKG" ; done)
@@ -686,137 +816,6 @@ if [ "$HOSTNAME" = "$MAINNODE" ] ; then
fi
-# find+terminate processes which should not be there
-echo "$(date -u) - Looking for processes which should not be there."
-HAYSTACK=$(mktemp --tmpdir=$TEMPDIR maintenance-XXXXXXXXXXX)
-RESULT=$(mktemp --tmpdir=$TEMPDIR maintenance-XXXXXXXXXXX)
-TOKILL=$(mktemp --tmpdir=$TEMPDIR maintenance-XXXXXXXXXXX)
-PBUIDS="1234 1111 2222"
-ps axo pid,user,size,pcpu,cmd > $HAYSTACK
-for i in $PBUIDS ; do
- for PROCESS in $(pgrep -u $i -P 1 || true) ; do
- # faked-sysv comes and goes...
- grep ^$PROCESS $HAYSTACK | grep -v faked-sysv >> $RESULT 2> /dev/null || true
- done
-done
-if [ -s $RESULT ] ; then
- for PROCESS in $(cat $RESULT | cut -d " " -f1 | grep -v ^UID | xargs echo) ; do
- AGE=$(ps -p $PROCESS -o etimes= || echo 0)
- # a single build may take day, so... (first build: 18h, 2nd: 24h)
- if [ $AGE -gt $(( 24*60*60 )) ] ; then
- echo "$PROCESS" >> $TOKILL
- fi
- done
- if [ -s $TOKILL ] ; then
- DIRTY=true
- PSCALL=""
- echo
- echo "Info: processes found which should not be there, killing them now:"
- for PROCESS in $(cat $TOKILL) ; do
- PSCALL=${PSCALL:+"$PSCALL,"}"$PROCESS"
- done
- ps -F -p $PSCALL
- echo
- for PROCESS in $(cat $TOKILL) ; do
- sudo kill -9 $PROCESS 2>&1 || true
- echo "'sudo kill -9 $PROCESS' done."
- done
- echo
- fi
-fi
-rm $HAYSTACK $RESULT $TOKILL
-# There are naughty processes spawning childs and leaving them to their grandparents
-PSCALL=""
-for i in $PBUIDS ; do
- for p in $(pgrep -u $i) ; do
- AGE=$(ps -p $p -o etimes= || echo 0)
- # let's be generous and consider 26 hours here...
- if [ $AGE -gt $(( 26*60*60 )) ] ; then
- sudo kill -9 $p 2>&1 || (echo "Could not kill:" ; ps -F -p "$p")
- sleep 2
- # check it's gone
- AGE=$(ps -p $p -o etimes= || echo 0)
- if [ $AGE -gt $(( 14*60*60 )) ] ; then
- PSCALL=${PSCALL:+"$PSCALL,"}"$p"
- fi
- fi
- done
-done
-if [ -n "$PSCALL" ] ; then
- # ignore some well known zombie processes
- KNOWN_ZOMBIE_PROCESSES="(buf-ring.t|poll-race-mshot.t|ringbuf-read.t|send_recvmsg.t)"
- if [ $(ps -F -p "$PSCALL" | grep -E -v "$KNOWN_ZOMBIE_PROCESSES" | wc -l) -lt 10 ] ; then
- echo "Info: ignoring less than ten processes which should not be there and which could not be killed, because those are probably just a few harmless zombies, which can only be removed by rebooting...."
- else
- echo "Warning: found more than ten processes which should not be there and which could not be killed. Please investigate and reboot or ignore them...:"
- fi
- ps -F -p "$PSCALL" | grep -E -v "$KNOWN_ZOMBIE_PROCESSES"
- echo
-fi
-
-# find builds which should not be there
-RESULTS=$(pgrep -f reproducible_build.sh --parent 1 || true)
-if [ -n "$RESULTS" ] ; then
- DIRTY=true
- echo "Warning: found reproducible_build.sh processes which have pid 1 as parent (and not sshd), thus something went wrong… please investigate."
- echo -e "$RESULTS"
-fi
-
-# remove debian ci builds artifacts older than a day
-echo "$(date -u) - Checking for artifacts older than a day."
-ARTIFACTS=$(find $DEBIAN_BASE/artifacts/r00t-me/* -maxdepth 1 -type d -mtime +1 -exec ls -lad {} \; 2>/dev/null|| true)
-if [ -n "$ARTIFACTS" ] ; then
- echo
- echo "Removed old artifacts:"
- find $DEBIAN_BASE/artifacts/r00t-me/* -maxdepth 1 -type d -mtime +1 -exec rm -rv --one-file-system {} \; || true
- echo
-fi
-
-# remove artifacts from the debian live build jobs, older twelve hours
-echo "$(date -u) - Checking for artifacts from debian live build jobs, that are older than 12h"
-ARTIFACTS=$(find $DEBIAN_BASE/live_build/artifacts/r00t-me/* -maxdepth 1 -type f -mmin +720 -exec ls -lad {} \; 2>/dev/null|| true)
-if [ -n "$ARTIFACTS" ] ; then
- echo
- echo "Removed old debian-live artifacts:"
- find $DEBIAN_BASE/live_build/artifacts/r00t-me/* -maxdepth 1 -type f -mmin +720 -exec rm -v --one-file-system {} \; || true
- echo
-fi
-
-# remove leftovers from rsyncing live-build results
-echo "$(date -u) - Checking for leftovers from rsyncing live-build results, that are older than a day."
-dir=/var/lib/jenkins/userContent
-ARTIFACTS=$(find $dir/reproducible-rsync-????-* -maxdepth 1 -type d -mtime +1 -exec ls -lad {} \; 2>/dev/null|| true)
-if [ -n "$ARTIFACTS" ] ; then
- echo
- echo "Removed leftovers from rsyncing live-build artifacts:"
- find $dir/reproducible-rsync-????-* -maxdepth 1 -type d -mtime +1 -exec rm -rv --one-file-system {} \; || true
- echo
-fi
-
-
-# find + chmod files with bad permissions
-echo "$(date -u) - Checking for files with bad permissions."
-# automatically fix rbuild files with wrong permissions...
-# (we know it happens (very rarely) but... shrugs.)
-[ ! -d $DEBIAN_BASE/rbuild ] || find $DEBIAN_BASE/rbuild ! -perm 644 -type f -exec chmod -v 644 {} \; 2>/dev/null|| true
-BADPERMS=$(find $DEBIAN_BASE/{buildinfo,dbd,dbdtxt,dbdjson,logs,logdiffs,rbuild,artifacts,buster,bullseye,bookworm,trixie,unstable,experimental,rb-pkg} ! -perm 644 -type f 2>/dev/null|| true)
-if [ -n "$BADPERMS" ] ; then
- DIRTY=true
- echo
- echo "Warning: Found files with bad permissions (!=644):"
- echo "Please fix permission manually"
- echo "$BADPERMS" | xargs echo chmod -v 644
- echo
-fi
-
-# find kernels we cannot read and fix that
-BADPERMS=$(find /boot/vmlinuz* ! -perm 644)
-if [ -n "$BADPERMS" ] ; then
- echo "Fixing kernel permissions:"
- echo $BADPERMS
- sudo chmod +r /boot/vmlinuz*
-fi
-
# daily mails
if [ "$HOSTNAME" = "$MAINNODE" ] && [ $(date -u +%H) -eq 0 ] ; then
echo "$(date -u) - sending daily emails about problems found in logfiles:"
View it on GitLab: https://salsa.debian.org/qa/jenkins.debian.net/-/compare/73a595683a7b1e455991b3c0adbade4662edf582...3824405f61d555cf11a1cc8a88916990ce186308
--
View it on GitLab: https://salsa.debian.org/qa/jenkins.debian.net/-/compare/73a595683a7b1e455991b3c0adbade4662edf582...3824405f61d555cf11a1cc8a88916990ce186308
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/qa-jenkins-scm/attachments/20231024/0da0a9ab/attachment-0001.htm>
More information about the Qa-jenkins-scm
mailing list