[Git][qa/jenkins.debian.net][master] 2 commits: reproducible maintenance: be verbose about the proxy used

Holger Levsen (@holger) gitlab at salsa.debian.org
Tue Oct 24 10:43:38 BST 2023



Holger Levsen pushed to branch master at Debian QA / jenkins.debian.net


Commits:
f2b6ed80 by Holger Levsen at 2023-10-24T11:37:06+02:00
reproducible maintenance: be verbose about the proxy used

Signed-off-by: Holger Levsen <holger at layer-acht.org>

- - - - -
3824405f by Holger Levsen at 2023-10-24T11:43:11+02:00
reproducible maintenance: first do local cleanup tasks, then do tasks which require network+proxy

Signed-off-by: Holger Levsen <holger at layer-acht.org>

- - - - -


1 changed file:

- bin/reproducible_maintenance.sh


Changes:

=====================================
bin/reproducible_maintenance.sh
=====================================
@@ -172,219 +172,6 @@ if [ -n "$OLDSTUFF" ] ; then
 	DIRTY=true
 fi
 
-#
-# check for working proxy
-#
-echo "$(date -u) - testing whether the proxy works..."
-curl $MIRROR > /dev/null
-if [ $? -ne 0 ] ; then
-	echo "Error: curl $MIRROR failed, probably the proxy is down for $HOSTNAME"
-	exit 1
-fi
-
-if [ "$HOSTNAME" = "$MAINNODE" ] ; then
-	#
-	# find nodes with problems and temporarily turn them offline
-	#
-	echo "$(date -u) - Looking for unhealthy nodes."
-	cd ~/jobs
-	DUMMY_FILE=$(mktemp --tmpdir=$TMPDIR maintenance-XXXXXXX)
-	SICK=""
-	for i in reproducible_node_health_check_* reproducible_maintenance_* ; do
-		case $i in
-			reproducible_node_health_check_amd64_jenkins|reproducible_maintenance_amd64_jenkins)
-				echo "Skipping $i..."
-				continue
-				;;
-			reproducible_node_health_check_*)
-				NODE_ALIAS=$(echo $i | cut -d '_' -f6)
-				NODE_ARCH=$(echo $i | cut -d '_' -f5)
-				FORCE_DATE=$(date -u -d "3 hour ago" '+%Y-%m-%d %H:%M')
-				MAXDIFF=12
-				;;
-			reproducible_maintenance_*)
-				NODE_ALIAS=$(echo $i | cut -d '_' -f4)
-				NODE_ARCH=$(echo $i | cut -d '_' -f3)
-				FORCE_DATE=$(date -u -d "8 hour ago" '+%Y-%m-%d %H:%M')
-				MAXDIFF=3
-				;;
-		esac
-		touch -d "$FORCE_DATE" $DUMMY_FILE
-		case $NODE_ARCH in
-			amd64)
-				case "$NODE_ALIAS" in
-					ionos*)		NODE="$NODE_ALIAS-amd64.debian.net" ;;
-					osuosl*)	NODE="osuosl${NODE_ALIAS#osuosl}-amd64.debian.net" ;;
-				esac ;;
-			i386)	NODE="$NODE_ALIAS-i386.debian.net" ;;
-			arm64)	NODE="codethink${NODE_ALIAS#codethink}-arm64.debian.net" ;;
-			armhf)	NODE="${NODE_ALIAS}-armhf-rb.debian.net" ;;
-		esac
-		case "$NODE" in
-			ionos9-amd64.debian.net|ionos10-amd64.debian.net)
-				# ionos9 and ionos10 are not used for r-b and sometimes are too busy
-				# to run healthcheck / maintenance jobs
-				echo "Skipping ${NODE}..."
-				continue
-				;;
-		esac
-		cd $i/builds
-		LAST=$(ls -rt1 | tail -1)
-		GOOD=$(awk '/^lastSuccessfulBuild/ {print $2}' permalinks)
-		if [ "$LAST" = "$GOOD" ] ; then
-			DIFF=0
-		else
-			let DIFF=$LAST-$GOOD 2>/dev/null|| DIFF=-1
-		fi
-		if [ $DIFF -eq -1 ] ; then
-			echo "Warning: Problems analysing $i build logs, ignoring $NODE."
-		# either the diff is greater than $MAXDIFF (=the last $MAXDIFF job runs failed)
-		# or the last successful run is older than an hour (=a job is still running/hanging)
-		elif [ $DIFF -gt $MAXDIFF ] || [ $LAST -ot $DUMMY_FILE ] ; then
-			echo -n "$i job has issues since more than an hour"
-			if grep -q $NODE $JENKINS_OFFLINE_LIST >/dev/null 2>&1 ; then
-				echo " and $NODE already marked as offline, good."
-			else
-				echo $NODE >> $JENKINS_OFFLINE_LIST
-				echo " so $NODE has (temporarily) been marked as offline now."
-				SICK="$SICK $NODE"
-			fi
-		else
-			echo "$NODE is doing fine, good."
-		fi
-		cd ../..
-	done
-	if [ -n "$SICK" ] ; then
-		SICK=$(echo "$SICK" | sed 's#.debian.net##g' | sed 's#-rb##g' | sed 's# ##' )
-		if [[ $SICK =~ ' ' ]]; then
-			SICK=$(echo "$SICK" | sed 's# # and #g')
-			MESSAGE="$SICK have health problems and have temporarily been marked as offline."
-		else
-			MESSAGE="$SICK has health problems and has temporarily been marked as offline."
-		fi
-		MESSAGE="$MESSAGE To make this permanent, edit jenkins-home/offline_nodes in git."
-		RECIPIENTS="mattia at debian.org holger at debian.org"
-		if [[ $MESSAGE =~ armhf ]]; then
-			RECIPIENTS="$RECIPIENTS vagrant at reproducible-builds.org"
-		fi
-		for TO in $RECIPIENTS ; do
-			echo -e "$MESSAGE" | mail -s "jenkins nodes temporarily marked offline" $TO
-		done
-
-	fi
-	rm -f $DUMMY_FILE
-fi
-
-echo "$(date -u) - updating the chdists, schroots and pbuilder now..."
-# use host architecture (only)
-ARCH=$(dpkg --print-architecture)
-# use host apt proxy configuration for pbuilder
-if [ -n "$http_proxy" ] ; then
-	pbuilder_http_proxy="--http-proxy $http_proxy"
-fi
-for s in $SUITES ; do
-	for i in osuosl ionos3 ionos7 ionos9 ionos10 ; do
-		if [ "${HOSTNAME:0:${#i}}" = "$i" ]; then
-			# this node is not used to do Debian rebuilds, skip it all
-			continue 2
-		fi
-	done
-	#
-	# chdist update
-	#
-	distname="$s-$ARCH"
-	echo "$(date -u) - updating the $s/$ARCH chdist now."
-	if [ ! -d "$CHPATH/$distname" ]; then
-		echo "$(date -u) - chdist not existing, creating one now..."
-		if ! chdist --data-dir="$CHPATH" --arch="$ARCH" create "$distname" "$MIRROR" "$s" main ; then
-			echo "Error: failed to create the $s/$ARCH chdist."
-			exit 1
-		fi
-		. /srv/jenkins/bin/jenkins_node_definitions.sh
-		get_node_information "$HOSTNAME"
-		if "$NODE_RUN_IN_THE_FUTURE" ; then
-			echo "This node is reported to run in the future, configuring APT to ignore the Release file expiration..."
-			echo 'Acquire::Check-Valid-Until "false";' > "$CHPATH/$distname/etc/apt/apt.conf.d/398future"
-		fi
-	fi
-	if ! chdist --data-dir="$CHPATH" apt-get "$distname" -q update ; then
-		echo "Warning: failed to update the $s/$ARCH chdist."
-		DIRTY=true
-	fi
-	#
-	# pbuilder update
-	#
-	# skip main node
-	if [ "$HOSTNAME" = "$MAINNODE" ] ; then
-		continue
-	else
-		echo "$(date -u) - updating pbuilder for $s/$ARCH now."
-	fi
-	for i in 1 2 3 4 ; do
-		[ ! -f /var/cache/pbuilder/$s-reproducible-base.tgz ] || sudo pbuilder --update $pbuilder_http_proxy --basetgz /var/cache/pbuilder/$s-reproducible-base.tgz
-		RESULT=$?
-		if [ $RESULT -eq 1 ] ; then
-			# sleep 61-120 secs
-			echo "Sleeping some time... (to workaround network problems like 'Hash Sum mismatch'...)"
-			/bin/sleep $(echo "scale=1 ; ($(shuf -i 1-600 -n 1)/10)+60" | bc )
-			echo "$(date -u) - Retrying to update pbuilder for $s/$ARCH."
-		elif [ $RESULT -eq 0 ] ; then
-			break
-		fi
-	done
-	if [ $RESULT -eq 1 ] ; then
-		echo "Warning: failed to update pbuilder for $s/$ARCH."
-		DIRTY=true
-	fi
-done
-set -e
-
-# for alpine
-set +e
-case $HOSTNAME in
-	osuosl1-amd64|osuosl2-amd64|jenkins)
-		echo "$(date -u) - updating alpine schroot now."
-		if $(schroot -l|grep -q alpine) ; then
-			schroot --directory /tmp -c source:jenkins-reproducible-alpine -u root -- apk update
-			schroot --directory /tmp -c source:jenkins-reproducible-alpine -u root -- apk upgrade
-			RESULT=$?
-			if [ $RESULT -eq 1 ] ; then
-				echo "Warning: failed to update alpine schroot."
-				DIRTY=true
-			else
-				echo "$(date -u) - updating alpine schroot done."
-			fi
-		else
-			echo "No alpine schroot found, how strange."
-		fi
-		;;
-	*)	;;
-esac
-set -e
-
-# for Arch Linux
-set +e
-case $HOSTNAME in
-	osuosl1-amd64|osuosl2-amd64|jenkins)
-		echo "$(date -u) - updating Arch Linux schroot now."
-		schroot --directory /tmp -c source:jenkins-reproducible-archlinux -u root -- pacman -Syu --noconfirm
-		RESULT=$?
-		if [ $RESULT -eq 1 ] ; then
-			echo "Let's see if /var/lib/pacman/db.lck exists in the schroot..."
-			if [ "$(schroot --directory /tmp -c source:jenkins-reproducible-archlinux -u root -- ls /var/lib/pacman/db.lck)" = "/var/lib/pacman/db.lck" ] ; then
-				echo "Warning: failed to update Arch Linux schroot, pacman/db.lck exists."
-			else
-				echo "Warning: failed to update Arch Linux schroot."
-			fi
-			DIRTY=true
-		else
-			echo "$(date -u) - updating Arch Linux schroot done."
-		fi
-		;;
-	*)	;;
-esac
-set -e
-
 # delete build services logfiles
 dir=/var/lib/jenkins/userContent/reproducible/debian/build_service/
 if [ "$HOSTNAME" = "$MAINNODE" ] ; then
@@ -555,23 +342,366 @@ if [ -d $dir ] ; then
 	fi
 fi
 
-if [ "$HOSTNAME" = "$MAINNODE" ] ; then
-	#
-	# find failed builds due to network problems and reschedule them
-	#
-	# only grep through the last 5h (300 minutes) of builds...
-	# (ignore "*None.rbuild.log" because these are build which were just started)
-	# this job runs every 4h
-	echo "$(date -u) - Rescheduling failed builds due to network issues."
-	FAILED_BUILDS=$(find $DEBIAN_BASE/rbuild -type f ! -name "*None.rbuild.log" ! -mmin +300 -exec zgrep -l -E 'E: Failed to fetch.*(Unable to connect to|Connection failed|Size mismatch|Cannot initiate the connection to|Bad Gateway|Service Unavailable)' {} \; 2>/dev/null || true)
-	if [ -n "$FAILED_BUILDS" ] ; then
+# find+terminate processes which should not be there
+echo "$(date -u) - Looking for processes which should not be there."
+HAYSTACK=$(mktemp --tmpdir=$TEMPDIR maintenance-XXXXXXXXXXX)
+RESULT=$(mktemp --tmpdir=$TEMPDIR maintenance-XXXXXXXXXXX)
+TOKILL=$(mktemp --tmpdir=$TEMPDIR maintenance-XXXXXXXXXXX)
+PBUIDS="1234 1111 2222"
+ps axo pid,user,size,pcpu,cmd > $HAYSTACK
+for i in $PBUIDS ; do
+	for PROCESS in $(pgrep -u $i -P 1 || true) ; do
+		# faked-sysv comes and goes...
+		grep ^$PROCESS $HAYSTACK | grep -v faked-sysv >> $RESULT 2> /dev/null || true
+	done
+done
+if [ -s $RESULT ] ; then
+	for PROCESS in $(cat $RESULT | cut -d " " -f1 | grep -v ^UID | xargs echo) ; do
+		AGE=$(ps -p $PROCESS -o etimes= || echo 0)
+		# a single build may take day, so... (first build: 18h, 2nd: 24h)
+		if [ $AGE -gt $(( 24*60*60 )) ] ; then
+			echo "$PROCESS" >> $TOKILL
+		fi
+	done
+	if [ -s $TOKILL ] ; then
+		DIRTY=true
+		PSCALL=""
 		echo
-		echo "The following builds have failed due to network problems and will be rescheduled now:"
-		echo "$FAILED_BUILDS"
+		echo "Info: processes found which should not be there, killing them now:"
+		for PROCESS in $(cat $TOKILL) ; do
+			PSCALL=${PSCALL:+"$PSCALL,"}"$PROCESS"
+		done
+		ps -F -p $PSCALL
 		echo
-		echo "Rescheduling packages: "
-		REQUESTER="jenkins maintenance job"
-		REASON="maintenance reschedule: reschedule builds which failed due to network errors"
+		for PROCESS in $(cat $TOKILL) ; do
+			sudo kill -9 $PROCESS 2>&1 || true
+			echo "'sudo kill -9 $PROCESS' done."
+		done
+		echo
+	fi
+fi
+rm $HAYSTACK $RESULT $TOKILL
+# There are naughty processes spawning childs and leaving them to their grandparents
+PSCALL=""
+for i in $PBUIDS ; do
+	for p in $(pgrep -u $i) ; do
+		AGE=$(ps -p $p -o etimes= || echo 0)
+		# let's be generous and consider 26 hours here...
+		if [ $AGE -gt $(( 26*60*60 )) ] ; then
+			sudo kill -9 $p 2>&1 || (echo "Could not kill:" ; ps -F -p "$p")
+			sleep 2
+			# check it's gone
+			AGE=$(ps -p $p -o etimes= || echo 0)
+			if [ $AGE -gt $(( 14*60*60 )) ] ; then
+				PSCALL=${PSCALL:+"$PSCALL,"}"$p"
+			fi
+		fi
+	done
+done
+if [ -n "$PSCALL" ] ; then
+	# ignore some well known zombie processes
+	KNOWN_ZOMBIE_PROCESSES="(buf-ring.t|poll-race-mshot.t|ringbuf-read.t|send_recvmsg.t)"
+	if [ $(ps -F -p "$PSCALL" | grep -E -v "$KNOWN_ZOMBIE_PROCESSES" | wc -l) -lt 10 ] ; then
+		echo "Info: ignoring less than ten processes which should not be there and which could not be killed, because those are probably just a few harmless zombies, which can only be removed by rebooting...."
+	else
+		echo "Warning: found more than ten processes which should not be there and which could not be killed. Please investigate and reboot or ignore them...:"
+	fi
+	ps -F -p "$PSCALL" | grep -E -v "$KNOWN_ZOMBIE_PROCESSES"
+	echo
+fi
+
+# find builds which should not be there
+RESULTS=$(pgrep -f reproducible_build.sh --parent 1 || true)
+if [ -n "$RESULTS" ] ; then
+	DIRTY=true
+	echo "Warning: found reproducible_build.sh processes which have pid 1 as parent (and not sshd), thus something went wrong… please investigate."
+	echo -e "$RESULTS"
+fi
+
+# remove debian ci builds artifacts older than a day
+echo "$(date -u) - Checking for artifacts older than a day."
+ARTIFACTS=$(find $DEBIAN_BASE/artifacts/r00t-me/* -maxdepth 1 -type d -mtime +1 -exec ls -lad {} \; 2>/dev/null|| true)
+if [ -n "$ARTIFACTS" ] ; then
+	echo
+	echo "Removed old artifacts:"
+	find $DEBIAN_BASE/artifacts/r00t-me/* -maxdepth 1 -type d -mtime +1 -exec rm -rv --one-file-system {} \; || true
+	echo
+fi
+
+# remove artifacts from the debian live build jobs, older twelve hours
+echo "$(date -u) - Checking for artifacts from debian live build jobs, that are older than 12h"
+ARTIFACTS=$(find $DEBIAN_BASE/live_build/artifacts/r00t-me/* -maxdepth 1 -type f -mmin +720 -exec ls -lad {} \; 2>/dev/null|| true)
+if [ -n "$ARTIFACTS" ] ; then
+	echo
+	echo "Removed old debian-live artifacts:"
+	find $DEBIAN_BASE/live_build/artifacts/r00t-me/* -maxdepth 1 -type f -mmin +720 -exec rm -v --one-file-system {} \; || true
+	echo
+fi
+
+# remove leftovers from rsyncing live-build results
+echo "$(date -u) - Checking for leftovers from rsyncing live-build results, that are older than a day."
+dir=/var/lib/jenkins/userContent
+ARTIFACTS=$(find $dir/reproducible-rsync-????-* -maxdepth 1 -type d -mtime +1 -exec ls -lad {} \; 2>/dev/null|| true)
+if [ -n "$ARTIFACTS" ] ; then
+	echo
+	echo "Removed leftovers from rsyncing live-build artifacts:"
+	find $dir/reproducible-rsync-????-* -maxdepth 1 -type d -mtime +1 -exec rm -rv --one-file-system {} \; || true
+	echo
+fi
+
+# find + chmod files with bad permissions
+echo "$(date -u) - Checking for files with bad permissions."
+# automatically fix rbuild files with wrong permissions...
+# (we know it happens (very rarely) but... shrugs.)
+[ ! -d $DEBIAN_BASE/rbuild ] || find $DEBIAN_BASE/rbuild ! -perm 644 -type f -exec chmod -v 644 {} \; 2>/dev/null|| true
+BADPERMS=$(find $DEBIAN_BASE/{buildinfo,dbd,dbdtxt,dbdjson,logs,logdiffs,rbuild,artifacts,buster,bullseye,bookworm,trixie,unstable,experimental,rb-pkg} ! -perm 644 -type f 2>/dev/null|| true)
+if [ -n "$BADPERMS" ] ; then
+    DIRTY=true
+    echo
+    echo "Warning: Found files with bad permissions (!=644):"
+    echo "Please fix permission manually"
+    echo "$BADPERMS" | xargs echo chmod -v 644
+    echo
+fi
+
+# find kernels we cannot read and fix that
+BADPERMS=$(find /boot/vmlinuz* ! -perm 644)
+if [ -n "$BADPERMS" ] ; then
+    echo "Fixing kernel permissions:"
+    echo $BADPERMS
+    sudo chmod +r /boot/vmlinuz*
+fi
+
+#
+# check for working proxy
+#
+echo "$(date -u) - testing whether the proxy $http_proxy works..."
+curl $MIRROR > /dev/null
+if [ $? -ne 0 ] ; then
+	echo "Error: curl $MIRROR failed, probably the proxy is down for $HOSTNAME"
+	exit 1
+fi
+
+if [ "$HOSTNAME" = "$MAINNODE" ] ; then
+	#
+	# find nodes with problems and temporarily turn them offline
+	#
+	echo "$(date -u) - Looking for unhealthy nodes."
+	cd ~/jobs
+	DUMMY_FILE=$(mktemp --tmpdir=$TMPDIR maintenance-XXXXXXX)
+	SICK=""
+	for i in reproducible_node_health_check_* reproducible_maintenance_* ; do
+		case $i in
+			reproducible_node_health_check_amd64_jenkins|reproducible_maintenance_amd64_jenkins)
+				echo "Skipping $i..."
+				continue
+				;;
+			reproducible_node_health_check_*)
+				NODE_ALIAS=$(echo $i | cut -d '_' -f6)
+				NODE_ARCH=$(echo $i | cut -d '_' -f5)
+				FORCE_DATE=$(date -u -d "3 hour ago" '+%Y-%m-%d %H:%M')
+				MAXDIFF=12
+				;;
+			reproducible_maintenance_*)
+				NODE_ALIAS=$(echo $i | cut -d '_' -f4)
+				NODE_ARCH=$(echo $i | cut -d '_' -f3)
+				FORCE_DATE=$(date -u -d "8 hour ago" '+%Y-%m-%d %H:%M')
+				MAXDIFF=3
+				;;
+		esac
+		touch -d "$FORCE_DATE" $DUMMY_FILE
+		case $NODE_ARCH in
+			amd64)
+				case "$NODE_ALIAS" in
+					ionos*)		NODE="$NODE_ALIAS-amd64.debian.net" ;;
+					osuosl*)	NODE="osuosl${NODE_ALIAS#osuosl}-amd64.debian.net" ;;
+				esac ;;
+			i386)	NODE="$NODE_ALIAS-i386.debian.net" ;;
+			arm64)	NODE="codethink${NODE_ALIAS#codethink}-arm64.debian.net" ;;
+			armhf)	NODE="${NODE_ALIAS}-armhf-rb.debian.net" ;;
+		esac
+		case "$NODE" in
+			ionos9-amd64.debian.net|ionos10-amd64.debian.net)
+				# ionos9 and ionos10 are not used for r-b and sometimes are too busy
+				# to run healthcheck / maintenance jobs
+				echo "Skipping ${NODE}..."
+				continue
+				;;
+		esac
+		cd $i/builds
+		LAST=$(ls -rt1 | tail -1)
+		GOOD=$(awk '/^lastSuccessfulBuild/ {print $2}' permalinks)
+		if [ "$LAST" = "$GOOD" ] ; then
+			DIFF=0
+		else
+			let DIFF=$LAST-$GOOD 2>/dev/null|| DIFF=-1
+		fi
+		if [ $DIFF -eq -1 ] ; then
+			echo "Warning: Problems analysing $i build logs, ignoring $NODE."
+		# either the diff is greater than $MAXDIFF (=the last $MAXDIFF job runs failed)
+		# or the last successful run is older than an hour (=a job is still running/hanging)
+		elif [ $DIFF -gt $MAXDIFF ] || [ $LAST -ot $DUMMY_FILE ] ; then
+			echo -n "$i job has issues since more than an hour"
+			if grep -q $NODE $JENKINS_OFFLINE_LIST >/dev/null 2>&1 ; then
+				echo " and $NODE already marked as offline, good."
+			else
+				echo $NODE >> $JENKINS_OFFLINE_LIST
+				echo " so $NODE has (temporarily) been marked as offline now."
+				SICK="$SICK $NODE"
+			fi
+		else
+			echo "$NODE is doing fine, good."
+		fi
+		cd ../..
+	done
+	if [ -n "$SICK" ] ; then
+		SICK=$(echo "$SICK" | sed 's#.debian.net##g' | sed 's#-rb##g' | sed 's# ##' )
+		if [[ $SICK =~ ' ' ]]; then
+			SICK=$(echo "$SICK" | sed 's# # and #g')
+			MESSAGE="$SICK have health problems and have temporarily been marked as offline."
+		else
+			MESSAGE="$SICK has health problems and has temporarily been marked as offline."
+		fi
+		MESSAGE="$MESSAGE To make this permanent, edit jenkins-home/offline_nodes in git."
+		RECIPIENTS="mattia at debian.org holger at debian.org"
+		if [[ $MESSAGE =~ armhf ]]; then
+			RECIPIENTS="$RECIPIENTS vagrant at reproducible-builds.org"
+		fi
+		for TO in $RECIPIENTS ; do
+			echo -e "$MESSAGE" | mail -s "jenkins nodes temporarily marked offline" $TO
+		done
+
+	fi
+	rm -f $DUMMY_FILE
+fi
+
+echo "$(date -u) - updating the chdists, schroots and pbuilder now..."
+# use host architecture (only)
+ARCH=$(dpkg --print-architecture)
+# use host apt proxy configuration for pbuilder
+if [ -n "$http_proxy" ] ; then
+	pbuilder_http_proxy="--http-proxy $http_proxy"
+fi
+for s in $SUITES ; do
+	for i in osuosl ionos3 ionos7 ionos9 ionos10 ; do
+		if [ "${HOSTNAME:0:${#i}}" = "$i" ]; then
+			# this node is not used to do Debian rebuilds, skip it all
+			continue 2
+		fi
+	done
+	#
+	# chdist update
+	#
+	distname="$s-$ARCH"
+	echo "$(date -u) - updating the $s/$ARCH chdist now."
+	if [ ! -d "$CHPATH/$distname" ]; then
+		echo "$(date -u) - chdist not existing, creating one now..."
+		if ! chdist --data-dir="$CHPATH" --arch="$ARCH" create "$distname" "$MIRROR" "$s" main ; then
+			echo "Error: failed to create the $s/$ARCH chdist."
+			exit 1
+		fi
+		. /srv/jenkins/bin/jenkins_node_definitions.sh
+		get_node_information "$HOSTNAME"
+		if "$NODE_RUN_IN_THE_FUTURE" ; then
+			echo "This node is reported to run in the future, configuring APT to ignore the Release file expiration..."
+			echo 'Acquire::Check-Valid-Until "false";' > "$CHPATH/$distname/etc/apt/apt.conf.d/398future"
+		fi
+	fi
+	if ! chdist --data-dir="$CHPATH" apt-get "$distname" -q update ; then
+		echo "Warning: failed to update the $s/$ARCH chdist."
+		DIRTY=true
+	fi
+	#
+	# pbuilder update
+	#
+	# skip main node
+	if [ "$HOSTNAME" = "$MAINNODE" ] ; then
+		continue
+	else
+		echo "$(date -u) - updating pbuilder for $s/$ARCH now."
+	fi
+	for i in 1 2 3 4 ; do
+		[ ! -f /var/cache/pbuilder/$s-reproducible-base.tgz ] || sudo pbuilder --update $pbuilder_http_proxy --basetgz /var/cache/pbuilder/$s-reproducible-base.tgz
+		RESULT=$?
+		if [ $RESULT -eq 1 ] ; then
+			# sleep 61-120 secs
+			echo "Sleeping some time... (to workaround network problems like 'Hash Sum mismatch'...)"
+			/bin/sleep $(echo "scale=1 ; ($(shuf -i 1-600 -n 1)/10)+60" | bc )
+			echo "$(date -u) - Retrying to update pbuilder for $s/$ARCH."
+		elif [ $RESULT -eq 0 ] ; then
+			break
+		fi
+	done
+	if [ $RESULT -eq 1 ] ; then
+		echo "Warning: failed to update pbuilder for $s/$ARCH."
+		DIRTY=true
+	fi
+done
+set -e
+
+# for alpine
+set +e
+case $HOSTNAME in
+	osuosl1-amd64|osuosl2-amd64|jenkins)
+		echo "$(date -u) - updating alpine schroot now."
+		if $(schroot -l|grep -q alpine) ; then
+			schroot --directory /tmp -c source:jenkins-reproducible-alpine -u root -- apk update
+			schroot --directory /tmp -c source:jenkins-reproducible-alpine -u root -- apk upgrade
+			RESULT=$?
+			if [ $RESULT -eq 1 ] ; then
+				echo "Warning: failed to update alpine schroot."
+				DIRTY=true
+			else
+				echo "$(date -u) - updating alpine schroot done."
+			fi
+		else
+			echo "No alpine schroot found, how strange."
+		fi
+		;;
+	*)	;;
+esac
+set -e
+
+# for Arch Linux
+set +e
+case $HOSTNAME in
+	osuosl1-amd64|osuosl2-amd64|jenkins)
+		echo "$(date -u) - updating Arch Linux schroot now."
+		schroot --directory /tmp -c source:jenkins-reproducible-archlinux -u root -- pacman -Syu --noconfirm
+		RESULT=$?
+		if [ $RESULT -eq 1 ] ; then
+			echo "Let's see if /var/lib/pacman/db.lck exists in the schroot..."
+			if [ "$(schroot --directory /tmp -c source:jenkins-reproducible-archlinux -u root -- ls /var/lib/pacman/db.lck)" = "/var/lib/pacman/db.lck" ] ; then
+				echo "Warning: failed to update Arch Linux schroot, pacman/db.lck exists."
+			else
+				echo "Warning: failed to update Arch Linux schroot."
+			fi
+			DIRTY=true
+		else
+			echo "$(date -u) - updating Arch Linux schroot done."
+		fi
+		;;
+	*)	;;
+esac
+set -e
+
+if [ "$HOSTNAME" = "$MAINNODE" ] ; then
+	#
+	# find failed builds due to network problems and reschedule them
+	#
+	# only grep through the last 5h (300 minutes) of builds...
+	# (ignore "*None.rbuild.log" because these are build which were just started)
+	# this job runs every 4h
+	echo "$(date -u) - Rescheduling failed builds due to network issues."
+	FAILED_BUILDS=$(find $DEBIAN_BASE/rbuild -type f ! -name "*None.rbuild.log" ! -mmin +300 -exec zgrep -l -E 'E: Failed to fetch.*(Unable to connect to|Connection failed|Size mismatch|Cannot initiate the connection to|Bad Gateway|Service Unavailable)' {} \; 2>/dev/null || true)
+	if [ -n "$FAILED_BUILDS" ] ; then
+		echo
+		echo "The following builds have failed due to network problems and will be rescheduled now:"
+		echo "$FAILED_BUILDS"
+		echo
+		echo "Rescheduling packages: "
+		REQUESTER="jenkins maintenance job"
+		REASON="maintenance reschedule: reschedule builds which failed due to network errors"
 		for SUITE in $(echo $FAILED_BUILDS | sed "s# #\n#g" | cut -d "/" -f9 | sort -u) ; do
 			for ARCH in $(echo $FAILED_BUILDS | sed "s# #\n#g" | cut -d "/" -f10 | sort -u) ; do
 				CANDIDATES=$(for PKG in $(echo $FAILED_BUILDS | sed "s# #\n#g" | grep "/$SUITE/$ARCH/" | cut -d "/" -f11 | cut -d "_" -f1) ; do echo "$PKG" ; done)
@@ -686,137 +816,6 @@ if [ "$HOSTNAME" = "$MAINNODE" ] ; then
 
 fi
 
-# find+terminate processes which should not be there
-echo "$(date -u) - Looking for processes which should not be there."
-HAYSTACK=$(mktemp --tmpdir=$TEMPDIR maintenance-XXXXXXXXXXX)
-RESULT=$(mktemp --tmpdir=$TEMPDIR maintenance-XXXXXXXXXXX)
-TOKILL=$(mktemp --tmpdir=$TEMPDIR maintenance-XXXXXXXXXXX)
-PBUIDS="1234 1111 2222"
-ps axo pid,user,size,pcpu,cmd > $HAYSTACK
-for i in $PBUIDS ; do
-	for PROCESS in $(pgrep -u $i -P 1 || true) ; do
-		# faked-sysv comes and goes...
-		grep ^$PROCESS $HAYSTACK | grep -v faked-sysv >> $RESULT 2> /dev/null || true
-	done
-done
-if [ -s $RESULT ] ; then
-	for PROCESS in $(cat $RESULT | cut -d " " -f1 | grep -v ^UID | xargs echo) ; do
-		AGE=$(ps -p $PROCESS -o etimes= || echo 0)
-		# a single build may take day, so... (first build: 18h, 2nd: 24h)
-		if [ $AGE -gt $(( 24*60*60 )) ] ; then
-			echo "$PROCESS" >> $TOKILL
-		fi
-	done
-	if [ -s $TOKILL ] ; then
-		DIRTY=true
-		PSCALL=""
-		echo
-		echo "Info: processes found which should not be there, killing them now:"
-		for PROCESS in $(cat $TOKILL) ; do
-			PSCALL=${PSCALL:+"$PSCALL,"}"$PROCESS"
-		done
-		ps -F -p $PSCALL
-		echo
-		for PROCESS in $(cat $TOKILL) ; do
-			sudo kill -9 $PROCESS 2>&1 || true
-			echo "'sudo kill -9 $PROCESS' done."
-		done
-		echo
-	fi
-fi
-rm $HAYSTACK $RESULT $TOKILL
-# There are naughty processes spawning childs and leaving them to their grandparents
-PSCALL=""
-for i in $PBUIDS ; do
-	for p in $(pgrep -u $i) ; do
-		AGE=$(ps -p $p -o etimes= || echo 0)
-		# let's be generous and consider 26 hours here...
-		if [ $AGE -gt $(( 26*60*60 )) ] ; then
-			sudo kill -9 $p 2>&1 || (echo "Could not kill:" ; ps -F -p "$p")
-			sleep 2
-			# check it's gone
-			AGE=$(ps -p $p -o etimes= || echo 0)
-			if [ $AGE -gt $(( 14*60*60 )) ] ; then
-				PSCALL=${PSCALL:+"$PSCALL,"}"$p"
-			fi
-		fi
-	done
-done
-if [ -n "$PSCALL" ] ; then
-	# ignore some well known zombie processes
-	KNOWN_ZOMBIE_PROCESSES="(buf-ring.t|poll-race-mshot.t|ringbuf-read.t|send_recvmsg.t)"
-	if [ $(ps -F -p "$PSCALL" | grep -E -v "$KNOWN_ZOMBIE_PROCESSES" | wc -l) -lt 10 ] ; then
-		echo "Info: ignoring less than ten processes which should not be there and which could not be killed, because those are probably just a few harmless zombies, which can only be removed by rebooting...."
-	else
-		echo "Warning: found more than ten processes which should not be there and which could not be killed. Please investigate and reboot or ignore them...:"
-	fi
-	ps -F -p "$PSCALL" | grep -E -v "$KNOWN_ZOMBIE_PROCESSES"
-	echo
-fi
-
-# find builds which should not be there
-RESULTS=$(pgrep -f reproducible_build.sh --parent 1 || true)
-if [ -n "$RESULTS" ] ; then
-	DIRTY=true
-	echo "Warning: found reproducible_build.sh processes which have pid 1 as parent (and not sshd), thus something went wrong… please investigate."
-	echo -e "$RESULTS"
-fi
-
-# remove debian ci builds artifacts older than a day
-echo "$(date -u) - Checking for artifacts older than a day."
-ARTIFACTS=$(find $DEBIAN_BASE/artifacts/r00t-me/* -maxdepth 1 -type d -mtime +1 -exec ls -lad {} \; 2>/dev/null|| true)
-if [ -n "$ARTIFACTS" ] ; then
-	echo
-	echo "Removed old artifacts:"
-	find $DEBIAN_BASE/artifacts/r00t-me/* -maxdepth 1 -type d -mtime +1 -exec rm -rv --one-file-system {} \; || true
-	echo
-fi
-
-# remove artifacts from the debian live build jobs, older twelve hours
-echo "$(date -u) - Checking for artifacts from debian live build jobs, that are older than 12h"
-ARTIFACTS=$(find $DEBIAN_BASE/live_build/artifacts/r00t-me/* -maxdepth 1 -type f -mmin +720 -exec ls -lad {} \; 2>/dev/null|| true)
-if [ -n "$ARTIFACTS" ] ; then
-	echo
-	echo "Removed old debian-live artifacts:"
-	find $DEBIAN_BASE/live_build/artifacts/r00t-me/* -maxdepth 1 -type f -mmin +720 -exec rm -v --one-file-system {} \; || true
-	echo
-fi
-
-# remove leftovers from rsyncing live-build results
-echo "$(date -u) - Checking for leftovers from rsyncing live-build results, that are older than a day."
-dir=/var/lib/jenkins/userContent
-ARTIFACTS=$(find $dir/reproducible-rsync-????-* -maxdepth 1 -type d -mtime +1 -exec ls -lad {} \; 2>/dev/null|| true)
-if [ -n "$ARTIFACTS" ] ; then
-	echo
-	echo "Removed leftovers from rsyncing live-build artifacts:"
-	find $dir/reproducible-rsync-????-* -maxdepth 1 -type d -mtime +1 -exec rm -rv --one-file-system {} \; || true
-	echo
-fi
-
-
-# find + chmod files with bad permissions
-echo "$(date -u) - Checking for files with bad permissions."
-# automatically fix rbuild files with wrong permissions...
-# (we know it happens (very rarely) but... shrugs.)
-[ ! -d $DEBIAN_BASE/rbuild ] || find $DEBIAN_BASE/rbuild ! -perm 644 -type f -exec chmod -v 644 {} \; 2>/dev/null|| true
-BADPERMS=$(find $DEBIAN_BASE/{buildinfo,dbd,dbdtxt,dbdjson,logs,logdiffs,rbuild,artifacts,buster,bullseye,bookworm,trixie,unstable,experimental,rb-pkg} ! -perm 644 -type f 2>/dev/null|| true)
-if [ -n "$BADPERMS" ] ; then
-    DIRTY=true
-    echo
-    echo "Warning: Found files with bad permissions (!=644):"
-    echo "Please fix permission manually"
-    echo "$BADPERMS" | xargs echo chmod -v 644
-    echo
-fi
-
-# find kernels we cannot read and fix that
-BADPERMS=$(find /boot/vmlinuz* ! -perm 644)
-if [ -n "$BADPERMS" ] ; then
-    echo "Fixing kernel permissions:"
-    echo $BADPERMS
-    sudo chmod +r /boot/vmlinuz*
-fi
-
 # daily mails
 if [ "$HOSTNAME" = "$MAINNODE" ] && [ $(date -u +%H) -eq 0 ]  ; then
 	echo "$(date -u) - sending daily emails about problems found in logfiles:"



View it on GitLab: https://salsa.debian.org/qa/jenkins.debian.net/-/compare/73a595683a7b1e455991b3c0adbade4662edf582...3824405f61d555cf11a1cc8a88916990ce186308

-- 
View it on GitLab: https://salsa.debian.org/qa/jenkins.debian.net/-/compare/73a595683a7b1e455991b3c0adbade4662edf582...3824405f61d555cf11a1cc8a88916990ce186308
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/qa-jenkins-scm/attachments/20231024/0da0a9ab/attachment-0001.htm>


More information about the Qa-jenkins-scm mailing list