Bug#918314: gmp: please add NEON-optimized variant for armhf

Yuriy M. Kaminskiy yumkam+debian at gmail.com
Fri Jan 4 22:45:40 GMT 2019


Source: gmp
Version: 2:6.1.2+dfsg-1
Severity: wishlist
Tags: patch

Dear Maintainer,

As --enable-fat works only on i386^1, on armhf neon-optimized code is 
never used, rendering gmp much slower than it can be.

As a workaround, I suggest to compile and install separate 
neon-optimized library in $(libdir)/neon/vfp and rely on ld.so for 
runtime-detection.

Debdiff attached, passed limited testing ([1] cross-compiled [implies 
nocheck] `pbuilder --host-arch=armhf --arch=i386`, installed on 
rpi3b+/raspbian-stretch, benchmarked; [2] native recompilation on 
rpi3b+/raspbian [passed regression tests], then installed and benchmarked);

No idea how well this would work on hurd, kfreebsd, etc.

I hardcoded armv7 for neon code: debian's armhf requires minimum armv7, 
so this should be acceptable; I have not tested, but this should not 
break "fake armhf" from raspbian (rpi1 is armv6 without neon, so new 
neon-optimized variant would not be picked; newer rpi are armv7+ and 
have neon).

Potential pitfalls: it enables previously untested (at least, by debian) 
code on affected platforms, some bugs can lurk there.

About expected user-visible effect, on Raspberry Pi 3B+ (BCM2837B0, 
4-core Cortex-A53 @1.4GHz), `gnutls-cli --benchmark-tls-kx`:

Before:
Testing key exchanges (RSA/DH bits: 3072, EC bits: 256)
(TLS1.2)-(DHE-RSA-3072)-(AES-128-CBC)-(SHA1)  5.50 transactions/sec
            (avg. handshake time: 181.71 ms, sample variance: 0.43)
(TLS1.2)-(ECDHE-RSA-SECP256R1)-(AES-128-CBC)-(SHA1)  12.08 transactions/sec
            (avg. handshake time: 82.77 ms, sample variance: 0.18)
(TLS1.2)-(ECDHE-RSA-X25519)-(AES-128-CBC)-(SHA1)  12.32 transactions/sec
            (avg. handshake time: 81.03 ms, sample variance: 0.03)
(TLS1.2)-(ECDHE-ECDSA-SECP256R1)-(AES-128-CBC)-(SHA1)  77.88 
transactions/sec
            (avg. handshake time: 12.73 ms, sample variance: 0.20)
(TLS1.2)-(ECDHE-ECDSA-X25519)-(AES-128-CBC)-(SHA1)  88.98 transactions/sec
            (avg. handshake time: 11.13 ms, sample variance: 0.11)
    (TLS1.2)-(RSA)-(AES-128-CBC)-(SHA1)  13.15 transactions/sec
            (avg. handshake time: 75.86 ms, sample variance: 0.12)

After:
Testing key exchanges (RSA/DH bits: 3072, EC bits: 256)
(TLS1.2)-(DHE-RSA-3072)-(AES-128-CBC)-(SHA1)  8.40 transactions/sec
            (avg. handshake time: 118.98 ms, sample variance: 0.27)
(TLS1.2)-(ECDHE-RSA-SECP256R1)-(AES-128-CBC)-(SHA1)  18.42 transactions/sec
            (avg. handshake time: 54.23 ms, sample variance: 0.18)
(TLS1.2)-(ECDHE-RSA-X25519)-(AES-128-CBC)-(SHA1)  18.88 transactions/sec
            (avg. handshake time: 52.82 ms, sample variance: 0.15)
(TLS1.2)-(ECDHE-ECDSA-SECP256R1)-(AES-128-CBC)-(SHA1)  93.89 
transactions/sec
            (avg. handshake time: 10.54 ms, sample variance: 0.25)
(TLS1.2)-(ECDHE-ECDSA-X25519)-(AES-128-CBC)-(SHA1)  106.83 transactions/sec
            (avg. handshake time: 9.25 ms, sample variance: 0.19)
    (TLS1.2)-(RSA)-(AES-128-CBC)-(SHA1)  20.46 transactions/sec
            (avg. handshake time: 48.78 ms, sample variance: 0.18)

That is, 20% to 50% speedup.

^1 --enable-fat works on amd64 too - but debian disables it; maybe, it's 
time to reconsider? some related bugs was fixed upstream since last 
attempt (which resulted in #671866); that said, on my cpu amd64+fat is 
slower than current debian-packaged "fat-free" code, so I'm not very 
much interested.

-- System Information:
Debian Release: 9.6
   APT prefers stable-updates
   APT policy: (500, 'stable-updates'), (500, 'stable-debug'), (500, 
'proposed-updates-debug'), (500, 'proposed-updates'), (500, 'stable')
Architecture: i386 (x86_64)
Foreign Architectures: amd64

Kernel: Linux 4.9.0-6-amd64 (SMP w/2 CPU cores)
Locale: LANG=ru_RU.KOI8-R, LC_CTYPE=ru_RU.KOI8-R (charmap=KOI8-R), 
LANGUAGE=ru_RU.KOI8-R (charmap=KOI8-R)
Shell: /bin/sh linked to /bin/dash
Init: systemd (via /run/systemd/system)

-------------- next part --------------
diff -Nru gmp-6.1.2+dfsg/debian/rules gmp-6.1.2+dfsg/debian/rules
--- gmp-6.1.2+dfsg/debian/rules	2016-12-21 08:38:23.000000000 +0300
+++ gmp-6.1.2+dfsg/debian/rules	2019-01-02 22:52:33.000000000 +0300
@@ -68,9 +68,20 @@
 
 confflags_ma = $(confflags) $(confflags_build) --libdir=/usr/lib/$(DEB_HOST_MULTIARCH)
 
+FLAVORS = main
+LIBDIR_main =
+
 CC   = $(DEB_HOST_GNU_TYPE)-gcc
 CXX   = $(DEB_HOST_GNU_TYPE)-g++
 
+ifneq (,$(filter armhf, $(DEB_HOST_ARCH)))
+FLAVORS += neon
+
+LIBDIR_neon = neon/vfp
+neon_host_type = $(patsubst arm-%,armcortexa7neon-unknown-%,$(DEB_HOST_GNU_TYPE))
+confflags_neon = --host=$(neon_host_type) --target=$(neon_host_type) --libdir=/usr/lib/$(DEB_HOST_MULTIARCH)/$(LIBDIR_neon)
+CFLAGS_neon = -march=armv7-a -mfpu=neon
+endif
 
 get-orig-source: gmp-$(ORIG_SRC_VERSION).tar.xz
 	tar --xz -xf $<
@@ -88,25 +99,34 @@
 gmp-$(ORIG_SRC_VERSION).tar.xz:
 	wget https://gmplib.org/download/gmp/$@
 
-configure: configure-stamp
-configure-stamp:
-	mkdir -p build
-	cd build && ../configure $(confflags_ma) \
-	    AR=$(AR) CC="$(CC)" CFLAGS="$(CFLAGS)" \
-	    CXX="$(CXX)" CXXFLAGS="$(CXXFLAGS)"
+$(patsubst %,configure-%,$(FLAVORS)): configure-%: configure-stamp-%
+$(patsubst %,configure-stamp-%,$(FLAVORS)): configure-stamp-%:
+	mkdir -p build-$*
+	cd build-$* && ../configure $(confflags_ma) \
+	    $(confflags_$*) \
+	    AR=$(AR) CC="$(CC)" CFLAGS="$(CFLAGS) $(CFLAGS_$*)" \
+	    CXX="$(CXX)" CXXFLAGS="$(CXXFLAGS) $(CFLAGS_$*)"
 	touch $@
 
-build: build-stamp
-build-stamp: configure
+$(patsubst %,build-%,$(FLAVORS)): build-%: build-stamp-%
+$(patsubst %,build-stamp-%,$(FLAVORS)): build-stamp-%: configure-%
 	dh_testdir
-	$(MAKE) $(JOBSFLAG) -C build
-	$(MAKE_CHECK) -C build
+	$(MAKE) $(JOBSFLAG) -C build-$*
+	$(MAKE_CHECK) -C build-$*
+	touch $@
+
+build: $(patsubst %,build-%,$(FLAVORS))
+build-stamp: $(patsubst %,build-stamp-%,$(FLAVORS))
 	touch $@
 
 clean:
 	dh_testdir
 	dh_testroot
 	rm -rf build build-stamp
+	rm -rf $(patsubst %,build-%,$(FLAVORS))
+	rm -rf $(patsubst %,build-stamp-%,$(FLAVORS))
+	rm -rf configure-stamp
+	rm -rf $(patsubst %,configure-stamp-%,$(FLAVORS))
 	dh_clean
 
 install-prep:
@@ -115,13 +135,17 @@
 	dh_prep
 	dh_installdirs
 
+$(patsubst %,install-%,$(FLAVORS)): install-%: build-stamp-%
+	$(MAKE) DESTDIR=`pwd`/debian/tmp includeexecdir=/usr/include/$(DEB_HOST_MULTIARCH) -C build-$* install-exec
+	dh_install -plibgmp10 usr/lib/*/$(LIBDIR_$*)/libgmp.so.*
+
 install: build-stamp install-prep
 	rm -rf debian/tmp
+	$(MAKE) -f debian/rules $(patsubst %,install-%,$(FLAVORS))
 	# Install places gmp.h in 'includeexecdir' which is non-standard and cannot be set at compile time,
 	# so override it at install.
-	$(MAKE) DESTDIR=`pwd`/debian/tmp includeexecdir=/usr/include/$(DEB_HOST_MULTIARCH) -C build install
+	$(MAKE) DESTDIR=`pwd`/debian/tmp includeexecdir=/usr/include/$(DEB_HOST_MULTIARCH) -C build-main install
 
-	dh_install -plibgmp10 usr/lib/*/libgmp.so.*
 	dh_install -plibgmpxx4ldbl usr/lib/*/libgmpxx.so.*
 
 	dh_install -plibgmp-dev usr/lib/*/lib*.so



More information about the debian-science-maintainers mailing list