[med-svn] [Git][med-team/scrappie][master] 3 commits: Attempt using simde to compile across more arches
Nilesh Patra
gitlab at salsa.debian.org
Mon Dec 7 08:48:09 GMT 2020
Nilesh Patra pushed to branch master at Debian Med / scrappie
Commits:
a00691d9 by Nilesh Patra at 2020-12-07T13:54:58+05:30
Attempt using simde to compile across more arches
- - - - -
7a831032 by Nilesh Patra at 2020-12-07T14:13:53+05:30
Add libsimde-dev to build deps
- - - - -
f19dfb8d by Nilesh Patra at 2020-12-07T14:14:35+05:30
Add simde flags
- - - - -
4 changed files:
- debian/control
- + debian/patches/series
- + debian/patches/simde.patch
- debian/rules
Changes:
=====================================
debian/control
=====================================
@@ -7,7 +7,8 @@ Build-Depends: debhelper-compat (= 13),
libcunit1-dev,
libhdf5-dev,
libopenblas-dev,
- cmake
+ cmake,
+ libsimde-dev
Standards-Version: 4.5.0
Homepage: https://github.com/nanoporetech/scrappie
Vcs-Browser: https://salsa.debian.org/med-team/scrappie
=====================================
debian/patches/series
=====================================
@@ -0,0 +1 @@
+simde.patch
=====================================
debian/patches/simde.patch
=====================================
@@ -0,0 +1,398 @@
+--- a/interface/scrappie.h
++++ b/interface/scrappie.h
+@@ -5,7 +5,8 @@
+ extern "C" {
+ # endif
+
+-# include <immintrin.h>
++# define SIMDE_ENABLE_NATIVE_ALIASES
++# include <simde/x86/avx.h>
+ # include <inttypes.h>
+ # include <stdbool.h>
+
+--- a/src/scrappie_matrix.h
++++ b/src/scrappie_matrix.h
+@@ -2,7 +2,8 @@
+ #ifndef SCRAPPIE_MATRIX_H
+ # define SCRAPPIE_MATRIX_H
+
+-# include <immintrin.h>
++# define SIMDE_ENABLE_NATIVE_ALIASES
++# include <simde/x86/avx.h>
+ # include <stdbool.h>
+ # include <stdint.h>
+ # include <stdio.h>
+--- a/src/sse_mathfun.h
++++ b/src/sse_mathfun.h
+@@ -40,8 +40,6 @@
+ #define SSE_MATHFUN_H
+
+
+-#include <xmmintrin.h>
+-
+ /* yes I know, the top of this file is quite ugly */
+
+ #ifdef _MSC_VER /* visual c++ */
+@@ -55,12 +53,9 @@
+ /* __m128 is ugly to write */
+ typedef __m128 v4sf; // vector of 4 float (sse1)
+
+-#ifdef USE_SSE2
+-# include <emmintrin.h>
++#define SIMDE_ENABLE_NATIVE_ALIASES
++#include <simde/x86/sse2.h>
+ typedef __m128i v4si; // vector of 4 int (sse2)
+-#else
+-typedef __m64 v2si; // vector of 2 int (mmx)
+-#endif
+
+ /* declare some SSE constants -- why can't I figure a better way to do that? */
+ #define _PS_CONST(Name, Val) \
+@@ -99,61 +94,24 @@
+ _PS_CONST(cephes_log_q1, -2.12194440e-4);
+ _PS_CONST(cephes_log_q2, 0.693359375);
+
+-#ifndef USE_SSE2
+-typedef union xmm_mm_union {
+- __m128 xmm;
+- __m64 mm[2];
+-} xmm_mm_union;
+-
+-#define COPY_XMM_TO_MM(xmm_, mm0_, mm1_) { \
+- xmm_mm_union u; u.xmm = xmm_; \
+- mm0_ = u.mm[0]; \
+- mm1_ = u.mm[1]; \
+-}
+-
+-#define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) { \
+- xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm; \
+- }
+-
+-#endif // USE_SSE2
+
+ /* natural logarithm computed for 4 simultaneous float
+ return NaN for x <= 0
+ */
+ static inline v4sf __attribute__((__always_inline__)) log_ps(v4sf x) {
+-#ifdef USE_SSE2
+ v4si emm0;
+-#else
+- v2si mm0, mm1;
+-#endif
+ v4sf one = *(v4sf*)_ps_1;
+-
+ v4sf invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());
+
+ x = _mm_max_ps(x, *(v4sf*)_ps_min_norm_pos); /* cut off denormalized stuff */
+
+-#ifndef USE_SSE2
+- /* part 1: x = frexpf(x, &e); */
+- COPY_XMM_TO_MM(x, mm0, mm1);
+- mm0 = _mm_srli_pi32(mm0, 23);
+- mm1 = _mm_srli_pi32(mm1, 23);
+-#else
+ emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
+-#endif
+ /* keep only the fractional part */
+ x = _mm_and_ps(x, *(v4sf*)_ps_inv_mant_mask);
+ x = _mm_or_ps(x, *(v4sf*)_ps_0p5);
+
+-#ifndef USE_SSE2
+- /* now e=mm0:mm1 contain the really base-2 exponent */
+- mm0 = _mm_sub_pi32(mm0, *(v2si*)_pi32_0x7f);
+- mm1 = _mm_sub_pi32(mm1, *(v2si*)_pi32_0x7f);
+- v4sf e = _mm_cvtpi32x2_ps(mm0, mm1);
+- _mm_empty(); /* bye bye mmx */
+-#else
+ emm0 = _mm_sub_epi32(emm0, *(v4si*)_pi32_0x7f);
+ v4sf e = _mm_cvtepi32_ps(emm0);
+-#endif
+
+ e = _mm_add_ps(e, one);
+
+@@ -224,11 +182,7 @@
+
+ static inline __attribute__((__always_inline__)) v4sf exp_ps(v4sf x) {
+ v4sf tmp = _mm_setzero_ps(), fx;
+-#ifdef USE_SSE2
+ v4si emm0;
+-#else
+- v2si mm0, mm1;
+-#endif
+ v4sf one = *(v4sf*)_ps_1;
+
+ x = _mm_min_ps(x, *(v4sf*)_ps_exp_hi);
+@@ -239,17 +193,9 @@
+ fx = _mm_add_ps(fx, *(v4sf*)_ps_0p5);
+
+ /* how to perform a floorf with SSE: just below */
+-#ifndef USE_SSE2
+- /* step 1 : cast to int */
+- tmp = _mm_movehl_ps(tmp, fx);
+- mm0 = _mm_cvttps_pi32(fx);
+- mm1 = _mm_cvttps_pi32(tmp);
+- /* step 2 : cast back to float */
+- tmp = _mm_cvtpi32x2_ps(mm0, mm1);
+-#else
++
+ emm0 = _mm_cvttps_epi32(fx);
+ tmp = _mm_cvtepi32_ps(emm0);
+-#endif
+ /* if greater, substract 1 */
+ v4sf mask = _mm_cmpgt_ps(tmp, fx);
+ mask = _mm_and_ps(mask, one);
+@@ -278,24 +224,10 @@
+ y = _mm_add_ps(y, one);
+
+ /* build 2^n */
+-#ifndef USE_SSE2
+- z = _mm_movehl_ps(z, fx);
+- mm0 = _mm_cvttps_pi32(fx);
+- mm1 = _mm_cvttps_pi32(z);
+- mm0 = _mm_add_pi32(mm0, *(v2si*)_pi32_0x7f);
+- mm1 = _mm_add_pi32(mm1, *(v2si*)_pi32_0x7f);
+- mm0 = _mm_slli_pi32(mm0, 23);
+- mm1 = _mm_slli_pi32(mm1, 23);
+-
+- v4sf pow2n;
+- COPY_MM_TO_XMM(mm0, mm1, pow2n);
+- _mm_empty();
+-#else
+ emm0 = _mm_cvttps_epi32(fx);
+ emm0 = _mm_add_epi32(emm0, *(v4si*)_pi32_0x7f);
+ emm0 = _mm_slli_epi32(emm0, 23);
+ v4sf pow2n = _mm_castsi128_ps(emm0);
+-#endif
+ y = _mm_mul_ps(y, pow2n);
+ return y;
+ }
+@@ -342,12 +274,8 @@
+ */
+ static v4sf sin_ps(v4sf x) { // any x
+ v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
+-
+-#ifdef USE_SSE2
+ v4si emm0, emm2;
+-#else
+- v2si mm0, mm1, mm2, mm3;
+-#endif
++
+ sign_bit = x;
+ /* take the absolute value */
+ x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
+@@ -357,7 +285,6 @@
+ /* scale by 4/Pi */
+ y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
+
+-#ifdef USE_SSE2
+ /* store the integer part of y in mm0 */
+ emm2 = _mm_cvttps_epi32(y);
+ /* j=(j+1) & (~1) (see the cephes sources) */
+@@ -381,34 +308,6 @@
+ v4sf poly_mask = _mm_castsi128_ps(emm2);
+ sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
+
+-#else
+- /* store the integer part of y in mm0:mm1 */
+- xmm2 = _mm_movehl_ps(xmm2, y);
+- mm2 = _mm_cvttps_pi32(y);
+- mm3 = _mm_cvttps_pi32(xmm2);
+- /* j=(j+1) & (~1) (see the cephes sources) */
+- mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
+- mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
+- mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
+- mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
+- y = _mm_cvtpi32x2_ps(mm2, mm3);
+- /* get the swap sign flag */
+- mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
+- mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
+- mm0 = _mm_slli_pi32(mm0, 29);
+- mm1 = _mm_slli_pi32(mm1, 29);
+- /* get the polynom selection mask */
+- mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
+- mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
+- mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
+- mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
+- v4sf swap_sign_bit, poly_mask;
+- COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit);
+- COPY_MM_TO_XMM(mm2, mm3, poly_mask);
+- sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
+- _mm_empty(); /* good-bye mmx */
+-#endif
+-
+ /* The magic pass: "Extended precision modular arithmetic"
+ x = ((x - y * DP1) - y * DP2) - y * DP3; */
+ xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
+@@ -459,18 +358,14 @@
+ /* almost the same as sin_ps */
+ static v4sf cos_ps(v4sf x) { // any x
+ v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
+-#ifdef USE_SSE2
+ v4si emm0, emm2;
+-#else
+- v2si mm0, mm1, mm2, mm3;
+-#endif
++
+ /* take the absolute value */
+ x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
+
+ /* scale by 4/Pi */
+ y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
+
+-#ifdef USE_SSE2
+ /* store the integer part of y in mm0 */
+ emm2 = _mm_cvttps_epi32(y);
+ /* j=(j+1) & (~1) (see the cephes sources) */
+@@ -489,43 +384,7 @@
+
+ v4sf sign_bit = _mm_castsi128_ps(emm0);
+ v4sf poly_mask = _mm_castsi128_ps(emm2);
+-#else
+- /* store the integer part of y in mm0:mm1 */
+- xmm2 = _mm_movehl_ps(xmm2, y);
+- mm2 = _mm_cvttps_pi32(y);
+- mm3 = _mm_cvttps_pi32(xmm2);
+-
+- /* j=(j+1) & (~1) (see the cephes sources) */
+- mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
+- mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
+- mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
+- mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
+-
+- y = _mm_cvtpi32x2_ps(mm2, mm3);
+-
+-
+- mm2 = _mm_sub_pi32(mm2, *(v2si*)_pi32_2);
+- mm3 = _mm_sub_pi32(mm3, *(v2si*)_pi32_2);
+-
+- /* get the swap sign flag in mm0:mm1 and the
+- polynom selection mask in mm2:mm3 */
+-
+- mm0 = _mm_andnot_si64(mm2, *(v2si*)_pi32_4);
+- mm1 = _mm_andnot_si64(mm3, *(v2si*)_pi32_4);
+- mm0 = _mm_slli_pi32(mm0, 29);
+- mm1 = _mm_slli_pi32(mm1, 29);
+-
+- mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
+- mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
+-
+- mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
+- mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
+
+- v4sf sign_bit, poly_mask;
+- COPY_MM_TO_XMM(mm0, mm1, sign_bit);
+- COPY_MM_TO_XMM(mm2, mm3, poly_mask);
+- _mm_empty(); /* good-bye mmx */
+-#endif
+ /* The magic pass: "Extended precision modular arithmetic"
+ x = ((x - y * DP1) - y * DP2) - y * DP3; */
+ xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
+@@ -578,11 +437,8 @@
+ it is almost as fast, and gives you a free cosine with your sine */
+ static void sincos_ps(v4sf x, v4sf *s, v4sf *c) {
+ v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
+-#ifdef USE_SSE2
+ v4si emm0, emm2, emm4;
+-#else
+- v2si mm0, mm1, mm2, mm3, mm4, mm5;
+-#endif
++
+ sign_bit_sin = x;
+ /* take the absolute value */
+ x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
+@@ -592,7 +448,6 @@
+ /* scale by 4/Pi */
+ y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
+
+-#ifdef USE_SSE2
+ /* store the integer part of y in emm2 */
+ emm2 = _mm_cvttps_epi32(y);
+
+@@ -612,40 +467,6 @@
+ emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
+ emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
+ v4sf poly_mask = _mm_castsi128_ps(emm2);
+-#else
+- /* store the integer part of y in mm2:mm3 */
+- xmm3 = _mm_movehl_ps(xmm3, y);
+- mm2 = _mm_cvttps_pi32(y);
+- mm3 = _mm_cvttps_pi32(xmm3);
+-
+- /* j=(j+1) & (~1) (see the cephes sources) */
+- mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
+- mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
+- mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
+- mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
+-
+- y = _mm_cvtpi32x2_ps(mm2, mm3);
+-
+- mm4 = mm2;
+- mm5 = mm3;
+-
+- /* get the swap sign flag for the sine */
+- mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
+- mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
+- mm0 = _mm_slli_pi32(mm0, 29);
+- mm1 = _mm_slli_pi32(mm1, 29);
+- v4sf swap_sign_bit_sin;
+- COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin);
+-
+- /* get the polynom selection mask for the sine */
+-
+- mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
+- mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
+- mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
+- mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
+- v4sf poly_mask;
+- COPY_MM_TO_XMM(mm2, mm3, poly_mask);
+-#endif
+
+ /* The magic pass: "Extended precision modular arithmetic"
+ x = ((x - y * DP1) - y * DP2) - y * DP3; */
+@@ -659,23 +480,10 @@
+ x = _mm_add_ps(x, xmm2);
+ x = _mm_add_ps(x, xmm3);
+
+-#ifdef USE_SSE2
+ emm4 = _mm_sub_epi32(emm4, *(v4si*)_pi32_2);
+ emm4 = _mm_andnot_si128(emm4, *(v4si*)_pi32_4);
+ emm4 = _mm_slli_epi32(emm4, 29);
+ v4sf sign_bit_cos = _mm_castsi128_ps(emm4);
+-#else
+- /* get the sign flag for the cosine */
+- mm4 = _mm_sub_pi32(mm4, *(v2si*)_pi32_2);
+- mm5 = _mm_sub_pi32(mm5, *(v2si*)_pi32_2);
+- mm4 = _mm_andnot_si64(mm4, *(v2si*)_pi32_4);
+- mm5 = _mm_andnot_si64(mm5, *(v2si*)_pi32_4);
+- mm4 = _mm_slli_pi32(mm4, 29);
+- mm5 = _mm_slli_pi32(mm5, 29);
+- v4sf sign_bit_cos;
+- COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos);
+- _mm_empty(); /* good-bye mmx */
+-#endif
+
+ sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
+
+--- a/src/util.h
++++ b/src/util.h
+@@ -1,7 +1,8 @@
+ #pragma once
+ #ifndef UTIL_H
+ # define UTIL_H
+-# include <immintrin.h>
++# define SIMDE_ENABLE_NATIVE_ALIASES
++# include <simde/x86/avx.h>
+ # include <math.h>
+ # include <stdbool.h>
+ # include <stdint.h>
+--- a/python/build.py
++++ b/python/build.py
+@@ -58,7 +58,7 @@
+ scrappie_seq_helpers
+ util'''.split()
+ ],
+- extra_compile_args=['-std=c99', '-msse3', '-O3']
++ extra_compile_args=['-std=c99', '-O3']
+ )
+
+ with open('pyscrap.h', 'r') as fh:
=====================================
debian/rules
=====================================
@@ -1,6 +1,8 @@
#!/usr/bin/make -f
export DH_VERBOSE = 1
export DEB_BUILD_MAINT_OPTIONS = hardening=+all
+export DEB_CFLAGS_MAINT_APPEND+=-DSIMDE_ENABLE_OPENMP -fopenmp-simd -O3
+export DEB_CXXFLAGS_MAINT_APPEND+=-DSIMDE_ENABLE_OPENMP -fopenmp-simd -O3
#export DEB_CFLAGS_MAINT_APPEND = -Wall -pedantic
#export DEB_LDFLAGS_MAINT_APPEND = -Wl,--as-needed
View it on GitLab: https://salsa.debian.org/med-team/scrappie/-/compare/e095d7fc493927356f2d3d2fd7b1fbdc52638af7...f19dfb8db2c7dd24e41e1aa0edf996cf81a02894
--
View it on GitLab: https://salsa.debian.org/med-team/scrappie/-/compare/e095d7fc493927356f2d3d2fd7b1fbdc52638af7...f19dfb8db2c7dd24e41e1aa0edf996cf81a02894
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20201207/9955820b/attachment-0001.html>
More information about the debian-med-commit
mailing list