[med-svn] [Git][med-team/scrappie][master] 3 commits: Attempt using simde to compile across more arches

Mon Dec 7 08:48:09 GMT 2020


Nilesh Patra pushed to branch master at Debian Med / scrappie


Commits:
a00691d9 by Nilesh Patra at 2020-12-07T13:54:58+05:30
Attempt using simde to compile across more arches

- - - - -
7a831032 by Nilesh Patra at 2020-12-07T14:13:53+05:30
Add libsimde-dev to build deps

- - - - -
f19dfb8d by Nilesh Patra at 2020-12-07T14:14:35+05:30
Add simde flags

- - - - -


4 changed files:

- debian/control
- + debian/patches/series
- + debian/patches/simde.patch
- debian/rules


Changes:

=====================================
debian/control
=====================================
@@ -7,7 +7,8 @@ Build-Depends: debhelper-compat (= 13),
                libcunit1-dev,
                libhdf5-dev,
                libopenblas-dev,
-               cmake
+               cmake,
+               libsimde-dev
 Standards-Version: 4.5.0
 Homepage: https://github.com/nanoporetech/scrappie
 Vcs-Browser: https://salsa.debian.org/med-team/scrappie


=====================================
debian/patches/series
=====================================
@@ -0,0 +1 @@
+simde.patch


=====================================
debian/patches/simde.patch
=====================================
@@ -0,0 +1,398 @@
+--- a/interface/scrappie.h
++++ b/interface/scrappie.h
+@@ -5,7 +5,8 @@
+ extern "C" {
+ #    endif
+ 
+-#    include <immintrin.h>
++#    define SIMDE_ENABLE_NATIVE_ALIASES
++#    include <simde/x86/avx.h>
+ #    include <inttypes.h>
+ #    include <stdbool.h>
+ 
+--- a/src/scrappie_matrix.h
++++ b/src/scrappie_matrix.h
+@@ -2,7 +2,8 @@
+ #ifndef SCRAPPIE_MATRIX_H
+ #    define SCRAPPIE_MATRIX_H
+ 
+-#    include <immintrin.h>
++#    define SIMDE_ENABLE_NATIVE_ALIASES
++#    include <simde/x86/avx.h>
+ #    include <stdbool.h>
+ #    include <stdint.h>
+ #    include <stdio.h>
+--- a/src/sse_mathfun.h
++++ b/src/sse_mathfun.h
+@@ -40,8 +40,6 @@
+ #define SSE_MATHFUN_H
+ 
+ 
+-#include <xmmintrin.h>
+-
+ /* yes I know, the top of this file is quite ugly */
+ 
+ #ifdef _MSC_VER /* visual c++ */
+@@ -55,12 +53,9 @@
+ /* __m128 is ugly to write */
+ typedef __m128 v4sf;  // vector of 4 float (sse1)
+ 
+-#ifdef USE_SSE2
+-# include <emmintrin.h>
++#define SIMDE_ENABLE_NATIVE_ALIASES
++#include <simde/x86/sse2.h>
+ typedef __m128i v4si; // vector of 4 int (sse2)
+-#else
+-typedef __m64 v2si;   // vector of 2 int (mmx)
+-#endif
+ 
+ /* declare some SSE constants -- why can't I figure a better way to do that? */
+ #define _PS_CONST(Name, Val)                                            \
+@@ -99,61 +94,24 @@
+ _PS_CONST(cephes_log_q1, -2.12194440e-4);
+ _PS_CONST(cephes_log_q2, 0.693359375);
+ 
+-#ifndef USE_SSE2
+-typedef union xmm_mm_union {
+-  __m128 xmm;
+-  __m64 mm[2];
+-} xmm_mm_union;
+-
+-#define COPY_XMM_TO_MM(xmm_, mm0_, mm1_) {          \
+-    xmm_mm_union u; u.xmm = xmm_;                   \
+-    mm0_ = u.mm[0];                                 \
+-    mm1_ = u.mm[1];                                 \
+-}
+-
+-#define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) {                         \
+-    xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm;      \
+-  }
+-
+-#endif // USE_SSE2
+ 
+ /* natural logarithm computed for 4 simultaneous float
+    return NaN for x <= 0
+ */
+ static inline v4sf __attribute__((__always_inline__)) log_ps(v4sf x) {
+-#ifdef USE_SSE2
+   v4si emm0;
+-#else
+-  v2si mm0, mm1;
+-#endif
+   v4sf one = *(v4sf*)_ps_1;
+-
+   v4sf invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());
+ 
+   x = _mm_max_ps(x, *(v4sf*)_ps_min_norm_pos);  /* cut off denormalized stuff */
+ 
+-#ifndef USE_SSE2
+-  /* part 1: x = frexpf(x, &e); */
+-  COPY_XMM_TO_MM(x, mm0, mm1);
+-  mm0 = _mm_srli_pi32(mm0, 23);
+-  mm1 = _mm_srli_pi32(mm1, 23);
+-#else
+   emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
+-#endif
+   /* keep only the fractional part */
+   x = _mm_and_ps(x, *(v4sf*)_ps_inv_mant_mask);
+   x = _mm_or_ps(x, *(v4sf*)_ps_0p5);
+ 
+-#ifndef USE_SSE2
+-  /* now e=mm0:mm1 contain the really base-2 exponent */
+-  mm0 = _mm_sub_pi32(mm0, *(v2si*)_pi32_0x7f);
+-  mm1 = _mm_sub_pi32(mm1, *(v2si*)_pi32_0x7f);
+-  v4sf e = _mm_cvtpi32x2_ps(mm0, mm1);
+-  _mm_empty(); /* bye bye mmx */
+-#else
+   emm0 = _mm_sub_epi32(emm0, *(v4si*)_pi32_0x7f);
+   v4sf e = _mm_cvtepi32_ps(emm0);
+-#endif
+ 
+   e = _mm_add_ps(e, one);
+ 
+@@ -224,11 +182,7 @@
+ 
+ static inline __attribute__((__always_inline__)) v4sf exp_ps(v4sf x) {
+   v4sf tmp = _mm_setzero_ps(), fx;
+-#ifdef USE_SSE2
+   v4si emm0;
+-#else
+-  v2si mm0, mm1;
+-#endif
+   v4sf one = *(v4sf*)_ps_1;
+ 
+   x = _mm_min_ps(x, *(v4sf*)_ps_exp_hi);
+@@ -239,17 +193,9 @@
+   fx = _mm_add_ps(fx, *(v4sf*)_ps_0p5);
+ 
+   /* how to perform a floorf with SSE: just below */
+-#ifndef USE_SSE2
+-  /* step 1 : cast to int */
+-  tmp = _mm_movehl_ps(tmp, fx);
+-  mm0 = _mm_cvttps_pi32(fx);
+-  mm1 = _mm_cvttps_pi32(tmp);
+-  /* step 2 : cast back to float */
+-  tmp = _mm_cvtpi32x2_ps(mm0, mm1);
+-#else
++
+   emm0 = _mm_cvttps_epi32(fx);
+   tmp  = _mm_cvtepi32_ps(emm0);
+-#endif
+   /* if greater, substract 1 */
+   v4sf mask = _mm_cmpgt_ps(tmp, fx);
+   mask = _mm_and_ps(mask, one);
+@@ -278,24 +224,10 @@
+   y = _mm_add_ps(y, one);
+ 
+   /* build 2^n */
+-#ifndef USE_SSE2
+-  z = _mm_movehl_ps(z, fx);
+-  mm0 = _mm_cvttps_pi32(fx);
+-  mm1 = _mm_cvttps_pi32(z);
+-  mm0 = _mm_add_pi32(mm0, *(v2si*)_pi32_0x7f);
+-  mm1 = _mm_add_pi32(mm1, *(v2si*)_pi32_0x7f);
+-  mm0 = _mm_slli_pi32(mm0, 23);
+-  mm1 = _mm_slli_pi32(mm1, 23);
+-
+-  v4sf pow2n;
+-  COPY_MM_TO_XMM(mm0, mm1, pow2n);
+-  _mm_empty();
+-#else
+   emm0 = _mm_cvttps_epi32(fx);
+   emm0 = _mm_add_epi32(emm0, *(v4si*)_pi32_0x7f);
+   emm0 = _mm_slli_epi32(emm0, 23);
+   v4sf pow2n = _mm_castsi128_ps(emm0);
+-#endif
+   y = _mm_mul_ps(y, pow2n);
+   return y;
+ }
+@@ -342,12 +274,8 @@
+ */
+ static v4sf sin_ps(v4sf x) { // any x
+   v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
+-
+-#ifdef USE_SSE2
+   v4si emm0, emm2;
+-#else
+-  v2si mm0, mm1, mm2, mm3;
+-#endif
++
+   sign_bit = x;
+   /* take the absolute value */
+   x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
+@@ -357,7 +285,6 @@
+   /* scale by 4/Pi */
+   y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
+ 
+-#ifdef USE_SSE2
+   /* store the integer part of y in mm0 */
+   emm2 = _mm_cvttps_epi32(y);
+   /* j=(j+1) & (~1) (see the cephes sources) */
+@@ -381,34 +308,6 @@
+   v4sf poly_mask = _mm_castsi128_ps(emm2);
+   sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
+ 
+-#else
+-  /* store the integer part of y in mm0:mm1 */
+-  xmm2 = _mm_movehl_ps(xmm2, y);
+-  mm2 = _mm_cvttps_pi32(y);
+-  mm3 = _mm_cvttps_pi32(xmm2);
+-  /* j=(j+1) & (~1) (see the cephes sources) */
+-  mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
+-  mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
+-  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
+-  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
+-  y = _mm_cvtpi32x2_ps(mm2, mm3);
+-  /* get the swap sign flag */
+-  mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
+-  mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
+-  mm0 = _mm_slli_pi32(mm0, 29);
+-  mm1 = _mm_slli_pi32(mm1, 29);
+-  /* get the polynom selection mask */
+-  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
+-  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
+-  mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
+-  mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
+-  v4sf swap_sign_bit, poly_mask;
+-  COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit);
+-  COPY_MM_TO_XMM(mm2, mm3, poly_mask);
+-  sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
+-  _mm_empty(); /* good-bye mmx */
+-#endif
+-
+   /* The magic pass: "Extended precision modular arithmetic"
+      x = ((x - y * DP1) - y * DP2) - y * DP3; */
+   xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
+@@ -459,18 +358,14 @@
+ /* almost the same as sin_ps */
+ static v4sf cos_ps(v4sf x) { // any x
+   v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
+-#ifdef USE_SSE2
+   v4si emm0, emm2;
+-#else
+-  v2si mm0, mm1, mm2, mm3;
+-#endif
++
+   /* take the absolute value */
+   x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
+ 
+   /* scale by 4/Pi */
+   y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
+ 
+-#ifdef USE_SSE2
+   /* store the integer part of y in mm0 */
+   emm2 = _mm_cvttps_epi32(y);
+   /* j=(j+1) & (~1) (see the cephes sources) */
+@@ -489,43 +384,7 @@
+ 
+   v4sf sign_bit = _mm_castsi128_ps(emm0);
+   v4sf poly_mask = _mm_castsi128_ps(emm2);
+-#else
+-  /* store the integer part of y in mm0:mm1 */
+-  xmm2 = _mm_movehl_ps(xmm2, y);
+-  mm2 = _mm_cvttps_pi32(y);
+-  mm3 = _mm_cvttps_pi32(xmm2);
+-
+-  /* j=(j+1) & (~1) (see the cephes sources) */
+-  mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
+-  mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
+-  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
+-  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
+-
+-  y = _mm_cvtpi32x2_ps(mm2, mm3);
+-
+-
+-  mm2 = _mm_sub_pi32(mm2, *(v2si*)_pi32_2);
+-  mm3 = _mm_sub_pi32(mm3, *(v2si*)_pi32_2);
+-
+-  /* get the swap sign flag in mm0:mm1 and the
+-     polynom selection mask in mm2:mm3 */
+-
+-  mm0 = _mm_andnot_si64(mm2, *(v2si*)_pi32_4);
+-  mm1 = _mm_andnot_si64(mm3, *(v2si*)_pi32_4);
+-  mm0 = _mm_slli_pi32(mm0, 29);
+-  mm1 = _mm_slli_pi32(mm1, 29);
+-
+-  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
+-  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
+-
+-  mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
+-  mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
+ 
+-  v4sf sign_bit, poly_mask;
+-  COPY_MM_TO_XMM(mm0, mm1, sign_bit);
+-  COPY_MM_TO_XMM(mm2, mm3, poly_mask);
+-  _mm_empty(); /* good-bye mmx */
+-#endif
+   /* The magic pass: "Extended precision modular arithmetic"
+      x = ((x - y * DP1) - y * DP2) - y * DP3; */
+   xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
+@@ -578,11 +437,8 @@
+    it is almost as fast, and gives you a free cosine with your sine */
+ static void sincos_ps(v4sf x, v4sf *s, v4sf *c) {
+   v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
+-#ifdef USE_SSE2
+   v4si emm0, emm2, emm4;
+-#else
+-  v2si mm0, mm1, mm2, mm3, mm4, mm5;
+-#endif
++
+   sign_bit_sin = x;
+   /* take the absolute value */
+   x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
+@@ -592,7 +448,6 @@
+   /* scale by 4/Pi */
+   y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
+ 
+-#ifdef USE_SSE2
+   /* store the integer part of y in emm2 */
+   emm2 = _mm_cvttps_epi32(y);
+ 
+@@ -612,40 +467,6 @@
+   emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
+   emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
+   v4sf poly_mask = _mm_castsi128_ps(emm2);
+-#else
+-  /* store the integer part of y in mm2:mm3 */
+-  xmm3 = _mm_movehl_ps(xmm3, y);
+-  mm2 = _mm_cvttps_pi32(y);
+-  mm3 = _mm_cvttps_pi32(xmm3);
+-
+-  /* j=(j+1) & (~1) (see the cephes sources) */
+-  mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
+-  mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
+-  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
+-  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
+-
+-  y = _mm_cvtpi32x2_ps(mm2, mm3);
+-
+-  mm4 = mm2;
+-  mm5 = mm3;
+-
+-  /* get the swap sign flag for the sine */
+-  mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
+-  mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
+-  mm0 = _mm_slli_pi32(mm0, 29);
+-  mm1 = _mm_slli_pi32(mm1, 29);
+-  v4sf swap_sign_bit_sin;
+-  COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin);
+-
+-  /* get the polynom selection mask for the sine */
+-
+-  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
+-  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
+-  mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
+-  mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
+-  v4sf poly_mask;
+-  COPY_MM_TO_XMM(mm2, mm3, poly_mask);
+-#endif
+ 
+   /* The magic pass: "Extended precision modular arithmetic"
+      x = ((x - y * DP1) - y * DP2) - y * DP3; */
+@@ -659,23 +480,10 @@
+   x = _mm_add_ps(x, xmm2);
+   x = _mm_add_ps(x, xmm3);
+ 
+-#ifdef USE_SSE2
+   emm4 = _mm_sub_epi32(emm4, *(v4si*)_pi32_2);
+   emm4 = _mm_andnot_si128(emm4, *(v4si*)_pi32_4);
+   emm4 = _mm_slli_epi32(emm4, 29);
+   v4sf sign_bit_cos = _mm_castsi128_ps(emm4);
+-#else
+-  /* get the sign flag for the cosine */
+-  mm4 = _mm_sub_pi32(mm4, *(v2si*)_pi32_2);
+-  mm5 = _mm_sub_pi32(mm5, *(v2si*)_pi32_2);
+-  mm4 = _mm_andnot_si64(mm4, *(v2si*)_pi32_4);
+-  mm5 = _mm_andnot_si64(mm5, *(v2si*)_pi32_4);
+-  mm4 = _mm_slli_pi32(mm4, 29);
+-  mm5 = _mm_slli_pi32(mm5, 29);
+-  v4sf sign_bit_cos;
+-  COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos);
+-  _mm_empty(); /* good-bye mmx */
+-#endif
+ 
+   sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
+ 
+--- a/src/util.h
++++ b/src/util.h
+@@ -1,7 +1,8 @@
+ #pragma once
+ #ifndef UTIL_H
+ #    define UTIL_H
+-#    include <immintrin.h>
++#    define SIMDE_ENABLE_NATIVE_ALIASES
++#    include <simde/x86/avx.h>
+ #    include <math.h>
+ #    include <stdbool.h>
+ #    include <stdint.h>
+--- a/python/build.py
++++ b/python/build.py
+@@ -58,7 +58,7 @@
+             scrappie_seq_helpers
+             util'''.split()
+     ],
+-    extra_compile_args=['-std=c99', '-msse3', '-O3']
++    extra_compile_args=['-std=c99', '-O3']
+ )
+ 
+ with open('pyscrap.h', 'r') as fh:


=====================================
debian/rules
=====================================
@@ -1,6 +1,8 @@
 #!/usr/bin/make -f
 export DH_VERBOSE = 1
 export DEB_BUILD_MAINT_OPTIONS = hardening=+all
+export DEB_CFLAGS_MAINT_APPEND+=-DSIMDE_ENABLE_OPENMP -fopenmp-simd -O3
+export DEB_CXXFLAGS_MAINT_APPEND+=-DSIMDE_ENABLE_OPENMP -fopenmp-simd -O3
 
 #export DEB_CFLAGS_MAINT_APPEND  = -Wall -pedantic
 #export DEB_LDFLAGS_MAINT_APPEND = -Wl,--as-needed



View it on GitLab: https://salsa.debian.org/med-team/scrappie/-/compare/e095d7fc493927356f2d3d2fd7b1fbdc52638af7...f19dfb8db2c7dd24e41e1aa0edf996cf81a02894

-- 
View it on GitLab: https://salsa.debian.org/med-team/scrappie/-/compare/e095d7fc493927356f2d3d2fd7b1fbdc52638af7...f19dfb8db2c7dd24e41e1aa0edf996cf81a02894
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20201207/9955820b/attachment-0001.html>