[med-svn] [Git][med-team/scrappie][master] Update simde fix

Mon Dec 7 11:34:25 GMT 2020


Nilesh Patra pushed to branch master at Debian Med / scrappie


Commits:
d47c7b0f by Nilesh Patra at 2020-12-07T17:02:20+05:30
Update simde fix

- - - - -


1 changed file:

- debian/patches/simde.patch


Changes:

=====================================
debian/patches/simde.patch
=====================================
@@ -33,7 +33,7 @@
  /* yes I know, the top of this file is quite ugly */
  
  #ifdef _MSC_VER /* visual c++ */
-@@ -55,12 +53,9 @@
+@@ -55,12 +53,10 @@
  /* __m128 is ugly to write */
  typedef __m128 v4sf;  // vector of 4 float (sse1)
  
@@ -45,10 +45,11 @@
 -#else
 -typedef __m64 v2si;   // vector of 2 int (mmx)
 -#endif
++typedef float __v4sf __attribute__ ((__vector_size__ (16)));
  
  /* declare some SSE constants -- why can't I figure a better way to do that? */
  #define _PS_CONST(Name, Val)                                            \
-@@ -99,61 +94,24 @@
+@@ -99,61 +95,24 @@
  _PS_CONST(cephes_log_q1, -2.12194440e-4);
  _PS_CONST(cephes_log_q2, 0.693359375);
  
@@ -110,7 +111,7 @@
  
    e = _mm_add_ps(e, one);
  
-@@ -224,11 +182,7 @@
+@@ -224,11 +183,7 @@
  
  static inline __attribute__((__always_inline__)) v4sf exp_ps(v4sf x) {
    v4sf tmp = _mm_setzero_ps(), fx;
@@ -122,7 +123,7 @@
    v4sf one = *(v4sf*)_ps_1;
  
    x = _mm_min_ps(x, *(v4sf*)_ps_exp_hi);
-@@ -239,17 +193,9 @@
+@@ -239,17 +194,9 @@
    fx = _mm_add_ps(fx, *(v4sf*)_ps_0p5);
  
    /* how to perform a floorf with SSE: just below */
@@ -141,7 +142,7 @@
    /* if greater, substract 1 */
    v4sf mask = _mm_cmpgt_ps(tmp, fx);
    mask = _mm_and_ps(mask, one);
-@@ -278,24 +224,10 @@
+@@ -278,24 +225,10 @@
    y = _mm_add_ps(y, one);
  
    /* build 2^n */
@@ -166,7 +167,7 @@
    y = _mm_mul_ps(y, pow2n);
    return y;
  }
-@@ -342,12 +274,8 @@
+@@ -342,12 +275,8 @@
  */
  static v4sf sin_ps(v4sf x) { // any x
    v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
@@ -180,7 +181,7 @@
    sign_bit = x;
    /* take the absolute value */
    x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
-@@ -357,7 +285,6 @@
+@@ -357,7 +286,6 @@
    /* scale by 4/Pi */
    y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
  
@@ -188,7 +189,7 @@
    /* store the integer part of y in mm0 */
    emm2 = _mm_cvttps_epi32(y);
    /* j=(j+1) & (~1) (see the cephes sources) */
-@@ -381,34 +308,6 @@
+@@ -381,34 +309,6 @@
    v4sf poly_mask = _mm_castsi128_ps(emm2);
    sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
  
@@ -223,7 +224,7 @@
    /* The magic pass: "Extended precision modular arithmetic"
       x = ((x - y * DP1) - y * DP2) - y * DP3; */
    xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
-@@ -459,18 +358,14 @@
+@@ -459,18 +359,14 @@
  /* almost the same as sin_ps */
  static v4sf cos_ps(v4sf x) { // any x
    v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
@@ -243,7 +244,7 @@
    /* store the integer part of y in mm0 */
    emm2 = _mm_cvttps_epi32(y);
    /* j=(j+1) & (~1) (see the cephes sources) */
-@@ -489,43 +384,7 @@
+@@ -489,43 +385,7 @@
  
    v4sf sign_bit = _mm_castsi128_ps(emm0);
    v4sf poly_mask = _mm_castsi128_ps(emm2);
@@ -287,7 +288,7 @@
    /* The magic pass: "Extended precision modular arithmetic"
       x = ((x - y * DP1) - y * DP2) - y * DP3; */
    xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
-@@ -578,11 +437,8 @@
+@@ -578,11 +438,8 @@
     it is almost as fast, and gives you a free cosine with your sine */
  static void sincos_ps(v4sf x, v4sf *s, v4sf *c) {
    v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
@@ -300,7 +301,7 @@
    sign_bit_sin = x;
    /* take the absolute value */
    x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
-@@ -592,7 +448,6 @@
+@@ -592,7 +449,6 @@
    /* scale by 4/Pi */
    y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
  
@@ -308,7 +309,7 @@
    /* store the integer part of y in emm2 */
    emm2 = _mm_cvttps_epi32(y);
  
-@@ -612,40 +467,6 @@
+@@ -612,40 +468,6 @@
    emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
    emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
    v4sf poly_mask = _mm_castsi128_ps(emm2);
@@ -349,7 +350,7 @@
  
    /* The magic pass: "Extended precision modular arithmetic"
       x = ((x - y * DP1) - y * DP2) - y * DP3; */
-@@ -659,23 +480,10 @@
+@@ -659,23 +481,10 @@
    x = _mm_add_ps(x, xmm2);
    x = _mm_add_ps(x, xmm3);
  
@@ -396,3 +397,37 @@
  )
  
  with open('pyscrap.h', 'r') as fh:
+--- a/src/decode.c
++++ b/src/decode.c
+@@ -7,30 +7,6 @@
+ #define NBASE 4
+ #define BIG_FLOAT 1.e30f
+ 
+-#ifndef __SSE2__
+-#    error "Compilation of function decode_transducer requires a processor that supports at least SSE2"
+-#endif
+-
+-#ifndef __SSE4_1__
+-/**  Multiply two vectors of 32 bit integers together
+- *
+- *   Emulates the SSE4.1 instruction _mm_mullo_epi32 on hardware that only supports
+- *   SSE2.  See https://software.intel.com/en-us/forums/intel-c-compiler/topic/288768
+- *
+- *   @param x first vector to multiply
+- *   @param y second vector to multiply
+- *
+- *   @returns vector of integers containing the lower 32 bits of x * y
+- **/
+-static inline __m128i __attribute__((__gnu_inline__, __always_inline__)) _mm_mullo_epi32(const __m128i x, const __m128i y) {
+-    __m128i tmp1 = _mm_mul_epu32(x, y);
+-    __m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(x, 4), _mm_srli_si128(y, 4));
+-    return _mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0, 0, 2, 0)),
+-                              _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0, 0, 2, 0)));
+-
+-}
+-#endif
+-
+ float viterbi_backtrace(float const *score, size_t n, const_scrappie_imatrix traceback, int * seq){
+     RETURN_NULL_IF(NULL == score, NAN);
+     RETURN_NULL_IF(NULL == seq, NAN);
+



View it on GitLab: https://salsa.debian.org/med-team/scrappie/-/commit/d47c7b0f8d7e908cd3aefa2711f8c3f613e871fd

-- 
View it on GitLab: https://salsa.debian.org/med-team/scrappie/-/commit/d47c7b0f8d7e908cd3aefa2711f8c3f613e871fd
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20201207/ed9312f8/attachment-0001.html>