[med-svn] [Git][med-team/scrappie][master] Update simde fix
Nilesh Patra
gitlab at salsa.debian.org
Mon Dec 7 11:34:25 GMT 2020
Nilesh Patra pushed to branch master at Debian Med / scrappie
Commits:
d47c7b0f by Nilesh Patra at 2020-12-07T17:02:20+05:30
Update simde fix
- - - - -
1 changed file:
- debian/patches/simde.patch
Changes:
=====================================
debian/patches/simde.patch
=====================================
@@ -33,7 +33,7 @@
/* yes I know, the top of this file is quite ugly */
#ifdef _MSC_VER /* visual c++ */
-@@ -55,12 +53,9 @@
+@@ -55,12 +53,10 @@
/* __m128 is ugly to write */
typedef __m128 v4sf; // vector of 4 float (sse1)
@@ -45,10 +45,11 @@
-#else
-typedef __m64 v2si; // vector of 2 int (mmx)
-#endif
++typedef float __v4sf __attribute__ ((__vector_size__ (16)));
/* declare some SSE constants -- why can't I figure a better way to do that? */
#define _PS_CONST(Name, Val) \
-@@ -99,61 +94,24 @@
+@@ -99,61 +95,24 @@
_PS_CONST(cephes_log_q1, -2.12194440e-4);
_PS_CONST(cephes_log_q2, 0.693359375);
@@ -110,7 +111,7 @@
e = _mm_add_ps(e, one);
-@@ -224,11 +182,7 @@
+@@ -224,11 +183,7 @@
static inline __attribute__((__always_inline__)) v4sf exp_ps(v4sf x) {
v4sf tmp = _mm_setzero_ps(), fx;
@@ -122,7 +123,7 @@
v4sf one = *(v4sf*)_ps_1;
x = _mm_min_ps(x, *(v4sf*)_ps_exp_hi);
-@@ -239,17 +193,9 @@
+@@ -239,17 +194,9 @@
fx = _mm_add_ps(fx, *(v4sf*)_ps_0p5);
/* how to perform a floorf with SSE: just below */
@@ -141,7 +142,7 @@
/* if greater, substract 1 */
v4sf mask = _mm_cmpgt_ps(tmp, fx);
mask = _mm_and_ps(mask, one);
-@@ -278,24 +224,10 @@
+@@ -278,24 +225,10 @@
y = _mm_add_ps(y, one);
/* build 2^n */
@@ -166,7 +167,7 @@
y = _mm_mul_ps(y, pow2n);
return y;
}
-@@ -342,12 +274,8 @@
+@@ -342,12 +275,8 @@
*/
static v4sf sin_ps(v4sf x) { // any x
v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
@@ -180,7 +181,7 @@
sign_bit = x;
/* take the absolute value */
x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
-@@ -357,7 +285,6 @@
+@@ -357,7 +286,6 @@
/* scale by 4/Pi */
y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
@@ -188,7 +189,7 @@
/* store the integer part of y in mm0 */
emm2 = _mm_cvttps_epi32(y);
/* j=(j+1) & (~1) (see the cephes sources) */
-@@ -381,34 +308,6 @@
+@@ -381,34 +309,6 @@
v4sf poly_mask = _mm_castsi128_ps(emm2);
sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
@@ -223,7 +224,7 @@
/* The magic pass: "Extended precision modular arithmetic"
x = ((x - y * DP1) - y * DP2) - y * DP3; */
xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
-@@ -459,18 +358,14 @@
+@@ -459,18 +359,14 @@
/* almost the same as sin_ps */
static v4sf cos_ps(v4sf x) { // any x
v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
@@ -243,7 +244,7 @@
/* store the integer part of y in mm0 */
emm2 = _mm_cvttps_epi32(y);
/* j=(j+1) & (~1) (see the cephes sources) */
-@@ -489,43 +384,7 @@
+@@ -489,43 +385,7 @@
v4sf sign_bit = _mm_castsi128_ps(emm0);
v4sf poly_mask = _mm_castsi128_ps(emm2);
@@ -287,7 +288,7 @@
/* The magic pass: "Extended precision modular arithmetic"
x = ((x - y * DP1) - y * DP2) - y * DP3; */
xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
-@@ -578,11 +437,8 @@
+@@ -578,11 +438,8 @@
it is almost as fast, and gives you a free cosine with your sine */
static void sincos_ps(v4sf x, v4sf *s, v4sf *c) {
v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
@@ -300,7 +301,7 @@
sign_bit_sin = x;
/* take the absolute value */
x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
-@@ -592,7 +448,6 @@
+@@ -592,7 +449,6 @@
/* scale by 4/Pi */
y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
@@ -308,7 +309,7 @@
/* store the integer part of y in emm2 */
emm2 = _mm_cvttps_epi32(y);
-@@ -612,40 +467,6 @@
+@@ -612,40 +468,6 @@
emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
v4sf poly_mask = _mm_castsi128_ps(emm2);
@@ -349,7 +350,7 @@
/* The magic pass: "Extended precision modular arithmetic"
x = ((x - y * DP1) - y * DP2) - y * DP3; */
-@@ -659,23 +480,10 @@
+@@ -659,23 +481,10 @@
x = _mm_add_ps(x, xmm2);
x = _mm_add_ps(x, xmm3);
@@ -396,3 +397,37 @@
)
with open('pyscrap.h', 'r') as fh:
+--- a/src/decode.c
++++ b/src/decode.c
+@@ -7,30 +7,6 @@
+ #define NBASE 4
+ #define BIG_FLOAT 1.e30f
+
+-#ifndef __SSE2__
+-# error "Compilation of function decode_transducer requires a processor that supports at least SSE2"
+-#endif
+-
+-#ifndef __SSE4_1__
+-/** Multiply two vectors of 32 bit integers together
+- *
+- * Emulates the SSE4.1 instruction _mm_mullo_epi32 on hardware that only supports
+- * SSE2. See https://software.intel.com/en-us/forums/intel-c-compiler/topic/288768
+- *
+- * @param x first vector to multiply
+- * @param y second vector to multiply
+- *
+- * @returns vector of integers containing the lower 32 bits of x * y
+- **/
+-static inline __m128i __attribute__((__gnu_inline__, __always_inline__)) _mm_mullo_epi32(const __m128i x, const __m128i y) {
+- __m128i tmp1 = _mm_mul_epu32(x, y);
+- __m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(x, 4), _mm_srli_si128(y, 4));
+- return _mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0, 0, 2, 0)),
+- _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0, 0, 2, 0)));
+-
+-}
+-#endif
+-
+ float viterbi_backtrace(float const *score, size_t n, const_scrappie_imatrix traceback, int * seq){
+ RETURN_NULL_IF(NULL == score, NAN);
+ RETURN_NULL_IF(NULL == seq, NAN);
+
View it on GitLab: https://salsa.debian.org/med-team/scrappie/-/commit/d47c7b0f8d7e908cd3aefa2711f8c3f613e871fd
--
View it on GitLab: https://salsa.debian.org/med-team/scrappie/-/commit/d47c7b0f8d7e908cd3aefa2711f8c3f613e871fd
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20201207/ed9312f8/attachment-0001.html>
More information about the debian-med-commit
mailing list