[med-svn] [Git][med-team/ksw2][master] 2 commits: DEP3

Andreas Tille gitlab at salsa.debian.org
Fri May 15 12:24:58 BST 2020



Andreas Tille pushed to branch master at Debian Med / ksw2


Commits:
18d4d87a by Andreas Tille at 2020-05-15T12:13:54+02:00
DEP3

- - - - -
1492eb6a by Andreas Tille at 2020-05-15T13:24:36+02:00
TODO: Adapt build system to add libsimde

- - - - -


5 changed files:

- debian/changelog
- debian/control
- debian/patches/autoconf.patch
- debian/patches/series
- + debian/patches/simde.patch


Changes:

=====================================
debian/changelog
=====================================
@@ -1,5 +1,6 @@
 ksw2 (0.0+git20190429.f06f9b3-1) UNRELEASED; urgency=medium
 
   * Initial release (Closes: #nnnn)
+  TODO: Adapt build system to add libsimde
 
  -- Andreas Tille <tille at debian.org>  Fri, 15 May 2020 11:45:35 +0200


=====================================
debian/control
=====================================
@@ -8,7 +8,8 @@ Build-Depends: debhelper-compat (= 12),
                d-shlibs,
                autoconf-archive,
                pkg-config,
-               zlib1g-dev
+               zlib1g-dev,
+               libsimde-dev
 Standards-Version: 4.5.0
 Vcs-Browser: https://salsa.debian.org/med-team/ksw2
 Vcs-Git: https://salsa.debian.org/med-team/ksw2.git


=====================================
debian/patches/autoconf.patch
=====================================
@@ -1,3 +1,7 @@
+Author: Andreas Tille <tille at debian.org>
+Last-Update: Fri, 15 May 2020 11:45:35 +0200
+Description: Provide automake build system to get proper library packaging
+
 --- /dev/null
 +++ b/Makefile.am
 @@ -0,0 +1,19 @@


=====================================
debian/patches/series
=====================================
@@ -1 +1,2 @@
 autoconf.patch
+simde.patch


=====================================
debian/patches/simde.patch
=====================================
@@ -0,0 +1,469 @@
+Author: Michael R. Crusoe <michael.crusoe at gmail.com>
+Description: Add support for more architectures
+
+using the SIMD Everywhere library
+--- a/ksw2_extd2_sse.c
++++ b/ksw2_extd2_sse.c
+@@ -3,29 +3,19 @@
+ #include <assert.h>
+ #include "ksw2.h"
+ 
+-#ifdef __SSE2__
+-#include <emmintrin.h>
++#define SIMDE_ENABLE_NATIVE_ALIASES
++#include "simde/x86/sse4.1.h"
+ 
+-#ifdef KSW_SSE2_ONLY
+-#undef __SSE4_1__
+-#endif
+-
+-#ifdef __SSE4_1__
+-#include <smmintrin.h>
+-#endif
+-
+-#ifdef KSW_CPU_DISPATCH
+-#ifdef __SSE4_1__
++#if defined(SIMDE_SSE4_1_NATIVE)
+ void ksw_extd2_sse41(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
+ 				   int8_t q, int8_t e, int8_t q2, int8_t e2, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez)
+-#else
++#elif defined(SIMDE_SSE2_NATIVE)
+ void ksw_extd2_sse2(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
+-				   int8_t q, int8_t e, int8_t q2, int8_t e2, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez)
+-#endif
++                                  int8_t q, int8_t e, int8_t q2, int8_t e2, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez)
+ #else
+ void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
+-				   int8_t q, int8_t e, int8_t q2, int8_t e2, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez)
+-#endif // ~KSW_CPU_DISPATCH
++                                  int8_t q, int8_t e, int8_t q2, int8_t e2, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez)
++#endif
+ {
+ #define __dp_code_block1 \
+ 	z = _mm_load_si128(&s[t]); \
+@@ -161,13 +151,8 @@ void ksw_extd2_sse(void *km, int qlen, c
+ 				st = _mm_loadu_si128((__m128i*)&qrr[t]);
+ 				mask = _mm_or_si128(_mm_cmpeq_epi8(sq, m1_), _mm_cmpeq_epi8(st, m1_));
+ 				tmp = _mm_cmpeq_epi8(sq, st);
+-#ifdef __SSE4_1__
+ 				tmp = _mm_blendv_epi8(sc_mis_, sc_mch_, tmp);
+ 				tmp = _mm_blendv_epi8(tmp,     sc_N_,   mask);
+-#else
+-				tmp = _mm_or_si128(_mm_andnot_si128(tmp,  sc_mis_), _mm_and_si128(tmp,  sc_mch_));
+-				tmp = _mm_or_si128(_mm_andnot_si128(mask, tmp),     _mm_and_si128(mask, sc_N_));
+-#endif
+ 				_mm_storeu_si128((__m128i*)((int8_t*)s + t), tmp);
+ 			}
+ 		} else {
+@@ -184,7 +169,6 @@ void ksw_extd2_sse(void *km, int qlen, c
+ 			for (t = st_; t <= en_; ++t) {
+ 				__m128i z, a, b, a2, b2, xt1, x2t1, vt1, ut, tmp;
+ 				__dp_code_block1;
+-#ifdef __SSE4_1__
+ 				z = _mm_max_epi8(z, a);
+ 				z = _mm_max_epi8(z, b);
+ 				z = _mm_max_epi8(z, a2);
+@@ -195,27 +179,6 @@ void ksw_extd2_sse(void *km, int qlen, c
+ 				_mm_store_si128(&y[t],  _mm_sub_epi8(_mm_max_epi8(b,  zero_), qe_));
+ 				_mm_store_si128(&x2[t], _mm_sub_epi8(_mm_max_epi8(a2, zero_), qe2_));
+ 				_mm_store_si128(&y2[t], _mm_sub_epi8(_mm_max_epi8(b2, zero_), qe2_));
+-#else
+-				tmp = _mm_cmpgt_epi8(a,  z);
+-				z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a));
+-				tmp = _mm_cmpgt_epi8(b,  z);
+-				z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, b));
+-				tmp = _mm_cmpgt_epi8(a2, z);
+-				z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a2));
+-				tmp = _mm_cmpgt_epi8(b2, z);
+-				z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, b2));
+-				tmp = _mm_cmplt_epi8(sc_mch_, z);
+-				z = _mm_or_si128(_mm_and_si128(tmp, sc_mch_), _mm_andnot_si128(tmp, z));
+-				__dp_code_block2;
+-				tmp = _mm_cmpgt_epi8(a, zero_);
+-				_mm_store_si128(&x[t],  _mm_sub_epi8(_mm_and_si128(tmp, a),  qe_));
+-				tmp = _mm_cmpgt_epi8(b, zero_);
+-				_mm_store_si128(&y[t],  _mm_sub_epi8(_mm_and_si128(tmp, b),  qe_));
+-				tmp = _mm_cmpgt_epi8(a2, zero_);
+-				_mm_store_si128(&x2[t], _mm_sub_epi8(_mm_and_si128(tmp, a2), qe2_));
+-				tmp = _mm_cmpgt_epi8(b2, zero_);
+-				_mm_store_si128(&y2[t], _mm_sub_epi8(_mm_and_si128(tmp, b2), qe2_));
+-#endif
+ 			}
+ 		} else if (!(flag&KSW_EZ_RIGHT)) { // gap left-alignment
+ 			__m128i *pr = p + (size_t)r * n_col_ - st_;
+@@ -223,7 +186,6 @@ void ksw_extd2_sse(void *km, int qlen, c
+ 			for (t = st_; t <= en_; ++t) {
+ 				__m128i d, z, a, b, a2, b2, xt1, x2t1, vt1, ut, tmp;
+ 				__dp_code_block1;
+-#ifdef __SSE4_1__
+ 				d = _mm_and_si128(_mm_cmpgt_epi8(a, z), _mm_set1_epi8(1));       // d = a  > z? 1 : 0
+ 				z = _mm_max_epi8(z, a);
+ 				d = _mm_blendv_epi8(d, _mm_set1_epi8(2), _mm_cmpgt_epi8(b,  z)); // d = b  > z? 2 : d
+@@ -233,22 +195,6 @@ void ksw_extd2_sse(void *km, int qlen, c
+ 				d = _mm_blendv_epi8(d, _mm_set1_epi8(4), _mm_cmpgt_epi8(b2, z)); // d = a2 > z? 3 : d
+ 				z = _mm_max_epi8(z, b2);
+ 				z = _mm_min_epi8(z, sc_mch_);
+-#else // we need to emulate SSE4.1 intrinsics _mm_max_epi8() and _mm_blendv_epi8()
+-				tmp = _mm_cmpgt_epi8(a,  z);
+-				d = _mm_and_si128(tmp, _mm_set1_epi8(1));
+-				z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a));
+-				tmp = _mm_cmpgt_epi8(b,  z);
+-				d = _mm_or_si128(_mm_andnot_si128(tmp, d), _mm_and_si128(tmp, _mm_set1_epi8(2)));
+-				z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, b));
+-				tmp = _mm_cmpgt_epi8(a2, z);
+-				d = _mm_or_si128(_mm_andnot_si128(tmp, d), _mm_and_si128(tmp, _mm_set1_epi8(3)));
+-				z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a2));
+-				tmp = _mm_cmpgt_epi8(b2, z);
+-				d = _mm_or_si128(_mm_andnot_si128(tmp, d), _mm_and_si128(tmp, _mm_set1_epi8(4)));
+-				z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, b2));
+-				tmp = _mm_cmplt_epi8(sc_mch_, z);
+-				z = _mm_or_si128(_mm_and_si128(tmp, sc_mch_), _mm_andnot_si128(tmp, z));
+-#endif
+ 				__dp_code_block2;
+ 				tmp = _mm_cmpgt_epi8(a, zero_);
+ 				_mm_store_si128(&x[t],  _mm_sub_epi8(_mm_and_si128(tmp, a),  qe_));
+@@ -270,7 +216,6 @@ void ksw_extd2_sse(void *km, int qlen, c
+ 			for (t = st_; t <= en_; ++t) {
+ 				__m128i d, z, a, b, a2, b2, xt1, x2t1, vt1, ut, tmp;
+ 				__dp_code_block1;
+-#ifdef __SSE4_1__
+ 				d = _mm_andnot_si128(_mm_cmpgt_epi8(z, a), _mm_set1_epi8(1));    // d = z > a?  0 : 1
+ 				z = _mm_max_epi8(z, a);
+ 				d = _mm_blendv_epi8(_mm_set1_epi8(2), d, _mm_cmpgt_epi8(z, b));  // d = z > b?  d : 2
+@@ -280,22 +225,6 @@ void ksw_extd2_sse(void *km, int qlen, c
+ 				d = _mm_blendv_epi8(_mm_set1_epi8(4), d, _mm_cmpgt_epi8(z, b2)); // d = z > b2? d : 4
+ 				z = _mm_max_epi8(z, b2);
+ 				z = _mm_min_epi8(z, sc_mch_);
+-#else // we need to emulate SSE4.1 intrinsics _mm_max_epi8() and _mm_blendv_epi8()
+-				tmp = _mm_cmpgt_epi8(z, a);
+-				d = _mm_andnot_si128(tmp, _mm_set1_epi8(1));
+-				z = _mm_or_si128(_mm_and_si128(tmp, z), _mm_andnot_si128(tmp, a));
+-				tmp = _mm_cmpgt_epi8(z, b);
+-				d = _mm_or_si128(_mm_and_si128(tmp, d), _mm_andnot_si128(tmp, _mm_set1_epi8(2)));
+-				z = _mm_or_si128(_mm_and_si128(tmp, z), _mm_andnot_si128(tmp, b));
+-				tmp = _mm_cmpgt_epi8(z, a2);
+-				d = _mm_or_si128(_mm_and_si128(tmp, d), _mm_andnot_si128(tmp, _mm_set1_epi8(3)));
+-				z = _mm_or_si128(_mm_and_si128(tmp, z), _mm_andnot_si128(tmp, a2));
+-				tmp = _mm_cmpgt_epi8(z, b2);
+-				d = _mm_or_si128(_mm_and_si128(tmp, d), _mm_andnot_si128(tmp, _mm_set1_epi8(4)));
+-				z = _mm_or_si128(_mm_and_si128(tmp, z), _mm_andnot_si128(tmp, b2));
+-				tmp = _mm_cmplt_epi8(sc_mch_, z);
+-				z = _mm_or_si128(_mm_and_si128(tmp, sc_mch_), _mm_andnot_si128(tmp, z));
+-#endif
+ 				__dp_code_block2;
+ 				tmp = _mm_cmpgt_epi8(zero_, a);
+ 				_mm_store_si128(&x[t],  _mm_sub_epi8(_mm_andnot_si128(tmp, a),  qe_));
+@@ -330,13 +259,8 @@ void ksw_extd2_sse(void *km, int qlen, c
+ 					_mm_storeu_si128((__m128i*)&H[t], H1);
+ 					t_ = _mm_set1_epi32(t);
+ 					tmp = _mm_cmpgt_epi32(H1, max_H_);
+-#ifdef __SSE4_1__
+ 					max_H_ = _mm_blendv_epi8(max_H_, H1, tmp);
+ 					max_t_ = _mm_blendv_epi8(max_t_, t_, tmp);
+-#else
+-					max_H_ = _mm_or_si128(_mm_and_si128(tmp, H1), _mm_andnot_si128(tmp, max_H_));
+-					max_t_ = _mm_or_si128(_mm_and_si128(tmp, t_), _mm_andnot_si128(tmp, max_t_));
+-#endif
+ 				}
+ 				_mm_storeu_si128((__m128i*)HH, max_H_);
+ 				_mm_storeu_si128((__m128i*)tt, max_t_);
+@@ -391,4 +315,3 @@ void ksw_extd2_sse(void *km, int qlen, c
+ 		kfree(km, mem2); kfree(km, off);
+ 	}
+ }
+-#endif // __SSE2__
+--- a/ksw2_exts2_sse.c
++++ b/ksw2_exts2_sse.c
+@@ -3,25 +3,15 @@
+ #include <assert.h>
+ #include "ksw2.h"
+ 
+-#ifdef __SSE2__
+-#include <emmintrin.h>
++#define SIMDE_ENABLE_NATIVE_ALIASES
++#include "simde/x86/sse4.1.h"
+ 
+-#ifdef KSW_SSE2_ONLY
+-#undef __SSE4_1__
+-#endif
+-
+-#ifdef __SSE4_1__
+-#include <smmintrin.h>
+-#endif
+-
+-#ifdef KSW_CPU_DISPATCH
+-#ifdef __SSE4_1__
++#if defined(SIMDE_SSE4_1_NATIVE)
+ void ksw_exts2_sse41(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
+ 				   int8_t q, int8_t e, int8_t q2, int8_t noncan, int zdrop, int flag, ksw_extz_t *ez)
+-#else
++#elif defined(SIMDE_SSE2_NATIVE)
+ void ksw_exts2_sse2(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
+ 				   int8_t q, int8_t e, int8_t q2, int8_t noncan, int zdrop, int flag, ksw_extz_t *ez)
+-#endif
+ #else
+ void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
+ 				   int8_t q, int8_t e, int8_t q2, int8_t noncan, int zdrop, int flag, ksw_extz_t *ez)
+@@ -161,13 +151,8 @@ void ksw_exts2_sse(void *km, int qlen, c
+ 				st = _mm_loadu_si128((__m128i*)&qrr[t]);
+ 				mask = _mm_or_si128(_mm_cmpeq_epi8(sq, m1_), _mm_cmpeq_epi8(st, m1_));
+ 				tmp = _mm_cmpeq_epi8(sq, st);
+-#ifdef __SSE4_1__
+ 				tmp = _mm_blendv_epi8(sc_mis_, sc_mch_, tmp);
+ 				tmp = _mm_blendv_epi8(tmp,     sc_N_,   mask);
+-#else
+-				tmp = _mm_or_si128(_mm_andnot_si128(tmp,  sc_mis_), _mm_and_si128(tmp,  sc_mch_));
+-				tmp = _mm_or_si128(_mm_andnot_si128(mask, tmp),     _mm_and_si128(mask, sc_N_));
+-#endif
+ 				_mm_storeu_si128((__m128i*)((int8_t*)s + t), tmp);
+ 			}
+ 		} else {
+@@ -184,7 +169,6 @@ void ksw_exts2_sse(void *km, int qlen, c
+ 			for (t = st_; t <= en_; ++t) {
+ 				__m128i z, a, b, a2, a2a, xt1, x2t1, vt1, ut, tmp;
+ 				__dp_code_block1;
+-#ifdef __SSE4_1__
+ 				z = _mm_max_epi8(z, a);
+ 				z = _mm_max_epi8(z, b);
+ 				z = _mm_max_epi8(z, a2a);
+@@ -193,23 +177,6 @@ void ksw_exts2_sse(void *km, int qlen, c
+ 				_mm_store_si128(&y[t],  _mm_sub_epi8(_mm_max_epi8(b,  zero_), qe_));
+ 				tmp = _mm_load_si128(&donor[t]);
+ 				_mm_store_si128(&x2[t], _mm_sub_epi8(_mm_max_epi8(a2, tmp), q2_));
+-#else
+-				tmp = _mm_cmpgt_epi8(a,  z);
+-				z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a));
+-				tmp = _mm_cmpgt_epi8(b,  z);
+-				z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, b));
+-				tmp = _mm_cmpgt_epi8(a2a, z);
+-				z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a2a));
+-				__dp_code_block2;
+-				tmp = _mm_cmpgt_epi8(a, zero_);
+-				_mm_store_si128(&x[t],  _mm_sub_epi8(_mm_and_si128(tmp, a),  qe_));
+-				tmp = _mm_cmpgt_epi8(b, zero_);
+-				_mm_store_si128(&y[t],  _mm_sub_epi8(_mm_and_si128(tmp, b),  qe_));
+-				tmp = _mm_load_si128(&donor[t]); // TODO: check if this is correct
+-				tmp = _mm_cmpgt_epi8(a2, tmp);
+-				tmp = _mm_or_si128(_mm_andnot_si128(tmp, tmp), _mm_and_si128(tmp, a2));
+-				_mm_store_si128(&x2[t], _mm_sub_epi8(tmp, q2_));
+-#endif
+ 			}
+ 		} else if (!(flag&KSW_EZ_RIGHT)) { // gap left-alignment
+ 			__m128i *pr = p + r * n_col_ - st_;
+@@ -217,24 +184,12 @@ void ksw_exts2_sse(void *km, int qlen, c
+ 			for (t = st_; t <= en_; ++t) {
+ 				__m128i d, z, a, b, a2, a2a, xt1, x2t1, vt1, ut, tmp, tmp2;
+ 				__dp_code_block1;
+-#ifdef __SSE4_1__
+ 				d = _mm_and_si128(_mm_cmpgt_epi8(a, z), _mm_set1_epi8(1));       // d = a  > z? 1 : 0
+ 				z = _mm_max_epi8(z, a);
+ 				d = _mm_blendv_epi8(d, _mm_set1_epi8(2), _mm_cmpgt_epi8(b,  z)); // d = b  > z? 2 : d
+ 				z = _mm_max_epi8(z, b);
+ 				d = _mm_blendv_epi8(d, _mm_set1_epi8(3), _mm_cmpgt_epi8(a2a, z)); // d = a2 > z? 3 : d
+ 				z = _mm_max_epi8(z, a2a);
+-#else // we need to emulate SSE4.1 intrinsics _mm_max_epi8() and _mm_blendv_epi8()
+-				tmp = _mm_cmpgt_epi8(a,  z);
+-				d = _mm_and_si128(tmp, _mm_set1_epi8(1));
+-				z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a));
+-				tmp = _mm_cmpgt_epi8(b,  z);
+-				d = _mm_or_si128(_mm_andnot_si128(tmp, d), _mm_and_si128(tmp, _mm_set1_epi8(2)));
+-				z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, b));
+-				tmp = _mm_cmpgt_epi8(a2a, z);
+-				d = _mm_or_si128(_mm_andnot_si128(tmp, d), _mm_and_si128(tmp, _mm_set1_epi8(3)));
+-				z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a2a));
+-#endif
+ 				__dp_code_block2;
+ 				tmp = _mm_cmpgt_epi8(a, zero_);
+ 				_mm_store_si128(&x[t],  _mm_sub_epi8(_mm_and_si128(tmp, a),  qe_));
+@@ -245,11 +200,7 @@ void ksw_exts2_sse(void *km, int qlen, c
+ 
+ 				tmp2 = _mm_load_si128(&donor[t]);
+ 				tmp = _mm_cmpgt_epi8(a2, tmp2);
+-#ifdef __SSE4_1__
+ 				tmp2 = _mm_max_epi8(a2, tmp2);
+-#else
+-				tmp2 = _mm_or_si128(_mm_andnot_si128(tmp, tmp2), _mm_and_si128(tmp, a2));
+-#endif
+ 				_mm_store_si128(&x2[t], _mm_sub_epi8(tmp2, q2_));
+ 				d = _mm_or_si128(d, _mm_and_si128(tmp, _mm_set1_epi8(0x20)));
+ 				_mm_store_si128(&pr[t], d);
+@@ -260,24 +211,12 @@ void ksw_exts2_sse(void *km, int qlen, c
+ 			for (t = st_; t <= en_; ++t) {
+ 				__m128i d, z, a, b, a2, a2a, xt1, x2t1, vt1, ut, tmp, tmp2;
+ 				__dp_code_block1;
+-#ifdef __SSE4_1__
+ 				d = _mm_andnot_si128(_mm_cmpgt_epi8(z, a), _mm_set1_epi8(1));    // d = z > a?  0 : 1
+ 				z = _mm_max_epi8(z, a);
+ 				d = _mm_blendv_epi8(_mm_set1_epi8(2), d, _mm_cmpgt_epi8(z, b));  // d = z > b?  d : 2
+ 				z = _mm_max_epi8(z, b);
+ 				d = _mm_blendv_epi8(_mm_set1_epi8(3), d, _mm_cmpgt_epi8(z, a2a)); // d = z > a2? d : 3
+ 				z = _mm_max_epi8(z, a2a);
+-#else // we need to emulate SSE4.1 intrinsics _mm_max_epi8() and _mm_blendv_epi8()
+-				tmp = _mm_cmpgt_epi8(z, a);
+-				d = _mm_andnot_si128(tmp, _mm_set1_epi8(1));
+-				z = _mm_or_si128(_mm_and_si128(tmp, z), _mm_andnot_si128(tmp, a));
+-				tmp = _mm_cmpgt_epi8(z, b);
+-				d = _mm_or_si128(_mm_and_si128(tmp, d), _mm_andnot_si128(tmp, _mm_set1_epi8(2)));
+-				z = _mm_or_si128(_mm_and_si128(tmp, z), _mm_andnot_si128(tmp, b));
+-				tmp = _mm_cmpgt_epi8(z, a2a);
+-				d = _mm_or_si128(_mm_and_si128(tmp, d), _mm_andnot_si128(tmp, _mm_set1_epi8(3)));
+-				z = _mm_or_si128(_mm_and_si128(tmp, z), _mm_andnot_si128(tmp, a2a));
+-#endif
+ 				__dp_code_block2;
+ 				tmp = _mm_cmpgt_epi8(zero_, a);
+ 				_mm_store_si128(&x[t],  _mm_sub_epi8(_mm_andnot_si128(tmp, a),  qe_));
+@@ -288,11 +227,7 @@ void ksw_exts2_sse(void *km, int qlen, c
+ 
+ 				tmp2 = _mm_load_si128(&donor[t]);
+ 				tmp = _mm_cmpgt_epi8(tmp2, a2);
+-#ifdef __SSE4_1__
+ 				tmp2 = _mm_max_epi8(tmp2, a2);
+-#else
+-				tmp2 = _mm_or_si128(_mm_andnot_si128(tmp, a2), _mm_and_si128(tmp, tmp2));
+-#endif
+ 				_mm_store_si128(&x2[t], _mm_sub_epi8(tmp2, q2_));
+ 				d = _mm_or_si128(d, _mm_andnot_si128(tmp, _mm_set1_epi8(0x20))); // d = a > 0? 1<<5 : 0
+ 				_mm_store_si128(&pr[t], d);
+@@ -316,13 +251,8 @@ void ksw_exts2_sse(void *km, int qlen, c
+ 					_mm_storeu_si128((__m128i*)&H[t], H1);
+ 					t_ = _mm_set1_epi32(t);
+ 					tmp = _mm_cmpgt_epi32(H1, max_H_);
+-#ifdef __SSE4_1__
+ 					max_H_ = _mm_blendv_epi8(max_H_, H1, tmp);
+ 					max_t_ = _mm_blendv_epi8(max_t_, t_, tmp);
+-#else
+-					max_H_ = _mm_or_si128(_mm_and_si128(tmp, H1), _mm_andnot_si128(tmp, max_H_));
+-					max_t_ = _mm_or_si128(_mm_and_si128(tmp, t_), _mm_andnot_si128(tmp, max_t_));
+-#endif
+ 				}
+ 				_mm_storeu_si128((__m128i*)HH, max_H_);
+ 				_mm_storeu_si128((__m128i*)tt, max_t_);
+@@ -373,4 +303,3 @@ void ksw_exts2_sse(void *km, int qlen, c
+ 		kfree(km, mem2); kfree(km, off);
+ 	}
+ }
+-#endif // __SSE2__
+--- a/ksw2_extz2_sse.c
++++ b/ksw2_extz2_sse.c
+@@ -2,26 +2,16 @@
+ #include <assert.h>
+ #include "ksw2.h"
+ 
+-#ifdef __SSE2__
+-#include <emmintrin.h>
++#define SIMDE_ENABLE_NATIVE_ALIASES
++#include "simde/x86/sse4.1.h"
+ 
+-#ifdef KSW_SSE2_ONLY
+-#undef __SSE4_1__
+-#endif
+-
+-#ifdef __SSE4_1__
+-#include <smmintrin.h>
+-#endif
+-
+-#ifdef KSW_CPU_DISPATCH
+-#ifdef __SSE4_1__
++#if defined(SIMDE_SSE4_1_NATIVE)
+ void ksw_extz2_sse41(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t q, int8_t e, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez)
+-#else
++#elif defined(SIMDE_SSE2_NATIVE)
+ void ksw_extz2_sse2(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t q, int8_t e, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez)
+-#endif
+ #else
+ void ksw_extz2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t q, int8_t e, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez)
+-#endif // ~KSW_CPU_DISPATCH
++#endif
+ {
+ #define __dp_code_block1 \
+ 	z = _mm_add_epi8(_mm_load_si128(&s[t]), qe2_); \
+@@ -129,13 +119,8 @@ void ksw_extz2_sse(void *km, int qlen, c
+ 				st = _mm_loadu_si128((__m128i*)&qrr[t]);
+ 				mask = _mm_or_si128(_mm_cmpeq_epi8(sq, m1_), _mm_cmpeq_epi8(st, m1_));
+ 				tmp = _mm_cmpeq_epi8(sq, st);
+-#ifdef __SSE4_1__
+ 				tmp = _mm_blendv_epi8(sc_mis_, sc_mch_, tmp);
+ 				tmp = _mm_blendv_epi8(tmp,     sc_N_,   mask);
+-#else
+-				tmp = _mm_or_si128(_mm_andnot_si128(tmp,  sc_mis_), _mm_and_si128(tmp,  sc_mch_));
+-				tmp = _mm_or_si128(_mm_andnot_si128(mask, tmp),     _mm_and_si128(mask, sc_N_));
+-#endif
+ 				_mm_storeu_si128((__m128i*)((uint8_t*)s + t), tmp);
+ 			}
+ 		} else {
+@@ -151,22 +136,10 @@ void ksw_extz2_sse(void *km, int qlen, c
+ 			for (t = st_; t <= en_; ++t) {
+ 				__m128i z, a, b, xt1, vt1, ut, tmp;
+ 				__dp_code_block1;
+-#ifdef __SSE4_1__
+ 				z = _mm_max_epi8(z, a);                          // z = z > a? z : a (signed)
+-#else // we need to emulate SSE4.1 intrinsics _mm_max_epi8()
+-				z = _mm_and_si128(z, _mm_cmpgt_epi8(z, zero_));  // z = z > 0? z : 0;
+-				z = _mm_max_epu8(z, a);                          // z = max(z, a); this works because both are non-negative
+-#endif
+ 				__dp_code_block2;
+-#ifdef __SSE4_1__
+ 				_mm_store_si128(&x[t], _mm_max_epi8(a, zero_));
+ 				_mm_store_si128(&y[t], _mm_max_epi8(b, zero_));
+-#else
+-				tmp = _mm_cmpgt_epi8(a, zero_);
+-				_mm_store_si128(&x[t], _mm_and_si128(a, tmp));
+-				tmp = _mm_cmpgt_epi8(b, zero_);
+-				_mm_store_si128(&y[t], _mm_and_si128(b, tmp));
+-#endif
+ 			}
+ 		} else if (!(flag&KSW_EZ_RIGHT)) { // gap left-alignment
+ 			__m128i *pr = p + (size_t)r * n_col_ - st_;
+@@ -175,16 +148,9 @@ void ksw_extz2_sse(void *km, int qlen, c
+ 				__m128i d, z, a, b, xt1, vt1, ut, tmp;
+ 				__dp_code_block1;
+ 				d = _mm_and_si128(_mm_cmpgt_epi8(a, z), flag1_); // d = a > z? 1 : 0
+-#ifdef __SSE4_1__
+ 				z = _mm_max_epi8(z, a);                          // z = z > a? z : a (signed)
+ 				tmp = _mm_cmpgt_epi8(b, z);
+ 				d = _mm_blendv_epi8(d, flag2_, tmp);             // d = b > z? 2 : d
+-#else // we need to emulate SSE4.1 intrinsics _mm_max_epi8() and _mm_blendv_epi8()
+-				z = _mm_and_si128(z, _mm_cmpgt_epi8(z, zero_));  // z = z > 0? z : 0;
+-				z = _mm_max_epu8(z, a);                          // z = max(z, a); this works because both are non-negative
+-				tmp = _mm_cmpgt_epi8(b, z);
+-				d = _mm_or_si128(_mm_andnot_si128(tmp, d), _mm_and_si128(tmp, flag2_)); // d = b > z? 2 : d; emulating blendv
+-#endif
+ 				__dp_code_block2;
+ 				tmp = _mm_cmpgt_epi8(a, zero_);
+ 				_mm_store_si128(&x[t], _mm_and_si128(tmp, a));
+@@ -201,16 +167,9 @@ void ksw_extz2_sse(void *km, int qlen, c
+ 				__m128i d, z, a, b, xt1, vt1, ut, tmp;
+ 				__dp_code_block1;
+ 				d = _mm_andnot_si128(_mm_cmpgt_epi8(z, a), flag1_); // d = z > a? 0 : 1
+-#ifdef __SSE4_1__
+ 				z = _mm_max_epi8(z, a);                          // z = z > a? z : a (signed)
+ 				tmp = _mm_cmpgt_epi8(z, b);
+ 				d = _mm_blendv_epi8(flag2_, d, tmp);             // d = z > b? d : 2
+-#else // we need to emulate SSE4.1 intrinsics _mm_max_epi8() and _mm_blendv_epi8()
+-				z = _mm_and_si128(z, _mm_cmpgt_epi8(z, zero_));  // z = z > 0? z : 0;
+-				z = _mm_max_epu8(z, a);                          // z = max(z, a); this works because both are non-negative
+-				tmp = _mm_cmpgt_epi8(z, b);
+-				d = _mm_or_si128(_mm_andnot_si128(tmp, flag2_), _mm_and_si128(tmp, d)); // d = z > b? d : 2; emulating blendv
+-#endif
+ 				__dp_code_block2;
+ 				tmp = _mm_cmpgt_epi8(zero_, a);
+ 				_mm_store_si128(&x[t], _mm_andnot_si128(tmp, a));
+@@ -241,13 +200,8 @@ void ksw_extz2_sse(void *km, int qlen, c
+ 					_mm_storeu_si128((__m128i*)&H[t], H1);
+ 					t_ = _mm_set1_epi32(t);
+ 					tmp = _mm_cmpgt_epi32(H1, max_H_);
+-#ifdef __SSE4_1__
+ 					max_H_ = _mm_blendv_epi8(max_H_, H1, tmp);
+ 					max_t_ = _mm_blendv_epi8(max_t_, t_, tmp);
+-#else
+-					max_H_ = _mm_or_si128(_mm_and_si128(tmp, H1), _mm_andnot_si128(tmp, max_H_));
+-					max_t_ = _mm_or_si128(_mm_and_si128(tmp, t_), _mm_andnot_si128(tmp, max_t_));
+-#endif
+ 				}
+ 				_mm_storeu_si128((__m128i*)HH, max_H_);
+ 				_mm_storeu_si128((__m128i*)tt, max_t_);
+@@ -302,4 +256,3 @@ void ksw_extz2_sse(void *km, int qlen, c
+ 		kfree(km, mem2); kfree(km, off);
+ 	}
+ }
+-#endif // __SSE2__



View it on GitLab: https://salsa.debian.org/med-team/ksw2/-/compare/065e9821653b88bd0b5b54a3d184dfd86fb84f2a...1492eb6a912877c22cf0c2fa653968d5b7fa6e2d

-- 
View it on GitLab: https://salsa.debian.org/med-team/ksw2/-/compare/065e9821653b88bd0b5b54a3d184dfd86fb84f2a...1492eb6a912877c22cf0c2fa653968d5b7fa6e2d
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20200515/fce96f75/attachment-0001.html>


More information about the debian-med-commit mailing list