[med-svn] [Git][med-team/rapmap][master] Add non-x86 compatability with libsimde-dev
Michael R. Crusoe
gitlab at salsa.debian.org
Fri Jun 5 10:32:06 BST 2020
Michael R. Crusoe pushed to branch master at Debian Med / rapmap
Commits:
f6be793f by Michael R. Crusoe at 2020-06-05T11:09:20+02:00
Add non-x86 compatability with libsimde-dev
- - - - -
6 changed files:
- debian/changelog
- debian/control
- debian/patches/portable_pause
- debian/patches/series
- + debian/patches/simde
- debian/rules
Changes:
=====================================
debian/changelog
=====================================
@@ -1,3 +1,9 @@
+rapmap (0.15.0+dfsg-2) UNRELEASED; urgency=medium
+
+ * Add non-x86 compatability with libsimde-dev
+
+ -- Michael R. Crusoe <crusoe at debian.org> Fri, 05 Jun 2020 09:40:30 +0200
+
rapmap (0.15.0+dfsg-1) unstable; urgency=medium
* New upstream version
=====================================
debian/control
=====================================
@@ -11,6 +11,7 @@ Build-Depends: debhelper-compat (= 12),
libcereal-dev,
libjellyfish-2.0-dev,
libjemalloc-dev,
+ libsimde-dev,
libsparsehash-dev,
libspdlog-dev,
libtclap-dev,
@@ -26,6 +27,7 @@ Package: rapmap
Architecture: amd64
Depends: ${shlibs:Depends},
${misc:Depends}
+Built-Using: ${simde:Built-Using}
Description: rapid sensitive and accurate DNA read mapping via quasi-mapping
RapMap is a testing ground for ideas in quasi-mapping / (lightweight /
pseudo) transcriptome alignment. That means that, at this point, it is
=====================================
debian/patches/portable_pause
=====================================
@@ -1,18 +1,40 @@
-From: Michael R. Crusoe <michael.crusoe at gmail.com>
+From: Michael R. Crusoe <crusoe at debian.org>
Subject: Add portable pauses
---- rapmap.orig/include/FastxParserThreadUtils.hpp
-+++ rapmap/include/FastxParserThreadUtils.hpp
-@@ -18,7 +18,13 @@
+Forwarded: https://github.com/COMBINE-lab/RapMap/pull/49
+--- a/include/FastxParserThreadUtils.hpp
++++ b/include/FastxParserThreadUtils.hpp
+@@ -6,6 +6,9 @@
+ #include <chrono>
+ #include <random>
+ #include <pthread.h>
++#if defined(__SSE2__)
++#include <xmmintrin.h> // _mm_pause
++#endif
+
+ // Most of this code is taken directly from https://github.com/geidav/spinlocks-bench/blob/master/os.hpp.
+ // However, things may be renamed, modified, or randomly mangled over time.
+@@ -18,7 +21,23 @@ namespace fastx_parser {
static const size_t MAX_BACKOFF_ITERS = 1024;
ALWAYS_INLINE static void cpuRelax() {
-+#if defined(__aarch64__) || defined(arm64)
-+ asm volatile("yield" ::: "memory");
-+#elif defined(__PPC64__) || defined(PPC64) || defined(__ppc64__)
-+ asm("ori 0,0,0");
-+#else
- asm("pause");
-+#endif
+- asm("pause");
++ #if defined(__SSE2__) // AMD and Intel
++ _mm_pause();
++ #elif defined(__i386__) || defined(__x86_64__)
++ asm volatile("pause");
++ #elif defined(__aarch64__)
++ asm volatile("wfe");
++ #elif defined(__armel__) || defined(__ARMEL__)
++ asm volatile ("nop" ::: "memory");
++ #elif defined(__arm__) || defined(__aarch64__)
++ __asm__ __volatile__ ("yield" ::: "memory");
++ #elif defined(__ia64__) // IA64
++ __asm__ __volatile__ ("hint @pause");
++ #elif defined(__powerpc__) || defined(__ppc__) || defined(__PPC__)
++ __asm__ __volatile__ ("or 27,27,27" ::: "memory");
++ #else // everything else.
++ asm volatile ("nop" ::: "memory");
++ #endif
}
ALWAYS_INLINE void yieldSleep() {
=====================================
debian/patches/series
=====================================
@@ -4,3 +4,4 @@ avoid_privacy_breach.patch
spelling
portable_pause
no_gomp_needed
+simde
=====================================
debian/patches/simde
=====================================
@@ -0,0 +1,642 @@
+From: Michael R. Crusoe <crusoe at debian.org>
+Subject: Add non-x86 portability using SIMD Everywhere
+--- rapmap.orig/src/metro/metrohash128crc.cpp
++++ rapmap/src/metro/metrohash128crc.cpp
+@@ -24,7 +24,8 @@
+ //
+
+
+-#include <nmmintrin.h>
++#define SIMDE_ENABLE_NATIVE_ALIASES
++#include "simde/x86/sse4.2.h"
+ #include <string.h>
+ #include "metro/metrohash.h"
+ #include "metro/platform.h"
+--- rapmap.orig/src/ksw2pp/ksw2_extd2_sse.c
++++ rapmap/src/ksw2pp/ksw2_extd2_sse.c
+@@ -3,29 +3,19 @@
+ #include <assert.h>
+ #include "ksw2pp/ksw2.h"
+
+-#ifdef __SSE2__
+-#include <emmintrin.h>
++#define SIMDE_ENABLE_NATIVE_ALIASES
++#include "simde/x86/sse4.1.h"
+
+-#ifdef KSW_SSE2_ONLY
+-#undef __SSE4_1__
+-#endif
+-
+-#ifdef __SSE4_1__
+-#include <smmintrin.h>
+-#endif
+-
+-#ifdef KSW_CPU_DISPATCH
+ #ifdef __SSE4_1__
+ void ksw_extd2_sse41(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
+ int8_t q, int8_t e, int8_t q2, int8_t e2, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez)
+-#else
++#elif defined(__SSE2__)
+ void ksw_extd2_sse2(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
+ int8_t q, int8_t e, int8_t q2, int8_t e2, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez)
+-#endif
+ #else
+ void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
+ int8_t q, int8_t e, int8_t q2, int8_t e2, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez)
+-#endif // ~KSW_CPU_DISPATCH
++#endif
+ {
+ #define __dp_code_block1 \
+ z = _mm_load_si128(&s[t]); \
+@@ -161,13 +151,8 @@
+ st = _mm_loadu_si128((__m128i*)&qrr[t]);
+ mask = _mm_or_si128(_mm_cmpeq_epi8(sq, m1_), _mm_cmpeq_epi8(st, m1_));
+ tmp = _mm_cmpeq_epi8(sq, st);
+-#ifdef __SSE4_1__
+ tmp = _mm_blendv_epi8(sc_mis_, sc_mch_, tmp);
+ tmp = _mm_blendv_epi8(tmp, sc_N_, mask);
+-#else
+- tmp = _mm_or_si128(_mm_andnot_si128(tmp, sc_mis_), _mm_and_si128(tmp, sc_mch_));
+- tmp = _mm_or_si128(_mm_andnot_si128(mask, tmp), _mm_and_si128(mask, sc_N_));
+-#endif
+ _mm_storeu_si128((__m128i*)((int8_t*)s + t), tmp);
+ }
+ } else {
+@@ -184,7 +169,6 @@
+ for (t = st_; t <= en_; ++t) {
+ __m128i z, a, b, a2, b2, xt1, x2t1, vt1, ut, tmp;
+ __dp_code_block1;
+-#ifdef __SSE4_1__
+ z = _mm_max_epi8(z, a);
+ z = _mm_max_epi8(z, b);
+ z = _mm_max_epi8(z, a2);
+@@ -195,27 +179,6 @@
+ _mm_store_si128(&y[t], _mm_sub_epi8(_mm_max_epi8(b, zero_), qe_));
+ _mm_store_si128(&x2[t], _mm_sub_epi8(_mm_max_epi8(a2, zero_), qe2_));
+ _mm_store_si128(&y2[t], _mm_sub_epi8(_mm_max_epi8(b2, zero_), qe2_));
+-#else
+- tmp = _mm_cmpgt_epi8(a, z);
+- z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a));
+- tmp = _mm_cmpgt_epi8(b, z);
+- z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, b));
+- tmp = _mm_cmpgt_epi8(a2, z);
+- z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a2));
+- tmp = _mm_cmpgt_epi8(b2, z);
+- z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, b2));
+- tmp = _mm_cmplt_epi8(sc_mch_, z);
+- z = _mm_or_si128(_mm_and_si128(tmp, sc_mch_), _mm_andnot_si128(tmp, z));
+- __dp_code_block2;
+- tmp = _mm_cmpgt_epi8(a, zero_);
+- _mm_store_si128(&x[t], _mm_sub_epi8(_mm_and_si128(tmp, a), qe_));
+- tmp = _mm_cmpgt_epi8(b, zero_);
+- _mm_store_si128(&y[t], _mm_sub_epi8(_mm_and_si128(tmp, b), qe_));
+- tmp = _mm_cmpgt_epi8(a2, zero_);
+- _mm_store_si128(&x2[t], _mm_sub_epi8(_mm_and_si128(tmp, a2), qe2_));
+- tmp = _mm_cmpgt_epi8(b2, zero_);
+- _mm_store_si128(&y2[t], _mm_sub_epi8(_mm_and_si128(tmp, b2), qe2_));
+-#endif
+ }
+ } else if (!(flag&KSW_EZ_RIGHT)) { // gap left-alignment
+ __m128i *pr = p + (size_t)r * n_col_ - st_;
+@@ -223,7 +186,6 @@
+ for (t = st_; t <= en_; ++t) {
+ __m128i d, z, a, b, a2, b2, xt1, x2t1, vt1, ut, tmp;
+ __dp_code_block1;
+-#ifdef __SSE4_1__
+ d = _mm_and_si128(_mm_cmpgt_epi8(a, z), _mm_set1_epi8(1)); // d = a > z? 1 : 0
+ z = _mm_max_epi8(z, a);
+ d = _mm_blendv_epi8(d, _mm_set1_epi8(2), _mm_cmpgt_epi8(b, z)); // d = b > z? 2 : d
+@@ -233,22 +195,6 @@
+ d = _mm_blendv_epi8(d, _mm_set1_epi8(4), _mm_cmpgt_epi8(b2, z)); // d = a2 > z? 3 : d
+ z = _mm_max_epi8(z, b2);
+ z = _mm_min_epi8(z, sc_mch_);
+-#else // we need to emulate SSE4.1 intrinsics _mm_max_epi8() and _mm_blendv_epi8()
+- tmp = _mm_cmpgt_epi8(a, z);
+- d = _mm_and_si128(tmp, _mm_set1_epi8(1));
+- z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a));
+- tmp = _mm_cmpgt_epi8(b, z);
+- d = _mm_or_si128(_mm_andnot_si128(tmp, d), _mm_and_si128(tmp, _mm_set1_epi8(2)));
+- z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, b));
+- tmp = _mm_cmpgt_epi8(a2, z);
+- d = _mm_or_si128(_mm_andnot_si128(tmp, d), _mm_and_si128(tmp, _mm_set1_epi8(3)));
+- z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a2));
+- tmp = _mm_cmpgt_epi8(b2, z);
+- d = _mm_or_si128(_mm_andnot_si128(tmp, d), _mm_and_si128(tmp, _mm_set1_epi8(4)));
+- z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, b2));
+- tmp = _mm_cmplt_epi8(sc_mch_, z);
+- z = _mm_or_si128(_mm_and_si128(tmp, sc_mch_), _mm_andnot_si128(tmp, z));
+-#endif
+ __dp_code_block2;
+ tmp = _mm_cmpgt_epi8(a, zero_);
+ _mm_store_si128(&x[t], _mm_sub_epi8(_mm_and_si128(tmp, a), qe_));
+@@ -270,7 +216,6 @@
+ for (t = st_; t <= en_; ++t) {
+ __m128i d, z, a, b, a2, b2, xt1, x2t1, vt1, ut, tmp;
+ __dp_code_block1;
+-#ifdef __SSE4_1__
+ d = _mm_andnot_si128(_mm_cmpgt_epi8(z, a), _mm_set1_epi8(1)); // d = z > a? 0 : 1
+ z = _mm_max_epi8(z, a);
+ d = _mm_blendv_epi8(_mm_set1_epi8(2), d, _mm_cmpgt_epi8(z, b)); // d = z > b? d : 2
+@@ -280,22 +225,6 @@
+ d = _mm_blendv_epi8(_mm_set1_epi8(4), d, _mm_cmpgt_epi8(z, b2)); // d = z > b2? d : 4
+ z = _mm_max_epi8(z, b2);
+ z = _mm_min_epi8(z, sc_mch_);
+-#else // we need to emulate SSE4.1 intrinsics _mm_max_epi8() and _mm_blendv_epi8()
+- tmp = _mm_cmpgt_epi8(z, a);
+- d = _mm_andnot_si128(tmp, _mm_set1_epi8(1));
+- z = _mm_or_si128(_mm_and_si128(tmp, z), _mm_andnot_si128(tmp, a));
+- tmp = _mm_cmpgt_epi8(z, b);
+- d = _mm_or_si128(_mm_and_si128(tmp, d), _mm_andnot_si128(tmp, _mm_set1_epi8(2)));
+- z = _mm_or_si128(_mm_and_si128(tmp, z), _mm_andnot_si128(tmp, b));
+- tmp = _mm_cmpgt_epi8(z, a2);
+- d = _mm_or_si128(_mm_and_si128(tmp, d), _mm_andnot_si128(tmp, _mm_set1_epi8(3)));
+- z = _mm_or_si128(_mm_and_si128(tmp, z), _mm_andnot_si128(tmp, a2));
+- tmp = _mm_cmpgt_epi8(z, b2);
+- d = _mm_or_si128(_mm_and_si128(tmp, d), _mm_andnot_si128(tmp, _mm_set1_epi8(4)));
+- z = _mm_or_si128(_mm_and_si128(tmp, z), _mm_andnot_si128(tmp, b2));
+- tmp = _mm_cmplt_epi8(sc_mch_, z);
+- z = _mm_or_si128(_mm_and_si128(tmp, sc_mch_), _mm_andnot_si128(tmp, z));
+-#endif
+ __dp_code_block2;
+ tmp = _mm_cmpgt_epi8(zero_, a);
+ _mm_store_si128(&x[t], _mm_sub_epi8(_mm_andnot_si128(tmp, a), qe_));
+@@ -330,13 +259,8 @@
+ _mm_storeu_si128((__m128i*)&H[t], H1);
+ t_ = _mm_set1_epi32(t);
+ tmp = _mm_cmpgt_epi32(H1, max_H_);
+-#ifdef __SSE4_1__
+ max_H_ = _mm_blendv_epi8(max_H_, H1, tmp);
+ max_t_ = _mm_blendv_epi8(max_t_, t_, tmp);
+-#else
+- max_H_ = _mm_or_si128(_mm_and_si128(tmp, H1), _mm_andnot_si128(tmp, max_H_));
+- max_t_ = _mm_or_si128(_mm_and_si128(tmp, t_), _mm_andnot_si128(tmp, max_t_));
+-#endif
+ }
+ _mm_storeu_si128((__m128i*)HH, max_H_);
+ _mm_storeu_si128((__m128i*)tt, max_t_);
+@@ -391,4 +315,3 @@
+ kfree(km, mem2); kfree(km, off);
+ }
+ }
+-#endif // __SSE2__
+--- rapmap.orig/src/ksw2pp/ksw2_extf2_sse.c
++++ rapmap/src/ksw2pp/ksw2_extf2_sse.c
+@@ -1,22 +1,16 @@
+ #include <string.h>
+ #include "ksw2pp/ksw2.h"
+
+-#ifdef __SSE2__
+-#include <emmintrin.h>
++#define SIMDE_ENABLE_NATIVE_ALIASES
++#include "simde/x86/sse4.1.h"
+
+ #ifdef __SSE4_1__
+-#include <smmintrin.h>
+-#endif
+-
+-#ifdef KSW_CPU_DISPATCH
+-#ifdef __SSE4_1__
+ void ksw_extf2_sse41(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t mch, int8_t mis, int8_t e, int w, int xdrop, ksw_extz_t *ez)
+-#else
+- void ksw_extf2_sse2(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t mch, int8_t mis, int8_t e, int w, int xdrop, ksw_extz_t *ez)
+-#endif
++#elif defined(__SSE2__)
++void ksw_extf2_sse2(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t mch, int8_t mis, int8_t e, int w, int xdrop, ksw_extz_t *ez)
+ #else
+ void ksw_extf2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t mch, int8_t mis, int8_t e, int w, int xdrop, ksw_extz_t *ez)
+-#endif // ~KSW_CPU_DISPATCH
++#endif
+ {
+ int32_t r, t, tlen_, qlen_, last_st, last_en, H0 = 0, last_H0_t = 0;
+ uint8_t *qr, *sf, *mem;
+@@ -60,11 +54,7 @@
+ sq = _mm_loadu_si128((__m128i*)&sf[t]);
+ st = _mm_loadu_si128((__m128i*)&qrr[t]);
+ tmp = _mm_cmpeq_epi8(sq, st);
+-#ifdef __SSE4_1__
+ tmp = _mm_blendv_epi8(sc_mis_, sc_mch_, tmp);
+-#else
+- tmp = _mm_or_si128(_mm_andnot_si128(tmp, sc_mis_), _mm_and_si128(tmp, sc_mch_));
+-#endif
+ _mm_storeu_si128((__m128i*)((uint8_t*)s + t), tmp);
+ }
+ for (t = st_; t <= en_; ++t) {
+@@ -75,12 +65,7 @@
+ vt1 = _mm_or_si128(_mm_slli_si128(vt1, 1), v1_); // vt1 <- v[r-1][t-1..t+14]
+ v1_ = tmp;
+ ut = _mm_load_si128(&u[t]); // ut <- u[t..t+15]
+-#ifdef __SSE4_1__
+ z = _mm_max_epi8(z, vt1); // z = z > a? z : a (signed)
+-#else
+- z = _mm_and_si128(z, _mm_cmpgt_epi8(z, _mm_setzero_si128())); // z = z > 0? z : 0;
+- z = _mm_max_epu8(z, vt1); // z = max(z, a); this works because both are non-negative
+-#endif
+ z = _mm_max_epu8(z, ut); // z = max(z, b); this works because both are non-negative
+ _mm_store_si128(&u[t], _mm_sub_epi8(z, vt1)); // u[r][t..t+15] <- z - v[r-1][t-1..t+14]
+ _mm_store_si128(&v[t], _mm_sub_epi8(z, ut)); // v[r][t..t+15] <- z - u[r-1][t..t+15]
+@@ -104,4 +89,3 @@
+ else ez->zdropped = 1;
+ kfree(km, mem);
+ }
+-#endif // __SSE2__
+--- rapmap.orig/src/ksw2pp/ksw2_exts2_sse.c
++++ rapmap/src/ksw2pp/ksw2_exts2_sse.c
+@@ -3,29 +3,19 @@
+ #include <assert.h>
+ #include "ksw2pp/ksw2.h"
+
+-#ifdef __SSE2__
+-#include <emmintrin.h>
++#define SIMDE_ENABLE_NATIVE_ALIASES
++#include "simde/x86/sse4.1.h"
+
+-#ifdef KSW_SSE2_ONLY
+-#undef __SSE4_1__
+-#endif
+-
+-#ifdef __SSE4_1__
+-#include <smmintrin.h>
+-#endif
+-
+-#ifdef KSW_CPU_DISPATCH
+ #ifdef __SSE4_1__
+ void ksw_exts2_sse41(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
+ int8_t q, int8_t e, int8_t q2, int8_t noncan, int zdrop, int flag, ksw_extz_t *ez)
+-#else
++#elif defined(__SSE2__)
+ void ksw_exts2_sse2(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
+ int8_t q, int8_t e, int8_t q2, int8_t noncan, int zdrop, int flag, ksw_extz_t *ez)
+-#endif
+ #else
+ void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
+ int8_t q, int8_t e, int8_t q2, int8_t noncan, int zdrop, int flag, ksw_extz_t *ez)
+-#endif // ~KSW_CPU_DISPATCH
++#endif
+ {
+ #define __dp_code_block1 \
+ z = _mm_load_si128(&s[t]); \
+@@ -161,13 +151,8 @@
+ st = _mm_loadu_si128((__m128i*)&qrr[t]);
+ mask = _mm_or_si128(_mm_cmpeq_epi8(sq, m1_), _mm_cmpeq_epi8(st, m1_));
+ tmp = _mm_cmpeq_epi8(sq, st);
+-#ifdef __SSE4_1__
+ tmp = _mm_blendv_epi8(sc_mis_, sc_mch_, tmp);
+ tmp = _mm_blendv_epi8(tmp, sc_N_, mask);
+-#else
+- tmp = _mm_or_si128(_mm_andnot_si128(tmp, sc_mis_), _mm_and_si128(tmp, sc_mch_));
+- tmp = _mm_or_si128(_mm_andnot_si128(mask, tmp), _mm_and_si128(mask, sc_N_));
+-#endif
+ _mm_storeu_si128((__m128i*)((int8_t*)s + t), tmp);
+ }
+ } else {
+@@ -184,7 +169,6 @@
+ for (t = st_; t <= en_; ++t) {
+ __m128i z, a, b, a2, a2a, xt1, x2t1, vt1, ut, tmp;
+ __dp_code_block1;
+-#ifdef __SSE4_1__
+ z = _mm_max_epi8(z, a);
+ z = _mm_max_epi8(z, b);
+ z = _mm_max_epi8(z, a2a);
+@@ -193,23 +177,6 @@
+ _mm_store_si128(&y[t], _mm_sub_epi8(_mm_max_epi8(b, zero_), qe_));
+ tmp = _mm_load_si128(&donor[t]);
+ _mm_store_si128(&x2[t], _mm_sub_epi8(_mm_max_epi8(a2, tmp), q2_));
+-#else
+- tmp = _mm_cmpgt_epi8(a, z);
+- z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a));
+- tmp = _mm_cmpgt_epi8(b, z);
+- z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, b));
+- tmp = _mm_cmpgt_epi8(a2a, z);
+- z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a2a));
+- __dp_code_block2;
+- tmp = _mm_cmpgt_epi8(a, zero_);
+- _mm_store_si128(&x[t], _mm_sub_epi8(_mm_and_si128(tmp, a), qe_));
+- tmp = _mm_cmpgt_epi8(b, zero_);
+- _mm_store_si128(&y[t], _mm_sub_epi8(_mm_and_si128(tmp, b), qe_));
+- tmp = _mm_load_si128(&donor[t]); // TODO: check if this is correct
+- tmp = _mm_cmpgt_epi8(a2, tmp);
+- tmp = _mm_or_si128(_mm_andnot_si128(tmp, tmp), _mm_and_si128(tmp, a2));
+- _mm_store_si128(&x2[t], _mm_sub_epi8(tmp, q2_));
+-#endif
+ }
+ } else if (!(flag&KSW_EZ_RIGHT)) { // gap left-alignment
+ __m128i *pr = p + r * n_col_ - st_;
+@@ -217,24 +184,12 @@
+ for (t = st_; t <= en_; ++t) {
+ __m128i d, z, a, b, a2, a2a, xt1, x2t1, vt1, ut, tmp, tmp2;
+ __dp_code_block1;
+-#ifdef __SSE4_1__
+ d = _mm_and_si128(_mm_cmpgt_epi8(a, z), _mm_set1_epi8(1)); // d = a > z? 1 : 0
+ z = _mm_max_epi8(z, a);
+ d = _mm_blendv_epi8(d, _mm_set1_epi8(2), _mm_cmpgt_epi8(b, z)); // d = b > z? 2 : d
+ z = _mm_max_epi8(z, b);
+ d = _mm_blendv_epi8(d, _mm_set1_epi8(3), _mm_cmpgt_epi8(a2a, z)); // d = a2 > z? 3 : d
+ z = _mm_max_epi8(z, a2a);
+-#else // we need to emulate SSE4.1 intrinsics _mm_max_epi8() and _mm_blendv_epi8()
+- tmp = _mm_cmpgt_epi8(a, z);
+- d = _mm_and_si128(tmp, _mm_set1_epi8(1));
+- z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a));
+- tmp = _mm_cmpgt_epi8(b, z);
+- d = _mm_or_si128(_mm_andnot_si128(tmp, d), _mm_and_si128(tmp, _mm_set1_epi8(2)));
+- z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, b));
+- tmp = _mm_cmpgt_epi8(a2a, z);
+- d = _mm_or_si128(_mm_andnot_si128(tmp, d), _mm_and_si128(tmp, _mm_set1_epi8(3)));
+- z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a2a));
+-#endif
+ __dp_code_block2;
+ tmp = _mm_cmpgt_epi8(a, zero_);
+ _mm_store_si128(&x[t], _mm_sub_epi8(_mm_and_si128(tmp, a), qe_));
+@@ -245,11 +200,7 @@
+
+ tmp2 = _mm_load_si128(&donor[t]);
+ tmp = _mm_cmpgt_epi8(a2, tmp2);
+-#ifdef __SSE4_1__
+ tmp2 = _mm_max_epi8(a2, tmp2);
+-#else
+- tmp2 = _mm_or_si128(_mm_andnot_si128(tmp, tmp2), _mm_and_si128(tmp, a2));
+-#endif
+ _mm_store_si128(&x2[t], _mm_sub_epi8(tmp2, q2_));
+ d = _mm_or_si128(d, _mm_and_si128(tmp, _mm_set1_epi8(0x20)));
+ _mm_store_si128(&pr[t], d);
+@@ -260,24 +211,12 @@
+ for (t = st_; t <= en_; ++t) {
+ __m128i d, z, a, b, a2, a2a, xt1, x2t1, vt1, ut, tmp, tmp2;
+ __dp_code_block1;
+-#ifdef __SSE4_1__
+ d = _mm_andnot_si128(_mm_cmpgt_epi8(z, a), _mm_set1_epi8(1)); // d = z > a? 0 : 1
+ z = _mm_max_epi8(z, a);
+ d = _mm_blendv_epi8(_mm_set1_epi8(2), d, _mm_cmpgt_epi8(z, b)); // d = z > b? d : 2
+ z = _mm_max_epi8(z, b);
+ d = _mm_blendv_epi8(_mm_set1_epi8(3), d, _mm_cmpgt_epi8(z, a2a)); // d = z > a2? d : 3
+ z = _mm_max_epi8(z, a2a);
+-#else // we need to emulate SSE4.1 intrinsics _mm_max_epi8() and _mm_blendv_epi8()
+- tmp = _mm_cmpgt_epi8(z, a);
+- d = _mm_andnot_si128(tmp, _mm_set1_epi8(1));
+- z = _mm_or_si128(_mm_and_si128(tmp, z), _mm_andnot_si128(tmp, a));
+- tmp = _mm_cmpgt_epi8(z, b);
+- d = _mm_or_si128(_mm_and_si128(tmp, d), _mm_andnot_si128(tmp, _mm_set1_epi8(2)));
+- z = _mm_or_si128(_mm_and_si128(tmp, z), _mm_andnot_si128(tmp, b));
+- tmp = _mm_cmpgt_epi8(z, a2a);
+- d = _mm_or_si128(_mm_and_si128(tmp, d), _mm_andnot_si128(tmp, _mm_set1_epi8(3)));
+- z = _mm_or_si128(_mm_and_si128(tmp, z), _mm_andnot_si128(tmp, a2a));
+-#endif
+ __dp_code_block2;
+ tmp = _mm_cmpgt_epi8(zero_, a);
+ _mm_store_si128(&x[t], _mm_sub_epi8(_mm_andnot_si128(tmp, a), qe_));
+@@ -288,11 +227,7 @@
+
+ tmp2 = _mm_load_si128(&donor[t]);
+ tmp = _mm_cmpgt_epi8(tmp2, a2);
+-#ifdef __SSE4_1__
+ tmp2 = _mm_max_epi8(tmp2, a2);
+-#else
+- tmp2 = _mm_or_si128(_mm_andnot_si128(tmp, a2), _mm_and_si128(tmp, tmp2));
+-#endif
+ _mm_store_si128(&x2[t], _mm_sub_epi8(tmp2, q2_));
+ d = _mm_or_si128(d, _mm_andnot_si128(tmp, _mm_set1_epi8(0x20))); // d = a > 0? 1<<5 : 0
+ _mm_store_si128(&pr[t], d);
+@@ -316,13 +251,8 @@
+ _mm_storeu_si128((__m128i*)&H[t], H1);
+ t_ = _mm_set1_epi32(t);
+ tmp = _mm_cmpgt_epi32(H1, max_H_);
+-#ifdef __SSE4_1__
+ max_H_ = _mm_blendv_epi8(max_H_, H1, tmp);
+ max_t_ = _mm_blendv_epi8(max_t_, t_, tmp);
+-#else
+- max_H_ = _mm_or_si128(_mm_and_si128(tmp, H1), _mm_andnot_si128(tmp, max_H_));
+- max_t_ = _mm_or_si128(_mm_and_si128(tmp, t_), _mm_andnot_si128(tmp, max_t_));
+-#endif
+ }
+ _mm_storeu_si128((__m128i*)HH, max_H_);
+ _mm_storeu_si128((__m128i*)tt, max_t_);
+@@ -373,4 +303,3 @@
+ kfree(km, mem2); kfree(km, off);
+ }
+ }
+-#endif // __SSE2__
+--- rapmap.orig/src/ksw2pp/ksw2_extz2_sse.c
++++ rapmap/src/ksw2pp/ksw2_extz2_sse.c
+@@ -2,26 +2,16 @@
+ #include <assert.h>
+ #include "ksw2pp/ksw2.h"
+
+-#ifdef __SSE2__
+-#include <emmintrin.h>
+-
+-#ifdef KSW_SSE2_ONLY
+-#undef __SSE4_1__
+-#endif
++#define SIMDE_ENABLE_NATIVE_ALIASES
++#include "simde/x86/sse4.1.h"
+
+ #ifdef __SSE4_1__
+-#include <smmintrin.h>
+-#endif
+-
+-#ifdef KSW_CPU_DISPATCH
+-#ifdef __SSE4_1__
+ void ksw_extz2_sse41(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t q, int8_t e, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez)
+-#else
++#elif defined(__SSE2__)
+ void ksw_extz2_sse2(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t q, int8_t e, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez)
+-#endif
+ #else
+ void ksw_extz2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t q, int8_t e, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez)
+-#endif // ~KSW_CPU_DISPATCH
++#endif
+ {
+ #define __dp_code_block1 \
+ z = _mm_add_epi8(_mm_load_si128(&s[t]), qe2_); \
+@@ -129,13 +119,8 @@
+ st = _mm_loadu_si128((__m128i*)&qrr[t]);
+ mask = _mm_or_si128(_mm_cmpeq_epi8(sq, m1_), _mm_cmpeq_epi8(st, m1_));
+ tmp = _mm_cmpeq_epi8(sq, st);
+-#ifdef __SSE4_1__
+ tmp = _mm_blendv_epi8(sc_mis_, sc_mch_, tmp);
+ tmp = _mm_blendv_epi8(tmp, sc_N_, mask);
+-#else
+- tmp = _mm_or_si128(_mm_andnot_si128(tmp, sc_mis_), _mm_and_si128(tmp, sc_mch_));
+- tmp = _mm_or_si128(_mm_andnot_si128(mask, tmp), _mm_and_si128(mask, sc_N_));
+-#endif
+ _mm_storeu_si128((__m128i*)((uint8_t*)s + t), tmp);
+ }
+ } else {
+@@ -151,22 +136,10 @@
+ for (t = st_; t <= en_; ++t) {
+ __m128i z, a, b, xt1, vt1, ut, tmp;
+ __dp_code_block1;
+-#ifdef __SSE4_1__
+ z = _mm_max_epi8(z, a); // z = z > a? z : a (signed)
+-#else // we need to emulate SSE4.1 intrinsics _mm_max_epi8()
+- z = _mm_and_si128(z, _mm_cmpgt_epi8(z, zero_)); // z = z > 0? z : 0;
+- z = _mm_max_epu8(z, a); // z = max(z, a); this works because both are non-negative
+-#endif
+ __dp_code_block2;
+-#ifdef __SSE4_1__
+ _mm_store_si128(&x[t], _mm_max_epi8(a, zero_));
+ _mm_store_si128(&y[t], _mm_max_epi8(b, zero_));
+-#else
+- tmp = _mm_cmpgt_epi8(a, zero_);
+- _mm_store_si128(&x[t], _mm_and_si128(a, tmp));
+- tmp = _mm_cmpgt_epi8(b, zero_);
+- _mm_store_si128(&y[t], _mm_and_si128(b, tmp));
+-#endif
+ }
+ } else if (!(flag&KSW_EZ_RIGHT)) { // gap left-alignment
+ __m128i *pr = p + (size_t)r * n_col_ - st_;
+@@ -175,16 +148,9 @@
+ __m128i d, z, a, b, xt1, vt1, ut, tmp;
+ __dp_code_block1;
+ d = _mm_and_si128(_mm_cmpgt_epi8(a, z), flag1_); // d = a > z? 1 : 0
+-#ifdef __SSE4_1__
+ z = _mm_max_epi8(z, a); // z = z > a? z : a (signed)
+ tmp = _mm_cmpgt_epi8(b, z);
+ d = _mm_blendv_epi8(d, flag2_, tmp); // d = b > z? 2 : d
+-#else // we need to emulate SSE4.1 intrinsics _mm_max_epi8() and _mm_blendv_epi8()
+- z = _mm_and_si128(z, _mm_cmpgt_epi8(z, zero_)); // z = z > 0? z : 0;
+- z = _mm_max_epu8(z, a); // z = max(z, a); this works because both are non-negative
+- tmp = _mm_cmpgt_epi8(b, z);
+- d = _mm_or_si128(_mm_andnot_si128(tmp, d), _mm_and_si128(tmp, flag2_)); // d = b > z? 2 : d; emulating blendv
+-#endif
+ __dp_code_block2;
+ tmp = _mm_cmpgt_epi8(a, zero_);
+ _mm_store_si128(&x[t], _mm_and_si128(tmp, a));
+@@ -201,16 +167,9 @@
+ __m128i d, z, a, b, xt1, vt1, ut, tmp;
+ __dp_code_block1;
+ d = _mm_andnot_si128(_mm_cmpgt_epi8(z, a), flag1_); // d = z > a? 0 : 1
+-#ifdef __SSE4_1__
+ z = _mm_max_epi8(z, a); // z = z > a? z : a (signed)
+ tmp = _mm_cmpgt_epi8(z, b);
+ d = _mm_blendv_epi8(flag2_, d, tmp); // d = z > b? d : 2
+-#else // we need to emulate SSE4.1 intrinsics _mm_max_epi8() and _mm_blendv_epi8()
+- z = _mm_and_si128(z, _mm_cmpgt_epi8(z, zero_)); // z = z > 0? z : 0;
+- z = _mm_max_epu8(z, a); // z = max(z, a); this works because both are non-negative
+- tmp = _mm_cmpgt_epi8(z, b);
+- d = _mm_or_si128(_mm_andnot_si128(tmp, flag2_), _mm_and_si128(tmp, d)); // d = z > b? d : 2; emulating blendv
+-#endif
+ __dp_code_block2;
+ tmp = _mm_cmpgt_epi8(zero_, a);
+ _mm_store_si128(&x[t], _mm_andnot_si128(tmp, a));
+@@ -241,13 +200,8 @@
+ _mm_storeu_si128((__m128i*)&H[t], H1);
+ t_ = _mm_set1_epi32(t);
+ tmp = _mm_cmpgt_epi32(H1, max_H_);
+-#ifdef __SSE4_1__
+ max_H_ = _mm_blendv_epi8(max_H_, H1, tmp);
+ max_t_ = _mm_blendv_epi8(max_t_, t_, tmp);
+-#else
+- max_H_ = _mm_or_si128(_mm_and_si128(tmp, H1), _mm_andnot_si128(tmp, max_H_));
+- max_t_ = _mm_or_si128(_mm_and_si128(tmp, t_), _mm_andnot_si128(tmp, max_t_));
+-#endif
+ }
+ _mm_storeu_si128((__m128i*)HH, max_H_);
+ _mm_storeu_si128((__m128i*)tt, max_t_);
+@@ -302,4 +256,3 @@
+ kfree(km, mem2); kfree(km, off);
+ }
+ }
+-#endif // __SSE2__
+--- rapmap.orig/src/ksw2pp/ksw2_gg2_sse.c
++++ rapmap/src/ksw2pp/ksw2_gg2_sse.c
+@@ -1,12 +1,8 @@
+ #include <stdio.h> // for debugging only
+ #include "ksw2pp/ksw2.h"
+
+-#ifdef __SSE2__
+-#include <emmintrin.h>
+-
+-#ifdef __SSE4_1__
+-#include <smmintrin.h>
+-#endif
++#define SIMDE_ENABLE_NATIVE_ALIASES
++#include "simde/x86/sse4.1.h"
+
+ int ksw_gg2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t q, int8_t e, int w, int *m_cigar_, int *n_cigar_, uint32_t **cigar_)
+ {
+@@ -86,16 +82,9 @@
+ b = _mm_add_epi8(_mm_load_si128(&y[t]), ut); // b <- y[r-1][t..t+15] + u[r-1][t..t+15]
+
+ d = _mm_and_si128(_mm_cmpgt_epi8(a, z), flag1_); // d = a > z? 1 : 0
+-#ifdef __SSE4_1__
+ z = _mm_max_epi8(z, a); // z = z > a? z : a (signed)
+ tmp = _mm_cmpgt_epi8(b, z);
+ d = _mm_blendv_epi8(d, flag2_, tmp); // d = b > z? 2 : d
+-#else // we need to emulate SSE4.1 intrinsics _mm_max_epi8() and _mm_blendv_epi8()
+- z = _mm_and_si128(z, _mm_cmpgt_epi8(z, zero_)); // z = z > 0? z : 0;
+- z = _mm_max_epu8(z, a); // z = max(z, a); this works because both are non-negative
+- tmp = _mm_cmpgt_epi8(b, z);
+- d = _mm_or_si128(_mm_andnot_si128(tmp, d), _mm_and_si128(tmp, flag2_)); // d = b > z? 2 : d; emulating blendv
+-#endif
+ z = _mm_max_epu8(z, b); // z = max(z, b); this works because both are non-negative
+ _mm_store_si128(&u[t], _mm_sub_epi8(z, vt1)); // u[r][t..t+15] <- z - v[r-1][t-1..t+14]
+ _mm_store_si128(&v[t], _mm_sub_epi8(z, ut)); // v[r][t..t+15] <- z - u[r-1][t..t+15]
+@@ -124,4 +113,3 @@
+ kfree(km, mem2); kfree(km, off);
+ return H0;
+ }
+-#endif // __SSE2__
+--- rapmap.orig/src/ksw2pp/KSW2Aligner.cpp
++++ rapmap/src/ksw2pp/KSW2Aligner.cpp
+@@ -27,10 +27,12 @@
+ asm volatile ("cpuid"
+ : "=a" (cpuid[0]), "=b" (cpuid[1]), "=c" (cpuid[2]), "=d" (cpuid[3])
+ : "0" (func_id), "2" (subfunc_id));
+-#else // on 32bit, ebx can NOT be used as PIC code
++#elif defined(__i386__) // on 32bit, ebx can NOT be used as PIC code
+ asm volatile ("xchgl %%ebx, %1; cpuid; xchgl %%ebx, %1"
+ : "=a" (cpuid[0]), "=r" (cpuid[1]), "=c" (cpuid[2]), "=d" (cpuid[3])
+ : "0" (func_id), "2" (subfunc_id));
++#else
++ cpuid[0] = 0
+ #endif
+ }
+ #endif
+--- rapmap.orig/src/CMakeLists.txt
++++ rapmap/src/CMakeLists.txt
+@@ -87,20 +87,36 @@
+ check_ipo_supported(RESULT HAS_IPOHAS_IPO)
+
+ add_library(ksw2pp_sse2 OBJECT ${KSW2PP_ADVANCED_LIB_SRCS})
+-add_library(ksw2pp_sse4 OBJECT ${KSW2PP_ADVANCED_LIB_SRCS})
+ add_library(ksw2pp_basic OBJECT ${KSW2PP_BASIC_LIB_SRCS})
++set_target_properties(ksw2pp_basic PROPERTIES INCLUDE_DIRECTORIES ${GAT_SOURCE_DIR}/include)
+
+-set_target_properties(ksw2pp_sse2 PROPERTIES COMPILE_FLAGS "-O3 -msse2 -mno-sse4.1")
+-set_target_properties(ksw2pp_sse2 PROPERTIES COMPILE_DEFINITIONS "KSW_CPU_DISPATCH;KSW_SSE2_ONLY;HAVE_KALLOC")
+-set_target_properties(ksw2pp_sse4 PROPERTIES COMPILE_FLAGS "-O3 -msse4.1")
+-set_target_properties(ksw2pp_sse4 PROPERTIES COMPILE_DEFINITIONS "KSW_CPU_DISPATCH;HAVE_KALLOC")
+-set_target_properties(ksw2pp_basic PROPERTIES COMPILE_DEFINITIONS "KSW_CPU_DISPATCH;HAVE_KALLOC")
++if(NOT DEFINED CMAKE_SYSTEM_PROCESSOR)
++ EXECUTE_PROCESS( COMMAND uname -m COMMAND tr -d '\n' OUTPUT_VARIABLE ARCHITECTURE )
++ set(CMAKE_SYSTEM_PROCESSOR "${ARCHITECTURE}")
++endif()
+
+-set_target_properties(ksw2pp_basic PROPERTIES INCLUDE_DIRECTORIES ${GAT_SOURCE_DIR}/include)
+-set_target_properties(ksw2pp_sse4 PROPERTIES INCLUDE_DIRECTORIES ${GAT_SOURCE_DIR}/include)
++if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
++ set(CMAKE_SYSTEM_PROCESSOR "amd64")
++elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "i686")
++ set(CMAKE_SYSTEM_PROCESSOR "i386")
++endif()
++
++message("CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}")
++
++if("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "amd64" OR "${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "i386")
++ add_library(ksw2pp_sse4 OBJECT ${KSW2PP_ADVANCED_LIB_SRCS})
++ set_target_properties(ksw2pp_sse2 PROPERTIES COMPILE_FLAGS "-O3 -msse2 -mno-sse4.1")
++ set_target_properties(ksw2pp_sse2 PROPERTIES COMPILE_DEFINITIONS "KSW_CPU_DISPATCH;KSW_SSE2_ONLY;HAVE_KALLOC")
++ set_target_properties(ksw2pp_sse4 PROPERTIES COMPILE_FLAGS "-O3 -msse4.1")
++ set_target_properties(ksw2pp_sse4 PROPERTIES COMPILE_DEFINITIONS "KSW_CPU_DISPATCH;HAVE_KALLOC")
++ set_target_properties(ksw2pp_basic PROPERTIES COMPILE_DEFINITIONS "KSW_CPU_DISPATCH;HAVE_KALLOC")
++ set_target_properties(ksw2pp_sse4 PROPERTIES INCLUDE_DIRECTORIES ${GAT_SOURCE_DIR}/include)
++ add_library(ksw2pp STATIC $<TARGET_OBJECTS:ksw2pp_sse2> $<TARGET_OBJECTS:ksw2pp_sse4> $<TARGET_OBJECTS:ksw2pp_basic>)
++else()
++ add_library(ksw2pp STATIC $<TARGET_OBJECTS:ksw2pp_sse2> $<TARGET_OBJECTS:ksw2pp_basic>)
++endif()
+
+ # Build the ksw2pp library
+-add_library(ksw2pp STATIC $<TARGET_OBJECTS:ksw2pp_sse2> $<TARGET_OBJECTS:ksw2pp_sse4> $<TARGET_OBJECTS:ksw2pp_basic>)
+ set_target_properties(ksw2pp PROPERTIES COMPILE_DEFINITIONS "KSW_CPU_DISPATCH;HAVE_KALLOC")
+ if(HAS_IPO)
+ set_property(TARGET ksw2pp PROPERTY INTERPROCEDURAL_OPTIMIZATION True)
=====================================
debian/rules
=====================================
@@ -24,3 +24,6 @@ override_dh_auto_build:
override_dh_clean:
dh_clean README.html sample_data/sample_quasi_index/ sample_data/sample_quasi_index_ph/ sample_data/sample_quasi_map.sam sample_data/sample_quasi_map_ph.sam
+
+override_dh_gencontrol:
+ dh_gencontrol -- -Vsimde:Built-Using="$(shell dpkg-query -f '$${source:Package} (= $${source:Version}), ' -W "libsimde-dev")"
View it on GitLab: https://salsa.debian.org/med-team/rapmap/-/commit/f6be793f5bbef20fabd1b046ad0c520840344334
--
View it on GitLab: https://salsa.debian.org/med-team/rapmap/-/commit/f6be793f5bbef20fabd1b046ad0c520840344334
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20200605/f1f9e650/attachment-0001.html>
More information about the debian-med-commit
mailing list