[med-svn] [Git][med-team/simde][upstream] New upstream version 0.7.0
Michael R. Crusoe
gitlab at salsa.debian.org
Sun Dec 27 19:20:43 GMT 2020
Michael R. Crusoe pushed to branch upstream at Debian Med / simde
Commits:
dfcfc271 by Michael R. Crusoe at 2020-12-27T13:48:47+01:00
New upstream version 0.7.0
- - - - -
8 changed files:
- simde/arm/neon/ld3.h
- simde/arm/neon/ld4.h
- simde/arm/neon/orn.h
- simde/simde-common.h
- simde/simde-math.h
- simde/x86/avx512/permutex2var.h
- simde/x86/sse.h
- test/x86/avx512/permutex2var.c
Changes:
=====================================
simde/arm/neon/ld3.h
=====================================
@@ -33,7 +33,7 @@
HEDLEY_DIAGNOSTIC_PUSH
SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
-#if defined(HEDLEY_GCC_VERSION) && !HEDLEY_GCC_VERSION_CHECK(10,0,0)
+#if defined(HEDLEY_GCC_VERSION)
SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_
#endif
SIMDE_BEGIN_DECLS_
=====================================
simde/arm/neon/ld4.h
=====================================
@@ -32,6 +32,9 @@
HEDLEY_DIAGNOSTIC_PUSH
SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
+#if defined(HEDLEY_GCC_VERSION)
+ SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_
+#endif
SIMDE_BEGIN_DECLS_
#if !defined(SIMDE_BUG_INTEL_857088)
=====================================
simde/arm/neon/orn.h
=====================================
@@ -258,6 +258,8 @@ simde_vornq_s8(simde_int8x16_t a, simde_int8x16_t b) {
return vornq_s8(a, b);
#elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
return vec_orc(a, b);
+ #elif defined(SIMDE_X86_AVX512VL_NATIVE)
+ return _mm_ternarylogic_epi32(a, b, a, 0xf3);
#else
simde_int8x16_private
a_ = simde_int8x16_to_private(a),
@@ -287,6 +289,8 @@ simde_vornq_s16(simde_int16x8_t a, simde_int16x8_t b) {
return vornq_s16(a, b);
#elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
return vec_orc(a, b);
+ #elif defined(SIMDE_X86_AVX512VL_NATIVE)
+ return _mm_ternarylogic_epi32(a, b, a, 0xf3);
#else
simde_int16x8_private
a_ = simde_int16x8_to_private(a),
@@ -316,6 +320,8 @@ simde_vornq_s32(simde_int32x4_t a, simde_int32x4_t b) {
return vornq_s32(a, b);
#elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
return vec_orc(a, b);
+ #elif defined(SIMDE_X86_AVX512VL_NATIVE)
+ return _mm_ternarylogic_epi32(a, b, a, 0xf3);
#else
simde_int32x4_private
a_ = simde_int32x4_to_private(a),
@@ -345,6 +351,8 @@ simde_vornq_s64(simde_int64x2_t a, simde_int64x2_t b) {
return vornq_s64(a, b);
#elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
return vec_orc(a, b);
+ #elif defined(SIMDE_X86_AVX512VL_NATIVE)
+ return _mm_ternarylogic_epi64(a, b, a, 0xf3);
#else
simde_int64x2_private
a_ = simde_int64x2_to_private(a),
@@ -374,6 +382,8 @@ simde_vornq_u8(simde_uint8x16_t a, simde_uint8x16_t b) {
return vornq_u8(a, b);
#elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
return vec_orc(a, b);
+ #elif defined(SIMDE_X86_AVX512VL_NATIVE)
+ return _mm_ternarylogic_epi32(a, b, a, 0xf3);
#else
simde_uint8x16_private
a_ = simde_uint8x16_to_private(a),
@@ -403,6 +413,8 @@ simde_vornq_u16(simde_uint16x8_t a, simde_uint16x8_t b) {
return vornq_u16(a, b);
#elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
return vec_orc(a, b);
+ #elif defined(SIMDE_X86_AVX512VL_NATIVE)
+ return _mm_ternarylogic_epi32(a, b, a, 0xf3);
#else
simde_uint16x8_private
a_ = simde_uint16x8_to_private(a),
@@ -432,6 +444,8 @@ simde_vornq_u32(simde_uint32x4_t a, simde_uint32x4_t b) {
return vornq_u32(a, b);
#elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
return vec_orc(a, b);
+ #elif defined(SIMDE_X86_AVX512VL_NATIVE)
+ return _mm_ternarylogic_epi32(a, b, a, 0xf3);
#else
simde_uint32x4_private
a_ = simde_uint32x4_to_private(a),
@@ -461,6 +475,8 @@ simde_vornq_u64(simde_uint64x2_t a, simde_uint64x2_t b) {
return vornq_u64(a, b);
#elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
return vec_orc(a, b);
+ #elif defined(SIMDE_X86_AVX512VL_NATIVE)
+ return _mm_ternarylogic_epi64(a, b, a, 0xf3);
#else
simde_uint64x2_private
a_ = simde_uint64x2_to_private(a),
=====================================
simde/simde-common.h
=====================================
@@ -30,7 +30,7 @@
#include "hedley.h"
#define SIMDE_VERSION_MAJOR 0
-#define SIMDE_VERSION_MINOR 5
+#define SIMDE_VERSION_MINOR 7
#define SIMDE_VERSION_MICRO 0
#define SIMDE_VERSION HEDLEY_VERSION_ENCODE(SIMDE_VERSION_MAJOR, SIMDE_VERSION_MINOR, SIMDE_VERSION_MICRO)
@@ -777,23 +777,14 @@ HEDLEY_DIAGNOSTIC_POP
# define SIMDE_BUG_GCC_REV_247851
# endif
# if !HEDLEY_GCC_VERSION_CHECK(10,0,0)
-# define SIMDE_BUG_GCC_REV_274313
# define SIMDE_BUG_GCC_91341
# endif
-# if !HEDLEY_GCC_VERSION_CHECK(9,0,0) && defined(SIMDE_ARCH_AARCH64)
-# define SIMDE_BUG_GCC_ARM_SHIFT_SCALAR
-# endif
# if defined(SIMDE_ARCH_X86) && !defined(SIMDE_ARCH_AMD64)
# define SIMDE_BUG_GCC_94482
# endif
# if (defined(SIMDE_ARCH_X86) && !defined(SIMDE_ARCH_AMD64)) || defined(SIMDE_ARCH_SYSTEMZ)
# define SIMDE_BUG_GCC_53784
# endif
-# if defined(SIMDE_ARCH_X86) || defined(SIMDE_ARCH_AMD64)
-# if HEDLEY_GCC_VERSION_CHECK(4,3,0) /* -Wsign-conversion */
-# define SIMDE_BUG_GCC_95144
-# endif
-# endif
# if !HEDLEY_GCC_VERSION_CHECK(9,4,0) && defined(SIMDE_ARCH_AARCH64)
# define SIMDE_BUG_GCC_94488
# endif
@@ -825,22 +816,13 @@ HEDLEY_DIAGNOSTIC_POP
# define SIMDE_BUG_CLANG_POWER9_16x4_BAD_SHIFT
# endif
# if defined(SIMDE_ARCH_X86) || defined(SIMDE_ARCH_AMD64)
-# if HEDLEY_HAS_WARNING("-Wsign-conversion") && SIMDE_DETECT_CLANG_VERSION_NOT(11,0,0)
-# define SIMDE_BUG_CLANG_45931
+# if HEDLEY_HAS_WARNING("-Wvector-conversion") && SIMDE_DETECT_CLANG_VERSION_NOT(11,0,0)
+# define SIMDE_BUG_CLANG_44589
# endif
# endif
-# define SIMDE_BUG_CLANG_45959
-# elif defined(HEDLEY_MSVC_VERSION)
-# if defined(SIMDE_ARCH_X86)
-# define SIMDE_BUG_MSVC_ROUND_EXTRACT
-# endif
# elif defined(HEDLEY_INTEL_VERSION)
# define SIMDE_BUG_INTEL_857088
# endif
-# if defined(HEDLEY_EMSCRIPTEN_VERSION)
-# define SIMDE_BUG_EMSCRIPTEN_MISSING_IMPL /* Placeholder for (as yet) unfiled issues. */
-# define SIMDE_BUG_EMSCRIPTEN_5242
-# endif
#endif
/* GCC and Clang both have the same issue:
=====================================
simde/simde-math.h
=====================================
@@ -1017,6 +1017,26 @@ SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
#endif
#endif
+#if !defined(simde_math_modf)
+ #if SIMDE_MATH_BUILTIN_LIBM(modf)
+ #define simde_math_modf(x, iptr) __builtin_modf(x, iptr)
+ #elif defined(SIMDE_MATH_HAVE_CMATH)
+ #define simde_math_modf(x, iptr) std::modf(x, iptr)
+ #elif defined(SIMDE_MATH_HAVE_MATH_H)
+ #define simde_math_modf(x, iptr) modf(x, iptr)
+ #endif
+#endif
+
+#if !defined(simde_math_modff)
+ #if SIMDE_MATH_BUILTIN_LIBM(modff)
+ #define simde_math_modff(x, iptr) __builtin_modff(x, iptr)
+ #elif defined(SIMDE_MATH_HAVE_CMATH)
+ #define simde_math_modff(x, iptr) std::modf(x, iptr)
+ #elif defined(SIMDE_MATH_HAVE_MATH_H)
+ #define simde_math_modff(x, iptr) modff(x, iptr)
+ #endif
+#endif
+
#if !defined(simde_math_nearbyint)
#if SIMDE_MATH_BUILTIN_LIBM(nearbyint)
#define simde_math_nearbyint(v) __builtin_nearbyint(v)
=====================================
simde/x86/avx512/permutex2var.h
=====================================
@@ -43,6 +43,1086 @@ HEDLEY_DIAGNOSTIC_PUSH
SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
SIMDE_BEGIN_DECLS_
+/* The following generic code avoids many, nearly identical, repetitions of fairly complex code.
+ * If the compiler optimizes well, in particular extracting invariant code from loops
+ * and simplifying code involving constants passed as arguments, it should not be
+ * significantly slower than specific code.
+ * Note that when the original vector contains few elements, these implementations
+ * may not be faster than portable code.
+ */
+#if defined(SIMDE_X86_SSSE3_NATIVE) || defined(SIMDE_ARM_NEON_A64V8_NATIVE) || defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_WASM_SIMD128_NATIVE)
+ #define SIMDE_X_PERMUTEX2VAR_USE_GENERIC
+#endif
+
+#if defined(SIMDE_X_PERMUTEX2VAR_USE_GENERIC)
+SIMDE_FUNCTION_ATTRIBUTES
+simde__m128i
+simde_x_permutex2var128 (const simde__m128i *a, const simde__m128i idx, const simde__m128i *b, const unsigned int log2_index_size, const unsigned int log2_data_length) {
+ const int idx_mask = (1 << (5 - log2_index_size + log2_data_length)) - 1;
+
+ #if defined(SIMDE_X86_SSE3_NATIVE)
+ __m128i ra, rb, t, test, select, index;
+ const __m128i sixteen = _mm_set1_epi8(16);
+
+ /* Avoid the mullo intrinsics which have high latency (and the 32-bit one requires SSE4.1) */
+ switch (log2_index_size) {
+ default: /* Avoid uninitialized variable warning/error */
+ case 0:
+ index = _mm_and_si128(idx, _mm_set1_epi8(HEDLEY_STATIC_CAST(int8_t, idx_mask)));
+ break;
+ case 1:
+ index = _mm_and_si128(idx, _mm_set1_epi16(HEDLEY_STATIC_CAST(int16_t, idx_mask)));
+ index = _mm_slli_epi32(index, 1);
+ t = _mm_slli_epi32(index, 8);
+ index = _mm_or_si128(index, t);
+ index = _mm_add_epi16(index, _mm_set1_epi16(0x0100));
+ break;
+ case 2:
+ index = _mm_and_si128(idx, _mm_set1_epi32(HEDLEY_STATIC_CAST(int32_t, idx_mask)));
+ index = _mm_slli_epi32(index, 2);
+ t = _mm_slli_epi32(index, 8);
+ index = _mm_or_si128(index, t);
+ t = _mm_slli_epi32(index, 16);
+ index = _mm_or_si128(index, t);
+ index = _mm_add_epi32(index, _mm_set1_epi32(0x03020100));
+ break;
+ }
+
+ test = index;
+ index = _mm_and_si128(index, _mm_set1_epi8(HEDLEY_STATIC_CAST(int8_t, (1 << (4 + log2_data_length)) - 1)));
+ test = _mm_cmpgt_epi8(test, index);
+
+ ra = _mm_shuffle_epi8(a[0], index);
+ rb = _mm_shuffle_epi8(b[0], index);
+
+ #if defined(SIMDE_X86_SSE4_1_NATIVE)
+ SIMDE_VECTORIZE
+ for (int i = 1 ; i < (1 << log2_data_length) ; i++) {
+ select = _mm_cmplt_epi8(index, sixteen);
+ index = _mm_sub_epi8(index, sixteen);
+ ra = _mm_blendv_epi8(_mm_shuffle_epi8(a[i], index), ra, select);
+ rb = _mm_blendv_epi8(_mm_shuffle_epi8(b[i], index), rb, select);
+ }
+
+ return _mm_blendv_epi8(ra, rb, test);
+ #else
+ SIMDE_VECTORIZE
+ for (int i = 1 ; i < (1 << log2_data_length) ; i++) {
+ select = _mm_cmplt_epi8(index, sixteen);
+ index = _mm_sub_epi8(index, sixteen);
+ ra = _mm_or_si128(_mm_andnot_si128(select, _mm_shuffle_epi8(a[i], index)), _mm_and_si128(select, ra));
+ rb = _mm_or_si128(_mm_andnot_si128(select, _mm_shuffle_epi8(b[i], index)), _mm_and_si128(select, rb));
+ }
+
+ return _mm_or_si128(_mm_andnot_si128(test, ra), _mm_and_si128(test, rb));
+ #endif
+ #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ uint8x16_t index, r;
+ uint16x8_t index16;
+ uint32x4_t index32;
+ uint8x16x2_t table2_a, table2_b;
+ uint8x16x4_t table4_a, table4_b;
+
+ switch (log2_index_size) {
+ case 0:
+ index = vandq_u8(simde__m128i_to_neon_u8(idx), vdupq_n_u8(HEDLEY_STATIC_CAST(int8_t, idx_mask)));
+ break;
+ case 1:
+ index16 = vandq_u16(simde__m128i_to_neon_u16(idx), vdupq_n_u16(HEDLEY_STATIC_CAST(int16_t, idx_mask)));
+ index16 = vmulq_n_u16(index16, 0x0202);
+ index16 = vaddq_u16(index16, vdupq_n_u16(0x0100));
+ index = vreinterpretq_u8_u16(index16);
+ break;
+ case 2:
+ index32 = vandq_u32(simde__m128i_to_neon_u32(idx), vdupq_n_u32(HEDLEY_STATIC_CAST(int32_t, idx_mask)));
+ index32 = vmulq_n_u32(index32, 0x04040404);
+ index32 = vaddq_u32(index32, vdupq_n_u32(0x03020100));
+ index = vreinterpretq_u8_u32(index32);
+ break;
+ }
+
+ uint8x16_t mask = vdupq_n_u8(HEDLEY_STATIC_CAST(int8_t, (1 << (4 + log2_data_length)) - 1));
+
+ switch (log2_data_length) {
+ case 0:
+ r = vqtbx1q_u8(vqtbl1q_u8(simde__m128i_to_neon_u8(b[0]), vandq_u8(index, mask)), simde__m128i_to_neon_u8(a[0]), index);
+ break;
+ case 1:
+ table2_a.val[0] = simde__m128i_to_neon_u8(a[0]);
+ table2_a.val[1] = simde__m128i_to_neon_u8(a[1]);
+ table2_b.val[0] = simde__m128i_to_neon_u8(b[0]);
+ table2_b.val[1] = simde__m128i_to_neon_u8(b[1]);
+ r = vqtbx2q_u8(vqtbl2q_u8(table2_b, vandq_u8(index, mask)), table2_a, index);
+ break;
+ case 2:
+ table4_a.val[0] = simde__m128i_to_neon_u8(a[0]);
+ table4_a.val[1] = simde__m128i_to_neon_u8(a[1]);
+ table4_a.val[2] = simde__m128i_to_neon_u8(a[2]);
+ table4_a.val[3] = simde__m128i_to_neon_u8(a[3]);
+ table4_b.val[0] = simde__m128i_to_neon_u8(b[0]);
+ table4_b.val[1] = simde__m128i_to_neon_u8(b[1]);
+ table4_b.val[2] = simde__m128i_to_neon_u8(b[2]);
+ table4_b.val[3] = simde__m128i_to_neon_u8(b[3]);
+ r = vqtbx4q_u8(vqtbl4q_u8(table4_b, vandq_u8(index, mask)), table4_a, index);
+ break;
+ }
+
+ return simde__m128i_from_neon_u8(r);
+ #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
+ SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) r, ra, rb, t, index, s, thirty_two = vec_splats(HEDLEY_STATIC_CAST(uint8_t, 32));
+ SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) index16;
+ SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) temp32, index32;
+ SIMDE_POWER_ALTIVEC_VECTOR(SIMDE_POWER_ALTIVEC_BOOL char) select, test;
+
+ switch (log2_index_size) {
+ default: /* Avoid uninitialized variable warning/error */
+ case 0:
+ index = vec_and(simde__m128i_to_altivec_u8(idx), vec_splats(HEDLEY_STATIC_CAST(uint8_t, idx_mask)));
+ break;
+ case 1:
+ index16 = simde__m128i_to_altivec_u16(idx);
+ index16 = vec_and(index16, vec_splats(HEDLEY_STATIC_CAST(uint16_t, idx_mask)));
+ index16 = vec_mladd(index16, vec_splats(HEDLEY_STATIC_CAST(unsigned short, 0x0202)), vec_splats(HEDLEY_STATIC_CAST(unsigned short, 0x0100)));
+ index = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), index16);
+ break;
+ case 2:
+ index32 = simde__m128i_to_altivec_u32(idx);
+ index32 = vec_and(index32, vec_splats(HEDLEY_STATIC_CAST(uint32_t, idx_mask)));
+
+ /* Multiply index32 by 0x04040404; unfortunately vec_mul isn't available so (mis)use 16-bit vec_mladd */
+ temp32 = vec_sl(index32, vec_splats(HEDLEY_STATIC_CAST(unsigned int, 16)));
+ index32 = vec_add(index32, temp32);
+ index32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned int),
+ vec_mladd(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned short), index32),
+ vec_splats(HEDLEY_STATIC_CAST(unsigned short, 0x0404)),
+ vec_splat_u16(0)));
+
+ index32 = vec_add(index32, vec_splats(HEDLEY_STATIC_CAST(unsigned int, 0x03020100)));
+ index = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), index32);
+ break;
+ }
+
+ if (log2_data_length == 0) {
+ r = vec_perm(simde__m128i_to_altivec_u8(a[0]), simde__m128i_to_altivec_u8(b[0]), HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), index));
+ }
+ else {
+ s = index;
+ index = vec_and(index, vec_splats(HEDLEY_STATIC_CAST(uint8_t, (1 << (4 + log2_data_length)) - 1)));
+ test = vec_cmpgt(s, index);
+
+ ra = vec_perm(simde__m128i_to_altivec_u8(a[0]), simde__m128i_to_altivec_u8(a[1]), index);
+ rb = vec_perm(simde__m128i_to_altivec_u8(b[0]), simde__m128i_to_altivec_u8(b[1]), index);
+
+ SIMDE_VECTORIZE
+ for (int i = 2 ; i < (1 << log2_data_length) ; i += 2) {
+ select = vec_cmplt(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed char), index),
+ HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed char), thirty_two));
+ index = vec_sub(index, thirty_two);
+ t = vec_perm(simde__m128i_to_altivec_u8(a[i]), simde__m128i_to_altivec_u8(a[i + 1]), index);
+ ra = vec_sel(t, ra, select);
+ t = vec_perm(simde__m128i_to_altivec_u8(b[i]), simde__m128i_to_altivec_u8(b[i + 1]), index);
+ rb = vec_sel(t, rb, select);
+ }
+
+ r = vec_sel(ra, rb, test);
+ }
+
+ return simde__m128i_from_altivec_u8(r);
+ #elif defined(SIMDE_WASM_SIMD128_NATIVE)
+ const v128_t sixteen = wasm_i8x16_splat(16);
+
+ v128_t index = simde__m128i_to_wasm_v128(idx);
+
+ switch (log2_index_size) {
+ case 0:
+ index = wasm_v128_and(index, wasm_i8x16_splat(HEDLEY_STATIC_CAST(int8_t, idx_mask)));
+ break;
+ case 1:
+ index = wasm_v128_and(index, wasm_i16x8_splat(HEDLEY_STATIC_CAST(int16_t, idx_mask)));
+ index = wasm_i16x8_mul(index, wasm_i16x8_splat(0x0202));
+ index = wasm_i16x8_add(index, wasm_i16x8_splat(0x0100));
+ break;
+ case 2:
+ index = wasm_v128_and(index, wasm_i32x4_splat(HEDLEY_STATIC_CAST(int32_t, idx_mask)));
+ index = wasm_i32x4_mul(index, wasm_i32x4_splat(0x04040404));
+ index = wasm_i32x4_add(index, wasm_i32x4_splat(0x03020100));
+ break;
+ }
+
+ v128_t r = wasm_v8x16_swizzle(simde__m128i_to_wasm_v128(a[0]), index);
+
+ SIMDE_VECTORIZE
+ for (int i = 1 ; i < (1 << log2_data_length) ; i++) {
+ index = wasm_i8x16_sub(index, sixteen);
+ r = wasm_v128_or(r, wasm_v8x16_swizzle(simde__m128i_to_wasm_v128(a[i]), index));
+ }
+
+ SIMDE_VECTORIZE
+ for (int i = 0 ; i < (1 << log2_data_length) ; i++) {
+ index = wasm_i8x16_sub(index, sixteen);
+ r = wasm_v128_or(r, wasm_v8x16_swizzle(simde__m128i_to_wasm_v128(b[i]), index));
+ }
+
+ return simde__m128i_from_wasm_v128(r);
+ #endif
+}
+
+SIMDE_FUNCTION_ATTRIBUTES
+void
+simde_x_permutex2var (simde__m128i *r, const simde__m128i *a, const simde__m128i *idx, const simde__m128i *b, const unsigned int log2_index_size, const unsigned int log2_data_length) {
+ SIMDE_VECTORIZE
+ for (int i = 0 ; i < (1 << log2_data_length) ; i++) {
+ r[i] = simde_x_permutex2var128(a, idx[i], b, log2_index_size, log2_data_length);
+ }
+}
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde__m128i
+simde_mm_permutex2var_epi16 (simde__m128i a, simde__m128i idx, simde__m128i b) {
+ #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
+ return _mm_permutex2var_epi16(a, idx, b);
+ #elif defined(SIMDE_X_PERMUTEX2VAR_USE_GENERIC)
+ simde__m128i r;
+
+ simde_x_permutex2var(&r, &a, &idx, &b, 1, 0);
+
+ return r;
+ #else
+ simde__m128i_private
+ a_ = simde__m128i_to_private(a),
+ idx_ = simde__m128i_to_private(idx),
+ b_ = simde__m128i_to_private(b),
+ r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
+ r_.i16[i] = ((idx_.i16[i] & 8) ? b_ : a_).i16[idx_.i16[i] & 7];
+ }
+
+ return simde__m128i_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
+ #undef _mm_permutex2var_epi16
+ #define _mm_permutex2var_epi16(a, idx, b) simde_mm_permutex2var_epi16(a, idx, b)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde__m128i
+simde_mm_mask_permutex2var_epi16 (simde__m128i a, simde__mmask8 k, simde__m128i idx, simde__m128i b) {
+ #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
+ return _mm_mask_permutex2var_epi16(a, k, idx, b);
+ #else
+ return simde_mm_mask_mov_epi16(a, k, simde_mm_permutex2var_epi16(a, idx, b));
+ #endif
+}
+#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
+ #undef _mm_mask_permutex2var_epi16
+#define _mm_mask_permutex2var_epi16(a, k, idx, b) simde_mm_mask_permutex2var_epi16(a, k, idx, b)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde__m128i
+simde_mm_mask2_permutex2var_epi16 (simde__m128i a, simde__m128i idx, simde__mmask8 k, simde__m128i b) {
+ #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
+ return _mm_mask2_permutex2var_epi16(a, idx, k, b);
+ #else
+ return simde_mm_mask_mov_epi16(idx, k, simde_mm_permutex2var_epi16(a, idx, b));
+ #endif
+}
+#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
+ #undef _mm_mask2_permutex2var_epi16
+#define _mm_mask2_permutex2var_epi16(a, idx, k, b) simde_mm_mask2_permutex2var_epi16(a, idx, k, b)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde__m128i
+simde_mm_maskz_permutex2var_epi16 (simde__mmask8 k, simde__m128i a, simde__m128i idx, simde__m128i b) {
+ #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
+ return _mm_maskz_permutex2var_epi16(k, a, idx, b);
+ #else
+ return simde_mm_maskz_mov_epi16(k, simde_mm_permutex2var_epi16(a, idx, b));
+ #endif
+}
+#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
+ #undef _mm_maskz_permutex2var_epi16
+#define _mm_maskz_permutex2var_epi16(k, a, idx, b) simde_mm_maskz_permutex2var_epi16(k, a, idx, b)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde__m128i
+simde_mm_permutex2var_epi32 (simde__m128i a, simde__m128i idx, simde__m128i b) {
+ #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
+ return _mm_permutex2var_epi32(a, idx, b);
+ #elif defined(SIMDE_X_PERMUTEX2VAR_USE_GENERIC) /* This may not be faster than the portable version */
+ simde__m128i r;
+
+ simde_x_permutex2var(&r, &a, &idx, &b, 2, 0);
+
+ return r;
+ #else
+ simde__m128i_private
+ a_ = simde__m128i_to_private(a),
+ idx_ = simde__m128i_to_private(idx),
+ b_ = simde__m128i_to_private(b),
+ r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
+ r_.i32[i] = ((idx_.i32[i] & 4) ? b_ : a_).i32[idx_.i32[i] & 3];
+ }
+
+ return simde__m128i_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
+ #undef _mm_permutex2var_epi32
+ #define _mm_permutex2var_epi32(a, idx, b) simde_mm_permutex2var_epi32(a, idx, b)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde__m128i
+simde_mm_mask_permutex2var_epi32 (simde__m128i a, simde__mmask8 k, simde__m128i idx, simde__m128i b) {
+ #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
+ return _mm_mask_permutex2var_epi32(a, k, idx, b);
+ #else
+ return simde_mm_mask_mov_epi32(a, k, simde_mm_permutex2var_epi32(a, idx, b));
+ #endif
+}
+#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
+ #undef _mm_mask_permutex2var_epi32
+#define _mm_mask_permutex2var_epi32(a, k, idx, b) simde_mm_mask_permutex2var_epi32(a, k, idx, b)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde__m128i
+simde_mm_mask2_permutex2var_epi32 (simde__m128i a, simde__m128i idx, simde__mmask8 k, simde__m128i b) {
+ #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
+ return _mm_mask2_permutex2var_epi32(a, idx, k, b);
+ #else
+ return simde_mm_mask_mov_epi32(idx, k, simde_mm_permutex2var_epi32(a, idx, b));
+ #endif
+}
+#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
+ #undef _mm_mask2_permutex2var_epi32
+#define _mm_mask2_permutex2var_epi32(a, idx, k, b) simde_mm_mask2_permutex2var_epi32(a, idx, k, b)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde__m128i
+simde_mm_maskz_permutex2var_epi32 (simde__mmask8 k, simde__m128i a, simde__m128i idx, simde__m128i b) {
+ #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
+ return _mm_maskz_permutex2var_epi32(k, a, idx, b);
+ #else
+ return simde_mm_maskz_mov_epi32(k, simde_mm_permutex2var_epi32(a, idx, b));
+ #endif
+}
+#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
+ #undef _mm_maskz_permutex2var_epi32
+#define _mm_maskz_permutex2var_epi32(k, a, idx, b) simde_mm_maskz_permutex2var_epi32(k, a, idx, b)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde__m128i
+simde_mm_permutex2var_epi64 (simde__m128i a, simde__m128i idx, simde__m128i b) {
+ #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
+ return _mm_permutex2var_epi64(a, idx, b);
+ #else
+ simde__m128i_private
+ a_ = simde__m128i_to_private(a),
+ idx_ = simde__m128i_to_private(idx),
+ b_ = simde__m128i_to_private(b),
+ r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
+ r_.i64[i] = ((idx_.i64[i] & 2) ? b_ : a_).i64[idx_.i64[i] & 1];
+ }
+
+ return simde__m128i_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
+ #undef _mm_permutex2var_epi64
+ #define _mm_permutex2var_epi64(a, idx, b) simde_mm_permutex2var_epi64(a, idx, b)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde__m128i
+simde_mm_mask_permutex2var_epi64 (simde__m128i a, simde__mmask8 k, simde__m128i idx, simde__m128i b) {
+ #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
+ return _mm_mask_permutex2var_epi64(a, k, idx, b);
+ #else
+ return simde_mm_mask_mov_epi64(a, k, simde_mm_permutex2var_epi64(a, idx, b));
+ #endif
+}
+#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
+ #undef _mm_mask_permutex2var_epi64
+#define _mm_mask_permutex2var_epi64(a, k, idx, b) simde_mm_mask_permutex2var_epi64(a, k, idx, b)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde__m128i
+simde_mm_mask2_permutex2var_epi64 (simde__m128i a, simde__m128i idx, simde__mmask8 k, simde__m128i b) {
+ #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
+ return _mm_mask2_permutex2var_epi64(a, idx, k, b);
+ #else
+ return simde_mm_mask_mov_epi64(idx, k, simde_mm_permutex2var_epi64(a, idx, b));
+ #endif
+}
+#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
+ #undef _mm_mask2_permutex2var_epi64
+#define _mm_mask2_permutex2var_epi64(a, idx, k, b) simde_mm_mask2_permutex2var_epi64(a, idx, k, b)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde__m128i
+simde_mm_maskz_permutex2var_epi64 (simde__mmask8 k, simde__m128i a, simde__m128i idx, simde__m128i b) {
+ #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
+ return _mm_maskz_permutex2var_epi64(k, a, idx, b);
+ #else
+ return simde_mm_maskz_mov_epi64(k, simde_mm_permutex2var_epi64(a, idx, b));
+ #endif
+}
+#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
+ #undef _mm_maskz_permutex2var_epi64
+#define _mm_maskz_permutex2var_epi64(k, a, idx, b) simde_mm_maskz_permutex2var_epi64(k, a, idx, b)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde__m128i
+simde_mm_permutex2var_epi8 (simde__m128i a, simde__m128i idx, simde__m128i b) {
+ #if defined(SIMDE_X86_AVX512VBMI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
+ return _mm_permutex2var_epi8(a, idx, b);
+ #elif defined(SIMDE_X86_AVX512F_NATIVE)
+ return _mm512_cvtepi32_epi8(_mm512_permutex2var_epi32(_mm512_cvtepu8_epi32(a), _mm512_cvtepu8_epi32(idx), _mm512_cvtepu8_epi32(b)));
+ #elif defined(SIMDE_X_PERMUTEX2VAR_USE_GENERIC)
+ simde__m128i r;
+
+ simde_x_permutex2var(&r, &a, &idx, &b, 0, 0);
+
+ return r;
+ #else
+ simde__m128i_private
+ a_ = simde__m128i_to_private(a),
+ idx_ = simde__m128i_to_private(idx),
+ b_ = simde__m128i_to_private(b),
+ r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
+ r_.i8[i] = ((idx_.i8[i] & 0x10) ? b_ : a_).i8[idx_.i8[i] & 0x0F];
+ }
+
+ return simde__m128i_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_X86_AVX512VBMI_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
+ #undef _mm_permutex2var_epi8
+ #define _mm_permutex2var_epi8(a, idx, b) simde_mm_permutex2var_epi8(a, idx, b)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde__m128i
+simde_mm_mask_permutex2var_epi8 (simde__m128i a, simde__mmask16 k, simde__m128i idx, simde__m128i b) {
+ #if defined(SIMDE_X86_AVX512VBMI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
+ return _mm_mask_permutex2var_epi8(a, k, idx, b);
+ #else
+ return simde_mm_mask_mov_epi8(a, k, simde_mm_permutex2var_epi8(a, idx, b));
+ #endif
+}
+#if defined(SIMDE_X86_AVX512VBMI_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
+ #undef _mm_mask_permutex2var_epi8
+#define _mm_mask_permutex2var_epi8(a, k, idx, b) simde_mm_mask_permutex2var_epi8(a, k, idx, b)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde__m128i
+simde_mm_mask2_permutex2var_epi8 (simde__m128i a, simde__m128i idx, simde__mmask16 k, simde__m128i b) {
+ #if defined(SIMDE_X86_AVX512VBMI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
+ return _mm_mask2_permutex2var_epi8(a, idx, k, b);
+ #else
+ return simde_mm_mask_mov_epi8(idx, k, simde_mm_permutex2var_epi8(a, idx, b));
+ #endif
+}
+#if defined(SIMDE_X86_AVX512VBMI_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
+ #undef _mm_mask2_permutex2var_epi8
+#define _mm_mask2_permutex2var_epi8(a, idx, k, b) simde_mm_mask2_permutex2var_epi8(a, idx, k, b)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde__m128i
+simde_mm_maskz_permutex2var_epi8 (simde__mmask16 k, simde__m128i a, simde__m128i idx, simde__m128i b) {
+ #if defined(SIMDE_X86_AVX512VBMI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
+ return _mm_maskz_permutex2var_epi8(k, a, idx, b);
+ #else
+ return simde_mm_maskz_mov_epi8(k, simde_mm_permutex2var_epi8(a, idx, b));
+ #endif
+}
+#if defined(SIMDE_X86_AVX512VBMI_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
+ #undef _mm_maskz_permutex2var_epi8
+#define _mm_maskz_permutex2var_epi8(k, a, idx, b) simde_mm_maskz_permutex2var_epi8(k, a, idx, b)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde__m128d
+simde_mm_permutex2var_pd (simde__m128d a, simde__m128i idx, simde__m128d b) {
+ #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
+ return _mm_permutex2var_pd(a, idx, b);
+ #else
+ return simde_mm_castsi128_pd(simde_mm_permutex2var_epi64(simde_mm_castpd_si128(a), idx, simde_mm_castpd_si128(b)));
+ #endif
+}
+#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
+ #undef _mm_permutex2var_pd
+ #define _mm_permutex2var_pd(a, idx, b) simde_mm_permutex2var_pd(a, idx, b)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde__m128d
+simde_mm_mask_permutex2var_pd (simde__m128d a, simde__mmask8 k, simde__m128i idx, simde__m128d b) {
+ #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
+ return _mm_mask_permutex2var_pd(a, k, idx, b);
+ #else
+ return simde_mm_mask_mov_pd(a, k, simde_mm_permutex2var_pd(a, idx, b));
+ #endif
+}
+#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
+ #undef _mm_mask_permutex2var_pd
+#define _mm_mask_permutex2var_pd(a, k, idx, b) simde_mm_mask_permutex2var_pd(a, k, idx, b)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde__m128d
+simde_mm_mask2_permutex2var_pd (simde__m128d a, simde__m128i idx, simde__mmask8 k, simde__m128d b) {
+ #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
+ return _mm_mask2_permutex2var_pd(a, idx, k, b);
+ #else
+ return simde_mm_mask_mov_pd(simde_mm_castsi128_pd(idx), k, simde_mm_permutex2var_pd(a, idx, b));
+ #endif
+}
+#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
+ #undef _mm_mask2_permutex2var_pd
+#define _mm_mask2_permutex2var_pd(a, idx, k, b) simde_mm_mask2_permutex2var_pd(a, idx, k, b)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde__m128d
+simde_mm_maskz_permutex2var_pd (simde__mmask8 k, simde__m128d a, simde__m128i idx, simde__m128d b) {
+ #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
+ return _mm_maskz_permutex2var_pd(k, a, idx, b);
+ #else
+ return simde_mm_maskz_mov_pd(k, simde_mm_permutex2var_pd(a, idx, b));
+ #endif
+}
+#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
+ #undef _mm_maskz_permutex2var_pd
+#define _mm_maskz_permutex2var_pd(k, a, idx, b) simde_mm_maskz_permutex2var_pd(k, a, idx, b)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde__m128
+simde_mm_permutex2var_ps (simde__m128 a, simde__m128i idx, simde__m128 b) {
+ #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
+ return _mm_permutex2var_ps(a, idx, b);
+ #else
+ return simde_mm_castsi128_ps(simde_mm_permutex2var_epi32(simde_mm_castps_si128(a), idx, simde_mm_castps_si128(b)));
+ #endif
+}
+#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
+ #undef _mm_permutex2var_ps
+ #define _mm_permutex2var_ps(a, idx, b) simde_mm_permutex2var_ps(a, idx, b)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde__m128
+simde_mm_mask_permutex2var_ps (simde__m128 a, simde__mmask8 k, simde__m128i idx, simde__m128 b) {
+ #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
+ return _mm_mask_permutex2var_ps(a, k, idx, b);
+ #else
+ return simde_mm_mask_mov_ps(a, k, simde_mm_permutex2var_ps(a, idx, b));
+ #endif
+}
+#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
+ #undef _mm_mask_permutex2var_ps
+#define _mm_mask_permutex2var_ps(a, k, idx, b) simde_mm_mask_permutex2var_ps(a, k, idx, b)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde__m128
+simde_mm_mask2_permutex2var_ps (simde__m128 a, simde__m128i idx, simde__mmask8 k, simde__m128 b) {
+ #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
+ return _mm_mask2_permutex2var_ps(a, idx, k, b);
+ #else
+ return simde_mm_mask_mov_ps(simde_mm_castsi128_ps(idx), k, simde_mm_permutex2var_ps(a, idx, b));
+ #endif
+}
+#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
+ #undef _mm_mask2_permutex2var_ps
+#define _mm_mask2_permutex2var_ps(a, idx, k, b) simde_mm_mask2_permutex2var_ps(a, idx, k, b)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde__m128
+simde_mm_maskz_permutex2var_ps (simde__mmask8 k, simde__m128 a, simde__m128i idx, simde__m128 b) {
+ #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
+ return _mm_maskz_permutex2var_ps(k, a, idx, b);
+ #else
+ return simde_mm_maskz_mov_ps(k, simde_mm_permutex2var_ps(a, idx, b));
+ #endif
+}
+#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
+ #undef _mm_maskz_permutex2var_ps
+#define _mm_maskz_permutex2var_ps(k, a, idx, b) simde_mm_maskz_permutex2var_ps(k, a, idx, b)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde__m256i
+simde_mm256_permutex2var_epi16 (simde__m256i a, simde__m256i idx, simde__m256i b) {
+ #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
+ return _mm256_permutex2var_epi16(a, idx, b);
+ #elif defined(SIMDE_X86_AVX2_NATIVE)
+ __m256i hilo, hilo2, hi, lo, idx2, ta, tb, select;
+ const __m256i ones = _mm256_set1_epi16(1);
+
+ idx2 = _mm256_srli_epi32(idx, 1);
+
+ ta = _mm256_permutevar8x32_epi32(a, idx2);
+ tb = _mm256_permutevar8x32_epi32(b, idx2);
+ select = _mm256_slli_epi32(idx2, 28);
+ hilo = _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(ta),
+ _mm256_castsi256_ps(tb),
+ _mm256_castsi256_ps(select)));
+ idx2 = _mm256_srli_epi32(idx2, 16);
+
+ ta = _mm256_permutevar8x32_epi32(a, idx2);
+ tb = _mm256_permutevar8x32_epi32(b, idx2);
+ select = _mm256_slli_epi32(idx2, 28);
+ hilo2 = _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(ta),
+ _mm256_castsi256_ps(tb),
+ _mm256_castsi256_ps(select)));
+
+ lo = _mm256_blend_epi16(_mm256_slli_epi32(hilo2, 16), hilo, 0x55);
+ hi = _mm256_blend_epi16(hilo2, _mm256_srli_epi32(hilo, 16), 0x55);
+
+ select = _mm256_cmpeq_epi16(_mm256_and_si256(idx, ones), ones);
+ return _mm256_blendv_epi8(lo, hi, select);
+ #else
+ simde__m256i_private
+ a_ = simde__m256i_to_private(a),
+ idx_ = simde__m256i_to_private(idx),
+ b_ = simde__m256i_to_private(b),
+ r_;
+
+ #if defined(SIMDE_X_PERMUTEX2VAR_USE_GENERIC)
+ simde_x_permutex2var(r_.m128i, a_.m128i, idx_.m128i, b_.m128i, 1, 1);
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
+ r_.i16[i] = ((idx_.i16[i] & 0x10) ? b_ : a_).i16[idx_.i16[i] & 0x0F];
+ }
+ #endif
+
+ return simde__m256i_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
+ #undef _mm256_permutex2var_epi16
+ #define _mm256_permutex2var_epi16(a, idx, b) simde_mm256_permutex2var_epi16(a, idx, b)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde__m256i
+simde_mm256_mask_permutex2var_epi16 (simde__m256i a, simde__mmask16 k, simde__m256i idx, simde__m256i b) {
+ #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
+ return _mm256_mask_permutex2var_epi16(a, k, idx, b);
+ #else
+ return simde_mm256_mask_mov_epi16(a, k, simde_mm256_permutex2var_epi16(a, idx, b));
+ #endif
+}
+#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
+ #undef _mm256_mask_permutex2var_epi16
+#define _mm256_mask_permutex2var_epi16(a, k, idx, b) simde_mm256_mask_permutex2var_epi16(a, k, idx, b)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde__m256i
+simde_mm256_mask2_permutex2var_epi16 (simde__m256i a, simde__m256i idx, simde__mmask16 k, simde__m256i b) {
+ #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
+ return _mm256_mask2_permutex2var_epi16(a, idx, k, b);
+ #else
+ return simde_mm256_mask_mov_epi16(idx, k, simde_mm256_permutex2var_epi16(a, idx, b));
+ #endif
+}
+#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
+ #undef _mm256_mask2_permutex2var_epi16
+#define _mm256_mask2_permutex2var_epi16(a, idx, k, b) simde_mm256_mask2_permutex2var_epi16(a, idx, k, b)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde__m256i
+simde_mm256_maskz_permutex2var_epi16 (simde__mmask16 k, simde__m256i a, simde__m256i idx, simde__m256i b) {
+ #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
+ return _mm256_maskz_permutex2var_epi16(k, a, idx, b);
+ #else
+ return simde_mm256_maskz_mov_epi16(k, simde_mm256_permutex2var_epi16(a, idx, b));
+ #endif
+}
+#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
+ #undef _mm256_maskz_permutex2var_epi16
+#define _mm256_maskz_permutex2var_epi16(k, a, idx, b) simde_mm256_maskz_permutex2var_epi16(k, a, idx, b)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde__m256i
+simde_mm256_permutex2var_epi32 (simde__m256i a, simde__m256i idx, simde__m256i b) {
+ #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
+ return _mm256_permutex2var_epi32(a, idx, b);
+ #elif defined(SIMDE_X86_AVX2_NATIVE)
+ __m256i ta, tb, select;
+ ta = _mm256_permutevar8x32_epi32(a, idx);
+ tb = _mm256_permutevar8x32_epi32(b, idx);
+ select = _mm256_slli_epi32(idx, 28);
+ return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(ta),
+ _mm256_castsi256_ps(tb),
+ _mm256_castsi256_ps(select)));
+ #else
+ simde__m256i_private
+ a_ = simde__m256i_to_private(a),
+ idx_ = simde__m256i_to_private(idx),
+ b_ = simde__m256i_to_private(b),
+ r_;
+
+ #if defined(SIMDE_X_PERMUTEX2VAR_USE_GENERIC)
+ simde_x_permutex2var(r_.m128i, a_.m128i, idx_.m128i, b_.m128i, 2, 1);
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
+ r_.i32[i] = ((idx_.i32[i] & 8) ? b_ : a_).i32[idx_.i32[i] & 7];
+ }
+ #endif
+
+ return simde__m256i_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
+ #undef _mm256_permutex2var_epi32
+ #define _mm256_permutex2var_epi32(a, idx, b) simde_mm256_permutex2var_epi32(a, idx, b)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde__m256i
+simde_mm256_mask_permutex2var_epi32 (simde__m256i a, simde__mmask8 k, simde__m256i idx, simde__m256i b) {
+ #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
+ return _mm256_mask_permutex2var_epi32(a, k, idx, b);
+ #else
+ return simde_mm256_mask_mov_epi32(a, k, simde_mm256_permutex2var_epi32(a, idx, b));
+ #endif
+}
+#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
+ #undef _mm256_mask_permutex2var_epi32
+#define _mm256_mask_permutex2var_epi32(a, k, idx, b) simde_mm256_mask_permutex2var_epi32(a, k, idx, b)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde__m256i
+simde_mm256_mask2_permutex2var_epi32 (simde__m256i a, simde__m256i idx, simde__mmask8 k, simde__m256i b) {
+ #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
+ return _mm256_mask2_permutex2var_epi32(a, idx, k, b);
+ #else
+ return simde_mm256_mask_mov_epi32(idx, k, simde_mm256_permutex2var_epi32(a, idx, b));
+ #endif
+}
+#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
+ #undef _mm256_mask2_permutex2var_epi32
+#define _mm256_mask2_permutex2var_epi32(a, idx, k, b) simde_mm256_mask2_permutex2var_epi32(a, idx, k, b)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde__m256i
+simde_mm256_maskz_permutex2var_epi32 (simde__mmask8 k, simde__m256i a, simde__m256i idx, simde__m256i b) {
+ #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
+ return _mm256_maskz_permutex2var_epi32(k, a, idx, b);
+ #else
+ return simde_mm256_maskz_mov_epi32(k, simde_mm256_permutex2var_epi32(a, idx, b));
+ #endif
+}
+#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
+ #undef _mm256_maskz_permutex2var_epi32
+#define _mm256_maskz_permutex2var_epi32(k, a, idx, b) simde_mm256_maskz_permutex2var_epi32(k, a, idx, b)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde__m256i
+simde_mm256_permutex2var_epi64 (simde__m256i a, simde__m256i idx, simde__m256i b) {
+ #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
+ return _mm256_permutex2var_epi64(a, idx, b);
+ #else
+ simde__m256i_private
+ a_ = simde__m256i_to_private(a),
+ idx_ = simde__m256i_to_private(idx),
+ b_ = simde__m256i_to_private(b),
+ r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
+ r_.i64[i] = ((idx_.i64[i] & 4) ? b_ : a_).i64[idx_.i64[i] & 3];
+ }
+
+ return simde__m256i_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
+ #undef _mm256_permutex2var_epi64
+ #define _mm256_permutex2var_epi64(a, idx, b) simde_mm256_permutex2var_epi64(a, idx, b)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde__m256i
+simde_mm256_mask_permutex2var_epi64 (simde__m256i a, simde__mmask8 k, simde__m256i idx, simde__m256i b) {
+ #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
+ return _mm256_mask_permutex2var_epi64(a, k, idx, b);
+ #else
+ return simde_mm256_mask_mov_epi64(a, k, simde_mm256_permutex2var_epi64(a, idx, b));
+ #endif
+}
+#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
+ #undef _mm256_mask_permutex2var_epi64
+#define _mm256_mask_permutex2var_epi64(a, k, idx, b) simde_mm256_mask_permutex2var_epi64(a, k, idx, b)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde__m256i
+simde_mm256_mask2_permutex2var_epi64 (simde__m256i a, simde__m256i idx, simde__mmask8 k, simde__m256i b) {
+ #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
+ return _mm256_mask2_permutex2var_epi64(a, idx, k, b);
+ #else
+ return simde_mm256_mask_mov_epi64(idx, k, simde_mm256_permutex2var_epi64(a, idx, b));
+ #endif
+}
+#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
+ #undef _mm256_mask2_permutex2var_epi64
+#define _mm256_mask2_permutex2var_epi64(a, idx, k, b) simde_mm256_mask2_permutex2var_epi64(a, idx, k, b)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde__m256i
+simde_mm256_maskz_permutex2var_epi64 (simde__mmask8 k, simde__m256i a, simde__m256i idx, simde__m256i b) {
+ #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
+ return _mm256_maskz_permutex2var_epi64(k, a, idx, b);
+ #else
+ return simde_mm256_maskz_mov_epi64(k, simde_mm256_permutex2var_epi64(a, idx, b));
+ #endif
+}
+#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
+ #undef _mm256_maskz_permutex2var_epi64
+#define _mm256_maskz_permutex2var_epi64(k, a, idx, b) simde_mm256_maskz_permutex2var_epi64(k, a, idx, b)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde__m256i
+simde_mm256_permutex2var_epi8 (simde__m256i a, simde__m256i idx, simde__m256i b) {
+ #if defined(SIMDE_X86_AVX512VBMI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
+ return _mm256_permutex2var_epi8(a, idx, b);
+ #elif defined(SIMDE_X86_AVX512BW_NATIVE)
+ return _mm512_cvtepi16_epi8(_mm512_permutex2var_epi16(_mm512_cvtepu8_epi16(a), _mm512_cvtepu8_epi16(idx), _mm512_cvtepu8_epi16(b)));
+ #elif defined(SIMDE_X86_AVX2_NATIVE)
+ __m256i t0, t1, index, select0x10, select0x20, a01, b01;
+ const __m256i mask = _mm256_set1_epi8(0x3F);
+ const __m256i a0 = _mm256_permute4x64_epi64(a, (1 << 6) + (0 << 4) + (1 << 2) + (0 << 0));
+ const __m256i a1 = _mm256_permute4x64_epi64(a, (3 << 6) + (2 << 4) + (3 << 2) + (2 << 0));
+ const __m256i b0 = _mm256_permute4x64_epi64(b, (1 << 6) + (0 << 4) + (1 << 2) + (0 << 0));
+ const __m256i b1 = _mm256_permute4x64_epi64(b, (3 << 6) + (2 << 4) + (3 << 2) + (2 << 0));
+
+ index = _mm256_and_si256(idx, mask);
+ t0 = _mm256_shuffle_epi8(a0, index);
+ t1 = _mm256_shuffle_epi8(a1, index);
+ select0x10 = _mm256_slli_epi64(index, 3);
+ a01 = _mm256_blendv_epi8(t0, t1, select0x10);
+ t0 = _mm256_shuffle_epi8(b0, index);
+ t1 = _mm256_shuffle_epi8(b1, index);
+ b01 = _mm256_blendv_epi8(t0, t1, select0x10);
+ select0x20 = _mm256_slli_epi64(index, 2);
+ return _mm256_blendv_epi8(a01, b01, select0x20);
+ #else
+ simde__m256i_private
+ a_ = simde__m256i_to_private(a),
+ idx_ = simde__m256i_to_private(idx),
+ b_ = simde__m256i_to_private(b),
+ r_;
+
+ #if defined(SIMDE_X_PERMUTEX2VAR_USE_GENERIC)
+ simde_x_permutex2var(r_.m128i, a_.m128i, idx_.m128i, b_.m128i, 0, 1);
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
+ r_.i8[i] = ((idx_.i8[i] & 0x20) ? b_ : a_).i8[idx_.i8[i] & 0x1F];
+ }
+ #endif
+
+ return simde__m256i_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_X86_AVX512VBMI_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
+ #undef _mm256_permutex2var_epi8
+ #define _mm256_permutex2var_epi8(a, idx, b) simde_mm256_permutex2var_epi8(a, idx, b)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde__m256i
+simde_mm256_mask_permutex2var_epi8 (simde__m256i a, simde__mmask32 k, simde__m256i idx, simde__m256i b) {
+ #if defined(SIMDE_X86_AVX512VBMI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
+ return _mm256_mask_permutex2var_epi8(a, k, idx, b);
+ #else
+ return simde_mm256_mask_mov_epi8(a, k, simde_mm256_permutex2var_epi8(a, idx, b));
+ #endif
+}
+#if defined(SIMDE_X86_AVX512VBMI_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
+ #undef _mm256_mask_permutex2var_epi8
+#define _mm256_mask_permutex2var_epi8(a, k, idx, b) simde_mm256_mask_permutex2var_epi8(a, k, idx, b)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde__m256i
+simde_mm256_mask2_permutex2var_epi8 (simde__m256i a, simde__m256i idx, simde__mmask32 k, simde__m256i b) {
+ #if defined(SIMDE_X86_AVX512VBMI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
+ return _mm256_mask2_permutex2var_epi8(a, idx, k, b);
+ #else
+ return simde_mm256_mask_mov_epi8(idx, k, simde_mm256_permutex2var_epi8(a, idx, b));
+ #endif
+}
+#if defined(SIMDE_X86_AVX512VBMI_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
+ #undef _mm256_mask2_permutex2var_epi8
+#define _mm256_mask2_permutex2var_epi8(a, idx, k, b) simde_mm256_mask2_permutex2var_epi8(a, idx, k, b)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde__m256i
+simde_mm256_maskz_permutex2var_epi8 (simde__mmask32 k, simde__m256i a, simde__m256i idx, simde__m256i b) {
+ #if defined(SIMDE_X86_AVX512VBMI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
+ return _mm256_maskz_permutex2var_epi8(k, a, idx, b);
+ #else
+ return simde_mm256_maskz_mov_epi8(k, simde_mm256_permutex2var_epi8(a, idx, b));
+ #endif
+}
+#if defined(SIMDE_X86_AVX512VBMI_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
+ #undef _mm256_maskz_permutex2var_epi8
+#define _mm256_maskz_permutex2var_epi8(k, a, idx, b) simde_mm256_maskz_permutex2var_epi8(k, a, idx, b)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde__m256d
+simde_mm256_permutex2var_pd (simde__m256d a, simde__m256i idx, simde__m256d b) {
+ #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
+ return _mm256_permutex2var_pd(a, idx, b);
+ #else
+ return simde_mm256_castsi256_pd(simde_mm256_permutex2var_epi64(simde_mm256_castpd_si256(a), idx, simde_mm256_castpd_si256(b)));
+ #endif
+}
+#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
+ #undef _mm256_permutex2var_pd
+ #define _mm256_permutex2var_pd(a, idx, b) simde_mm256_permutex2var_pd(a, idx, b)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde__m256d
+simde_mm256_mask_permutex2var_pd (simde__m256d a, simde__mmask8 k, simde__m256i idx, simde__m256d b) {
+ #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
+ return _mm256_mask_permutex2var_pd(a, k, idx, b);
+ #else
+ return simde_mm256_mask_mov_pd(a, k, simde_mm256_permutex2var_pd(a, idx, b));
+ #endif
+}
+#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
+ #undef _mm256_mask_permutex2var_pd
+#define _mm256_mask_permutex2var_pd(a, k, idx, b) simde_mm256_mask_permutex2var_pd(a, k, idx, b)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde__m256d
+simde_mm256_mask2_permutex2var_pd (simde__m256d a, simde__m256i idx, simde__mmask8 k, simde__m256d b) {
+ #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
+ return _mm256_mask2_permutex2var_pd(a, idx, k, b);
+ #else
+ return simde_mm256_mask_mov_pd(simde_mm256_castsi256_pd(idx), k, simde_mm256_permutex2var_pd(a, idx, b));
+ #endif
+}
+#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
+ #undef _mm256_mask2_permutex2var_pd
+#define _mm256_mask2_permutex2var_pd(a, idx, k, b) simde_mm256_mask2_permutex2var_pd(a, idx, k, b)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde__m256d
+simde_mm256_maskz_permutex2var_pd (simde__mmask8 k, simde__m256d a, simde__m256i idx, simde__m256d b) {
+ #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
+ return _mm256_maskz_permutex2var_pd(k, a, idx, b);
+ #else
+ return simde_mm256_maskz_mov_pd(k, simde_mm256_permutex2var_pd(a, idx, b));
+ #endif
+}
+#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
+ #undef _mm256_maskz_permutex2var_pd
+#define _mm256_maskz_permutex2var_pd(k, a, idx, b) simde_mm256_maskz_permutex2var_pd(k, a, idx, b)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde__m256
+simde_mm256_permutex2var_ps (simde__m256 a, simde__m256i idx, simde__m256 b) {
+ #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
+ return _mm256_permutex2var_ps(a, idx, b);
+ #else
+ return simde_mm256_castsi256_ps(simde_mm256_permutex2var_epi32(simde_mm256_castps_si256(a), idx, simde_mm256_castps_si256(b)));
+ #endif
+}
+#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
+ #undef _mm256_permutex2var_ps
+ #define _mm256_permutex2var_ps(a, idx, b) simde_mm256_permutex2var_ps(a, idx, b)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde__m256
+simde_mm256_mask_permutex2var_ps (simde__m256 a, simde__mmask8 k, simde__m256i idx, simde__m256 b) {
+ #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
+ return _mm256_mask_permutex2var_ps(a, k, idx, b);
+ #else
+ return simde_mm256_mask_mov_ps(a, k, simde_mm256_permutex2var_ps(a, idx, b));
+ #endif
+}
+#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
+ #undef _mm256_mask_permutex2var_ps
+#define _mm256_mask_permutex2var_ps(a, k, idx, b) simde_mm256_mask_permutex2var_ps(a, k, idx, b)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde__m256
+simde_mm256_mask2_permutex2var_ps (simde__m256 a, simde__m256i idx, simde__mmask8 k, simde__m256 b) {
+ #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
+ return _mm256_mask2_permutex2var_ps(a, idx, k, b);
+ #else
+ return simde_mm256_mask_mov_ps(simde_mm256_castsi256_ps(idx), k, simde_mm256_permutex2var_ps(a, idx, b));
+ #endif
+}
+#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
+ #undef _mm256_mask2_permutex2var_ps
+#define _mm256_mask2_permutex2var_ps(a, idx, k, b) simde_mm256_mask2_permutex2var_ps(a, idx, k, b)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde__m256
+simde_mm256_maskz_permutex2var_ps (simde__mmask8 k, simde__m256 a, simde__m256i idx, simde__m256 b) {
+ #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
+ return _mm256_maskz_permutex2var_ps(k, a, idx, b);
+ #else
+ return simde_mm256_maskz_mov_ps(k, simde_mm256_permutex2var_ps(a, idx, b));
+ #endif
+}
+#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
+ #undef _mm256_maskz_permutex2var_ps
+#define _mm256_maskz_permutex2var_ps(k, a, idx, b) simde_mm256_maskz_permutex2var_ps(k, a, idx, b)
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde__m512i
simde_mm512_permutex2var_epi16 (simde__m512i a, simde__m512i idx, simde__m512i b) {
@@ -55,27 +1135,57 @@ simde_mm512_permutex2var_epi16 (simde__m512i a, simde__m512i idx, simde__m512i b
b_ = simde__m512i_to_private(b),
r_;
- #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
- uint8x16x4_t table_a = { { a_.m128i_private[0].neon_u8,
- a_.m128i_private[1].neon_u8,
- a_.m128i_private[2].neon_u8,
- a_.m128i_private[3].neon_u8 } };
- uint8x16x4_t table_b = { { b_.m128i_private[0].neon_u8,
- b_.m128i_private[1].neon_u8,
- b_.m128i_private[2].neon_u8,
- b_.m128i_private[3].neon_u8 } };
- uint8x16_t mask8 = vdupq_n_u8(0x3F);
- uint16x8_t mask16 = vdupq_n_u16(0x003F);
- uint16x8_t byte_index16 = vdupq_n_u16(0x0100);
+ #if defined(SIMDE_X86_AVX2_NATIVE)
+ __m256i hilo, hilo1, hilo2, hi, lo, idx1, idx2, ta, tb, select;
+ const __m256i ones = _mm256_set1_epi16(1);
SIMDE_VECTORIZE
- for (size_t i = 0 ; i < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; i++) {
- uint16x8_t index16 = vandq_u16(idx_.m128i_private[i].neon_u16, mask16);
- index16 = vmulq_n_u16(index16, 0x0202);
- index16 = vaddq_u16(index16, byte_index16);
- uint8x16_t index8 = vreinterpretq_u8_u16(index16);
- r_.m128i_private[i].neon_u8 = vqtbx4q_u8(vqtbl4q_u8(table_b, vandq_u8(index8, mask8)), table_a, index8);
+ for (size_t i = 0 ; i < (sizeof(r_.m256i_private) / sizeof(r_.m256i_private[0])) ; i++) {
+ idx1 = idx_.m256i[i];
+ idx2 = _mm256_srli_epi32(idx1, 1);
+
+ select = _mm256_slli_epi32(idx2, 27);
+ ta = _mm256_permutevar8x32_epi32(a_.m256i[0], idx2);
+ tb = _mm256_permutevar8x32_epi32(b_.m256i[0], idx2);
+ hilo = _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(ta),
+ _mm256_castsi256_ps(tb),
+ _mm256_castsi256_ps(select)));
+ ta = _mm256_permutevar8x32_epi32(a_.m256i[1], idx2);
+ tb = _mm256_permutevar8x32_epi32(b_.m256i[1], idx2);
+ hilo1 = _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(ta),
+ _mm256_castsi256_ps(tb),
+ _mm256_castsi256_ps(select)));
+ select = _mm256_add_epi32(select, select);
+ hilo1 = _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(hilo),
+ _mm256_castsi256_ps(hilo1),
+ _mm256_castsi256_ps(select)));
+
+ idx2 = _mm256_srli_epi32(idx2, 16);
+
+ select = _mm256_slli_epi32(idx2, 27);
+ ta = _mm256_permutevar8x32_epi32(a_.m256i[0], idx2);
+ tb = _mm256_permutevar8x32_epi32(b_.m256i[0], idx2);
+ hilo = _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(ta),
+ _mm256_castsi256_ps(tb),
+ _mm256_castsi256_ps(select)));
+ ta = _mm256_permutevar8x32_epi32(a_.m256i[1], idx2);
+ tb = _mm256_permutevar8x32_epi32(b_.m256i[1], idx2);
+ hilo2 = _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(ta),
+ _mm256_castsi256_ps(tb),
+ _mm256_castsi256_ps(select)));
+ select = _mm256_add_epi32(select, select);
+ hilo2 = _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(hilo),
+ _mm256_castsi256_ps(hilo2),
+ _mm256_castsi256_ps(select)));
+
+ lo = _mm256_blend_epi16(_mm256_slli_epi32(hilo2, 16), hilo1, 0x55);
+ hi = _mm256_blend_epi16(hilo2, _mm256_srli_epi32(hilo1, 16), 0x55);
+
+ select = _mm256_cmpeq_epi16(_mm256_and_si256(idx1, ones), ones);
+ r_.m256i[i] = _mm256_blendv_epi8(lo, hi, select);
}
+ #elif defined(SIMDE_X_PERMUTEX2VAR_USE_GENERIC)
+ simde_x_permutex2var(r_.m128i, a_.m128i, idx_.m128i, b_.m128i, 1, 2);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
@@ -86,7 +1196,7 @@ simde_mm512_permutex2var_epi16 (simde__m512i a, simde__m512i idx, simde__m512i b
return simde__m512i_from_private(r_);
#endif
}
-#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES)
+#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
#undef _mm512_permutex2var_epi16
#define _mm512_permutex2var_epi16(a, idx, b) simde_mm512_permutex2var_epi16(a, idx, b)
#endif
@@ -136,7 +1246,7 @@ simde_mm512_maskz_permutex2var_epi16 (simde__mmask32 k, simde__m512i a, simde__m
SIMDE_FUNCTION_ATTRIBUTES
simde__m512i
simde_mm512_permutex2var_epi32 (simde__m512i a, simde__m512i idx, simde__m512i b) {
- #if defined(SIMDE_X86_AVX512BW_NATIVE)
+ #if defined(SIMDE_X86_AVX512F_NATIVE)
return _mm512_permutex2var_epi32(a, idx, b);
#else
simde__m512i_private
@@ -145,27 +1255,29 @@ simde_mm512_permutex2var_epi32 (simde__m512i a, simde__m512i idx, simde__m512i b
b_ = simde__m512i_to_private(b),
r_;
- #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
- uint8x16x4_t table_a = { { a_.m128i_private[0].neon_u8,
- a_.m128i_private[1].neon_u8,
- a_.m128i_private[2].neon_u8,
- a_.m128i_private[3].neon_u8 } };
- uint8x16x4_t table_b = { { b_.m128i_private[0].neon_u8,
- b_.m128i_private[1].neon_u8,
- b_.m128i_private[2].neon_u8,
- b_.m128i_private[3].neon_u8 } };
- uint8x16_t mask8 = vdupq_n_u8(0x3F);
- uint32x4_t mask32 = vdupq_n_u32(0x0000001F);
- uint32x4_t byte_index32 = vdupq_n_u32(0x03020100);
-
+ #if defined(SIMDE_X86_AVX2_NATIVE)
+ __m256i index, t0, t1, a01, b01, select;
SIMDE_VECTORIZE
- for (size_t i = 0 ; i < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; i++) {
- uint32x4_t index32 = vandq_u32(idx_.m128i_private[i].neon_u32, mask32);
- index32 = vmulq_n_u32(index32, 0x04040404);
- index32 = vaddq_u32(index32, byte_index32);
- uint8x16_t index8 = vreinterpretq_u8_u32(index32);
- r_.m128i_private[i].neon_u8 = vqtbx4q_u8(vqtbl4q_u8(table_b, vandq_u8(index8, mask8)), table_a, index8);
+ for (size_t i = 0 ; i < (sizeof(r_.m256i_private) / sizeof(r_.m256i_private[0])) ; i++) {
+ index = idx_.m256i[i];
+ t0 = _mm256_permutevar8x32_epi32(a_.m256i[0], index);
+ t1 = _mm256_permutevar8x32_epi32(a_.m256i[1], index);
+ select = _mm256_slli_epi32(index, 28);
+ a01 = _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(t0),
+ _mm256_castsi256_ps(t1),
+ _mm256_castsi256_ps(select)));
+ t0 = _mm256_permutevar8x32_epi32(b_.m256i[0], index);
+ t1 = _mm256_permutevar8x32_epi32(b_.m256i[1], index);
+ b01 = _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(t0),
+ _mm256_castsi256_ps(t1),
+ _mm256_castsi256_ps(select)));
+ select = _mm256_slli_epi32(index, 27);
+ r_.m256i[i] = _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a01),
+ _mm256_castsi256_ps(b01),
+ _mm256_castsi256_ps(select)));
}
+ #elif defined(SIMDE_X_PERMUTEX2VAR_USE_GENERIC)
+ simde_x_permutex2var(r_.m128i, a_.m128i, idx_.m128i, b_.m128i, 2, 2);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
@@ -176,7 +1288,7 @@ simde_mm512_permutex2var_epi32 (simde__m512i a, simde__m512i idx, simde__m512i b
return simde__m512i_from_private(r_);
#endif
}
-#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES)
+#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES)
#undef _mm512_permutex2var_epi32
#define _mm512_permutex2var_epi32(a, idx, b) simde_mm512_permutex2var_epi32(a, idx, b)
#endif
@@ -184,13 +1296,13 @@ simde_mm512_permutex2var_epi32 (simde__m512i a, simde__m512i idx, simde__m512i b
SIMDE_FUNCTION_ATTRIBUTES
simde__m512i
simde_mm512_mask_permutex2var_epi32 (simde__m512i a, simde__mmask16 k, simde__m512i idx, simde__m512i b) {
- #if defined(SIMDE_X86_AVX512BW_NATIVE)
+ #if defined(SIMDE_X86_AVX512F_NATIVE)
return _mm512_mask_permutex2var_epi32(a, k, idx, b);
#else
return simde_mm512_mask_mov_epi32(a, k, simde_mm512_permutex2var_epi32(a, idx, b));
#endif
}
-#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES)
+#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES)
#undef _mm512_mask_permutex2var_epi32
#define _mm512_mask_permutex2var_epi32(a, k, idx, b) simde_mm512_mask_permutex2var_epi32(a, k, idx, b)
#endif
@@ -198,13 +1310,13 @@ simde_mm512_mask_permutex2var_epi32 (simde__m512i a, simde__mmask16 k, simde__m5
SIMDE_FUNCTION_ATTRIBUTES
simde__m512i
simde_mm512_mask2_permutex2var_epi32 (simde__m512i a, simde__m512i idx, simde__mmask16 k, simde__m512i b) {
- #if defined(SIMDE_X86_AVX512BW_NATIVE)
+ #if defined(SIMDE_X86_AVX512F_NATIVE)
return _mm512_mask2_permutex2var_epi32(a, idx, k, b);
#else
return simde_mm512_mask_mov_epi32(idx, k, simde_mm512_permutex2var_epi32(a, idx, b));
#endif
}
-#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES)
+#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES)
#undef _mm512_mask2_permutex2var_epi32
#define _mm512_mask2_permutex2var_epi32(a, idx, k, b) simde_mm512_mask2_permutex2var_epi32(a, idx, k, b)
#endif
@@ -212,13 +1324,13 @@ simde_mm512_mask2_permutex2var_epi32 (simde__m512i a, simde__m512i idx, simde__m
SIMDE_FUNCTION_ATTRIBUTES
simde__m512i
simde_mm512_maskz_permutex2var_epi32 (simde__mmask16 k, simde__m512i a, simde__m512i idx, simde__m512i b) {
- #if defined(SIMDE_X86_AVX512BW_NATIVE)
+ #if defined(SIMDE_X86_AVX512F_NATIVE)
return _mm512_maskz_permutex2var_epi32(k, a, idx, b);
#else
return simde_mm512_maskz_mov_epi32(k, simde_mm512_permutex2var_epi32(a, idx, b));
#endif
}
-#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES)
+#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES)
#undef _mm512_maskz_permutex2var_epi32
#define _mm512_maskz_permutex2var_epi32(k, a, idx, b) simde_mm512_maskz_permutex2var_epi32(k, a, idx, b)
#endif
@@ -226,7 +1338,7 @@ simde_mm512_maskz_permutex2var_epi32 (simde__mmask16 k, simde__m512i a, simde__m
SIMDE_FUNCTION_ATTRIBUTES
simde__m512i
simde_mm512_permutex2var_epi64 (simde__m512i a, simde__m512i idx, simde__m512i b) {
- #if defined(SIMDE_X86_AVX512BW_NATIVE)
+ #if defined(SIMDE_X86_AVX512F_NATIVE)
return _mm512_permutex2var_epi64(a, idx, b);
#else
simde__m512i_private
@@ -237,13 +1349,13 @@ simde_mm512_permutex2var_epi64 (simde__m512i a, simde__m512i idx, simde__m512i b
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
- r_.i64[i] = ((idx_.i64[i] & 0x08) ? b_ : a_).i64[idx_.i64[i] & 0x07];
+ r_.i64[i] = ((idx_.i64[i] & 8) ? b_ : a_).i64[idx_.i64[i] & 7];
}
return simde__m512i_from_private(r_);
#endif
}
-#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES)
+#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES)
#undef _mm512_permutex2var_epi64
#define _mm512_permutex2var_epi64(a, idx, b) simde_mm512_permutex2var_epi64(a, idx, b)
#endif
@@ -251,13 +1363,13 @@ simde_mm512_permutex2var_epi64 (simde__m512i a, simde__m512i idx, simde__m512i b
SIMDE_FUNCTION_ATTRIBUTES
simde__m512i
simde_mm512_mask_permutex2var_epi64 (simde__m512i a, simde__mmask8 k, simde__m512i idx, simde__m512i b) {
- #if defined(SIMDE_X86_AVX512BW_NATIVE)
+ #if defined(SIMDE_X86_AVX512F_NATIVE)
return _mm512_mask_permutex2var_epi64(a, k, idx, b);
#else
return simde_mm512_mask_mov_epi64(a, k, simde_mm512_permutex2var_epi64(a, idx, b));
#endif
}
-#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES)
+#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES)
#undef _mm512_mask_permutex2var_epi64
#define _mm512_mask_permutex2var_epi64(a, k, idx, b) simde_mm512_mask_permutex2var_epi64(a, k, idx, b)
#endif
@@ -265,13 +1377,13 @@ simde_mm512_mask_permutex2var_epi64 (simde__m512i a, simde__mmask8 k, simde__m51
SIMDE_FUNCTION_ATTRIBUTES
simde__m512i
simde_mm512_mask2_permutex2var_epi64 (simde__m512i a, simde__m512i idx, simde__mmask8 k, simde__m512i b) {
- #if defined(SIMDE_X86_AVX512BW_NATIVE)
+ #if defined(SIMDE_X86_AVX512F_NATIVE)
return _mm512_mask2_permutex2var_epi64(a, idx, k, b);
#else
return simde_mm512_mask_mov_epi64(idx, k, simde_mm512_permutex2var_epi64(a, idx, b));
#endif
}
-#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES)
+#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES)
#undef _mm512_mask2_permutex2var_epi64
#define _mm512_mask2_permutex2var_epi64(a, idx, k, b) simde_mm512_mask2_permutex2var_epi64(a, idx, k, b)
#endif
@@ -279,13 +1391,13 @@ simde_mm512_mask2_permutex2var_epi64 (simde__m512i a, simde__m512i idx, simde__m
SIMDE_FUNCTION_ATTRIBUTES
simde__m512i
simde_mm512_maskz_permutex2var_epi64 (simde__mmask8 k, simde__m512i a, simde__m512i idx, simde__m512i b) {
- #if defined(SIMDE_X86_AVX512BW_NATIVE)
+ #if defined(SIMDE_X86_AVX512F_NATIVE)
return _mm512_maskz_permutex2var_epi64(k, a, idx, b);
#else
return simde_mm512_maskz_mov_epi64(k, simde_mm512_permutex2var_epi64(a, idx, b));
#endif
}
-#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES)
+#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES)
#undef _mm512_maskz_permutex2var_epi64
#define _mm512_maskz_permutex2var_epi64(k, a, idx, b) simde_mm512_maskz_permutex2var_epi64(k, a, idx, b)
#endif
@@ -296,25 +1408,25 @@ simde_mm512_permutex2var_epi8 (simde__m512i a, simde__m512i idx, simde__m512i b)
#if defined(SIMDE_X86_AVX512VBMI_NATIVE)
return _mm512_permutex2var_epi8(a, idx, b);
#elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
- simde__m512i hilo, hi, lo, hi2, lo2, idx2;
- simde__m512i ones = simde_mm512_set1_epi8(1);
- simde__m512i low_bytes = simde_mm512_set1_epi16(0x00FF);
+ __m512i hilo, hi, lo, hi2, lo2, idx2;
+ const __m512i ones = _mm512_set1_epi8(1);
+ const __m512i low_bytes = _mm512_set1_epi16(0x00FF);
- idx2 = simde_mm512_srli_epi16(idx, 1);
- hilo = simde_mm512_permutex2var_epi16(a, idx2, b);
- simde__mmask64 mask = simde_mm512_test_epi8_mask(idx, ones);
- lo = simde_mm512_and_si512(hilo, low_bytes);
- hi = simde_mm512_srli_epi16(hilo, 8);
+ idx2 = _mm512_srli_epi16(idx, 1);
+ hilo = _mm512_permutex2var_epi16(a, idx2, b);
+ __mmask64 mask = _mm512_test_epi8_mask(idx, ones);
+ lo = _mm512_and_si512(hilo, low_bytes);
+ hi = _mm512_srli_epi16(hilo, 8);
- idx2 = simde_mm512_srli_epi16(idx, 9);
- hilo = simde_mm512_permutex2var_epi16(a, idx2, b);
- lo2 = simde_mm512_slli_epi16(hilo, 8);
- hi2 = simde_mm512_andnot_si512(low_bytes, hilo);
+ idx2 = _mm512_srli_epi16(idx, 9);
+ hilo = _mm512_permutex2var_epi16(a, idx2, b);
+ lo2 = _mm512_slli_epi16(hilo, 8);
+ hi2 = _mm512_andnot_si512(low_bytes, hilo);
- lo = simde_mm512_or_si512(lo, lo2);
- hi = simde_mm512_or_si512(hi, hi2);
+ lo = _mm512_or_si512(lo, lo2);
+ hi = _mm512_or_si512(hi, hi2);
- return simde_mm512_mask_blend_epi8(mask, lo, hi);
+ return _mm512_mask_blend_epi8(mask, lo, hi);
#else
simde__m512i_private
a_ = simde__m512i_to_private(a),
@@ -322,22 +1434,42 @@ simde_mm512_permutex2var_epi8 (simde__m512i a, simde__m512i idx, simde__m512i b)
b_ = simde__m512i_to_private(b),
r_;
- #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
- uint8x16x4_t table_a = { { a_.m128i_private[0].neon_u8,
- a_.m128i_private[1].neon_u8,
- a_.m128i_private[2].neon_u8,
- a_.m128i_private[3].neon_u8 } };
- uint8x16x4_t table_b = { { b_.m128i_private[0].neon_u8,
- b_.m128i_private[1].neon_u8,
- b_.m128i_private[2].neon_u8,
- b_.m128i_private[3].neon_u8 } };
- uint8x16_t mask_7F = vdupq_n_u8(0x7F), mask_3F = vdupq_n_u8(0x3F);
+ #if defined(SIMDE_X86_AVX2_NATIVE)
+ __m256i t0, t1, index, select0x10, select0x20, select0x40, t01, t23, a0123, b0123;
+ const __m256i mask = _mm256_set1_epi8(0x7F);
+ const __m256i a0 = _mm256_permute4x64_epi64(a_.m256i[0], (1 << 6) + (0 << 4) + (1 << 2) + (0 << 0));
+ const __m256i a1 = _mm256_permute4x64_epi64(a_.m256i[0], (3 << 6) + (2 << 4) + (3 << 2) + (2 << 0));
+ const __m256i a2 = _mm256_permute4x64_epi64(a_.m256i[1], (1 << 6) + (0 << 4) + (1 << 2) + (0 << 0));
+ const __m256i a3 = _mm256_permute4x64_epi64(a_.m256i[1], (3 << 6) + (2 << 4) + (3 << 2) + (2 << 0));
+ const __m256i b0 = _mm256_permute4x64_epi64(b_.m256i[0], (1 << 6) + (0 << 4) + (1 << 2) + (0 << 0));
+ const __m256i b1 = _mm256_permute4x64_epi64(b_.m256i[0], (3 << 6) + (2 << 4) + (3 << 2) + (2 << 0));
+ const __m256i b2 = _mm256_permute4x64_epi64(b_.m256i[1], (1 << 6) + (0 << 4) + (1 << 2) + (0 << 0));
+ const __m256i b3 = _mm256_permute4x64_epi64(b_.m256i[1], (3 << 6) + (2 << 4) + (3 << 2) + (2 << 0));
SIMDE_VECTORIZE
- for (size_t i = 0 ; i < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; i++) {
- uint8x16_t index = vandq_u8(idx_.m128i_private[i].neon_u8, mask_7F);
- r_.m128i_private[i].neon_u8 = vqtbx4q_u8(vqtbl4q_u8(table_b, vandq_u8(index, mask_3F)), table_a, index);
+ for (size_t i = 0 ; i < (sizeof(r_.m256i_private) / sizeof(r_.m256i_private[0])) ; i++) {
+ index = _mm256_and_si256(idx_.m256i[i], mask);
+ t0 = _mm256_shuffle_epi8(a0, index);
+ t1 = _mm256_shuffle_epi8(a1, index);
+ select0x10 = _mm256_slli_epi64(index, 3);
+ t01 = _mm256_blendv_epi8(t0, t1, select0x10);
+ t0 = _mm256_shuffle_epi8(a2, index);
+ t1 = _mm256_shuffle_epi8(a3, index);
+ t23 = _mm256_blendv_epi8(t0, t1, select0x10);
+ select0x20 = _mm256_slli_epi64(index, 2);
+ a0123 = _mm256_blendv_epi8(t01, t23, select0x20);
+ t0 = _mm256_shuffle_epi8(b0, index);
+ t1 = _mm256_shuffle_epi8(b1, index);
+ t01 = _mm256_blendv_epi8(t0, t1, select0x10);
+ t0 = _mm256_shuffle_epi8(b2, index);
+ t1 = _mm256_shuffle_epi8(b3, index);
+ t23 = _mm256_blendv_epi8(t0, t1, select0x10);
+ b0123 = _mm256_blendv_epi8(t01, t23, select0x20);
+ select0x40 = _mm256_slli_epi64(index, 1);
+ r_.m256i[i] = _mm256_blendv_epi8(a0123, b0123, select0x40);
}
+ #elif defined(SIMDE_X_PERMUTEX2VAR_USE_GENERIC)
+ simde_x_permutex2var(r_.m128i, a_.m128i, idx_.m128i, b_.m128i, 0, 2);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
@@ -401,18 +1533,7 @@ simde_mm512_permutex2var_pd (simde__m512d a, simde__m512i idx, simde__m512d b) {
#if defined(SIMDE_X86_AVX512BW_NATIVE)
return _mm512_permutex2var_pd(a, idx, b);
#else
- simde__m512i_private idx_ = simde__m512i_to_private(idx);
- simde__m512d_private
- a_ = simde__m512d_to_private(a),
- b_ = simde__m512d_to_private(b),
- r_;
-
- SIMDE_VECTORIZE
- for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
- r_.f64[i] = ((idx_.i64[i] & 0x08) ? b_ : a_).f64[idx_.i64[i] & 0x07];
- }
-
- return simde__m512d_from_private(r_);
+ return simde_mm512_castsi512_pd(simde_mm512_permutex2var_epi64(simde_mm512_castpd_si512(a), idx, simde_mm512_castpd_si512(b)));
#endif
}
#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES)
@@ -423,13 +1544,13 @@ simde_mm512_permutex2var_pd (simde__m512d a, simde__m512i idx, simde__m512d b) {
SIMDE_FUNCTION_ATTRIBUTES
simde__m512d
simde_mm512_mask_permutex2var_pd (simde__m512d a, simde__mmask8 k, simde__m512i idx, simde__m512d b) {
- #if defined(SIMDE_X86_AVX512BW_NATIVE)
+ #if defined(SIMDE_X86_AVX512F_NATIVE)
return _mm512_mask_permutex2var_pd(a, k, idx, b);
#else
return simde_mm512_mask_mov_pd(a, k, simde_mm512_permutex2var_pd(a, idx, b));
#endif
}
-#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES)
+#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES)
#undef _mm512_mask_permutex2var_pd
#define _mm512_mask_permutex2var_pd(a, k, idx, b) simde_mm512_mask_permutex2var_pd(a, k, idx, b)
#endif
@@ -437,13 +1558,13 @@ simde_mm512_mask_permutex2var_pd (simde__m512d a, simde__mmask8 k, simde__m512i
SIMDE_FUNCTION_ATTRIBUTES
simde__m512d
simde_mm512_mask2_permutex2var_pd (simde__m512d a, simde__m512i idx, simde__mmask8 k, simde__m512d b) {
- #if defined(SIMDE_X86_AVX512BW_NATIVE)
+ #if defined(SIMDE_X86_AVX512F_NATIVE)
return _mm512_mask2_permutex2var_pd(a, idx, k, b);
#else
return simde_mm512_mask_mov_pd(simde_mm512_castsi512_pd(idx), k, simde_mm512_permutex2var_pd(a, idx, b));
#endif
}
-#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES)
+#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES)
#undef _mm512_mask2_permutex2var_pd
#define _mm512_mask2_permutex2var_pd(a, idx, k, b) simde_mm512_mask2_permutex2var_pd(a, idx, k, b)
#endif
@@ -451,13 +1572,13 @@ simde_mm512_mask2_permutex2var_pd (simde__m512d a, simde__m512i idx, simde__mmas
SIMDE_FUNCTION_ATTRIBUTES
simde__m512d
simde_mm512_maskz_permutex2var_pd (simde__mmask8 k, simde__m512d a, simde__m512i idx, simde__m512d b) {
- #if defined(SIMDE_X86_AVX512BW_NATIVE)
+ #if defined(SIMDE_X86_AVX512F_NATIVE)
return _mm512_maskz_permutex2var_pd(k, a, idx, b);
#else
return simde_mm512_maskz_mov_pd(k, simde_mm512_permutex2var_pd(a, idx, b));
#endif
}
-#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES)
+#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES)
#undef _mm512_maskz_permutex2var_pd
#define _mm512_maskz_permutex2var_pd(k, a, idx, b) simde_mm512_maskz_permutex2var_pd(k, a, idx, b)
#endif
@@ -465,24 +1586,13 @@ simde_mm512_maskz_permutex2var_pd (simde__mmask8 k, simde__m512d a, simde__m512i
SIMDE_FUNCTION_ATTRIBUTES
simde__m512
simde_mm512_permutex2var_ps (simde__m512 a, simde__m512i idx, simde__m512 b) {
- #if defined(SIMDE_X86_AVX512BW_NATIVE)
+ #if defined(SIMDE_X86_AVX512F_NATIVE)
return _mm512_permutex2var_ps(a, idx, b);
#else
- simde__m512i_private idx_ = simde__m512i_to_private(idx);
- simde__m512_private
- a_ = simde__m512_to_private(a),
- b_ = simde__m512_to_private(b),
- r_;
-
- SIMDE_VECTORIZE
- for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
- r_.f32[i] = ((idx_.i32[i] & 0x10) ? b_ : a_).f32[idx_.i32[i] & 0x0F];
- }
-
- return simde__m512_from_private(r_);
+ return simde_mm512_castsi512_ps(simde_mm512_permutex2var_epi32(simde_mm512_castps_si512(a), idx, simde_mm512_castps_si512(b)));
#endif
}
-#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES)
+#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES)
#undef _mm512_permutex2var_ps
#define _mm512_permutex2var_ps(a, idx, b) simde_mm512_permutex2var_ps(a, idx, b)
#endif
@@ -490,13 +1600,13 @@ simde_mm512_permutex2var_ps (simde__m512 a, simde__m512i idx, simde__m512 b) {
SIMDE_FUNCTION_ATTRIBUTES
simde__m512
simde_mm512_mask_permutex2var_ps (simde__m512 a, simde__mmask16 k, simde__m512i idx, simde__m512 b) {
- #if defined(SIMDE_X86_AVX512BW_NATIVE)
+ #if defined(SIMDE_X86_AVX512F_NATIVE)
return _mm512_mask_permutex2var_ps(a, k, idx, b);
#else
return simde_mm512_mask_mov_ps(a, k, simde_mm512_permutex2var_ps(a, idx, b));
#endif
}
-#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES)
+#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES)
#undef _mm512_mask_permutex2var_ps
#define _mm512_mask_permutex2var_ps(a, k, idx, b) simde_mm512_mask_permutex2var_ps(a, k, idx, b)
#endif
@@ -504,13 +1614,13 @@ simde_mm512_mask_permutex2var_ps (simde__m512 a, simde__mmask16 k, simde__m512i
SIMDE_FUNCTION_ATTRIBUTES
simde__m512
simde_mm512_mask2_permutex2var_ps (simde__m512 a, simde__m512i idx, simde__mmask16 k, simde__m512 b) {
- #if defined(SIMDE_X86_AVX512BW_NATIVE)
+ #if defined(SIMDE_X86_AVX512F_NATIVE)
return _mm512_mask2_permutex2var_ps(a, idx, k, b);
#else
return simde_mm512_mask_mov_ps(simde_mm512_castsi512_ps(idx), k, simde_mm512_permutex2var_ps(a, idx, b));
#endif
}
-#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES)
+#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES)
#undef _mm512_mask2_permutex2var_ps
#define _mm512_mask2_permutex2var_ps(a, idx, k, b) simde_mm512_mask2_permutex2var_ps(a, idx, k, b)
#endif
@@ -518,13 +1628,13 @@ simde_mm512_mask2_permutex2var_ps (simde__m512 a, simde__m512i idx, simde__mmask
SIMDE_FUNCTION_ATTRIBUTES
simde__m512
simde_mm512_maskz_permutex2var_ps (simde__mmask16 k, simde__m512 a, simde__m512i idx, simde__m512 b) {
- #if defined(SIMDE_X86_AVX512BW_NATIVE)
+ #if defined(SIMDE_X86_AVX512F_NATIVE)
return _mm512_maskz_permutex2var_ps(k, a, idx, b);
#else
return simde_mm512_maskz_mov_ps(k, simde_mm512_permutex2var_ps(a, idx, b));
#endif
}
-#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES)
+#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES)
#undef _mm512_maskz_permutex2var_ps
#define _mm512_maskz_permutex2var_ps(k, a, idx, b) simde_mm512_maskz_permutex2var_ps(k, a, idx, b)
#endif
=====================================
simde/x86/sse.h
=====================================
@@ -2331,8 +2331,7 @@ simde_mm_extract_pi16 (simde__m64 a, const int imm8)
return a_.i16[imm8];
}
#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && !defined(HEDLEY_PGI_VERSION)
-# if HEDLEY_HAS_WARNING("-Wvector-conversion")
- /* https://bugs.llvm.org/show_bug.cgi?id=44589 */
+# if defined(SIMDE_BUG_CLANG_44589)
# define simde_mm_extract_pi16(a, imm8) ( \
HEDLEY_DIAGNOSTIC_PUSH \
_Pragma("clang diagnostic ignored \"-Wvector-conversion\"") \
@@ -2365,8 +2364,7 @@ simde_mm_insert_pi16 (simde__m64 a, int16_t i, const int imm8)
return simde__m64_from_private(r_);
}
#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI)
-# if HEDLEY_HAS_WARNING("-Wvector-conversion")
- /* https://bugs.llvm.org/show_bug.cgi?id=44589 */
+# if defined(SIMDE_BUG_CLANG_44589)
# define ssimde_mm_insert_pi16(a, i, imm8) ( \
HEDLEY_DIAGNOSTIC_PUSH \
_Pragma("clang diagnostic ignored \"-Wvector-conversion\"") \
@@ -3092,14 +3090,25 @@ simde_mm_mulhi_pu16 (simde__m64 a, simde__m64 b) {
# define _m_pmulhuw(a, b) simde_mm_mulhi_pu16(a, b)
#endif
-#define SIMDE_MM_HINT_NTA 0
-#define SIMDE_MM_HINT_T0 1
-#define SIMDE_MM_HINT_T1 2
-#define SIMDE_MM_HINT_T2 3
-#define SIMDE_MM_HINT_ENTA 4
-#define SIMDE_MM_HINT_ET0 5
-#define SIMDE_MM_HINT_ET1 6
-#define SIMDE_MM_HINT_ET2 7
+#if defined(SIMDE_X86_SSE_NATIVE) && defined(HEDLEY_GCC_VERSION)
+ #define SIMDE_MM_HINT_NTA HEDLEY_STATIC_CAST(enum _mm_hint, 0)
+ #define SIMDE_MM_HINT_T0 HEDLEY_STATIC_CAST(enum _mm_hint, 1)
+ #define SIMDE_MM_HINT_T1 HEDLEY_STATIC_CAST(enum _mm_hint, 2)
+ #define SIMDE_MM_HINT_T2 HEDLEY_STATIC_CAST(enum _mm_hint, 3)
+ #define SIMDE_MM_HINT_ENTA HEDLEY_STATIC_CAST(enum _mm_hint, 4)
+ #define SIMDE_MM_HINT_ET0 HEDLEY_STATIC_CAST(enum _mm_hint, 5)
+ #define SIMDE_MM_HINT_ET1 HEDLEY_STATIC_CAST(enum _mm_hint, 6)
+ #define SIMDE_MM_HINT_ET2 HEDLEY_STATIC_CAST(enum _mm_hint, 7)
+#else
+ #define SIMDE_MM_HINT_NTA 0
+ #define SIMDE_MM_HINT_T0 1
+ #define SIMDE_MM_HINT_T1 2
+ #define SIMDE_MM_HINT_T2 3
+ #define SIMDE_MM_HINT_ENTA 4
+ #define SIMDE_MM_HINT_ET0 5
+ #define SIMDE_MM_HINT_ET1 6
+ #define SIMDE_MM_HINT_ET2 7
+#endif
#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
HEDLEY_DIAGNOSTIC_PUSH
=====================================
test/x86/avx512/permutex2var.c
=====================================
The diff for this file was not included because it is too large.
View it on GitLab: https://salsa.debian.org/med-team/simde/-/commit/dfcfc2716e08dc1788b60d5c843c6e8cdc6d5cb5
--
View it on GitLab: https://salsa.debian.org/med-team/simde/-/commit/dfcfc2716e08dc1788b60d5c843c6e8cdc6d5cb5
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20201227/196b280e/attachment-0001.html>
More information about the debian-med-commit
mailing list