[med-svn] [Git][med-team/simde][upstream] New upstream version 0.0.0.git.20200407
Michael R. Crusoe
gitlab at salsa.debian.org
Tue Apr 7 14:48:00 BST 2020
Michael R. Crusoe pushed to branch upstream at Debian Med / simde
Commits:
cc9ed7fb by Michael R. Crusoe at 2020-04-07T13:43:48+02:00
New upstream version 0.0.0.git.20200407
- - - - -
17 changed files:
- .drone.yml
- README.md
- + meson.build
- simde/simde-common.h
- simde/x86/avx2.h
- simde/x86/avx512bw.h
- simde/x86/avx512f.h
- simde/x86/sse2.h
- + test/arm/meson.build
- + test/arm/neon/meson.build
- + test/meson.build
- test/x86/avx512bw.c
- test/x86/avx512f.c
- + test/x86/meson.build
- test/x86/mmx.c
- test/x86/skel.c
- test/x86/test-avx512.h
Changes:
=====================================
.drone.yml
=====================================
@@ -6,27 +6,25 @@ platform:
os: linux
arch: arm
steps:
- - name: test
- image: ubuntu:bionic
- environment:
- CC: clang-9
- CXX: clang++-9
- commands:
- - uname -m
- - cat /proc/cpuinfo
- - apt-get -yq update
- - apt-get -yq install clang-9 cmake git-core
- - git submodule update --init --recursive
- - mkdir test/build
- - cd test/build
- - cmake .. -DCMAKE_C_FLAGS='-march=armv7a -mfpu=neon' -DCMAKE_CXX_FLAGS='-march=armv7a -mfpu=neon'
- - make -j VERBOSE=1
- - ./run-tests
-trigger:
- branch:
- exclude:
- - wip/*
-
+- name: test
+ image: ubuntu:bionic
+ environment:
+ CC: clang-9
+ CXX: clang++-9
+ ARCH_FLAGS: -march=armv7a -mfpu=neon
+ commands:
+ - uname -m
+ - cat /proc/cpuinfo
+ - apt-get -yq update
+ - apt-get -yq install clang-9 ninja-build git-core python3-pip
+ - pip3 install meson
+ - git submodule update --init --recursive
+ - mkdir -p build
+ - cd build
+ - CFLAGS="$ARCH_FLAGS" CXXFLAGS="$ARCH_FLAGS" meson ..
+ - ninja -v
+ - ./test/run-tests
+
---
kind: pipeline
type: docker
@@ -35,26 +33,24 @@ platform:
os: linux
arch: arm
steps:
- - name: test
- image: ubuntu:bionic
- environment:
- CC: clang-9
- CXX: clang++-9
- commands:
- - uname -m
- - cat /proc/cpuinfo
- - apt-get -yq update
- - apt-get -yq install clang-9 cmake git-core
- - git submodule update --init --recursive
- - mkdir test/build
- - cd test/build
- - cmake .. -DCMAKE_C_FLAGS='-march=armv8a' -DCMAKE_CXX_FLAGS='-march=armv8a'
- - make -j VERBOSE=1
- - ./run-tests
-trigger:
- branch:
- exclude:
- - wip/*
+- name: test
+ image: ubuntu:bionic
+ environment:
+ CC: clang-9
+ CXX: clang++-9
+ ARCH_FLAGS: -march=armv8a -mfpu=neon
+ commands:
+ - uname -m
+ - cat /proc/cpuinfo
+ - apt-get -yq update
+ - apt-get -yq install clang-9 ninja-build git-core python3-pip
+ - pip3 install meson
+ - git submodule update --init --recursive
+ - mkdir -p build
+ - cd build
+ - CFLAGS="$ARCH_FLAGS" CXXFLAGS="$ARCH_FLAGS" meson ..
+ - ninja -v
+ - ./test/run-tests
---
kind: pipeline
@@ -64,26 +60,24 @@ platform:
os: linux
arch: arm
steps:
- - name: test
- image: ubuntu:bionic
- environment:
- CC: gcc-8
- CXX: g++-8
- commands:
- - uname -m
- - cat /proc/cpuinfo
- - apt-get -yq update
- - apt-get -yq install gcc-8 g++-8 cmake git-core
- - git submodule update --init --recursive
- - mkdir test/build
- - cd test/build
- - cmake .. -DCMAKE_C_FLAGS='-march=armv7-a -mfpu=neon' -DCMAKE_CXX_FLAGS='-march=armv7-a -mfpu=neon'
- - make -j VERBOSE=1
- - ./run-tests
-trigger:
- branch:
- exclude:
- - wip/*
+- name: test
+ image: ubuntu:bionic
+ environment:
+ CC: gcc-8
+ CXX: g++-8
+ ARCH_FLAGS: -march=armv7-a -mfpu=neon
+ commands:
+ - uname -m
+ - cat /proc/cpuinfo
+ - apt-get -yq update
+ - apt-get -yq install gcc-8 g++-8 ninja-build git-core python3-pip
+ - pip3 install meson
+ - git submodule update --init --recursive
+ - mkdir -p build
+ - cd build
+ - CFLAGS="$ARCH_FLAGS" CXXFLAGS="$ARCH_FLAGS" meson ..
+ - ninja -v
+ - ./test/run-tests
---
kind: pipeline
@@ -93,139 +87,152 @@ platform:
os: linux
arch: arm
steps:
- - name: test
- image: ubuntu:bionic
- environment:
- CC: gcc-8
- CXX: g++-8
- commands:
- - uname -m
- - cat /proc/cpuinfo
- - apt-get -yq update
- - apt-get -yq install gcc-8 g++-8 cmake git-core
- - git submodule update --init --recursive
- - mkdir test/build
- - cd test/build
- - cmake .. -DCMAKE_C_FLAGS='-march=armv8-a -mfpu=neon' -DCMAKE_CXX_FLAGS='-march=armv8-a -mfpu=neon'
- - make -j VERBOSE=1
- - ./run-tests
-trigger:
- branch:
- exclude:
- - wip/*
+- name: test
+ image: ubuntu:bionic
+ environment:
+ CC: gcc-8
+ CXX: g++-8
+ ARCH_FLAGS: -march=armv8-a -mfpu=neon
+ commands:
+ - uname -m
+ - cat /proc/cpuinfo
+ - apt-get -yq update
+ - apt-get -yq install gcc-8 g++-8 ninja-build git-core python3-pip
+ - pip3 install meson
+ - git submodule update --init --recursive
+ - mkdir -p build
+ - cd build
+ - CFLAGS="$ARCH_FLAGS" CXXFLAGS="$ARCH_FLAGS" meson ..
+ - ninja -v
+ - ./test/run-tests
---
kind: pipeline
type: docker
-name: "gcc-7 armv7"
+name: "clang-7 armv7"
platform:
os: linux
arch: arm
steps:
- - name: test
- image: ubuntu:bionic
- environment:
- CC: gcc-7
- CXX: g++-7
- commands:
- - uname -m
- - cat /proc/cpuinfo
- - apt-get -yq update
- - apt-get -yq install gcc-7 g++-7 cmake git-core
- - git submodule update --init --recursive
- - mkdir test/build
- - cd test/build
- - cmake .. -DCMAKE_C_FLAGS='-march=armv7-a -mfpu=neon' -DCMAKE_CXX_FLAGS='-march=armv7-a -mfpu=neon'
- - make -j VERBOSE=1
- - ./run-tests
-trigger:
- branch:
- exclude:
- - wip/*
-
+- name: test
+ image: ubuntu:bionic
+ environment:
+ CC: clang-7
+ CXX: clang++-7
+ ARCH_FLAGS: -march=armv7a -mfpu=neon
+ commands:
+ - uname -m
+ - cat /proc/cpuinfo
+ - apt-get -yq update
+ - apt-get -yq install clang-7 ninja-build git-core python3-pip
+ - pip3 install meson
+ - git submodule update --init --recursive
+ - mkdir -p build
+ - cd build
+ - CFLAGS="$ARCH_FLAGS" CXXFLAGS="$ARCH_FLAGS" meson ..
+ - ninja -v
+ - ./test/run-tests
+
---
kind: pipeline
type: docker
-name: "gcc-7 armv8"
+name: "clang-7 armv8"
platform:
os: linux
arch: arm
steps:
- - name: test
- image: ubuntu:bionic
- environment:
- CC: gcc-7
- CXX: g++-7
- commands:
- - uname -m
- - cat /proc/cpuinfo
- - apt-get -yq update
- - apt-get -yq install gcc-7 g++-7 cmake git-core
- - git submodule update --init --recursive
- - mkdir test/build
- - cd test/build
- - cmake .. -DCMAKE_C_FLAGS='-march=armv8-a -mfpu=neon' -DCMAKE_CXX_FLAGS='-march=armv8-a -mfpu=neon'
- - make -j VERBOSE=1
- - ./run-tests
-trigger:
- branch:
- exclude:
- - wip/*
+- name: test
+ image: ubuntu:bionic
+ environment:
+ CC: clang-7
+ CXX: clang++-7
+ ARCH_FLAGS: -march=armv8a -mfpu=neon
+ commands:
+ - uname -m
+ - cat /proc/cpuinfo
+ - apt-get -yq update
+ - apt-get -yq install clang-7 ninja-build git-core python3-pip
+ - pip3 install meson
+ - git submodule update --init --recursive
+ - mkdir -p build
+ - cd build
+ - CFLAGS="$ARCH_FLAGS" CXXFLAGS="$ARCH_FLAGS" meson ..
+ - ninja -v
+ - ./test/run-tests
---
kind: pipeline
type: docker
-name: "clang-7 armv7"
+name: "gcc-7 armv7"
platform:
os: linux
arch: arm
steps:
- - name: test
- image: ubuntu:bionic
- environment:
- CC: clang-7
- CXX: clang++-7
- commands:
- - uname -m
- - cat /proc/cpuinfo
- - apt-get -yq update
- - apt-get -yq install clang-7 cmake git-core
- - git submodule update --init --recursive
- - mkdir test/build
- - cd test/build
- - cmake .. -DCMAKE_C_FLAGS='-march=armv7a' -DCMAKE_CXX_FLAGS='-march=armv7a'
- - make -j VERBOSE=1
- - ./run-tests
-trigger:
- branch:
- exclude:
- - wip/*
+- name: test
+ image: ubuntu:bionic
+ environment:
+ CC: gcc-7
+ CXX: g++-7
+ ARCH_FLAGS: -march=armv7-a -mfpu=neon
+ commands:
+ - uname -m
+ - cat /proc/cpuinfo
+ - apt-get -yq update
+ - apt-get -yq install gcc-7 g++-7 ninja-build git-core python3-pip
+ - pip3 install meson
+ - git submodule update --init --recursive
+ - mkdir -p build
+ - cd build
+ - CFLAGS="$ARCH_FLAGS" CXXFLAGS="$ARCH_FLAGS" meson ..
+ - ninja -v
+ - ./test/run-tests
---
kind: pipeline
type: docker
-name: "clang-7 armv8"
+name: "gcc-7 armv8"
platform:
os: linux
arch: arm
steps:
- - name: test
- image: ubuntu:bionic
- environment:
- CC: clang-7
- CXX: clang++-7
- commands:
- - uname -m
- - cat /proc/cpuinfo
- - apt-get -yq update
- - apt-get -yq install clang-7 cmake git-core
- - git submodule update --init --recursive
- - mkdir test/build
- - cd test/build
- - cmake .. -DCMAKE_C_FLAGS='-march=armv8a' -DCMAKE_CXX_FLAGS='-march=armv8a'
- - make -j VERBOSE=1
- - ./run-tests
-trigger:
- branch:
- exclude:
- - wip/*
+- name: test
+ image: ubuntu:bionic
+ environment:
+ CC: gcc-7
+ CXX: g++-7
+ ARCH_FLAGS: -march=armv8-a -mfpu=neon
+ commands:
+ - uname -m
+ - cat /proc/cpuinfo
+ - apt-get -yq update
+ - apt-get -yq install gcc-7 g++-7 ninja-build git-core python3-pip
+ - pip3 install meson
+ - git submodule update --init --recursive
+ - mkdir -p build
+ - cd build
+ - CFLAGS="$ARCH_FLAGS" CXXFLAGS="$ARCH_FLAGS" meson ..
+ - ninja -v
+ - ./test/run-tests
+
+# ---
+# kind: pipeline
+# type: docker
+# name: "fedora"
+# steps:
+# - name: test
+# image: fedora:latest
+# environment:
+# CC: gcc
+# CXX: g++
+# ARCH_FLAGS: -march=native
+# commands:
+# - uname -m
+# - cat /proc/cpuinfo
+# - dnf install -y gcc gcc-c++ ninja-build git-core python3-pip
+# - pip3 install meson
+# - git submodule update --init --recursive
+# - mkdir -p build
+# - cd build
+# - CFLAGS="$ARCH_FLAGS" CXXFLAGS="$ARCH_FLAGS" meson ..
+# - ninja -v
+# - ./test/run-tests
=====================================
README.md
=====================================
@@ -90,6 +90,7 @@ make sense since they will always be green, but here are the links:
* [AppVeyor](https://ci.appveyor.com/project/quixdb/simde)
* [GitHub Actions](https://github.com/nemequ/simde/actions)
* [Azure Pipelines](https://dev.azure.com/simd-everywhere/SIMDe/_build)
+* [Drone CI](https://cloud.drone.io/nemequ/simde)
## Contributing
@@ -303,6 +304,8 @@ tremendously grateful for their support:
numerous platforms.
* [AppVeyor](https://www.appveyor.com/) — provides CI testing on
Windows.
+ * [Drone CI](https://drone.io/) — provides CI testing on ARM 32 bits
+ platform, etc.
* [IntegriCloud](https://integricloud.com/) — provides access to a very
fast POWER9 server for developing AltiVec/VMX support.
* [GCC Compile Farm](https://gcc.gnu.org/wiki/CompileFarm) — provides
=====================================
meson.build
=====================================
@@ -0,0 +1,32 @@
+project('SIMDe', 'c', 'cpp',
+ default_options: ['c_std=c99'],
+ license: 'MIT',
+ version: '0.5.0')
+
+cc = meson.get_compiler('c')
+cxx = meson.get_compiler('cpp')
+
+subdir('test')
+
+install_headers(
+ [
+ 'simde/hedley.h',
+ 'simde/check.h',
+ 'simde/debug-trap.h',
+ 'simde/simde-arch.h',
+ 'simde/simde-common.h',
+
+ 'simde/x86/avx2.h',
+ 'simde/x86/avx512bw.h',
+ 'simde/x86/avx512f.h',
+ 'simde/x86/fma.h',
+ 'simde/x86/mmx.h',
+ 'simde/x86/sse.h',
+ 'simde/x86/sse2.h',
+ 'simde/x86/sse3.h',
+ 'simde/x86/sse4.1.h',
+ 'simde/x86/sse4.2.h',
+ 'simde/x86/ssse3.h',
+ 'simde/x86/svml.h',
+ ],
+ subdir: 'simde')
=====================================
simde/simde-common.h
=====================================
@@ -25,6 +25,12 @@
#define SIMDE_COMMON_H
#include "hedley.h"
+
+#define SIMDE_VERSION_MAJOR 0
+#define SIMDE_VERSION_MINOR 5
+#define SIMDE_VERSION_MICRO 0
+#define SIMDE_VERSION HEDLEY_VERSION_ENCODE(SIMDE_VERSION_MAJOR, SIMDE_VERSION_MINOR, SIMDE_VERSION_MICRO)
+
#include "simde-arch.h"
#include <stddef.h>
@@ -160,12 +166,16 @@
but the code needs to be refactored a bit to take advantage. */
# if !defined(SIMDE_NO_CONVERT_VECTOR) && defined(SIMDE_VECTOR_SUBSCRIPT)
# if HEDLEY_HAS_BUILTIN(__builtin_convertvector) || HEDLEY_GCC_VERSION_CHECK(9,0,0)
-/* https://gcc.gnu.org/bugzilla/show_bug.cgi?id=93557 */
-# define SIMDE__CONVERT_VECTOR(to, from) ((to) = (__extension__({ \
- __typeof__(from) from_ = (from); \
- ((void) from_); \
- __builtin_convertvector(from_, __typeof__(to)); \
- })))
+# if HEDLEY_GCC_VERSION_CHECK(9,0,0) && !HEDLEY_GCC_VERSION_CHECK(9,3,0)
+ /* https://gcc.gnu.org/bugzilla/show_bug.cgi?id=93557 */
+# define SIMDE__CONVERT_VECTOR(to, from) ((to) = (__extension__({ \
+ __typeof__(from) from_ = (from); \
+ ((void) from_); \
+ __builtin_convertvector(from_, __typeof__(to)); \
+ })))
+# else
+# define SIMDE__CONVERT_VECTOR(to, from) ((to) = __builtin_convertvector((from), __typeof__(to)))
+# endif
# endif
# endif
#endif
@@ -433,7 +443,7 @@ HEDLEY_STATIC_ASSERT(sizeof(simde_float64) == 8, "Unable to find 64-bit floating
#if \
HEDLEY_HAS_WARNING("-Wtautological-compare") || \
- HEDLEY_GCC_VERSION_CHECK(8,0,0)
+ HEDLEY_GCC_VERSION_CHECK(7,0,0)
# if defined(__cplusplus)
# if (__cplusplus >= 201402L)
# define SIMDE_TAUTOLOGICAL_COMPARE_(expr) \
@@ -670,6 +680,9 @@ HEDLEY_STATIC_ASSERT(sizeof(simde_float64) == 8, "Unable to find 64-bit floating
# if defined(SIMDE_ARCH_X86) && !defined(SIMDE_ARCH_AMD64)
# define SIMDE_BUG_GCC_94482
# endif
+# if defined(SIMDE_ARCH_AARCH64)
+# define SIMDE_BUG_GCC_94488
+# endif
# endif
# if defined(HEDLEY_EMSCRIPTEN_VERSION)
# define SIMDE_BUG_EMSCRIPTEN_MISSING_IMPL /* Placeholder for (as yet) unfiled issues. */
=====================================
simde/x86/avx2.h
=====================================
@@ -219,8 +219,8 @@ simde_mm256_alignr_epi8 (simde__m256i a, simde__m256i b, int count) {
#elif defined(SIMDE_ARCH_X86_SSSE3)
# define simde_mm256_alignr_epi8(a, b, count) \
simde_mm256_set_m128i( \
- simde_mm_alignr_epi8(simde__m256i_to_private(a).m128i[1], simde__m256i_to_private(b).m128i[1], (count)), \
- simde_mm_alignr_epi8(simde__m256i_to_private(a).m128i[0], simde__m256i_to_private(b).m128i[0], (count)))
+ simde_mm_alignr_epi8(simde_mm256_extracti128_si256(a, 1), simde_mm256_extracti128_si256(b, 1), (count)), \
+ simde_mm_alignr_epi8(simde_mm256_extracti128_si256(a, 0), simde_mm256_extracti128_si256(b, 0), (count)))
#endif
#if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
# define _mm256_alignr_epi8(a, b, count) simde_mm256_alignr_epi8(a, b, (count))
@@ -1547,8 +1547,8 @@ simde_mm256_shuffle_epi32 (simde__m256i a, const int imm8) {
#elif defined(SIMDE_ARCH_X86_SSE2) && !defined(__PGI)
# define simde_mm256_shuffle_epi32(a, imm8) \
simde_mm256_set_m128i( \
- simde_mm_shuffle_epi32(simde__m256i_to_private(a).m128i[1], (imm8)), \
- simde_mm_shuffle_epi32(simde__m256i_to_private(a).m128i[0], (imm8)))
+ simde_mm_shuffle_epi32(simde_mm256_extracti128_si256(a, 1), (imm8)), \
+ simde_mm_shuffle_epi32(simde_mm256_extracti128_si256(a, 0), (imm8)))
#elif defined(SIMDE__SHUFFLE_VECTOR)
# define simde_mm256_shuffle_epi32(a, imm8) (__extension__ ({ \
const simde__m256i_private simde__tmp_a_ = simde__m256i_to_private(a); \
@@ -1574,8 +1574,8 @@ simde_mm256_shuffle_epi32 (simde__m256i a, const int imm8) {
#elif defined(SIMDE_ARCH_X86_SSE2)
# define simde_mm256_shufflelo_epi16(a, imm8) \
simde_mm256_set_m128i( \
- simde_mm_shufflelo_epi16(simde__m256i_to_private(a).m128i[1], (imm8)), \
- simde_mm_shufflelo_epi16(simde__m256i_to_private(a).m128i[0], (imm8)))
+ simde_mm_shufflelo_epi16(simde_mm256_extracti128_si256(a, 1), (imm8)), \
+ simde_mm_shufflelo_epi16(simde_mm256_extracti128_si256(a, 0), (imm8)))
#elif defined(SIMDE__SHUFFLE_VECTOR)
# define simde_mm256_shufflelo_epi16(a, imm8) (__extension__ ({ \
const simde__m256i_private simde__tmp_a_ = simde__m256i_to_private(a); \
@@ -1596,8 +1596,8 @@ simde_mm256_shuffle_epi32 (simde__m256i a, const int imm8) {
#else
# define simde_mm256_shufflelo_epi16(a, imm8) \
simde_mm256_set_m128i( \
- simde_mm_shufflelo_epi16(simde__m256i_to_private(a).m128i[1], imm8), \
- simde_mm_shufflelo_epi16(simde__m256i_to_private(a).m128i[0], imm8))
+ simde_mm_shufflelo_epi16(simde_mm256_extracti128_si256(a, 1), imm8), \
+ simde_mm_shufflelo_epi16(simde_mm256_extracti128_si256(a, 0), imm8))
#endif
#if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
# define _mm256_shufflelo_epi16(a, imm8) simde_mm256_shufflelo_epi16(a, imm8)
@@ -1631,8 +1631,8 @@ simde_mm256_slli_epi16 (simde__m256i a, const int imm8)
#elif defined(SIMDE_ARCH_X86_SSE2)
# define simde_mm256_slli_epi16(a, imm8) \
simde_mm256_set_m128i( \
- simde_mm_slli_epi16(simde__m256i_to_private(a).m128i[1], (imm8)), \
- simde_mm_slli_epi16(simde__m256i_to_private(a).m128i[0], (imm8)))
+ simde_mm_slli_epi16(simde_mm256_extracti128_si256(a, 1), (imm8)), \
+ simde_mm_slli_epi16(simde_mm256_extracti128_si256(a, 0), (imm8)))
#endif
#if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
# define _mm256_slli_epi16(a, imm8) simde_mm256_slli_epi16(a, imm8)
@@ -1662,8 +1662,8 @@ simde_mm256_slli_epi32 (simde__m256i a, const int imm8)
#elif defined(SIMDE_ARCH_X86_SSE2)
# define simde_mm256_slli_epi32(a, imm8) \
simde_mm256_set_m128i( \
- simde_mm_slli_epi32(simde__m256i_to_private(a).m128i[1], (imm8)), \
- simde_mm_slli_epi32(simde__m256i_to_private(a).m128i[0], (imm8)))
+ simde_mm_slli_epi32(simde_mm256_extracti128_si256(a, 1), (imm8)), \
+ simde_mm_slli_epi32(simde_mm256_extracti128_si256(a, 0), (imm8)))
#endif
#if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
# define _mm256_slli_epi32(a, imm8) simde_mm256_slli_epi32(a, imm8)
@@ -1693,8 +1693,8 @@ simde_mm256_slli_epi64 (simde__m256i a, const int imm8)
#elif defined(SIMDE_ARCH_X86_SSE2)
# define simde_mm256_slli_epi64(a, imm8) \
simde_mm256_set_m128i( \
- simde_mm_slli_epi64(simde__m256i_to_private(a).m128i[1], (imm8)), \
- simde_mm_slli_epi64(simde__m256i_to_private(a).m128i[0], (imm8)))
+ simde_mm_slli_epi64(simde_mm256_extracti128_si256(a, 1), (imm8)), \
+ simde_mm_slli_epi64(simde_mm256_extracti128_si256(a, 0), (imm8)))
#endif
#if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
# define _mm256_slli_epi64(a, imm8) simde_mm256_slli_epi64(a, imm8)
@@ -1843,8 +1843,8 @@ simde_mm256_srli_epi64 (simde__m256i a, const int imm8) {
#elif defined(SIMDE_ARCH_X86_SSE2)
# define simde_mm256_srli_epi64(a, imm8) \
simde_mm256_set_m128i( \
- simde_mm_srli_epi64(simde__m256i_to_private(a).m128i[1], (imm8)), \
- simde_mm_srli_epi64(simde__m256i_to_private(a).m128i[0], (imm8)))
+ simde_mm_srli_epi64(simde_mm256_extracti128_si256(a, 1), (imm8)), \
+ simde_mm_srli_epi64(simde_mm256_extracti128_si256(a, 0), (imm8)))
#endif
#if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
# define _mm256_srli_epi64(a, imm8) simde_mm256_srli_epi64(a, imm8)
@@ -1872,13 +1872,13 @@ simde_mm256_srli_si256 (simde__m256i a, const int imm8) {
#elif defined(SIMDE_ARCH_X86_SSE2) && !defined(__PGI)
# define simde_mm256_srli_si256(a, imm8) \
simde_mm256_set_m128i( \
- simde_mm_srli_si128(simde__m256i_to_private(a).m128i[1], (imm8)), \
- simde_mm_srli_si128(simde__m256i_to_private(a).m128i[0], (imm8)))
+ simde_mm_srli_si128(simde_mm256_extracti128_si256(a, 1), (imm8)), \
+ simde_mm_srli_si128(simde_mm256_extracti128_si256(a, 0), (imm8)))
#elif defined(SIMDE_SSE2_NEON)
# define simde_mm256_srli_si256(a, imm8) \
simde_mm256_set_m128i( \
- simde_mm_bsrli_si128(simde__m256i_to_private(a).m128i[1], (imm8)), \
- simde_mm_bsrli_si128(simde__m256i_to_private(a).m128i[0], (imm8)))
+ simde_mm_bsrli_si128(simde_mm256_extracti128_si256(a, 1), (imm8)), \
+ simde_mm_bsrli_si128(simde_mm256_extracti128_si256(a, 0), (imm8)))
#endif
#if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
# define _mm256_srli_si256(a, imm8) simde_mm_srli_si256(a, imm8)
@@ -2150,8 +2150,8 @@ simde_mm256_srli_epi32 (simde__m256i a, const int imm8) {
#elif defined(SIMDE_ARCH_X86_SSE2)
# define simde_mm256_srli_epi32(a, imm8) \
simde_mm256_set_m128i( \
- simde_mm_srli_epi32(simde__m256i_to_private(a).m128i[1], (imm8)), \
- simde_mm_srli_epi32(simde__m256i_to_private(a).m128i[0], (imm8)))
+ simde_mm_srli_epi32(simde_mm256_extracti128_si256(a, 1), (imm8)), \
+ simde_mm_srli_epi32(simde_mm256_extracti128_si256(a, 0), (imm8)))
#endif
#if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
# define _mm256_srli_epi32(a, imm8) simde_mm256_srli_epi32(a, imm8)
=====================================
simde/x86/avx512bw.h
=====================================
@@ -306,6 +306,56 @@ simde_mm512_cmpeq_epi8_mask (simde__m512i a, simde__m512i b) {
# define _mm512_cmpeq_epi8_mask(a, b) simde_mm512_cmpeq_epi8_mask(a, b)
#endif
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m256i
+simde_mm512_cvtepi16_epi8 (simde__m512i a) {
+#if defined(SIMDE_AVX512BW_NATIVE)
+ return _mm512_cvtepi16_epi8(a);
+#else
+ simde__m256i_private r_;
+ simde__m512i_private a_ = simde__m512i_to_private(a);
+
+#if defined(SIMDE__CONVERT_VECTOR)
+ SIMDE__CONVERT_VECTOR(r_.i8, a_.i16);
+#else
+ SIMDE__VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
+ r_.i8[i] = HEDLEY_STATIC_CAST(int8_t, a_.i16[i]);
+ }
+#endif
+
+ return simde__m256i_from_private(r_);
+#endif
+}
+#if defined(SIMDE_AVX512BW_ENABLE_NATIVE_ALIASES)
+# define _mm512_cvtepi16_epi8(a) simde_mm512_cvtepi16_epi8(a)
+#endif
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m512i
+simde_mm512_cvtepi8_epi16 (simde__m256i a) {
+#if defined(SIMDE_AVX512BW_NATIVE)
+ return _mm512_cvtepi8_epi16(a);
+#else
+ simde__m512i_private r_;
+ simde__m256i_private a_ = simde__m256i_to_private(a);
+
+#if defined(SIMDE__CONVERT_VECTOR)
+ SIMDE__CONVERT_VECTOR(r_.i16, a_.i8);
+#else
+ SIMDE__VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
+ r_.i16[i] = a_.i8[i];
+ }
+#endif
+
+ return simde__m512i_from_private(r_);
+#endif
+}
+#if defined(SIMDE_AVX512BW_ENABLE_NATIVE_ALIASES)
+# define _mm512_cvtepi8_epi16(a) simde_mm512_cvtepi8_epi16(a)
+#endif
+
SIMDE__END_DECLS
HEDLEY_DIAGNOSTIC_POP
=====================================
simde/x86/avx512f.h
=====================================
@@ -341,6 +341,32 @@ simde__m512d_to_private(simde__m512d v) {
return r;
}
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m512i
+simde_mm512_broadcast_i32x4 (simde__m128i a) {
+ #if defined(SIMDE_AVX512F_NATIVE)
+ return _mm512_broadcast_i32x4(a);
+ #else
+ simde__m512i_private r_;
+
+ #if defined(SIMDE_ARCH_X86_AVX2)
+ r_.m256i[1] = r_.m256i[0] = simde_mm256_broadcastsi128_si256(a);
+ #elif defined(SIMDE_ARCH_X86_SSE2)
+ r_.m128i[3] = r_.m128i[2] = r_.m128i[1] = r_.m128i[0] = a;
+ #else
+ SIMDE__VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) {
+ r_.m128i[i] = a;
+ }
+ #endif
+
+ return simde__m512i_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_AVX512F_ENABLE_NATIVE_ALIASES)
+# define _mm512_broadcast_i32x4(a) simde_mm512_broadcast_i32x4(a)
+#endif
+
SIMDE__FUNCTION_ATTRIBUTES
simde__m512
simde_mm512_castpd_ps (simde__m512d a) {
@@ -605,6 +631,40 @@ simde_mm512_castsi512_si256 (simde__m512i a) {
# define _mm512_castsi512_si256(a) simde_mm512_castsi512_si256(a)
#endif
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m512i
+simde_mm512_load_si512 (simde__m512i const * mem_addr) {
+ simde_assert_aligned(64, mem_addr);
+
+ #if defined(SIMDE_AVX512F_NATIVE)
+ return _mm512_load_si512((__m512i const*) mem_addr);
+ #elif defined(SIMDE_ARCH_AARCH64) && (defined(HEDLEY_GCC_VERSION) && !HEDLEY_GCC_VERSION_CHECK(8,0,0))
+ simde__m512i r;
+ memcpy(&r, mem_addr, sizeof(r));
+ return r;
+ #else
+ return *mem_addr;
+ #endif
+}
+#if defined(SIMDE_AVX512F_ENABLE_NATIVE_ALIASES)
+# define _mm512_load_si512(a) simde_mm512_load_si512(a)
+#endif
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m512i
+simde_mm512_loadu_si512 (simde__m512i const * mem_addr) {
+ #if defined(SIMDE_AVX512F_NATIVE)
+ return _mm512_loadu_si512((__m512i const*) mem_addr);
+ #else
+ simde__m512i r;
+ simde_memcpy(&r, mem_addr, sizeof(r));
+ return r;
+ #endif
+}
+#if defined(SIMDE_AVX512F_ENABLE_NATIVE_ALIASES)
+# define _mm512_loadu_si512(a) simde_mm512_loadu_si512(a)
+#endif
+
SIMDE__FUNCTION_ATTRIBUTES
simde__m512i
simde_mm512_set_epi8 (int8_t e63, int8_t e62, int8_t e61, int8_t e60, int8_t e59, int8_t e58, int8_t e57, int8_t e56,
@@ -1174,6 +1234,298 @@ simde_mm512_set1_pd (simde_float64 a) {
# define _mm512_set1_pd(a) simde_mm512_set1_pd(a)
#endif
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m512i
+simde_mm512_set4_epi32 (int32_t d, int32_t c, int32_t b, int32_t a) {
+ simde__m512i_private r_;
+
+ r_.i32[ 0] = a;
+ r_.i32[ 1] = b;
+ r_.i32[ 2] = c;
+ r_.i32[ 3] = d;
+ r_.i32[ 4] = a;
+ r_.i32[ 5] = b;
+ r_.i32[ 6] = c;
+ r_.i32[ 7] = d;
+ r_.i32[ 8] = a;
+ r_.i32[ 9] = b;
+ r_.i32[10] = c;
+ r_.i32[11] = d;
+ r_.i32[12] = a;
+ r_.i32[13] = b;
+ r_.i32[14] = c;
+ r_.i32[15] = d;
+
+ return simde__m512i_from_private(r_);
+}
+#if defined(SIMDE_AVX512F_ENABLE_NATIVE_ALIASES)
+# define _mm512_set4_epi32(d,c,b,a) simde_mm512_set4_epi32(d,c,b,a)
+#endif
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m512i
+simde_mm512_set4_epi64 (int64_t d, int64_t c, int64_t b, int64_t a) {
+ simde__m512i_private r_;
+
+ r_.i64[0] = a;
+ r_.i64[1] = b;
+ r_.i64[2] = c;
+ r_.i64[3] = d;
+ r_.i64[4] = a;
+ r_.i64[5] = b;
+ r_.i64[6] = c;
+ r_.i64[7] = d;
+
+ return simde__m512i_from_private(r_);
+}
+#if defined(SIMDE_AVX512F_ENABLE_NATIVE_ALIASES)
+# define _mm512_set4_epi64(d,c,b,a) simde_mm512_set4_epi64(d,c,b,a)
+#endif
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m512
+simde_mm512_set4_ps (simde_float32 d, simde_float32 c, simde_float32 b, simde_float32 a) {
+ simde__m512_private r_;
+
+ r_.f32[ 0] = a;
+ r_.f32[ 1] = b;
+ r_.f32[ 2] = c;
+ r_.f32[ 3] = d;
+ r_.f32[ 4] = a;
+ r_.f32[ 5] = b;
+ r_.f32[ 6] = c;
+ r_.f32[ 7] = d;
+ r_.f32[ 8] = a;
+ r_.f32[ 9] = b;
+ r_.f32[10] = c;
+ r_.f32[11] = d;
+ r_.f32[12] = a;
+ r_.f32[13] = b;
+ r_.f32[14] = c;
+ r_.f32[15] = d;
+
+ return simde__m512_from_private(r_);
+}
+#if defined(SIMDE_AVX512F_ENABLE_NATIVE_ALIASES)
+# define _mm512_set4_ps(d,c,b,a) simde_mm512_set4_ps(d,c,b,a)
+#endif
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m512d
+simde_mm512_set4_pd (simde_float64 d, simde_float64 c, simde_float64 b, simde_float64 a) {
+ simde__m512d_private r_;
+
+ r_.f64[0] = a;
+ r_.f64[1] = b;
+ r_.f64[2] = c;
+ r_.f64[3] = d;
+ r_.f64[4] = a;
+ r_.f64[5] = b;
+ r_.f64[6] = c;
+ r_.f64[7] = d;
+
+ return simde__m512d_from_private(r_);
+}
+#if defined(SIMDE_AVX512F_ENABLE_NATIVE_ALIASES)
+# define _mm512_set4_pd(d,c,b,a) simde_mm512_set4_pd(d,c,b,a)
+#endif
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m512i
+simde_mm512_setr_epi32 (int32_t e15, int32_t e14, int32_t e13, int32_t e12, int32_t e11, int32_t e10, int32_t e9, int32_t e8,
+ int32_t e7, int32_t e6, int32_t e5, int32_t e4, int32_t e3, int32_t e2, int32_t e1, int32_t e0) {
+ simde__m512i_private r_;
+
+ r_.i32[ 0] = e15;
+ r_.i32[ 1] = e14;
+ r_.i32[ 2] = e13;
+ r_.i32[ 3] = e12;
+ r_.i32[ 4] = e11;
+ r_.i32[ 5] = e10;
+ r_.i32[ 6] = e9;
+ r_.i32[ 7] = e8;
+ r_.i32[ 8] = e7;
+ r_.i32[ 9] = e6;
+ r_.i32[10] = e5;
+ r_.i32[11] = e4;
+ r_.i32[12] = e3;
+ r_.i32[13] = e2;
+ r_.i32[14] = e1;
+ r_.i32[15] = e0;
+
+ return simde__m512i_from_private(r_);
+}
+#if defined(SIMDE_AVX512F_ENABLE_NATIVE_ALIASES)
+# define _mm512_setr_epi32(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) simde_mm512_setr_epi32(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0)
+#endif
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m512i
+simde_mm512_setr_epi64 (int64_t e7, int64_t e6, int64_t e5, int64_t e4, int64_t e3, int64_t e2, int64_t e1, int64_t e0) {
+ simde__m512i_private r_;
+
+ r_.i64[0] = e7;
+ r_.i64[1] = e6;
+ r_.i64[2] = e5;
+ r_.i64[3] = e4;
+ r_.i64[4] = e3;
+ r_.i64[5] = e2;
+ r_.i64[6] = e1;
+ r_.i64[7] = e0;
+
+ return simde__m512i_from_private(r_);
+}
+#if defined(SIMDE_AVX512F_ENABLE_NATIVE_ALIASES)
+# define _mm512_setr_epi64(e7, e6, e5, e4, e3, e2, e1, e0) simde_mm512_setr_epi64(e7, e6, e5, e4, e3, e2, e1, e0)
+#endif
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m512
+simde_mm512_setr_ps (simde_float32 e15, simde_float32 e14, simde_float32 e13, simde_float32 e12,
+ simde_float32 e11, simde_float32 e10, simde_float32 e9, simde_float32 e8,
+ simde_float32 e7, simde_float32 e6, simde_float32 e5, simde_float32 e4,
+ simde_float32 e3, simde_float32 e2, simde_float32 e1, simde_float32 e0) {
+ simde__m512_private r_;
+
+ r_.f32[ 0] = e15;
+ r_.f32[ 1] = e14;
+ r_.f32[ 2] = e13;
+ r_.f32[ 3] = e12;
+ r_.f32[ 4] = e11;
+ r_.f32[ 5] = e10;
+ r_.f32[ 6] = e9;
+ r_.f32[ 7] = e8;
+ r_.f32[ 8] = e7;
+ r_.f32[ 9] = e6;
+ r_.f32[10] = e5;
+ r_.f32[11] = e4;
+ r_.f32[12] = e3;
+ r_.f32[13] = e2;
+ r_.f32[14] = e1;
+ r_.f32[15] = e0;
+
+ return simde__m512_from_private(r_);
+}
+#if defined(SIMDE_AVX512F_ENABLE_NATIVE_ALIASES)
+# define _mm512_setr_ps(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) simde_mm512_setr_ps(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0)
+#endif
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m512d
+simde_mm512_setr_pd (simde_float64 e7, simde_float64 e6, simde_float64 e5, simde_float64 e4, simde_float64 e3, simde_float64 e2, simde_float64 e1, simde_float64 e0) {
+ simde__m512d_private r_;
+
+ r_.f64[0] = e7;
+ r_.f64[1] = e6;
+ r_.f64[2] = e5;
+ r_.f64[3] = e4;
+ r_.f64[4] = e3;
+ r_.f64[5] = e2;
+ r_.f64[6] = e1;
+ r_.f64[7] = e0;
+
+ return simde__m512d_from_private(r_);
+}
+#if defined(SIMDE_AVX512F_ENABLE_NATIVE_ALIASES)
+# define _mm512_setr_pd(e7, e6, e5, e4, e3, e2, e1, e0) simde_mm512_setr_pd(e7, e6, e5, e4, e3, e2, e1, e0)
+#endif
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m512i
+simde_mm512_setr4_epi32 (int32_t d, int32_t c, int32_t b, int32_t a) {
+ simde__m512i_private r_;
+
+ r_.i32[ 0] = d;
+ r_.i32[ 1] = c;
+ r_.i32[ 2] = b;
+ r_.i32[ 3] = a;
+ r_.i32[ 4] = d;
+ r_.i32[ 5] = c;
+ r_.i32[ 6] = b;
+ r_.i32[ 7] = a;
+ r_.i32[ 8] = d;
+ r_.i32[ 9] = c;
+ r_.i32[10] = b;
+ r_.i32[11] = a;
+ r_.i32[12] = d;
+ r_.i32[13] = c;
+ r_.i32[14] = b;
+ r_.i32[15] = a;
+
+ return simde__m512i_from_private(r_);
+}
+#if defined(SIMDE_AVX512F_ENABLE_NATIVE_ALIASES)
+# define _mm512_setr4_epi32(d,c,b,a) simde_mm512_setr4_epi32(d,c,b,a)
+#endif
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m512i
+simde_mm512_setr4_epi64 (int64_t d, int64_t c, int64_t b, int64_t a) {
+ simde__m512i_private r_;
+
+ r_.i64[0] = d;
+ r_.i64[1] = c;
+ r_.i64[2] = b;
+ r_.i64[3] = a;
+ r_.i64[4] = d;
+ r_.i64[5] = c;
+ r_.i64[6] = b;
+ r_.i64[7] = a;
+
+ return simde__m512i_from_private(r_);
+}
+#if defined(SIMDE_AVX512F_ENABLE_NATIVE_ALIASES)
+# define _mm512_setr4_epi64(d,c,b,a) simde_mm512_setr4_epi64(d,c,b,a)
+#endif
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m512
+simde_mm512_setr4_ps (simde_float32 d, simde_float32 c, simde_float32 b, simde_float32 a) {
+ simde__m512_private r_;
+
+ r_.f32[ 0] = d;
+ r_.f32[ 1] = c;
+ r_.f32[ 2] = b;
+ r_.f32[ 3] = a;
+ r_.f32[ 4] = d;
+ r_.f32[ 5] = c;
+ r_.f32[ 6] = b;
+ r_.f32[ 7] = a;
+ r_.f32[ 8] = d;
+ r_.f32[ 9] = c;
+ r_.f32[10] = b;
+ r_.f32[11] = a;
+ r_.f32[12] = d;
+ r_.f32[13] = c;
+ r_.f32[14] = b;
+ r_.f32[15] = a;
+
+ return simde__m512_from_private(r_);
+}
+#if defined(SIMDE_AVX512F_ENABLE_NATIVE_ALIASES)
+# define _mm512_setr4_ps(d,c,b,a) simde_mm512_setr4_ps(d,c,b,a)
+#endif
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m512d
+simde_mm512_setr4_pd (simde_float64 d, simde_float64 c, simde_float64 b, simde_float64 a) {
+ simde__m512d_private r_;
+
+ r_.f64[0] = d;
+ r_.f64[1] = c;
+ r_.f64[2] = b;
+ r_.f64[3] = a;
+ r_.f64[4] = d;
+ r_.f64[5] = c;
+ r_.f64[6] = b;
+ r_.f64[7] = a;
+
+ return simde__m512d_from_private(r_);
+}
+#if defined(SIMDE_AVX512F_ENABLE_NATIVE_ALIASES)
+# define _mm512_setr4_pd(d,c,b,a) simde_mm512_setr4_pd(d,c,b,a)
+#endif
+
SIMDE__FUNCTION_ATTRIBUTES
simde__m512i
simde_mm512_setzero_si512(void) {
@@ -1243,6 +1595,84 @@ simde_mm512_setone_pd(void) {
return simde_mm512_castsi512_pd(simde_mm512_setone_si512());
}
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m512i
+simde_mm512_srli_epi64 (simde__m512i a, unsigned int imm8) {
+ #if defined(SIMDE_AVX512F_NATIVE)
+ return _mm512_srli_epi64(a, imm8);
+ #else
+ simde__m512i_private
+ r_,
+ a_ = simde__m512i_to_private(a);
+
+ #if defined(SIMDE_ARCH_X86_AVX2)
+ r_.m256i[0] = simde_mm256_srli_epi64(a_.m256i[0], imm8);
+ r_.m256i[1] = simde_mm256_srli_epi64(a_.m256i[1], imm8);
+ #elif defined(SIMDE_ARCH_X86_SSE2)
+ r_.m128i[0] = simde_mm_srli_epi64(a_.m128i[0], imm8);
+ r_.m128i[1] = simde_mm_srli_epi64(a_.m128i[1], imm8);
+ r_.m128i[2] = simde_mm_srli_epi64(a_.m128i[2], imm8);
+ r_.m128i[3] = simde_mm_srli_epi64(a_.m128i[3], imm8);
+ #else
+ /* The Intel Intrinsics Guide says that only the 8 LSBits of imm8 are
+ * used. In this case we should do "imm8 &= 0xff" here. However in
+ * practice all bits are used. */
+ if (imm8 > 63) {
+ simde_memset(&r_, 0, sizeof(r_));
+ } else {
+ #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
+ r_.u64 = a_.u64 >> imm8;
+ #else
+ SIMDE__VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
+ r_.u64[i] = a_.u64[i] >> imm8;
+ }
+ #endif
+ }
+ #endif
+
+ return simde__m512i_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_AVX512F_ENABLE_NATIVE_ALIASES)
+# define _mm512_srli_epi64(a, imm8) simde_mm512_srli_epi64(a, imm8)
+#endif
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m512i
+simde_mm512_xor_si512 (simde__m512i a, simde__m512i b) {
+ #if defined(SIMDE_AVX512F_NATIVE)
+ return _mm512_xor_si512(a, b);
+ #else
+ simde__m512i_private
+ r_,
+ a_ = simde__m512i_to_private(a),
+ b_ = simde__m512i_to_private(b);
+
+ #if defined(SIMDE_ARCH_X86_AVX2)
+ r_.m256i[0] = simde_mm256_xor_si256(a_.m256i[0], b_.m256i[0]);
+ r_.m256i[1] = simde_mm256_xor_si256(a_.m256i[1], b_.m256i[1]);
+ #elif defined(SIMDE_ARCH_X86_SSE2)
+ r_.m128i[0] = simde_mm_xor_si128(a_.m128i[0], b_.m128i[0]);
+ r_.m128i[1] = simde_mm_xor_si128(a_.m128i[1], b_.m128i[1]);
+ r_.m128i[2] = simde_mm_xor_si128(a_.m128i[2], b_.m128i[2]);
+ r_.m128i[3] = simde_mm_xor_si128(a_.m128i[3], b_.m128i[3]);
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
+ r_.i32f = a_.i32f ^ b_.i32f;
+ #else
+ SIMDE__VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
+ r_.i32f[i] = a_.i32f[i] ^ b_.i32f[i];
+ }
+ #endif
+
+ return simde__m512i_from_private(r_);
+#endif
+}
+#if defined(SIMDE_AVX512F_ENABLE_NATIVE_ALIASES)
+# define _mm512_xor_si512(a, b) simde_mm512_xor_si512(a, b)
+#endif
+
SIMDE__FUNCTION_ATTRIBUTES
simde__m512i
simde_mm512_mask_mov_epi32(simde__m512i src, simde__mmask16 k, simde__m512i a) {
@@ -1669,11 +2099,119 @@ simde_mm512_add_pd (simde__m512d a, simde__m512d b) {
# define _mm512_add_pd(a, b) simde_mm512_add_pd(a, b)
#endif
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m512i
+simde_mm512_sub_epi32 (simde__m512i a, simde__m512i b) {
+#if defined(SIMDE_AVX512F_NATIVE)
+ return _mm512_sub_epi32(a, b);
+#else
+ simde__m512i_private
+ r_,
+ a_ = simde__m512i_to_private(a),
+ b_ = simde__m512i_to_private(b);
+
+#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
+ r_.i32 = a_.i32 - b_.i32;
+#else
+ SIMDE__VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) {
+ r_.m256i[i] = simde_mm256_sub_epi32(a_.m256i[i], b_.m256i[i]);
+ }
+#endif
+
+ return simde__m512i_from_private(r_);
+#endif
+}
+#if defined(SIMDE_AVX512F_ENABLE_NATIVE_ALIASES)
+# define _mm512_sub_epi32(a, b) simde_mm512_sub_epi32(a, b)
+#endif
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m512i
+simde_mm512_sub_epi64 (simde__m512i a, simde__m512i b) {
+#if defined(SIMDE_AVX512F_NATIVE)
+ return _mm512_sub_epi64(a, b);
+#else
+ simde__m512i_private
+ r_,
+ a_ = simde__m512i_to_private(a),
+ b_ = simde__m512i_to_private(b);
+
+#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
+ r_.i64 = a_.i64 - b_.i64;
+#else
+ SIMDE__VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) {
+ r_.m256i[i] = simde_mm256_sub_epi64(a_.m256i[i], b_.m256i[i]);
+ }
+#endif
+
+ return simde__m512i_from_private(r_);
+#endif
+}
+#if defined(SIMDE_AVX512F_ENABLE_NATIVE_ALIASES)
+# define _mm512_sub_epi64(a, b) simde_mm512_sub_epi64(a, b)
+#endif
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m512
+simde_mm512_sub_ps (simde__m512 a, simde__m512 b) {
+#if defined(SIMDE_AVX512F_NATIVE)
+ return _mm512_sub_ps(a, b);
+#else
+ simde__m512_private
+ r_,
+ a_ = simde__m512_to_private(a),
+ b_ = simde__m512_to_private(b);
+
+#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
+ r_.f32 = a_.f32 - b_.f32;
+#else
+ SIMDE__VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.m256) / sizeof(r_.m256[0])) ; i++) {
+ r_.m256[i] = simde_mm256_sub_ps(a_.m256[i], b_.m256[i]);
+ }
+#endif
+
+ return simde__m512_from_private(r_);
+#endif
+}
+#if defined(SIMDE_AVX512F_ENABLE_NATIVE_ALIASES)
+# define _mm512_sub_ps(a, b) simde_mm512_sub_ps(a, b)
+#endif
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m512d
+simde_mm512_sub_pd (simde__m512d a, simde__m512d b) {
+#if defined(SIMDE_AVX512F_NATIVE)
+ return _mm512_sub_pd(a, b);
+#else
+ simde__m512d_private
+ r_,
+ a_ = simde__m512d_to_private(a),
+ b_ = simde__m512d_to_private(b);
+
+#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
+ r_.f64 = a_.f64 - b_.f64;
+#else
+ SIMDE__VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.m256d) / sizeof(r_.m256d[0])) ; i++) {
+ r_.m256d[i] = simde_mm256_sub_pd(a_.m256d[i], b_.m256d[i]);
+ }
+#endif
+
+ return simde__m512d_from_private(r_);
+#endif
+}
+#if defined(SIMDE_AVX512F_ENABLE_NATIVE_ALIASES)
+# define _mm512_sub_pd(a, b) simde_mm512_sub_pd(a, b)
+#endif
+
SIMDE__FUNCTION_ATTRIBUTES
simde__mmask16
-simde_mm512_mask_cmpeq_epi32_mask (simde__mmask16 k1, simde__m512i a, simde__m512i b) {
+simde_mm512_cmpeq_epi32_mask (simde__m512i a, simde__m512i b) {
#if defined(SIMDE_AVX512F_NATIVE)
- return _mm512_mask_cmpeq_epi32_mask(k1, a, b);
+ return _mm512_cmpeq_epi32_mask(a, b);
#else
simde__m512i_private
r_,
@@ -1684,18 +2222,66 @@ simde_mm512_mask_cmpeq_epi32_mask (simde__mmask16 k1, simde__m512i a, simde__m51
r_.m256i[i] = simde_mm256_cmpeq_epi32(a_.m256i[i], b_.m256i[i]);
}
- return simde__m512i_private_to_mmask16(r_) & k1;
+ return simde__m512i_private_to_mmask16(r_);
+ #endif
+}
+#if defined(SIMDE_AVX512F_ENABLE_NATIVE_ALIASES)
+# define _mm512_cmpeq_epi32_mask(a, b) simde_mm512_cmpeq_epi32_mask(a, b)
+#endif
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__mmask16
+simde_mm512_mask_cmpeq_epi32_mask (simde__mmask16 k1, simde__m512i a, simde__m512i b) {
+ #if defined(SIMDE_AVX512F_NATIVE)
+ return _mm512_mask_cmpeq_epi32_mask(k1, a, b);
+ #else
+ return simde_mm512_cmpeq_epi32_mask(a, b) & k1;
#endif
}
#if defined(SIMDE_AVX512F_ENABLE_NATIVE_ALIASES)
# define _mm512_mask_cmpeq_epi32_mask(k1, a, b) simde_mm512_mask_cmpeq_epi32_mask(k1, a, b)
#endif
+SIMDE__FUNCTION_ATTRIBUTES
+simde__mmask8
+simde_mm512_cmpeq_epi64_mask (simde__m512i a, simde__m512i b) {
+ #if defined(SIMDE_AVX512F_NATIVE)
+ return _mm512_cmpeq_epi64_mask(a, b);
+ #else
+ simde__m512i_private
+ r_,
+ a_ = simde__m512i_to_private(a),
+ b_ = simde__m512i_to_private(b);
+
+ for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) {
+ r_.m256i[i] = simde_mm256_cmpeq_epi64(a_.m256i[i], b_.m256i[i]);
+ }
+
+ return simde__m512i_private_to_mmask8(r_);
+ #endif
+}
+#if defined(SIMDE_AVX512F_ENABLE_NATIVE_ALIASES)
+# define _mm512_cmpeq_epi64_mask(a, b) simde_mm512_cmpeq_epi64_mask(a, b)
+#endif
+
SIMDE__FUNCTION_ATTRIBUTES
simde__mmask8
simde_mm512_mask_cmpeq_epi64_mask (simde__mmask8 k1, simde__m512i a, simde__m512i b) {
#if defined(SIMDE_AVX512F_NATIVE)
return _mm512_mask_cmpeq_epi64_mask(k1, a, b);
+ #else
+ return simde_mm512_cmpeq_epi64_mask(a, b) & k1;
+ #endif
+}
+#if defined(SIMDE_AVX512F_ENABLE_NATIVE_ALIASES)
+# define _mm512_mask_cmpeq_epi64_mask(k1, a, b) simde_mm512_mask_cmpeq_epi64_mask(k1, a, b)
+#endif
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__mmask16
+simde_mm512_mask_cmpgt_epi32_mask (simde__mmask16 k1, simde__m512i a, simde__m512i b) {
+ #if defined(SIMDE_AVX512F_NATIVE)
+ return _mm512_mask_cmpgt_epi32_mask(k1, a, b);
#else
simde__m512i_private
r_,
@@ -1703,14 +2289,36 @@ simde_mm512_mask_cmpeq_epi64_mask (simde__mmask8 k1, simde__m512i a, simde__m512
b_ = simde__m512i_to_private(b);
for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) {
- r_.m256i[i] = simde_mm256_cmpeq_epi64(a_.m256i[i], b_.m256i[i]);
+ r_.m256i[i] = simde_mm256_cmpgt_epi32(a_.m256i[i], b_.m256i[i]);
+ }
+
+ return simde__m512i_private_to_mmask16(r_) & k1;
+ #endif
+}
+#if defined(SIMDE_AVX512F_ENABLE_NATIVE_ALIASES)
+# define _mm512_mask_cmpgt_epi32_mask(k1, a, b) simde_mm512_mask_cmpgt_epi32_mask(k1, a, b)
+#endif
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__mmask8
+simde_mm512_mask_cmpgt_epi64_mask (simde__mmask8 k1, simde__m512i a, simde__m512i b) {
+ #if defined(SIMDE_AVX512F_NATIVE)
+ return _mm512_mask_cmpgt_epi64_mask(k1, a, b);
+ #else
+ simde__m512i_private
+ r_,
+ a_ = simde__m512i_to_private(a),
+ b_ = simde__m512i_to_private(b);
+
+ for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) {
+ r_.m256i[i] = simde_mm256_cmpgt_epi64(a_.m256i[i], b_.m256i[i]);
}
return simde__m512i_private_to_mmask8(r_) & k1;
#endif
}
#if defined(SIMDE_AVX512F_ENABLE_NATIVE_ALIASES)
-# define _mm512_mask_cmpeq_epi64_mask(k1, a, b) simde_mm512_mask_cmpeq_epi64_mask(k1, a, b)
+# define _mm512_mask_cmpgt_epi64_mask(k1, a, b) simde_mm512_mask_cmpgt_epi64_mask(k1, a, b)
#endif
SIMDE__FUNCTION_ATTRIBUTES
@@ -1788,6 +2396,31 @@ simde_mm512_cvtepi32_epi8 (simde__m512i a) {
# define _mm512_cvtepi32_epi8(a) simde_mm512_cvtepi32_epi8(a)
#endif
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m256i
+simde_mm512_cvtepi32_epi16 (simde__m512i a) {
+#if defined(SIMDE_AVX512F_NATIVE)
+ return _mm512_cvtepi32_epi16(a);
+#else
+ simde__m256i_private r_;
+ simde__m512i_private a_ = simde__m512i_to_private(a);
+
+#if defined(SIMDE__CONVERT_VECTOR)
+ SIMDE__CONVERT_VECTOR(r_.i16, a_.i32);
+#else
+ SIMDE__VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(a_.i32) / sizeof(a_.i32[0])) ; i++) {
+ r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, a_.i32[i]);
+ }
+#endif
+
+ return simde__m256i_from_private(r_);
+#endif
+}
+#if defined(SIMDE_AVX512F_ENABLE_NATIVE_ALIASES)
+# define _mm512_cvtepi32_epi16(a) simde_mm512_cvtepi32_epi16(a)
+#endif
+
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i
simde_mm512_cvtepi64_epi8 (simde__m512i a) {
@@ -1813,6 +2446,169 @@ simde_mm512_cvtepi64_epi8 (simde__m512i a) {
# define _mm512_cvtepi64_epi8(a) simde_mm512_cvtepi64_epi8(a)
#endif
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i
+simde_mm512_cvtepi64_epi16 (simde__m512i a) {
+#if defined(SIMDE_AVX512F_NATIVE)
+ return _mm512_cvtepi64_epi16(a);
+#else
+ simde__m128i_private r_ = simde__m128i_to_private(simde_mm_setzero_si128());
+ simde__m512i_private a_ = simde__m512i_to_private(a);
+
+#if defined(SIMDE__CONVERT_VECTOR)
+ SIMDE__CONVERT_VECTOR(r_.i16, a_.i64);
+#else
+ SIMDE__VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(a_.i64) / sizeof(a_.i64[0])) ; i++) {
+ r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, a_.i64[i]);
+ }
+#endif
+
+ return simde__m128i_from_private(r_);
+#endif
+}
+#if defined(SIMDE_AVX512F_ENABLE_NATIVE_ALIASES)
+# define _mm512_cvtepi64_epi16(a) simde_mm512_cvtepi64_epi16(a)
+#endif
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m512
+simde_mm512_div_ps (simde__m512 a, simde__m512 b) {
+#if defined(SIMDE_AVX512F_NATIVE)
+ return _mm512_div_ps(a, b);
+#else
+ simde__m512_private
+ r_,
+ a_ = simde__m512_to_private(a),
+ b_ = simde__m512_to_private(b);
+
+#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
+ r_.f32 = a_.f32 / b_.f32;
+#else
+ SIMDE__VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.m256) / sizeof(r_.m256[0])) ; i++) {
+ r_.m256[i] = simde_mm256_div_ps(a_.m256[i], b_.m256[i]);
+ }
+#endif
+
+ return simde__m512_from_private(r_);
+#endif
+}
+#if defined(SIMDE_AVX512F_ENABLE_NATIVE_ALIASES)
+# define _mm512_div_ps(a, b) simde_mm512_div_ps(a, b)
+#endif
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m512d
+simde_mm512_div_pd (simde__m512d a, simde__m512d b) {
+#if defined(SIMDE_AVX512F_NATIVE)
+ return _mm512_div_pd(a, b);
+#else
+ simde__m512d_private
+ r_,
+ a_ = simde__m512d_to_private(a),
+ b_ = simde__m512d_to_private(b);
+
+#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
+ r_.f64 = a_.f64 / b_.f64;
+#else
+ SIMDE__VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.m256d) / sizeof(r_.m256d[0])) ; i++) {
+ r_.m256d[i] = simde_mm256_div_pd(a_.m256d[i], b_.m256d[i]);
+ }
+#endif
+
+ return simde__m512d_from_private(r_);
+#endif
+}
+#if defined(SIMDE_AVX512F_ENABLE_NATIVE_ALIASES)
+# define _mm512_div_pd(a, b) simde_mm512_div_pd(a, b)
+#endif
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m512
+simde_mm512_mul_ps (simde__m512 a, simde__m512 b) {
+#if defined(SIMDE_AVX512F_NATIVE)
+ return _mm512_mul_ps(a, b);
+#else
+ simde__m512_private
+ r_,
+ a_ = simde__m512_to_private(a),
+ b_ = simde__m512_to_private(b);
+
+#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
+ r_.f32 = a_.f32 * b_.f32;
+#else
+ SIMDE__VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.m256) / sizeof(r_.m256[0])) ; i++) {
+ r_.m256[i] = simde_mm256_mul_ps(a_.m256[i], b_.m256[i]);
+ }
+#endif
+
+ return simde__m512_from_private(r_);
+#endif
+}
+#if defined(SIMDE_AVX512F_ENABLE_NATIVE_ALIASES)
+# define _mm512_mul_ps(a, b) simde_mm512_mul_ps(a, b)
+#endif
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m512d
+simde_mm512_mul_pd (simde__m512d a, simde__m512d b) {
+#if defined(SIMDE_AVX512F_NATIVE)
+ return _mm512_mul_pd(a, b);
+#else
+ simde__m512d_private
+ r_,
+ a_ = simde__m512d_to_private(a),
+ b_ = simde__m512d_to_private(b);
+
+#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
+ r_.f64 = a_.f64 * b_.f64;
+#else
+ SIMDE__VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.m256d) / sizeof(r_.m256d[0])) ; i++) {
+ r_.m256d[i] = simde_mm256_mul_pd(a_.m256d[i], b_.m256d[i]);
+ }
+#endif
+
+ return simde__m512d_from_private(r_);
+#endif
+}
+#if defined(SIMDE_AVX512F_ENABLE_NATIVE_ALIASES)
+# define _mm512_mul_pd(a, b) simde_mm512_mul_pd(a, b)
+#endif
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m512i
+simde_mm512_or_si512 (simde__m512i a, simde__m512i b) {
+#if defined(SIMDE_AVX512F_NATIVE)
+ return _mm512_or_si512(a, b);
+#else
+ simde__m512i_private
+ r_,
+ a_ = simde__m512i_to_private(a),
+ b_ = simde__m512i_to_private(b);
+
+#if defined(SIMDE_ARCH_X86_AVX2)
+ r_.m256i[0] = simde_mm256_or_si256(a_.m256i[0], b_.m256i[0]);
+ r_.m256i[1] = simde_mm256_or_si256(a_.m256i[1], b_.m256i[1]);
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
+ r_.i32f = a_.i32f | b_.i32f;
+#else
+ SIMDE__VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
+ r_.i32f[i] = a_.i32f[i] | b_.i32f[i];
+ }
+#endif
+
+ return simde__m512i_from_private(r_);
+#endif
+}
+#if defined(SIMDE_AVX512F_ENABLE_NATIVE_ALIASES)
+# define _mm512_or_si512(a, b) simde_mm512_or_si512(a, b)
+#endif
+
SIMDE__END_DECLS
HEDLEY_DIAGNOSTIC_POP
=====================================
simde/x86/sse2.h
=====================================
@@ -4405,15 +4405,13 @@ simde_mm_sll_epi64 (simde__m128i a, simde__m128i count) {
if (count_.u64[0] > 63)
return simde_mm_setzero_si128();
-#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_ARM_SHIFT_SCALAR)
- /* GCC ≤ 7 on AArch64 generates an ICE here */
- r_.u64 = (a_.u64 << count_.u64[0]);
-#else
- SIMDE__VECTORIZE
+ const int_fast16_t s = HEDLEY_STATIC_CAST(int_fast16_t, count_.u64[0]);
+ #if !defined(SIMDE_BUG_GCC_94488)
+ SIMDE__VECTORIZE
+ #endif
for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
- r_.u64[i] = a_.u64[i] << count_.u64[0];
+ r_.u64[i] = a_.u64[i] << s;
}
-#endif
return simde__m128i_from_private(r_);
#endif
@@ -4542,11 +4540,13 @@ simde_mm_srl_epi64 (simde__m128i a, simde__m128i count) {
a_ = simde__m128i_to_private(a),
count_ = simde__m128i_to_private(count);
- if (count_.u64[0] > 31)
+ if (count_.u64[0] > 63)
return simde_mm_setzero_si128();
- const int s = (int) (count_.u64[0]);
- SIMDE__VECTORIZE
+ const int_fast16_t s = HEDLEY_STATIC_CAST(int_fast16_t, count_.u64[0]);
+ #if !defined(SIMDE_BUG_GCC_94488)
+ SIMDE__VECTORIZE
+ #endif
for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
r_.u64[i] = a_.u64[i] >> s;
}
=====================================
test/arm/meson.build
=====================================
@@ -0,0 +1,4 @@
+subdir('neon')
+
+simde_tests_arm = static_library('simde-tests-arm', 'test-arm.c',
+ link_with: [simde_tests_arm_neon])
=====================================
test/arm/neon/meson.build
=====================================
@@ -0,0 +1,23 @@
+simde_test_arm_neon_sources = [
+ 'vadd.c',
+ 'vdup_n.c',
+ 'vmul.c',
+ 'vsub.c'
+]
+
+foreach src : simde_test_arm_neon_sources
+ simde_test_arm_neon_sources += configure_file(input: src, output: src + 'pp', copy: true)
+endforeach
+
+simde_tests_arm_neon_emul = static_library('simde-tests-arm-neon-emul', simde_test_arm_neon_sources,
+ c_args: simde_c_args + simde_c_defs + simde_native_c_flags + ['-DSIMDE_NO_NATIVE'],
+ cpp_args: simde_c_args + simde_c_defs + simde_native_c_flags + ['-DSIMDE_NO_NATIVE'],
+ include_directories: simde_include_dir)
+
+simde_tests_arm_neon_native = static_library('simde-tests-arm-neon-native', simde_test_arm_neon_sources,
+ c_args: simde_c_args + simde_c_defs + simde_native_c_flags + simde_native_c_flags,
+ cpp_args: simde_c_args + simde_c_defs + simde_native_c_flags + simde_native_cxx_flags,
+ include_directories: simde_include_dir)
+
+simde_tests_arm_neon = static_library('simde-tests-neon', 'test-neon.c',
+ link_with: [simde_tests_arm_neon_emul, simde_tests_arm_neon_native])
=====================================
test/meson.build
=====================================
@@ -0,0 +1,64 @@
+munit_deps = []
+munit_defs = []
+if cc.get_id() == 'pgi'
+ munit_deps += dependency('openmp')
+endif
+if cc.has_function('clock_gettime')
+ librt_dep = cc.find_library('rt', required: false)
+ if not librt_dep.found()
+ munit_defs += '-DPSNIP_CLOCK_NO_LIBRT'
+ endif
+ munit_deps += librt_dep
+endif
+munit = static_library('munit', 'munit/munit.c', dependencies: munit_deps)
+
+simde_c_defs = []
+simde_cxx_defs = []
+simde_c_args = []
+simde_cxx_args = []
+simde_deps = []
+
+c_openmp_simd = false
+cxx_openmp_simd = false
+foreach omp_arg : ['-fopenmp-simd', '-qopenmp-simd']
+ if (not c_openmp_simd) and cc.has_argument(omp_arg)
+ simde_c_args += omp_arg
+ simde_c_defs += '-DSIMDE_ENABLE_OPENMP'
+ c_openmp_simd = true
+ endif
+
+ if (not cxx_openmp_simd) and cxx.has_argument(omp_arg)
+ simde_cxx_args += omp_arg
+ simde_cxx_defs += '-DSIMDE_ENABLE_OPENMP'
+ cxx_openmp_simd = true
+ endif
+endforeach
+
+simde_native_c_flags = []
+# if cc.has_argument('-march=native+simd')
+# simde_native_c_flags += '-march=native+simd'
+# elif cc.has_argument('-march=native')
+# simde_native_c_flags += '-march=native'
+# endif
+
+simde_native_cxx_flags = []
+# if cxx.has_argument('-march=native+simd')
+# simde_native_cxx_flags += '-march=native+simd'
+# elif cxx.has_argument('-march=native')
+# simde_native_cxx_flags += '-march=native'
+# endif
+
+if not c_openmp_simd
+ simde_deps += dependency('openmp', required: false)
+endif
+
+simde_include_dir = include_directories('..')
+
+subdir('x86')
+subdir('arm')
+
+executable('run-tests', 'run-tests.c',
+ link_with: [munit, simde_tests_x86, simde_tests_arm],
+ dependencies: simde_deps,
+ c_args: simde_c_args + simde_c_defs + simde_native_c_flags,
+ cpp_args: simde_cxx_args + simde_cxx_defs + simde_native_cxx_flags)
=====================================
test/x86/avx512bw.c
=====================================
@@ -2179,6 +2179,300 @@ test_simde_mm512_cmpeq_epi8_mask(const MunitParameter params[], void* data) {
return MUNIT_OK;
}
+static MunitResult
+test_simde_mm512_cvtepi16_epi8(const MunitParameter params[], void* data) {
+ (void) params;
+ (void) data;
+
+ const struct {
+ simde__m512i a;
+ simde__m256i r;
+ } test_vec[8] = {
+ { simde_mm512_set_epi16(INT16_C( 14423), INT16_C( 3775), INT16_C( 16156), INT16_C( 17811),
+ INT16_C(-14881), INT16_C(-30283), INT16_C( 27295), INT16_C(-12290),
+ INT16_C( 12394), INT16_C( 32764), INT16_C( 8681), INT16_C( 21255),
+ INT16_C(-21785), INT16_C(-24065), INT16_C(-28005), INT16_C( 15206),
+ INT16_C( 6131), INT16_C(-29323), INT16_C( -9530), INT16_C( -6655),
+ INT16_C( 14785), INT16_C( -9158), INT16_C( 7009), INT16_C( 4834),
+ INT16_C(-15579), INT16_C( 5296), INT16_C( 20054), INT16_C( 12832),
+ INT16_C( 15724), INT16_C( 5918), INT16_C( 25398), INT16_C( 14084)),
+ simde_mm256_set_epi8(INT8_C( 87), INT8_C( -65), INT8_C( 28), INT8_C(-109),
+ INT8_C( -33), INT8_C( -75), INT8_C( -97), INT8_C( -2),
+ INT8_C( 106), INT8_C( -4), INT8_C( -23), INT8_C( 7),
+ INT8_C( -25), INT8_C( -1), INT8_C(-101), INT8_C( 102),
+ INT8_C( -13), INT8_C( 117), INT8_C( -58), INT8_C( 1),
+ INT8_C( -63), INT8_C( 58), INT8_C( 97), INT8_C( -30),
+ INT8_C( 37), INT8_C( -80), INT8_C( 86), INT8_C( 32),
+ INT8_C( 108), INT8_C( 30), INT8_C( 54), INT8_C( 4)) },
+ { simde_mm512_set_epi16(INT16_C( 8455), INT16_C( 1140), INT16_C(-23383), INT16_C( 22825),
+ INT16_C(-21438), INT16_C( 8713), INT16_C(-25940), INT16_C(-31180),
+ INT16_C(-13214), INT16_C( 10200), INT16_C(-21253), INT16_C( 2612),
+ INT16_C(-27891), INT16_C( 14031), INT16_C( -9014), INT16_C( 10287),
+ INT16_C(-11660), INT16_C( 26858), INT16_C(-19518), INT16_C( 2472),
+ INT16_C( 27637), INT16_C( 14857), INT16_C( 30034), INT16_C(-24153),
+ INT16_C( 31935), INT16_C( -6397), INT16_C( -2502), INT16_C( 31062),
+ INT16_C( 30236), INT16_C( 5156), INT16_C( 18439), INT16_C(-13074)),
+ simde_mm256_set_epi8(INT8_C( 7), INT8_C( 116), INT8_C( -87), INT8_C( 41),
+ INT8_C( 66), INT8_C( 9), INT8_C( -84), INT8_C( 52),
+ INT8_C( 98), INT8_C( -40), INT8_C( -5), INT8_C( 52),
+ INT8_C( 13), INT8_C( -49), INT8_C( -54), INT8_C( 47),
+ INT8_C( 116), INT8_C( -22), INT8_C( -62), INT8_C( -88),
+ INT8_C( -11), INT8_C( 9), INT8_C( 82), INT8_C( -89),
+ INT8_C( -65), INT8_C( 3), INT8_C( 58), INT8_C( 86),
+ INT8_C( 28), INT8_C( 36), INT8_C( 7), INT8_C( -18)) },
+ { simde_mm512_set_epi16(INT16_C( 18175), INT16_C( -3760), INT16_C( 10318), INT16_C(-31849),
+ INT16_C(-32429), INT16_C(-26500), INT16_C( 24084), INT16_C(-23946),
+ INT16_C( 2525), INT16_C( 2478), INT16_C(-15141), INT16_C(-27410),
+ INT16_C( 30961), INT16_C(-31554), INT16_C( -9533), INT16_C(-20012),
+ INT16_C(-21820), INT16_C( 11767), INT16_C(-17849), INT16_C( 24518),
+ INT16_C(-22206), INT16_C(-24996), INT16_C(-19566), INT16_C( 17826),
+ INT16_C( 25765), INT16_C( 29123), INT16_C( 28065), INT16_C( 1432),
+ INT16_C(-24949), INT16_C( 30580), INT16_C( 20499), INT16_C(-29164)),
+ simde_mm256_set_epi8(INT8_C( -1), INT8_C( 80), INT8_C( 78), INT8_C(-105),
+ INT8_C( 83), INT8_C( 124), INT8_C( 20), INT8_C( 118),
+ INT8_C( -35), INT8_C( -82), INT8_C( -37), INT8_C( -18),
+ INT8_C( -15), INT8_C( -66), INT8_C( -61), INT8_C( -44),
+ INT8_C( -60), INT8_C( -9), INT8_C( 71), INT8_C( -58),
+ INT8_C( 66), INT8_C( 92), INT8_C(-110), INT8_C( -94),
+ INT8_C( -91), INT8_C( -61), INT8_C( -95), INT8_C(-104),
+ INT8_C(-117), INT8_C( 116), INT8_C( 19), INT8_C( 20)) },
+ { simde_mm512_set_epi16(INT16_C( 10816), INT16_C( 16713), INT16_C( 29707), INT16_C( 15186),
+ INT16_C( 31860), INT16_C(-28520), INT16_C( 18947), INT16_C(-27460),
+ INT16_C( 10883), INT16_C( 310), INT16_C( 8277), INT16_C(-28768),
+ INT16_C( -4553), INT16_C( 23273), INT16_C(-27696), INT16_C(-20678),
+ INT16_C( 13089), INT16_C( -6620), INT16_C( 31575), INT16_C(-20169),
+ INT16_C( 14440), INT16_C( -9264), INT16_C(-26919), INT16_C(-25720),
+ INT16_C(-18371), INT16_C( 25765), INT16_C(-13162), INT16_C(-16808),
+ INT16_C( 5695), INT16_C(-25080), INT16_C( 19142), INT16_C( 3825)),
+ simde_mm256_set_epi8(INT8_C( 64), INT8_C( 73), INT8_C( 11), INT8_C( 82),
+ INT8_C( 116), INT8_C(-104), INT8_C( 3), INT8_C( -68),
+ INT8_C(-125), INT8_C( 54), INT8_C( 85), INT8_C( -96),
+ INT8_C( 55), INT8_C( -23), INT8_C( -48), INT8_C( 58),
+ INT8_C( 33), INT8_C( 36), INT8_C( 87), INT8_C( 55),
+ INT8_C( 104), INT8_C( -48), INT8_C( -39), INT8_C(-120),
+ INT8_C( 61), INT8_C( -91), INT8_C(-106), INT8_C( 88),
+ INT8_C( 63), INT8_C( 8), INT8_C( -58), INT8_C( -15)) },
+ { simde_mm512_set_epi16(INT16_C( 5079), INT16_C(-24746), INT16_C( 23487), INT16_C(-22087),
+ INT16_C( -8346), INT16_C( 29848), INT16_C( 14241), INT16_C( 18254),
+ INT16_C( -3124), INT16_C(-16186), INT16_C(-13364), INT16_C( 10652),
+ INT16_C( 31028), INT16_C( 21346), INT16_C( 1443), INT16_C(-20222),
+ INT16_C(-17028), INT16_C(-21899), INT16_C( 18933), INT16_C( 6935),
+ INT16_C( 24619), INT16_C( 1737), INT16_C( 12596), INT16_C( 31606),
+ INT16_C(-32691), INT16_C( 11392), INT16_C( 32126), INT16_C(-32712),
+ INT16_C( 20927), INT16_C(-27859), INT16_C( 22640), INT16_C( 8969)),
+ simde_mm256_set_epi8(INT8_C( -41), INT8_C( 86), INT8_C( -65), INT8_C( -71),
+ INT8_C( 102), INT8_C(-104), INT8_C( -95), INT8_C( 78),
+ INT8_C( -52), INT8_C( -58), INT8_C( -52), INT8_C(-100),
+ INT8_C( 52), INT8_C( 98), INT8_C( -93), INT8_C( 2),
+ INT8_C( 124), INT8_C( 117), INT8_C( -11), INT8_C( 23),
+ INT8_C( 43), INT8_C( -55), INT8_C( 52), INT8_C( 118),
+ INT8_C( 77), INT8_C(-128), INT8_C( 126), INT8_C( 56),
+ INT8_C( -65), INT8_C( 45), INT8_C( 112), INT8_C( 9)) },
+ { simde_mm512_set_epi16(INT16_C( 6901), INT16_C(-23435), INT16_C(-26040), INT16_C(-11295),
+ INT16_C( 623), INT16_C(-23058), INT16_C( 17549), INT16_C(-23291),
+ INT16_C( 17215), INT16_C( -4892), INT16_C( -849), INT16_C( 21086),
+ INT16_C(-13056), INT16_C( 19549), INT16_C( 16492), INT16_C(-22767),
+ INT16_C(-24079), INT16_C( 6429), INT16_C( 15302), INT16_C( -9175),
+ INT16_C( 17671), INT16_C(-29856), INT16_C(-12718), INT16_C(-22914),
+ INT16_C(-19613), INT16_C( 14088), INT16_C(-10443), INT16_C( 31757),
+ INT16_C( 24994), INT16_C( 24174), INT16_C( -9596), INT16_C(-22481)),
+ simde_mm256_set_epi8(INT8_C( -11), INT8_C( 117), INT8_C( 72), INT8_C( -31),
+ INT8_C( 111), INT8_C( -18), INT8_C(-115), INT8_C( 5),
+ INT8_C( 63), INT8_C( -28), INT8_C( -81), INT8_C( 94),
+ INT8_C( 0), INT8_C( 93), INT8_C( 108), INT8_C( 17),
+ INT8_C( -15), INT8_C( 29), INT8_C( -58), INT8_C( 41),
+ INT8_C( 7), INT8_C( 96), INT8_C( 82), INT8_C( 126),
+ INT8_C( 99), INT8_C( 8), INT8_C( 53), INT8_C( 13),
+ INT8_C( -94), INT8_C( 110), INT8_C(-124), INT8_C( 47)) },
+ { simde_mm512_set_epi16(INT16_C( 15520), INT16_C( 15679), INT16_C( 8541), INT16_C(-20376),
+ INT16_C( 8861), INT16_C( 12926), INT16_C( 25712), INT16_C( -8433),
+ INT16_C( -7066), INT16_C(-23691), INT16_C(-20251), INT16_C( 18056),
+ INT16_C( 5498), INT16_C(-18751), INT16_C(-26321), INT16_C( 7918),
+ INT16_C( 1647), INT16_C( 21774), INT16_C( 5430), INT16_C(-19512),
+ INT16_C(-14894), INT16_C( 12466), INT16_C( -9612), INT16_C(-23130),
+ INT16_C( 18357), INT16_C( 32349), INT16_C(-25760), INT16_C( -6559),
+ INT16_C(-24198), INT16_C( 13614), INT16_C( 13473), INT16_C(-25578)),
+ simde_mm256_set_epi8(INT8_C( -96), INT8_C( 63), INT8_C( 93), INT8_C( 104),
+ INT8_C( -99), INT8_C( 126), INT8_C( 112), INT8_C( 15),
+ INT8_C( 102), INT8_C( 117), INT8_C( -27), INT8_C(-120),
+ INT8_C( 122), INT8_C( -63), INT8_C( 47), INT8_C( -18),
+ INT8_C( 111), INT8_C( 14), INT8_C( 54), INT8_C( -56),
+ INT8_C( -46), INT8_C( -78), INT8_C( 116), INT8_C( -90),
+ INT8_C( -75), INT8_C( 93), INT8_C( 96), INT8_C( 97),
+ INT8_C( 122), INT8_C( 46), INT8_C( -95), INT8_C( 22)) },
+ { simde_mm512_set_epi16(INT16_C(-13944), INT16_C( 30422), INT16_C( 10523), INT16_C( 28986),
+ INT16_C(-23789), INT16_C(-20754), INT16_C( 29282), INT16_C(-10845),
+ INT16_C( 10721), INT16_C( 2777), INT16_C(-18838), INT16_C( 8324),
+ INT16_C( 19192), INT16_C( 114), INT16_C( -9073), INT16_C( 2615),
+ INT16_C( 21008), INT16_C( 12652), INT16_C(-14859), INT16_C( 5734),
+ INT16_C( -5598), INT16_C(-10707), INT16_C( 2170), INT16_C( 23903),
+ INT16_C( 29988), INT16_C( 24405), INT16_C( 5383), INT16_C(-29994),
+ INT16_C( 7143), INT16_C( 22270), INT16_C( -1480), INT16_C( 15491)),
+ simde_mm256_set_epi8(INT8_C(-120), INT8_C( -42), INT8_C( 27), INT8_C( 58),
+ INT8_C( 19), INT8_C( -18), INT8_C( 98), INT8_C( -93),
+ INT8_C( -31), INT8_C( -39), INT8_C( 106), INT8_C(-124),
+ INT8_C( -8), INT8_C( 114), INT8_C(-113), INT8_C( 55),
+ INT8_C( 16), INT8_C( 108), INT8_C( -11), INT8_C( 102),
+ INT8_C( 34), INT8_C( 45), INT8_C( 122), INT8_C( 95),
+ INT8_C( 36), INT8_C( 85), INT8_C( 7), INT8_C( -42),
+ INT8_C( -25), INT8_C( -2), INT8_C( 56), INT8_C(-125)) }
+ };
+
+ for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])); i++) {
+ simde__m256i r = simde_mm512_cvtepi16_epi8(test_vec[i].a);
+ simde_assert_m256i_i8(r, ==, test_vec[i].r);
+ }
+
+ return MUNIT_OK;
+}
+
+static MunitResult
+test_simde_mm512_cvtepi8_epi16(const MunitParameter params[], void* data) {
+ (void) params;
+ (void) data;
+
+ const struct {
+ simde__m256i a;
+ simde__m512i r;
+ } test_vec[8] = {
+ { simde_mm256_set_epi8(INT8_C( 7), INT8_C( 68), INT8_C( -86), INT8_C( -36),
+ INT8_C( -19), INT8_C( 73), INT8_C( 92), INT8_C( -27),
+ INT8_C( 55), INT8_C( -65), INT8_C( -50), INT8_C( 19),
+ INT8_C(-111), INT8_C( -79), INT8_C( -16), INT8_C( 70),
+ INT8_C( 27), INT8_C( -28), INT8_C( 116), INT8_C( 42),
+ INT8_C( -4), INT8_C( 78), INT8_C( 31), INT8_C( 51),
+ INT8_C( 92), INT8_C( 39), INT8_C(-125), INT8_C( 94),
+ INT8_C( -78), INT8_C( 67), INT8_C( -43), INT8_C( -71)),
+ simde_mm512_set_epi16(INT16_C( 7), INT16_C( 68), INT16_C( -86), INT16_C( -36),
+ INT16_C( -19), INT16_C( 73), INT16_C( 92), INT16_C( -27),
+ INT16_C( 55), INT16_C( -65), INT16_C( -50), INT16_C( 19),
+ INT16_C( -111), INT16_C( -79), INT16_C( -16), INT16_C( 70),
+ INT16_C( 27), INT16_C( -28), INT16_C( 116), INT16_C( 42),
+ INT16_C( -4), INT16_C( 78), INT16_C( 31), INT16_C( 51),
+ INT16_C( 92), INT16_C( 39), INT16_C( -125), INT16_C( 94),
+ INT16_C( -78), INT16_C( 67), INT16_C( -43), INT16_C( -71)) },
+ { simde_mm256_set_epi8(INT8_C( 29), INT8_C( -37), INT8_C( 27), INT8_C( 10),
+ INT8_C( -22), INT8_C( -9), INT8_C(-125), INT8_C( -3),
+ INT8_C( -53), INT8_C( 92), INT8_C( 103), INT8_C( 92),
+ INT8_C( 123), INT8_C( 74), INT8_C( 36), INT8_C( 59),
+ INT8_C( 46), INT8_C( -29), INT8_C(-103), INT8_C( -4),
+ INT8_C( 109), INT8_C( -54), INT8_C( 41), INT8_C( 79),
+ INT8_C( 15), INT8_C( -92), INT8_C( 102), INT8_C( 116),
+ INT8_C( -42), INT8_C( 52), INT8_C( -61), INT8_C( -99)),
+ simde_mm512_set_epi16(INT16_C( 29), INT16_C( -37), INT16_C( 27), INT16_C( 10),
+ INT16_C( -22), INT16_C( -9), INT16_C( -125), INT16_C( -3),
+ INT16_C( -53), INT16_C( 92), INT16_C( 103), INT16_C( 92),
+ INT16_C( 123), INT16_C( 74), INT16_C( 36), INT16_C( 59),
+ INT16_C( 46), INT16_C( -29), INT16_C( -103), INT16_C( -4),
+ INT16_C( 109), INT16_C( -54), INT16_C( 41), INT16_C( 79),
+ INT16_C( 15), INT16_C( -92), INT16_C( 102), INT16_C( 116),
+ INT16_C( -42), INT16_C( 52), INT16_C( -61), INT16_C( -99)) },
+ { simde_mm256_set_epi8(INT8_C( -9), INT8_C( -47), INT8_C( 107), INT8_C( -74),
+ INT8_C(-126), INT8_C( 34), INT8_C( 64), INT8_C( 115),
+ INT8_C( -65), INT8_C(-124), INT8_C( 54), INT8_C( 27),
+ INT8_C( 41), INT8_C( 112), INT8_C( 61), INT8_C( 6),
+ INT8_C( 7), INT8_C( 39), INT8_C(-109), INT8_C( -99),
+ INT8_C( 63), INT8_C( -35), INT8_C(-111), INT8_C( -72),
+ INT8_C( 109), INT8_C( -39), INT8_C( -99), INT8_C( 26),
+ INT8_C( 66), INT8_C( -78), INT8_C( 30), INT8_C( 38)),
+ simde_mm512_set_epi16(INT16_C( -9), INT16_C( -47), INT16_C( 107), INT16_C( -74),
+ INT16_C( -126), INT16_C( 34), INT16_C( 64), INT16_C( 115),
+ INT16_C( -65), INT16_C( -124), INT16_C( 54), INT16_C( 27),
+ INT16_C( 41), INT16_C( 112), INT16_C( 61), INT16_C( 6),
+ INT16_C( 7), INT16_C( 39), INT16_C( -109), INT16_C( -99),
+ INT16_C( 63), INT16_C( -35), INT16_C( -111), INT16_C( -72),
+ INT16_C( 109), INT16_C( -39), INT16_C( -99), INT16_C( 26),
+ INT16_C( 66), INT16_C( -78), INT16_C( 30), INT16_C( 38)) },
+ { simde_mm256_set_epi8(INT8_C( -72), INT8_C( -80), INT8_C( 101), INT8_C( 81),
+ INT8_C( 23), INT8_C( -68), INT8_C( -57), INT8_C(-111),
+ INT8_C( -3), INT8_C( 21), INT8_C( 121), INT8_C( -22),
+ INT8_C(-104), INT8_C( -10), INT8_C( -37), INT8_C( 66),
+ INT8_C( -93), INT8_C( -80), INT8_C( 34), INT8_C( 104),
+ INT8_C( -39), INT8_C( -99), INT8_C( 18), INT8_C( 110),
+ INT8_C(-118), INT8_C( 38), INT8_C( 112), INT8_C( -67),
+ INT8_C( 60), INT8_C( 47), INT8_C( 32), INT8_C( 33)),
+ simde_mm512_set_epi16(INT16_C( -72), INT16_C( -80), INT16_C( 101), INT16_C( 81),
+ INT16_C( 23), INT16_C( -68), INT16_C( -57), INT16_C( -111),
+ INT16_C( -3), INT16_C( 21), INT16_C( 121), INT16_C( -22),
+ INT16_C( -104), INT16_C( -10), INT16_C( -37), INT16_C( 66),
+ INT16_C( -93), INT16_C( -80), INT16_C( 34), INT16_C( 104),
+ INT16_C( -39), INT16_C( -99), INT16_C( 18), INT16_C( 110),
+ INT16_C( -118), INT16_C( 38), INT16_C( 112), INT16_C( -67),
+ INT16_C( 60), INT16_C( 47), INT16_C( 32), INT16_C( 33)) },
+ { simde_mm256_set_epi8(INT8_C( 120), INT8_C( -90), INT8_C(-101), INT8_C(-106),
+ INT8_C( 70), INT8_C( -49), INT8_C( 29), INT8_C( -43),
+ INT8_C( -42), INT8_C( 38), INT8_C( 16), INT8_C( -43),
+ INT8_C( -40), INT8_C( -76), INT8_C( -67), INT8_C( 53),
+ INT8_C( -73), INT8_C( -17), INT8_C( 66), INT8_C( 57),
+ INT8_C( -65), INT8_C( -63), INT8_C( 17), INT8_C( -9),
+ INT8_C( 95), INT8_C( -50), INT8_C(-118), INT8_C( 114),
+ INT8_C( 58), INT8_C( -28), INT8_C( -81), INT8_C( -37)),
+ simde_mm512_set_epi16(INT16_C( 120), INT16_C( -90), INT16_C( -101), INT16_C( -106),
+ INT16_C( 70), INT16_C( -49), INT16_C( 29), INT16_C( -43),
+ INT16_C( -42), INT16_C( 38), INT16_C( 16), INT16_C( -43),
+ INT16_C( -40), INT16_C( -76), INT16_C( -67), INT16_C( 53),
+ INT16_C( -73), INT16_C( -17), INT16_C( 66), INT16_C( 57),
+ INT16_C( -65), INT16_C( -63), INT16_C( 17), INT16_C( -9),
+ INT16_C( 95), INT16_C( -50), INT16_C( -118), INT16_C( 114),
+ INT16_C( 58), INT16_C( -28), INT16_C( -81), INT16_C( -37)) },
+ { simde_mm256_set_epi8(INT8_C( -97), INT8_C( 10), INT8_C( -75), INT8_C(-120),
+ INT8_C( -32), INT8_C(-105), INT8_C( -75), INT8_C(-101),
+ INT8_C( 71), INT8_C(-122), INT8_C(-112), INT8_C( -2),
+ INT8_C( 60), INT8_C( -71), INT8_C( 101), INT8_C( -1),
+ INT8_C( 95), INT8_C( -58), INT8_C( -70), INT8_C( 102),
+ INT8_C( 115), INT8_C( -68), INT8_C(-110), INT8_C( -36),
+ INT8_C( 6), INT8_C( 58), INT8_C( 73), INT8_C( 97),
+ INT8_C( -51), INT8_C( -4), INT8_C( 58), INT8_C( 31)),
+ simde_mm512_set_epi16(INT16_C( -97), INT16_C( 10), INT16_C( -75), INT16_C( -120),
+ INT16_C( -32), INT16_C( -105), INT16_C( -75), INT16_C( -101),
+ INT16_C( 71), INT16_C( -122), INT16_C( -112), INT16_C( -2),
+ INT16_C( 60), INT16_C( -71), INT16_C( 101), INT16_C( -1),
+ INT16_C( 95), INT16_C( -58), INT16_C( -70), INT16_C( 102),
+ INT16_C( 115), INT16_C( -68), INT16_C( -110), INT16_C( -36),
+ INT16_C( 6), INT16_C( 58), INT16_C( 73), INT16_C( 97),
+ INT16_C( -51), INT16_C( -4), INT16_C( 58), INT16_C( 31)) },
+ { simde_mm256_set_epi8(INT8_C( -73), INT8_C(-123), INT8_C( -11), INT8_C( 62),
+ INT8_C( -96), INT8_C(-103), INT8_C( 85), INT8_C( 88),
+ INT8_C( -19), INT8_C( 28), INT8_C(-107), INT8_C( -81),
+ INT8_C(-125), INT8_C( 88), INT8_C( 84), INT8_C( 115),
+ INT8_C( 105), INT8_C( -47), INT8_C( 68), INT8_C(-124),
+ INT8_C( 32), INT8_C(-100), INT8_C( 10), INT8_C( -69),
+ INT8_C( 124), INT8_C( -51), INT8_C( -89), INT8_C( -72),
+ INT8_C( -92), INT8_C( -5), INT8_C( -46), INT8_C( 115)),
+ simde_mm512_set_epi16(INT16_C( -73), INT16_C( -123), INT16_C( -11), INT16_C( 62),
+ INT16_C( -96), INT16_C( -103), INT16_C( 85), INT16_C( 88),
+ INT16_C( -19), INT16_C( 28), INT16_C( -107), INT16_C( -81),
+ INT16_C( -125), INT16_C( 88), INT16_C( 84), INT16_C( 115),
+ INT16_C( 105), INT16_C( -47), INT16_C( 68), INT16_C( -124),
+ INT16_C( 32), INT16_C( -100), INT16_C( 10), INT16_C( -69),
+ INT16_C( 124), INT16_C( -51), INT16_C( -89), INT16_C( -72),
+ INT16_C( -92), INT16_C( -5), INT16_C( -46), INT16_C( 115)) },
+ { simde_mm256_set_epi8(INT8_C( 104), INT8_C( 66), INT8_C( 51), INT8_C( 81),
+ INT8_C( -69), INT8_C( 104), INT8_C( 126), INT8_C( -43),
+ INT8_C( -40), INT8_C( 23), INT8_C(-124), INT8_C( 98),
+ INT8_C(-125), INT8_C( 95), INT8_C( -36), INT8_C( 46),
+ INT8_C(-115), INT8_C( -93), INT8_C( 2), INT8_C( -77),
+ INT8_C( 80), INT8_C(-116), INT8_C( 61), INT8_C( -89),
+ INT8_C( -37), INT8_C( 9), INT8_C( 84), INT8_C( -64),
+ INT8_C( 94), INT8_C( 67), INT8_C( -53), INT8_C( 111)),
+ simde_mm512_set_epi16(INT16_C( 104), INT16_C( 66), INT16_C( 51), INT16_C( 81),
+ INT16_C( -69), INT16_C( 104), INT16_C( 126), INT16_C( -43),
+ INT16_C( -40), INT16_C( 23), INT16_C( -124), INT16_C( 98),
+ INT16_C( -125), INT16_C( 95), INT16_C( -36), INT16_C( 46),
+ INT16_C( -115), INT16_C( -93), INT16_C( 2), INT16_C( -77),
+ INT16_C( 80), INT16_C( -116), INT16_C( 61), INT16_C( -89),
+ INT16_C( -37), INT16_C( 9), INT16_C( 84), INT16_C( -64),
+ INT16_C( 94), INT16_C( 67), INT16_C( -53), INT16_C( 111)) }
+ };
+
+ for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])); i++) {
+ simde__m512i r = simde_mm512_cvtepi8_epi16(test_vec[i].a);
+ simde_assert_m512i_i16(r, ==, test_vec[i].r);
+ }
+
+ return MUNIT_OK;
+}
+
#endif /* defined(SIMDE_avx512bw_NATIVE) || defined(SIMDE_NO_NATIVE) || defined(SIMDE_ALWAYS_BUILD_NATIVE_TESTS) */
HEDLEY_DIAGNOSTIC_PUSH
@@ -2197,6 +2491,10 @@ static MunitTest test_suite_tests[] = {
SIMDE_TESTS_DEFINE_TEST(mm512_shuffle_epi8),
SIMDE_TESTS_DEFINE_TEST(mm512_cmpeq_epi8_mask),
+ SIMDE_TESTS_DEFINE_TEST(mm512_cvtepi16_epi8),
+
+ SIMDE_TESTS_DEFINE_TEST(mm512_cvtepi8_epi16),
+
#endif /* defined(SIMDE_AVX512bw_NATIVE) || defined(SIMDE_NO_NATIVE) || defined(SIMDE_ALWAYS_BUILD_NATIVE_TESTS) */
{ NULL, NULL, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL }
};
=====================================
test/x86/avx512f.c
=====================================
The diff for this file was not included because it is too large.
=====================================
test/x86/meson.build
=====================================
@@ -0,0 +1,32 @@
+simde_test_x86_sources = [
+ 'mmx.c',
+ 'sse.c',
+ 'sse2.c',
+ 'sse3.c',
+ 'ssse3.c',
+ 'sse4.1.c',
+ 'sse4.2.c',
+ 'avx.c',
+ 'avx2.c',
+ 'fma.c',
+ 'avx512f.c',
+ 'avx512bw.c',
+ 'svml.c'
+]
+
+foreach src : simde_test_x86_sources
+ simde_test_x86_sources += configure_file(input: src, output: src + 'pp', copy: true)
+endforeach
+
+simde_tests_x86_emul = static_library('simde-tests-x86-emul', simde_test_x86_sources,
+ c_args: simde_c_args + simde_c_defs + simde_native_c_flags + ['-DSIMDE_NO_NATIVE'],
+ cpp_args: simde_cxx_args + simde_cxx_defs + simde_native_cxx_flags + ['-DSIMDE_NO_NATIVE'],
+ include_directories: simde_include_dir)
+
+simde_tests_x86_native = static_library('simde-tests-x86-native', simde_test_x86_sources,
+ c_args: simde_c_args + simde_c_defs + simde_native_c_flags,
+ cpp_args: simde_cxx_args + simde_cxx_defs + simde_native_cxx_flags,
+ include_directories: simde_include_dir)
+
+simde_tests_x86 = static_library('simde-tests-x86', 'test-x86.c',
+ link_with: [simde_tests_x86_emul, simde_tests_x86_native])
=====================================
test/x86/mmx.c
=====================================
@@ -25,7 +25,6 @@
#include <test/x86/test-mmx.h>
#if defined(SIMDE_MMX_NATIVE) || defined(SIMDE_NO_NATIVE) || defined(SIMDE_ALWAYS_BUILD_NATIVE_TESTS)
-#endif /* defined(SIMDE_MMX_NATIVE) || defined(SIMDE_NO_NATIVE) || defined(SIMDE_ALWAYS_BUILD_NATIVE_TESTS) */
static MunitResult
test_simde_mm_set1_pi8(const MunitParameter params[], void* data) {
@@ -2786,7 +2785,6 @@ test_simde_m_to_int64(const MunitParameter params[], void* data) {
return MUNIT_OK;
}
-#if defined(SIMDE_MMX_NATIVE) || defined(SIMDE_NO_NATIVE) || defined(SIMDE_ALWAYS_BUILD_NATIVE_TESTS)
#endif /* defined(SIMDE_MMX_NATIVE) || defined(SIMDE_NO_NATIVE) || defined(SIMDE_ALWAYS_BUILD_NATIVE_TESTS) */
HEDLEY_DIAGNOSTIC_PUSH
=====================================
test/x86/skel.c
=====================================
@@ -2804,7 +2804,7 @@ test_simde_mm512_mask_xxx_epi32_mask(const MunitParameter params[], void* data)
r = simde_mm512_mask_xxx_epi32_mask(k, simde__m512i_from_private(a), simde__m512i_from_private(b));
- printf(" { UINT16_C(%5" PRId16 "),\n", k);
+ printf(" { UINT16_C(%5" PRIu16 "),\n", k);
printf(" simde_mm512_set_epi32(INT32_C(%11" PRId32 "), INT32_C(%11" PRId32 "), INT32_C(%11" PRId32 "), INT32_C(%11" PRId32 "),\n"
" INT32_C(%11" PRId32 "), INT32_C(%11" PRId32 "), INT32_C(%11" PRId32 "), INT32_C(%11" PRId32 "),\n"
" INT32_C(%11" PRId32 "), INT32_C(%11" PRId32 "), INT32_C(%11" PRId32 "), INT32_C(%11" PRId32 "),\n"
@@ -2817,13 +2817,13 @@ test_simde_mm512_mask_xxx_epi32_mask(const MunitParameter params[], void* data)
" INT32_C(%11" PRId32 "), INT32_C(%11" PRId32 "), INT32_C(%11" PRId32 "), INT32_C(%11" PRId32 ")),\n",
b.i32[15], b.i32[14], b.i32[13], b.i32[12], b.i32[11], b.i32[10], b.i32[ 9], b.i32[ 8],
b.i32[ 7], b.i32[ 6], b.i32[ 5], b.i32[ 4], b.i32[ 3], b.i32[ 2], b.i32[ 1], b.i32[ 0]);
- printf(" UINT16_C(%5" PRId16 ") },\n", r);
+ printf(" UINT16_C(%5" PRIu16 ") },\n", r);
}
return MUNIT_FAIL;
for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])); i++) {
simde__mmask16 r = simde_mm512_mask_xxx_epi32_mask(test_vec[i].k, test_vec[i].a, test_vec[i].b);
- munit_assert_uint16(r, ==, test_vec[i].r);
+ simde_assert_mmask16(r, ==, test_vec[i].r);
}
return MUNIT_OK;
@@ -2873,7 +2873,7 @@ test_simde_mm512_mask_xxx_epi64_mask(const MunitParameter params[], void* data)
for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])); i++) {
simde__mmask8 r = simde_mm512_mask_xxx_epi64_mask(test_vec[i].k, test_vec[i].a, test_vec[i].b);
- munit_assert_uint8(r, ==, test_vec[i].r);
+ simde_assert_mmask8(r, ==, test_vec[i].r);
}
return MUNIT_OK;
=====================================
test/x86/test-avx512.h
=====================================
@@ -13,4 +13,13 @@ SIMDE_TEST_DEFINE_ASSERT_VEC_CLOSE(m512d, f64)
#define simde_assert_m512d_close(a, b, precision) \
simde_assert_m512d_f64_close_ex(__LINE__, __FILE__, a, b, precision)
+#define simde_assert_mmask8(a, op, b) \
+ munit_assert_uint8(HEDLEY_STATIC_CAST(uint8_t, a), op, HEDLEY_STATIC_CAST(uint8_t, b))
+#define simde_assert_mmask16(a, op, b) \
+ munit_assert_uint16(HEDLEY_STATIC_CAST(uint16_t, a), op, HEDLEY_STATIC_CAST(uint16_t, b))
+#define simde_assert_mmask32(a, op, b) \
+ munit_assert_uint32(HEDLEY_STATIC_CAST(uint32_t, a), op, HEDLEY_STATIC_CAST(uint32_t, b))
+#define simde_assert_mmask64(a, op, b) \
+ munit_assert_uint64(HEDLEY_STATIC_CAST(uint64_t, a), op, HEDLEY_STATIC_CAST(uint64_t, b))
+
#endif /* !defined(SIMDE_TEST_X86_AVX512) */
View it on GitLab: https://salsa.debian.org/med-team/simde/-/commit/cc9ed7fbec3c8e9ba2f95a63fdea42255bfe2a39
--
View it on GitLab: https://salsa.debian.org/med-team/simde/-/commit/cc9ed7fbec3c8e9ba2f95a63fdea42255bfe2a39
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20200407/b38026d8/attachment-0001.html>
More information about the debian-med-commit
mailing list