[med-svn] [Git][med-team/simde][upstream] New upstream version 0.0.0.git.20200407

Michael R. Crusoe gitlab at salsa.debian.org
Tue Apr 7 14:48:00 BST 2020



Michael R. Crusoe pushed to branch upstream at Debian Med / simde


Commits:
cc9ed7fb by Michael R. Crusoe at 2020-04-07T13:43:48+02:00
New upstream version 0.0.0.git.20200407
- - - - -


17 changed files:

- .drone.yml
- README.md
- + meson.build
- simde/simde-common.h
- simde/x86/avx2.h
- simde/x86/avx512bw.h
- simde/x86/avx512f.h
- simde/x86/sse2.h
- + test/arm/meson.build
- + test/arm/neon/meson.build
- + test/meson.build
- test/x86/avx512bw.c
- test/x86/avx512f.c
- + test/x86/meson.build
- test/x86/mmx.c
- test/x86/skel.c
- test/x86/test-avx512.h


Changes:

=====================================
.drone.yml
=====================================
@@ -6,27 +6,25 @@ platform:
   os: linux
   arch: arm
 steps:
-  - name: test
-    image: ubuntu:bionic
-    environment:
-      CC: clang-9
-      CXX: clang++-9
-    commands:
-    - uname -m
-    - cat /proc/cpuinfo
-    - apt-get -yq update
-    - apt-get -yq install clang-9 cmake git-core
-    - git submodule update --init --recursive
-    - mkdir test/build
-    - cd test/build
-    - cmake .. -DCMAKE_C_FLAGS='-march=armv7a -mfpu=neon' -DCMAKE_CXX_FLAGS='-march=armv7a -mfpu=neon'
-    - make -j VERBOSE=1
-    - ./run-tests
-trigger:
-  branch:
-    exclude:
-    - wip/*
-
+- name: test
+  image: ubuntu:bionic
+  environment:
+    CC: clang-9
+    CXX: clang++-9
+    ARCH_FLAGS: -march=armv7a -mfpu=neon
+  commands:
+  - uname -m
+  - cat /proc/cpuinfo
+  - apt-get -yq update
+  - apt-get -yq install clang-9 ninja-build git-core python3-pip
+  - pip3 install meson
+  - git submodule update --init --recursive
+  - mkdir -p build
+  - cd build
+  - CFLAGS="$ARCH_FLAGS" CXXFLAGS="$ARCH_FLAGS" meson ..
+  - ninja -v
+  - ./test/run-tests
+  
 ---
 kind: pipeline
 type: docker
@@ -35,26 +33,24 @@ platform:
   os: linux
   arch: arm
 steps:
-  - name: test
-    image: ubuntu:bionic
-    environment:
-      CC: clang-9
-      CXX: clang++-9
-    commands:
-    - uname -m
-    - cat /proc/cpuinfo
-    - apt-get -yq update
-    - apt-get -yq install clang-9 cmake git-core
-    - git submodule update --init --recursive
-    - mkdir test/build
-    - cd test/build
-    - cmake .. -DCMAKE_C_FLAGS='-march=armv8a' -DCMAKE_CXX_FLAGS='-march=armv8a'
-    - make -j VERBOSE=1
-    - ./run-tests
-trigger:
-  branch:
-    exclude:
-    - wip/*
+- name: test
+  image: ubuntu:bionic
+  environment:
+    CC: clang-9
+    CXX: clang++-9
+    ARCH_FLAGS: -march=armv8a -mfpu=neon
+  commands:
+  - uname -m
+  - cat /proc/cpuinfo
+  - apt-get -yq update
+  - apt-get -yq install clang-9 ninja-build git-core python3-pip
+  - pip3 install meson
+  - git submodule update --init --recursive
+  - mkdir -p build
+  - cd build
+  - CFLAGS="$ARCH_FLAGS" CXXFLAGS="$ARCH_FLAGS" meson ..
+  - ninja -v
+  - ./test/run-tests
 
 ---
 kind: pipeline
@@ -64,26 +60,24 @@ platform:
   os: linux
   arch: arm
 steps:
-  - name: test
-    image: ubuntu:bionic
-    environment:
-      CC: gcc-8
-      CXX: g++-8
-    commands:
-    - uname -m
-    - cat /proc/cpuinfo
-    - apt-get -yq update
-    - apt-get -yq install gcc-8 g++-8 cmake git-core
-    - git submodule update --init --recursive
-    - mkdir test/build
-    - cd test/build
-    - cmake .. -DCMAKE_C_FLAGS='-march=armv7-a -mfpu=neon' -DCMAKE_CXX_FLAGS='-march=armv7-a -mfpu=neon'
-    - make -j VERBOSE=1
-    - ./run-tests
-trigger:
-  branch:
-    exclude:
-    - wip/*
+- name: test
+  image: ubuntu:bionic
+  environment:
+    CC: gcc-8
+    CXX: g++-8
+    ARCH_FLAGS: -march=armv7-a -mfpu=neon
+  commands:
+  - uname -m
+  - cat /proc/cpuinfo
+  - apt-get -yq update
+  - apt-get -yq install gcc-8 g++-8 ninja-build git-core python3-pip
+  - pip3 install meson
+  - git submodule update --init --recursive
+  - mkdir -p build
+  - cd build
+  - CFLAGS="$ARCH_FLAGS" CXXFLAGS="$ARCH_FLAGS" meson ..
+  - ninja -v
+  - ./test/run-tests
 
 ---
 kind: pipeline
@@ -93,139 +87,152 @@ platform:
   os: linux
   arch: arm
 steps:
-  - name: test
-    image: ubuntu:bionic
-    environment:
-      CC: gcc-8
-      CXX: g++-8
-    commands:
-    - uname -m
-    - cat /proc/cpuinfo
-    - apt-get -yq update
-    - apt-get -yq install gcc-8 g++-8 cmake git-core
-    - git submodule update --init --recursive
-    - mkdir test/build
-    - cd test/build
-    - cmake .. -DCMAKE_C_FLAGS='-march=armv8-a -mfpu=neon' -DCMAKE_CXX_FLAGS='-march=armv8-a -mfpu=neon'
-    - make -j VERBOSE=1
-    - ./run-tests
-trigger:
-  branch:
-    exclude:
-    - wip/*
+- name: test
+  image: ubuntu:bionic
+  environment:
+    CC: gcc-8
+    CXX: g++-8
+    ARCH_FLAGS: -march=armv8-a -mfpu=neon
+  commands:
+  - uname -m
+  - cat /proc/cpuinfo
+  - apt-get -yq update
+  - apt-get -yq install gcc-8 g++-8 ninja-build git-core python3-pip
+  - pip3 install meson
+  - git submodule update --init --recursive
+  - mkdir -p build
+  - cd build
+  - CFLAGS="$ARCH_FLAGS" CXXFLAGS="$ARCH_FLAGS" meson ..
+  - ninja -v
+  - ./test/run-tests
 
 ---
 kind: pipeline
 type: docker
-name: "gcc-7 armv7"
+name: "clang-7 armv7"
 platform:
   os: linux
   arch: arm
 steps:
-  - name: test
-    image: ubuntu:bionic
-    environment:
-      CC: gcc-7
-      CXX: g++-7
-    commands:
-    - uname -m
-    - cat /proc/cpuinfo
-    - apt-get -yq update
-    - apt-get -yq install gcc-7 g++-7 cmake git-core
-    - git submodule update --init --recursive
-    - mkdir test/build
-    - cd test/build
-    - cmake .. -DCMAKE_C_FLAGS='-march=armv7-a -mfpu=neon' -DCMAKE_CXX_FLAGS='-march=armv7-a -mfpu=neon'
-    - make -j VERBOSE=1
-    - ./run-tests
-trigger:
-  branch:
-    exclude:
-    - wip/*
-
+- name: test
+  image: ubuntu:bionic
+  environment:
+    CC: clang-7
+    CXX: clang++-7
+    ARCH_FLAGS: -march=armv7a -mfpu=neon
+  commands:
+  - uname -m
+  - cat /proc/cpuinfo
+  - apt-get -yq update
+  - apt-get -yq install clang-7 ninja-build git-core python3-pip
+  - pip3 install meson
+  - git submodule update --init --recursive
+  - mkdir -p build
+  - cd build
+  - CFLAGS="$ARCH_FLAGS" CXXFLAGS="$ARCH_FLAGS" meson ..
+  - ninja -v
+  - ./test/run-tests
+  
 ---
 kind: pipeline
 type: docker
-name: "gcc-7 armv8"
+name: "clang-7 armv8"
 platform:
   os: linux
   arch: arm
 steps:
-  - name: test
-    image: ubuntu:bionic
-    environment:
-      CC: gcc-7
-      CXX: g++-7
-    commands:
-    - uname -m
-    - cat /proc/cpuinfo
-    - apt-get -yq update
-    - apt-get -yq install gcc-7 g++-7 cmake git-core
-    - git submodule update --init --recursive
-    - mkdir test/build
-    - cd test/build
-    - cmake .. -DCMAKE_C_FLAGS='-march=armv8-a -mfpu=neon' -DCMAKE_CXX_FLAGS='-march=armv8-a -mfpu=neon'
-    - make -j VERBOSE=1
-    - ./run-tests
-trigger:
-  branch:
-    exclude:
-    - wip/*
+- name: test
+  image: ubuntu:bionic
+  environment:
+    CC: clang-7
+    CXX: clang++-7
+    ARCH_FLAGS: -march=armv8a -mfpu=neon
+  commands:
+  - uname -m
+  - cat /proc/cpuinfo
+  - apt-get -yq update
+  - apt-get -yq install clang-7 ninja-build git-core python3-pip
+  - pip3 install meson
+  - git submodule update --init --recursive
+  - mkdir -p build
+  - cd build
+  - CFLAGS="$ARCH_FLAGS" CXXFLAGS="$ARCH_FLAGS" meson ..
+  - ninja -v
+  - ./test/run-tests
 
 ---
 kind: pipeline
 type: docker
-name: "clang-7 armv7"
+name: "gcc-7 armv7"
 platform:
   os: linux
   arch: arm
 steps:
-  - name: test
-    image: ubuntu:bionic
-    environment:
-      CC: clang-7
-      CXX: clang++-7
-    commands:
-    - uname -m
-    - cat /proc/cpuinfo
-    - apt-get -yq update
-    - apt-get -yq install clang-7 cmake git-core
-    - git submodule update --init --recursive
-    - mkdir test/build
-    - cd test/build
-    - cmake .. -DCMAKE_C_FLAGS='-march=armv7a' -DCMAKE_CXX_FLAGS='-march=armv7a'
-    - make -j VERBOSE=1
-    - ./run-tests
-trigger:
-  branch:
-    exclude:
-    - wip/*
+- name: test
+  image: ubuntu:bionic
+  environment:
+    CC: gcc-7
+    CXX: g++-7
+    ARCH_FLAGS: -march=armv7-a -mfpu=neon
+  commands:
+  - uname -m
+  - cat /proc/cpuinfo
+  - apt-get -yq update
+  - apt-get -yq install gcc-7 g++-7 ninja-build git-core python3-pip
+  - pip3 install meson
+  - git submodule update --init --recursive
+  - mkdir -p build
+  - cd build
+  - CFLAGS="$ARCH_FLAGS" CXXFLAGS="$ARCH_FLAGS" meson ..
+  - ninja -v
+  - ./test/run-tests
 
 ---
 kind: pipeline
 type: docker
-name: "clang-7 armv8"
+name: "gcc-7 armv8"
 platform:
   os: linux
   arch: arm
 steps:
-  - name: test
-    image: ubuntu:bionic
-    environment:
-      CC: clang-7
-      CXX: clang++-7
-    commands:
-    - uname -m
-    - cat /proc/cpuinfo
-    - apt-get -yq update
-    - apt-get -yq install clang-7 cmake git-core
-    - git submodule update --init --recursive
-    - mkdir test/build
-    - cd test/build
-    - cmake .. -DCMAKE_C_FLAGS='-march=armv8a' -DCMAKE_CXX_FLAGS='-march=armv8a'
-    - make -j VERBOSE=1
-    - ./run-tests
-trigger:
-  branch:
-    exclude:
-    - wip/*
+- name: test
+  image: ubuntu:bionic
+  environment:
+    CC: gcc-7
+    CXX: g++-7
+    ARCH_FLAGS: -march=armv8-a -mfpu=neon
+  commands:
+  - uname -m
+  - cat /proc/cpuinfo
+  - apt-get -yq update
+  - apt-get -yq install gcc-7 g++-7 ninja-build git-core python3-pip
+  - pip3 install meson
+  - git submodule update --init --recursive
+  - mkdir -p build
+  - cd build
+  - CFLAGS="$ARCH_FLAGS" CXXFLAGS="$ARCH_FLAGS" meson ..
+  - ninja -v
+  - ./test/run-tests
+
+# ---
+# kind: pipeline
+# type: docker
+# name: "fedora"
+# steps:
+# - name: test
+#   image: fedora:latest
+#   environment:
+#     CC: gcc
+#     CXX: g++
+#     ARCH_FLAGS: -march=native
+#   commands:
+#   - uname -m
+#   - cat /proc/cpuinfo
+#   - dnf install -y gcc gcc-c++ ninja-build git-core python3-pip
+#   - pip3 install meson
+#   - git submodule update --init --recursive
+#   - mkdir -p build
+#   - cd build
+#   - CFLAGS="$ARCH_FLAGS" CXXFLAGS="$ARCH_FLAGS" meson ..
+#   - ninja -v
+#   - ./test/run-tests


=====================================
README.md
=====================================
@@ -90,6 +90,7 @@ make sense since they will always be green, but here are the links:
 * [AppVeyor](https://ci.appveyor.com/project/quixdb/simde)
 * [GitHub Actions](https://github.com/nemequ/simde/actions)
 * [Azure Pipelines](https://dev.azure.com/simd-everywhere/SIMDe/_build)
+* [Drone CI](https://cloud.drone.io/nemequ/simde)
 
 ## Contributing
 
@@ -303,6 +304,8 @@ tremendously grateful for their support:
    numerous platforms.
  * [AppVeyor](https://www.appveyor.com/) — provides CI testing on
    Windows.
+ * [Drone CI](https://drone.io/) — provides CI testing on ARM 32 bits
+   platform, etc.
  * [IntegriCloud](https://integricloud.com/) — provides access to a very
    fast POWER9 server for developing AltiVec/VMX support.
  * [GCC Compile Farm](https://gcc.gnu.org/wiki/CompileFarm) — provides


=====================================
meson.build
=====================================
@@ -0,0 +1,32 @@
+project('SIMDe', 'c', 'cpp',
+        default_options: ['c_std=c99'],
+        license: 'MIT',
+        version: '0.5.0')
+
+cc = meson.get_compiler('c')
+cxx = meson.get_compiler('cpp')
+
+subdir('test')
+
+install_headers(
+    [
+		'simde/hedley.h',
+		'simde/check.h',
+		'simde/debug-trap.h',
+		'simde/simde-arch.h',
+		'simde/simde-common.h',
+
+		'simde/x86/avx2.h',
+		'simde/x86/avx512bw.h',
+		'simde/x86/avx512f.h',
+		'simde/x86/fma.h',
+		'simde/x86/mmx.h',
+		'simde/x86/sse.h',
+		'simde/x86/sse2.h',
+		'simde/x86/sse3.h',
+		'simde/x86/sse4.1.h',
+		'simde/x86/sse4.2.h',
+		'simde/x86/ssse3.h',
+		'simde/x86/svml.h',
+    ],
+    subdir: 'simde')


=====================================
simde/simde-common.h
=====================================
@@ -25,6 +25,12 @@
 #define SIMDE_COMMON_H
 
 #include "hedley.h"
+
+#define SIMDE_VERSION_MAJOR 0
+#define SIMDE_VERSION_MINOR 5
+#define SIMDE_VERSION_MICRO 0
+#define SIMDE_VERSION HEDLEY_VERSION_ENCODE(SIMDE_VERSION_MAJOR, SIMDE_VERSION_MINOR, SIMDE_VERSION_MICRO)
+
 #include "simde-arch.h"
 
 #include <stddef.h>
@@ -160,12 +166,16 @@
    but the code needs to be refactored a bit to take advantage. */
 #  if !defined(SIMDE_NO_CONVERT_VECTOR) && defined(SIMDE_VECTOR_SUBSCRIPT)
 #    if HEDLEY_HAS_BUILTIN(__builtin_convertvector) || HEDLEY_GCC_VERSION_CHECK(9,0,0)
-/* https://gcc.gnu.org/bugzilla/show_bug.cgi?id=93557 */
-#      define SIMDE__CONVERT_VECTOR(to, from) ((to) = (__extension__({ \
-           __typeof__(from) from_ = (from); \
-           ((void) from_); \
-           __builtin_convertvector(from_, __typeof__(to)); \
-         })))
+#      if HEDLEY_GCC_VERSION_CHECK(9,0,0) && !HEDLEY_GCC_VERSION_CHECK(9,3,0)
+         /* https://gcc.gnu.org/bugzilla/show_bug.cgi?id=93557 */
+#        define SIMDE__CONVERT_VECTOR(to, from) ((to) = (__extension__({ \
+             __typeof__(from) from_ = (from); \
+             ((void) from_); \
+             __builtin_convertvector(from_, __typeof__(to)); \
+           })))
+#      else
+#        define SIMDE__CONVERT_VECTOR(to, from) ((to) = __builtin_convertvector((from), __typeof__(to)))
+#      endif
 #    endif
 #  endif
 #endif
@@ -433,7 +443,7 @@ HEDLEY_STATIC_ASSERT(sizeof(simde_float64) == 8, "Unable to find 64-bit floating
 
 #if \
   HEDLEY_HAS_WARNING("-Wtautological-compare") || \
-  HEDLEY_GCC_VERSION_CHECK(8,0,0)
+  HEDLEY_GCC_VERSION_CHECK(7,0,0)
 #  if defined(__cplusplus)
 #    if (__cplusplus >= 201402L)
 #      define SIMDE_TAUTOLOGICAL_COMPARE_(expr) \
@@ -670,6 +680,9 @@ HEDLEY_STATIC_ASSERT(sizeof(simde_float64) == 8, "Unable to find 64-bit floating
 #    if defined(SIMDE_ARCH_X86) && !defined(SIMDE_ARCH_AMD64)
 #      define SIMDE_BUG_GCC_94482
 #    endif
+#    if defined(SIMDE_ARCH_AARCH64)
+#      define SIMDE_BUG_GCC_94488
+#    endif
 #  endif
 #  if defined(HEDLEY_EMSCRIPTEN_VERSION)
 #    define SIMDE_BUG_EMSCRIPTEN_MISSING_IMPL /* Placeholder for (as yet) unfiled issues. */


=====================================
simde/x86/avx2.h
=====================================
@@ -219,8 +219,8 @@ simde_mm256_alignr_epi8 (simde__m256i a, simde__m256i b, int count) {
 #elif defined(SIMDE_ARCH_X86_SSSE3)
 #  define simde_mm256_alignr_epi8(a, b, count) \
       simde_mm256_set_m128i( \
-          simde_mm_alignr_epi8(simde__m256i_to_private(a).m128i[1], simde__m256i_to_private(b).m128i[1], (count)), \
-          simde_mm_alignr_epi8(simde__m256i_to_private(a).m128i[0], simde__m256i_to_private(b).m128i[0], (count)))
+          simde_mm_alignr_epi8(simde_mm256_extracti128_si256(a, 1), simde_mm256_extracti128_si256(b, 1), (count)), \
+          simde_mm_alignr_epi8(simde_mm256_extracti128_si256(a, 0), simde_mm256_extracti128_si256(b, 0), (count)))
 #endif
 #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
 #  define _mm256_alignr_epi8(a, b, count) simde_mm256_alignr_epi8(a, b, (count))
@@ -1547,8 +1547,8 @@ simde_mm256_shuffle_epi32 (simde__m256i a, const int imm8) {
 #elif defined(SIMDE_ARCH_X86_SSE2) && !defined(__PGI)
 #  define simde_mm256_shuffle_epi32(a, imm8) \
      simde_mm256_set_m128i( \
-       simde_mm_shuffle_epi32(simde__m256i_to_private(a).m128i[1], (imm8)), \
-       simde_mm_shuffle_epi32(simde__m256i_to_private(a).m128i[0], (imm8)))
+       simde_mm_shuffle_epi32(simde_mm256_extracti128_si256(a, 1), (imm8)), \
+       simde_mm_shuffle_epi32(simde_mm256_extracti128_si256(a, 0), (imm8)))
 #elif defined(SIMDE__SHUFFLE_VECTOR)
 #  define simde_mm256_shuffle_epi32(a, imm8) (__extension__ ({ \
       const simde__m256i_private simde__tmp_a_ = simde__m256i_to_private(a); \
@@ -1574,8 +1574,8 @@ simde_mm256_shuffle_epi32 (simde__m256i a, const int imm8) {
 #elif defined(SIMDE_ARCH_X86_SSE2)
 #  define simde_mm256_shufflelo_epi16(a, imm8) \
      simde_mm256_set_m128i( \
-       simde_mm_shufflelo_epi16(simde__m256i_to_private(a).m128i[1], (imm8)), \
-       simde_mm_shufflelo_epi16(simde__m256i_to_private(a).m128i[0], (imm8)))
+       simde_mm_shufflelo_epi16(simde_mm256_extracti128_si256(a, 1), (imm8)), \
+       simde_mm_shufflelo_epi16(simde_mm256_extracti128_si256(a, 0), (imm8)))
 #elif defined(SIMDE__SHUFFLE_VECTOR)
 #  define simde_mm256_shufflelo_epi16(a, imm8) (__extension__ ({ \
       const simde__m256i_private simde__tmp_a_ = simde__m256i_to_private(a); \
@@ -1596,8 +1596,8 @@ simde_mm256_shuffle_epi32 (simde__m256i a, const int imm8) {
 #else
 #  define simde_mm256_shufflelo_epi16(a, imm8) \
      simde_mm256_set_m128i( \
-       simde_mm_shufflelo_epi16(simde__m256i_to_private(a).m128i[1], imm8), \
-       simde_mm_shufflelo_epi16(simde__m256i_to_private(a).m128i[0], imm8))
+       simde_mm_shufflelo_epi16(simde_mm256_extracti128_si256(a, 1), imm8), \
+       simde_mm_shufflelo_epi16(simde_mm256_extracti128_si256(a, 0), imm8))
 #endif
 #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
 #  define _mm256_shufflelo_epi16(a, imm8) simde_mm256_shufflelo_epi16(a, imm8)
@@ -1631,8 +1631,8 @@ simde_mm256_slli_epi16 (simde__m256i a, const int imm8)
 #elif defined(SIMDE_ARCH_X86_SSE2)
 #  define simde_mm256_slli_epi16(a, imm8) \
      simde_mm256_set_m128i( \
-         simde_mm_slli_epi16(simde__m256i_to_private(a).m128i[1], (imm8)), \
-         simde_mm_slli_epi16(simde__m256i_to_private(a).m128i[0], (imm8)))
+         simde_mm_slli_epi16(simde_mm256_extracti128_si256(a, 1), (imm8)), \
+         simde_mm_slli_epi16(simde_mm256_extracti128_si256(a, 0), (imm8)))
 #endif
 #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
 #  define _mm256_slli_epi16(a, imm8) simde_mm256_slli_epi16(a, imm8)
@@ -1662,8 +1662,8 @@ simde_mm256_slli_epi32 (simde__m256i a, const int imm8)
 #elif defined(SIMDE_ARCH_X86_SSE2)
 #  define simde_mm256_slli_epi32(a, imm8) \
      simde_mm256_set_m128i( \
-         simde_mm_slli_epi32(simde__m256i_to_private(a).m128i[1], (imm8)), \
-         simde_mm_slli_epi32(simde__m256i_to_private(a).m128i[0], (imm8)))
+         simde_mm_slli_epi32(simde_mm256_extracti128_si256(a, 1), (imm8)), \
+         simde_mm_slli_epi32(simde_mm256_extracti128_si256(a, 0), (imm8)))
 #endif
 #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
 #  define _mm256_slli_epi32(a, imm8) simde_mm256_slli_epi32(a, imm8)
@@ -1693,8 +1693,8 @@ simde_mm256_slli_epi64 (simde__m256i a, const int imm8)
 #elif defined(SIMDE_ARCH_X86_SSE2)
 #  define simde_mm256_slli_epi64(a, imm8) \
      simde_mm256_set_m128i( \
-         simde_mm_slli_epi64(simde__m256i_to_private(a).m128i[1], (imm8)), \
-         simde_mm_slli_epi64(simde__m256i_to_private(a).m128i[0], (imm8)))
+         simde_mm_slli_epi64(simde_mm256_extracti128_si256(a, 1), (imm8)), \
+         simde_mm_slli_epi64(simde_mm256_extracti128_si256(a, 0), (imm8)))
 #endif
 #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
 #  define _mm256_slli_epi64(a, imm8) simde_mm256_slli_epi64(a, imm8)
@@ -1843,8 +1843,8 @@ simde_mm256_srli_epi64 (simde__m256i a, const int imm8) {
 #elif defined(SIMDE_ARCH_X86_SSE2)
 #  define simde_mm256_srli_epi64(a, imm8) \
      simde_mm256_set_m128i( \
-         simde_mm_srli_epi64(simde__m256i_to_private(a).m128i[1], (imm8)), \
-         simde_mm_srli_epi64(simde__m256i_to_private(a).m128i[0], (imm8)))
+         simde_mm_srli_epi64(simde_mm256_extracti128_si256(a, 1), (imm8)), \
+         simde_mm_srli_epi64(simde_mm256_extracti128_si256(a, 0), (imm8)))
 #endif
 #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
 #  define _mm256_srli_epi64(a, imm8) simde_mm256_srli_epi64(a, imm8)
@@ -1872,13 +1872,13 @@ simde_mm256_srli_si256 (simde__m256i a, const int imm8) {
 #elif defined(SIMDE_ARCH_X86_SSE2) && !defined(__PGI)
 #  define simde_mm256_srli_si256(a, imm8) \
      simde_mm256_set_m128i( \
-         simde_mm_srli_si128(simde__m256i_to_private(a).m128i[1], (imm8)), \
-         simde_mm_srli_si128(simde__m256i_to_private(a).m128i[0], (imm8)))
+         simde_mm_srli_si128(simde_mm256_extracti128_si256(a, 1), (imm8)), \
+         simde_mm_srli_si128(simde_mm256_extracti128_si256(a, 0), (imm8)))
 #elif defined(SIMDE_SSE2_NEON)
 #  define simde_mm256_srli_si256(a, imm8) \
      simde_mm256_set_m128i( \
-       simde_mm_bsrli_si128(simde__m256i_to_private(a).m128i[1], (imm8)), \
-       simde_mm_bsrli_si128(simde__m256i_to_private(a).m128i[0], (imm8)))
+       simde_mm_bsrli_si128(simde_mm256_extracti128_si256(a, 1), (imm8)), \
+       simde_mm_bsrli_si128(simde_mm256_extracti128_si256(a, 0), (imm8)))
 #endif
 #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
 #  define _mm256_srli_si256(a, imm8) simde_mm_srli_si256(a, imm8)
@@ -2150,8 +2150,8 @@ simde_mm256_srli_epi32 (simde__m256i a, const int imm8) {
 #elif defined(SIMDE_ARCH_X86_SSE2)
 #  define simde_mm256_srli_epi32(a, imm8) \
      simde_mm256_set_m128i( \
-         simde_mm_srli_epi32(simde__m256i_to_private(a).m128i[1], (imm8)), \
-         simde_mm_srli_epi32(simde__m256i_to_private(a).m128i[0], (imm8)))
+         simde_mm_srli_epi32(simde_mm256_extracti128_si256(a, 1), (imm8)), \
+         simde_mm_srli_epi32(simde_mm256_extracti128_si256(a, 0), (imm8)))
 #endif
 #if defined(SIMDE_AVX2_ENABLE_NATIVE_ALIASES)
 #  define _mm256_srli_epi32(a, imm8) simde_mm256_srli_epi32(a, imm8)


=====================================
simde/x86/avx512bw.h
=====================================
@@ -306,6 +306,56 @@ simde_mm512_cmpeq_epi8_mask (simde__m512i a, simde__m512i b) {
 #  define _mm512_cmpeq_epi8_mask(a, b) simde_mm512_cmpeq_epi8_mask(a, b)
 #endif
 
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m256i
+simde_mm512_cvtepi16_epi8 (simde__m512i a) {
+#if defined(SIMDE_AVX512BW_NATIVE)
+  return _mm512_cvtepi16_epi8(a);
+#else
+  simde__m256i_private r_;
+  simde__m512i_private a_ = simde__m512i_to_private(a);
+
+#if defined(SIMDE__CONVERT_VECTOR)
+  SIMDE__CONVERT_VECTOR(r_.i8, a_.i16);
+#else
+  SIMDE__VECTORIZE
+  for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
+     r_.i8[i] = HEDLEY_STATIC_CAST(int8_t, a_.i16[i]);
+  }
+#endif
+
+  return simde__m256i_from_private(r_);
+#endif
+}
+#if defined(SIMDE_AVX512BW_ENABLE_NATIVE_ALIASES)
+#  define _mm512_cvtepi16_epi8(a) simde_mm512_cvtepi16_epi8(a)
+#endif
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m512i
+simde_mm512_cvtepi8_epi16 (simde__m256i a) {
+#if defined(SIMDE_AVX512BW_NATIVE)
+  return _mm512_cvtepi8_epi16(a);
+#else
+  simde__m512i_private r_;
+  simde__m256i_private a_ = simde__m256i_to_private(a);
+
+#if defined(SIMDE__CONVERT_VECTOR)
+  SIMDE__CONVERT_VECTOR(r_.i16, a_.i8);
+#else
+  SIMDE__VECTORIZE
+  for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
+    r_.i16[i] = a_.i8[i];
+  }
+#endif
+
+  return simde__m512i_from_private(r_);
+#endif
+}
+#if defined(SIMDE_AVX512BW_ENABLE_NATIVE_ALIASES)
+#  define _mm512_cvtepi8_epi16(a) simde_mm512_cvtepi8_epi16(a)
+#endif
+
 SIMDE__END_DECLS
 
 HEDLEY_DIAGNOSTIC_POP


=====================================
simde/x86/avx512f.h
=====================================
@@ -341,6 +341,32 @@ simde__m512d_to_private(simde__m512d v) {
   return r;
 }
 
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m512i
+simde_mm512_broadcast_i32x4 (simde__m128i a) {
+  #if defined(SIMDE_AVX512F_NATIVE)
+    return _mm512_broadcast_i32x4(a);
+  #else
+    simde__m512i_private r_;
+
+    #if defined(SIMDE_ARCH_X86_AVX2)
+      r_.m256i[1] = r_.m256i[0] = simde_mm256_broadcastsi128_si256(a);
+    #elif defined(SIMDE_ARCH_X86_SSE2)
+      r_.m128i[3] = r_.m128i[2] = r_.m128i[1] = r_.m128i[0] = a;
+    #else
+      SIMDE__VECTORIZE
+      for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) {
+        r_.m128i[i] = a;
+      }
+    #endif
+
+    return simde__m512i_from_private(r_);
+  #endif
+}
+#if defined(SIMDE_AVX512F_ENABLE_NATIVE_ALIASES)
+#  define _mm512_broadcast_i32x4(a) simde_mm512_broadcast_i32x4(a)
+#endif
+
 SIMDE__FUNCTION_ATTRIBUTES
 simde__m512
 simde_mm512_castpd_ps (simde__m512d a) {
@@ -605,6 +631,40 @@ simde_mm512_castsi512_si256 (simde__m512i a) {
 #  define _mm512_castsi512_si256(a) simde_mm512_castsi512_si256(a)
 #endif
 
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m512i
+simde_mm512_load_si512 (simde__m512i const * mem_addr) {
+  simde_assert_aligned(64, mem_addr);
+
+  #if defined(SIMDE_AVX512F_NATIVE)
+    return _mm512_load_si512((__m512i const*) mem_addr);
+  #elif defined(SIMDE_ARCH_AARCH64) && (defined(HEDLEY_GCC_VERSION) && !HEDLEY_GCC_VERSION_CHECK(8,0,0))
+    simde__m512i r;
+    memcpy(&r, mem_addr, sizeof(r));
+    return r;
+  #else
+    return *mem_addr;
+  #endif
+}
+#if defined(SIMDE_AVX512F_ENABLE_NATIVE_ALIASES)
+#  define _mm512_load_si512(a) simde_mm512_load_si512(a)
+#endif
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m512i
+simde_mm512_loadu_si512 (simde__m512i const * mem_addr) {
+  #if defined(SIMDE_AVX512F_NATIVE)
+    return _mm512_loadu_si512((__m512i const*) mem_addr);
+  #else
+    simde__m512i r;
+    simde_memcpy(&r, mem_addr, sizeof(r));
+    return r;
+  #endif
+}
+#if defined(SIMDE_AVX512F_ENABLE_NATIVE_ALIASES)
+#  define _mm512_loadu_si512(a) simde_mm512_loadu_si512(a)
+#endif
+
 SIMDE__FUNCTION_ATTRIBUTES
 simde__m512i
 simde_mm512_set_epi8 (int8_t e63, int8_t e62, int8_t e61, int8_t e60, int8_t e59, int8_t e58, int8_t e57, int8_t e56,
@@ -1174,6 +1234,298 @@ simde_mm512_set1_pd (simde_float64 a) {
 #  define _mm512_set1_pd(a) simde_mm512_set1_pd(a)
 #endif
 
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m512i
+simde_mm512_set4_epi32 (int32_t d, int32_t c, int32_t b, int32_t a) {
+  simde__m512i_private r_;
+
+  r_.i32[ 0] = a;
+  r_.i32[ 1] = b;
+  r_.i32[ 2] = c;
+  r_.i32[ 3] = d;
+  r_.i32[ 4] = a;
+  r_.i32[ 5] = b;
+  r_.i32[ 6] = c;
+  r_.i32[ 7] = d;
+  r_.i32[ 8] = a;
+  r_.i32[ 9] = b;
+  r_.i32[10] = c;
+  r_.i32[11] = d;
+  r_.i32[12] = a;
+  r_.i32[13] = b;
+  r_.i32[14] = c;
+  r_.i32[15] = d;
+
+  return simde__m512i_from_private(r_);
+}
+#if defined(SIMDE_AVX512F_ENABLE_NATIVE_ALIASES)
+#  define _mm512_set4_epi32(d,c,b,a) simde_mm512_set4_epi32(d,c,b,a)
+#endif
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m512i
+simde_mm512_set4_epi64 (int64_t d, int64_t c, int64_t b, int64_t a) {
+  simde__m512i_private r_;
+
+  r_.i64[0] = a;
+  r_.i64[1] = b;
+  r_.i64[2] = c;
+  r_.i64[3] = d;
+  r_.i64[4] = a;
+  r_.i64[5] = b;
+  r_.i64[6] = c;
+  r_.i64[7] = d;
+
+  return simde__m512i_from_private(r_);
+}
+#if defined(SIMDE_AVX512F_ENABLE_NATIVE_ALIASES)
+#  define _mm512_set4_epi64(d,c,b,a) simde_mm512_set4_epi64(d,c,b,a)
+#endif
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m512
+simde_mm512_set4_ps (simde_float32 d, simde_float32 c, simde_float32 b, simde_float32 a) {
+  simde__m512_private r_;
+
+  r_.f32[ 0] = a;
+  r_.f32[ 1] = b;
+  r_.f32[ 2] = c;
+  r_.f32[ 3] = d;
+  r_.f32[ 4] = a;
+  r_.f32[ 5] = b;
+  r_.f32[ 6] = c;
+  r_.f32[ 7] = d;
+  r_.f32[ 8] = a;
+  r_.f32[ 9] = b;
+  r_.f32[10] = c;
+  r_.f32[11] = d;
+  r_.f32[12] = a;
+  r_.f32[13] = b;
+  r_.f32[14] = c;
+  r_.f32[15] = d;
+
+  return simde__m512_from_private(r_);
+}
+#if defined(SIMDE_AVX512F_ENABLE_NATIVE_ALIASES)
+#  define _mm512_set4_ps(d,c,b,a) simde_mm512_set4_ps(d,c,b,a)
+#endif
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m512d
+simde_mm512_set4_pd (simde_float64 d, simde_float64 c, simde_float64 b, simde_float64 a) {
+  simde__m512d_private r_;
+
+  r_.f64[0] = a;
+  r_.f64[1] = b;
+  r_.f64[2] = c;
+  r_.f64[3] = d;
+  r_.f64[4] = a;
+  r_.f64[5] = b;
+  r_.f64[6] = c;
+  r_.f64[7] = d;
+
+  return simde__m512d_from_private(r_);
+}
+#if defined(SIMDE_AVX512F_ENABLE_NATIVE_ALIASES)
+#  define _mm512_set4_pd(d,c,b,a) simde_mm512_set4_pd(d,c,b,a)
+#endif
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m512i
+simde_mm512_setr_epi32 (int32_t e15, int32_t e14, int32_t e13, int32_t e12, int32_t e11, int32_t e10, int32_t  e9, int32_t  e8,
+                       int32_t  e7, int32_t  e6, int32_t  e5, int32_t  e4, int32_t  e3, int32_t  e2, int32_t  e1, int32_t  e0) {
+  simde__m512i_private r_;
+
+  r_.i32[ 0] = e15;
+  r_.i32[ 1] = e14;
+  r_.i32[ 2] = e13;
+  r_.i32[ 3] = e12;
+  r_.i32[ 4] = e11;
+  r_.i32[ 5] = e10;
+  r_.i32[ 6] = e9;
+  r_.i32[ 7] = e8;
+  r_.i32[ 8] = e7;
+  r_.i32[ 9] = e6;
+  r_.i32[10] = e5;
+  r_.i32[11] = e4;
+  r_.i32[12] = e3;
+  r_.i32[13] = e2;
+  r_.i32[14] = e1;
+  r_.i32[15] = e0;
+
+  return simde__m512i_from_private(r_);
+}
+#if defined(SIMDE_AVX512F_ENABLE_NATIVE_ALIASES)
+#  define _mm512_setr_epi32(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) simde_mm512_setr_epi32(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0)
+#endif
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m512i
+simde_mm512_setr_epi64 (int64_t e7, int64_t e6, int64_t e5, int64_t e4, int64_t e3, int64_t e2, int64_t e1, int64_t e0) {
+  simde__m512i_private r_;
+
+  r_.i64[0] = e7;
+  r_.i64[1] = e6;
+  r_.i64[2] = e5;
+  r_.i64[3] = e4;
+  r_.i64[4] = e3;
+  r_.i64[5] = e2;
+  r_.i64[6] = e1;
+  r_.i64[7] = e0;
+
+  return simde__m512i_from_private(r_);
+}
+#if defined(SIMDE_AVX512F_ENABLE_NATIVE_ALIASES)
+#  define _mm512_setr_epi64(e7, e6, e5, e4, e3, e2, e1, e0) simde_mm512_setr_epi64(e7, e6, e5, e4, e3, e2, e1, e0)
+#endif
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m512
+simde_mm512_setr_ps (simde_float32 e15, simde_float32 e14, simde_float32 e13, simde_float32 e12,
+                    simde_float32 e11, simde_float32 e10, simde_float32  e9, simde_float32  e8,
+                    simde_float32  e7, simde_float32  e6, simde_float32  e5, simde_float32  e4,
+                    simde_float32  e3, simde_float32  e2, simde_float32  e1, simde_float32  e0) {
+  simde__m512_private r_;
+
+  r_.f32[ 0] = e15;
+  r_.f32[ 1] = e14;
+  r_.f32[ 2] = e13;
+  r_.f32[ 3] = e12;
+  r_.f32[ 4] = e11;
+  r_.f32[ 5] = e10;
+  r_.f32[ 6] = e9;
+  r_.f32[ 7] = e8;
+  r_.f32[ 8] = e7;
+  r_.f32[ 9] = e6;
+  r_.f32[10] = e5;
+  r_.f32[11] = e4;
+  r_.f32[12] = e3;
+  r_.f32[13] = e2;
+  r_.f32[14] = e1;
+  r_.f32[15] = e0;
+
+  return simde__m512_from_private(r_);
+}
+#if defined(SIMDE_AVX512F_ENABLE_NATIVE_ALIASES)
+#  define _mm512_setr_ps(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) simde_mm512_setr_ps(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0)
+#endif
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m512d
+simde_mm512_setr_pd (simde_float64 e7, simde_float64 e6, simde_float64 e5, simde_float64 e4, simde_float64 e3, simde_float64 e2, simde_float64 e1, simde_float64 e0) {
+  simde__m512d_private r_;
+
+  r_.f64[0] = e7;
+  r_.f64[1] = e6;
+  r_.f64[2] = e5;
+  r_.f64[3] = e4;
+  r_.f64[4] = e3;
+  r_.f64[5] = e2;
+  r_.f64[6] = e1;
+  r_.f64[7] = e0;
+
+  return simde__m512d_from_private(r_);
+}
+#if defined(SIMDE_AVX512F_ENABLE_NATIVE_ALIASES)
+#  define _mm512_setr_pd(e7, e6, e5, e4, e3, e2, e1, e0) simde_mm512_setr_pd(e7, e6, e5, e4, e3, e2, e1, e0)
+#endif
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m512i
+simde_mm512_setr4_epi32 (int32_t d, int32_t c, int32_t b, int32_t a) {
+  simde__m512i_private r_;
+
+  r_.i32[ 0] = d;
+  r_.i32[ 1] = c;
+  r_.i32[ 2] = b;
+  r_.i32[ 3] = a;
+  r_.i32[ 4] = d;
+  r_.i32[ 5] = c;
+  r_.i32[ 6] = b;
+  r_.i32[ 7] = a;
+  r_.i32[ 8] = d;
+  r_.i32[ 9] = c;
+  r_.i32[10] = b;
+  r_.i32[11] = a;
+  r_.i32[12] = d;
+  r_.i32[13] = c;
+  r_.i32[14] = b;
+  r_.i32[15] = a;
+
+  return simde__m512i_from_private(r_);
+}
+#if defined(SIMDE_AVX512F_ENABLE_NATIVE_ALIASES)
+#  define _mm512_setr4_epi32(d,c,b,a) simde_mm512_setr4_epi32(d,c,b,a)
+#endif
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m512i
+simde_mm512_setr4_epi64 (int64_t d, int64_t c, int64_t b, int64_t a) {
+  simde__m512i_private r_;
+
+  r_.i64[0] = d;
+  r_.i64[1] = c;
+  r_.i64[2] = b;
+  r_.i64[3] = a;
+  r_.i64[4] = d;
+  r_.i64[5] = c;
+  r_.i64[6] = b;
+  r_.i64[7] = a;
+
+  return simde__m512i_from_private(r_);
+}
+#if defined(SIMDE_AVX512F_ENABLE_NATIVE_ALIASES)
+#  define _mm512_setr4_epi64(d,c,b,a) simde_mm512_setr4_epi64(d,c,b,a)
+#endif
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m512
+simde_mm512_setr4_ps (simde_float32 d, simde_float32 c, simde_float32 b, simde_float32 a) {
+  simde__m512_private r_;
+
+  r_.f32[ 0] = d;
+  r_.f32[ 1] = c;
+  r_.f32[ 2] = b;
+  r_.f32[ 3] = a;
+  r_.f32[ 4] = d;
+  r_.f32[ 5] = c;
+  r_.f32[ 6] = b;
+  r_.f32[ 7] = a;
+  r_.f32[ 8] = d;
+  r_.f32[ 9] = c;
+  r_.f32[10] = b;
+  r_.f32[11] = a;
+  r_.f32[12] = d;
+  r_.f32[13] = c;
+  r_.f32[14] = b;
+  r_.f32[15] = a;
+
+  return simde__m512_from_private(r_);
+}
+#if defined(SIMDE_AVX512F_ENABLE_NATIVE_ALIASES)
+#  define _mm512_setr4_ps(d,c,b,a) simde_mm512_setr4_ps(d,c,b,a)
+#endif
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m512d
+simde_mm512_setr4_pd (simde_float64 d, simde_float64 c, simde_float64 b, simde_float64 a) {
+  simde__m512d_private r_;
+
+  r_.f64[0] = d;
+  r_.f64[1] = c;
+  r_.f64[2] = b;
+  r_.f64[3] = a;
+  r_.f64[4] = d;
+  r_.f64[5] = c;
+  r_.f64[6] = b;
+  r_.f64[7] = a;
+
+  return simde__m512d_from_private(r_);
+}
+#if defined(SIMDE_AVX512F_ENABLE_NATIVE_ALIASES)
+#  define _mm512_setr4_pd(d,c,b,a) simde_mm512_setr4_pd(d,c,b,a)
+#endif
+
 SIMDE__FUNCTION_ATTRIBUTES
 simde__m512i
 simde_mm512_setzero_si512(void) {
@@ -1243,6 +1595,84 @@ simde_mm512_setone_pd(void) {
   return simde_mm512_castsi512_pd(simde_mm512_setone_si512());
 }
 
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m512i
+simde_mm512_srli_epi64 (simde__m512i a, unsigned int imm8) {
+  #if defined(SIMDE_AVX512F_NATIVE)
+    return _mm512_srli_epi64(a, imm8);
+  #else
+    simde__m512i_private
+      r_,
+      a_ = simde__m512i_to_private(a);
+
+    #if defined(SIMDE_ARCH_X86_AVX2)
+      r_.m256i[0] = simde_mm256_srli_epi64(a_.m256i[0], imm8);
+      r_.m256i[1] = simde_mm256_srli_epi64(a_.m256i[1], imm8);
+    #elif defined(SIMDE_ARCH_X86_SSE2)
+      r_.m128i[0] = simde_mm_srli_epi64(a_.m128i[0], imm8);
+      r_.m128i[1] = simde_mm_srli_epi64(a_.m128i[1], imm8);
+      r_.m128i[2] = simde_mm_srli_epi64(a_.m128i[2], imm8);
+      r_.m128i[3] = simde_mm_srli_epi64(a_.m128i[3], imm8);
+    #else
+      /* The Intel Intrinsics Guide says that only the 8 LSBits of imm8 are
+      * used.  In this case we should do "imm8 &= 0xff" here.  However in
+      * practice all bits are used. */
+      if (imm8 > 63) {
+        simde_memset(&r_, 0, sizeof(r_));
+      } else {
+        #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
+          r_.u64 = a_.u64 >> imm8;
+        #else
+          SIMDE__VECTORIZE
+          for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
+            r_.u64[i] = a_.u64[i] >> imm8;
+          }
+        #endif
+      }
+    #endif
+
+    return simde__m512i_from_private(r_);
+  #endif
+}
+#if defined(SIMDE_AVX512F_ENABLE_NATIVE_ALIASES)
+#  define _mm512_srli_epi64(a, imm8) simde_mm512_srli_epi64(a, imm8)
+#endif
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m512i
+simde_mm512_xor_si512 (simde__m512i a, simde__m512i b) {
+  #if defined(SIMDE_AVX512F_NATIVE)
+    return _mm512_xor_si512(a, b);
+  #else
+    simde__m512i_private
+      r_,
+      a_ = simde__m512i_to_private(a),
+      b_ = simde__m512i_to_private(b);
+
+  #if defined(SIMDE_ARCH_X86_AVX2)
+    r_.m256i[0] = simde_mm256_xor_si256(a_.m256i[0], b_.m256i[0]);
+    r_.m256i[1] = simde_mm256_xor_si256(a_.m256i[1], b_.m256i[1]);
+  #elif defined(SIMDE_ARCH_X86_SSE2)
+    r_.m128i[0] = simde_mm_xor_si128(a_.m128i[0], b_.m128i[0]);
+    r_.m128i[1] = simde_mm_xor_si128(a_.m128i[1], b_.m128i[1]);
+    r_.m128i[2] = simde_mm_xor_si128(a_.m128i[2], b_.m128i[2]);
+    r_.m128i[3] = simde_mm_xor_si128(a_.m128i[3], b_.m128i[3]);
+  #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
+    r_.i32f = a_.i32f ^ b_.i32f;
+  #else
+    SIMDE__VECTORIZE
+    for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
+      r_.i32f[i] = a_.i32f[i] ^ b_.i32f[i];
+    }
+  #endif
+
+  return simde__m512i_from_private(r_);
+#endif
+}
+#if defined(SIMDE_AVX512F_ENABLE_NATIVE_ALIASES)
+#  define _mm512_xor_si512(a, b) simde_mm512_xor_si512(a, b)
+#endif
+
 SIMDE__FUNCTION_ATTRIBUTES
 simde__m512i
 simde_mm512_mask_mov_epi32(simde__m512i src, simde__mmask16 k, simde__m512i a) {
@@ -1669,11 +2099,119 @@ simde_mm512_add_pd (simde__m512d a, simde__m512d b) {
 #  define _mm512_add_pd(a, b) simde_mm512_add_pd(a, b)
 #endif
 
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m512i
+simde_mm512_sub_epi32 (simde__m512i a, simde__m512i b) {
+#if defined(SIMDE_AVX512F_NATIVE)
+  return _mm512_sub_epi32(a, b);
+#else
+  simde__m512i_private
+    r_,
+    a_ = simde__m512i_to_private(a),
+    b_ = simde__m512i_to_private(b);
+
+#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
+  r_.i32 = a_.i32 - b_.i32;
+#else
+  SIMDE__VECTORIZE
+  for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) {
+    r_.m256i[i] = simde_mm256_sub_epi32(a_.m256i[i], b_.m256i[i]);
+  }
+#endif
+
+  return simde__m512i_from_private(r_);
+#endif
+}
+#if defined(SIMDE_AVX512F_ENABLE_NATIVE_ALIASES)
+#  define _mm512_sub_epi32(a, b) simde_mm512_sub_epi32(a, b)
+#endif
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m512i
+simde_mm512_sub_epi64 (simde__m512i a, simde__m512i b) {
+#if defined(SIMDE_AVX512F_NATIVE)
+  return _mm512_sub_epi64(a, b);
+#else
+  simde__m512i_private
+    r_,
+    a_ = simde__m512i_to_private(a),
+    b_ = simde__m512i_to_private(b);
+
+#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
+  r_.i64 = a_.i64 - b_.i64;
+#else
+  SIMDE__VECTORIZE
+  for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) {
+    r_.m256i[i] = simde_mm256_sub_epi64(a_.m256i[i], b_.m256i[i]);
+  }
+#endif
+
+  return simde__m512i_from_private(r_);
+#endif
+}
+#if defined(SIMDE_AVX512F_ENABLE_NATIVE_ALIASES)
+#  define _mm512_sub_epi64(a, b) simde_mm512_sub_epi64(a, b)
+#endif
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m512
+simde_mm512_sub_ps (simde__m512 a, simde__m512 b) {
+#if defined(SIMDE_AVX512F_NATIVE)
+  return _mm512_sub_ps(a, b);
+#else
+  simde__m512_private
+    r_,
+    a_ = simde__m512_to_private(a),
+    b_ = simde__m512_to_private(b);
+
+#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
+  r_.f32 = a_.f32 - b_.f32;
+#else
+  SIMDE__VECTORIZE
+  for (size_t i = 0 ; i < (sizeof(r_.m256) / sizeof(r_.m256[0])) ; i++) {
+    r_.m256[i] = simde_mm256_sub_ps(a_.m256[i], b_.m256[i]);
+  }
+#endif
+
+  return simde__m512_from_private(r_);
+#endif
+}
+#if defined(SIMDE_AVX512F_ENABLE_NATIVE_ALIASES)
+#  define _mm512_sub_ps(a, b) simde_mm512_sub_ps(a, b)
+#endif
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m512d
+simde_mm512_sub_pd (simde__m512d a, simde__m512d b) {
+#if defined(SIMDE_AVX512F_NATIVE)
+  return _mm512_sub_pd(a, b);
+#else
+  simde__m512d_private
+    r_,
+    a_ = simde__m512d_to_private(a),
+    b_ = simde__m512d_to_private(b);
+
+#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
+  r_.f64 = a_.f64 - b_.f64;
+#else
+  SIMDE__VECTORIZE
+  for (size_t i = 0 ; i < (sizeof(r_.m256d) / sizeof(r_.m256d[0])) ; i++) {
+    r_.m256d[i] = simde_mm256_sub_pd(a_.m256d[i], b_.m256d[i]);
+  }
+#endif
+
+  return simde__m512d_from_private(r_);
+#endif
+}
+#if defined(SIMDE_AVX512F_ENABLE_NATIVE_ALIASES)
+#  define _mm512_sub_pd(a, b) simde_mm512_sub_pd(a, b)
+#endif
+
 SIMDE__FUNCTION_ATTRIBUTES
 simde__mmask16
-simde_mm512_mask_cmpeq_epi32_mask (simde__mmask16 k1, simde__m512i a, simde__m512i b) {
+simde_mm512_cmpeq_epi32_mask (simde__m512i a, simde__m512i b) {
   #if defined(SIMDE_AVX512F_NATIVE)
-    return _mm512_mask_cmpeq_epi32_mask(k1, a, b);
+    return _mm512_cmpeq_epi32_mask(a, b);
   #else
     simde__m512i_private
       r_,
@@ -1684,18 +2222,66 @@ simde_mm512_mask_cmpeq_epi32_mask (simde__mmask16 k1, simde__m512i a, simde__m51
       r_.m256i[i] = simde_mm256_cmpeq_epi32(a_.m256i[i], b_.m256i[i]);
     }
 
-    return simde__m512i_private_to_mmask16(r_) & k1;
+    return simde__m512i_private_to_mmask16(r_);
+  #endif
+}
+#if defined(SIMDE_AVX512F_ENABLE_NATIVE_ALIASES)
+#  define _mm512_cmpeq_epi32_mask(a, b) simde_mm512_cmpeq_epi32_mask(a, b)
+#endif
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__mmask16
+simde_mm512_mask_cmpeq_epi32_mask (simde__mmask16 k1, simde__m512i a, simde__m512i b) {
+  #if defined(SIMDE_AVX512F_NATIVE)
+    return _mm512_mask_cmpeq_epi32_mask(k1, a, b);
+  #else
+    return simde_mm512_cmpeq_epi32_mask(a, b) & k1;
   #endif
 }
 #if defined(SIMDE_AVX512F_ENABLE_NATIVE_ALIASES)
 #  define _mm512_mask_cmpeq_epi32_mask(k1, a, b) simde_mm512_mask_cmpeq_epi32_mask(k1, a, b)
 #endif
 
+SIMDE__FUNCTION_ATTRIBUTES
+simde__mmask8
+simde_mm512_cmpeq_epi64_mask (simde__m512i a, simde__m512i b) {
+  #if defined(SIMDE_AVX512F_NATIVE)
+    return _mm512_cmpeq_epi64_mask(a, b);
+  #else
+    simde__m512i_private
+      r_,
+      a_ = simde__m512i_to_private(a),
+      b_ = simde__m512i_to_private(b);
+
+    for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) {
+      r_.m256i[i] = simde_mm256_cmpeq_epi64(a_.m256i[i], b_.m256i[i]);
+    }
+
+    return simde__m512i_private_to_mmask8(r_);
+  #endif
+}
+#if defined(SIMDE_AVX512F_ENABLE_NATIVE_ALIASES)
+#  define _mm512_cmpeq_epi64_mask(a, b) simde_mm512_cmpeq_epi64_mask(a, b)
+#endif
+
 SIMDE__FUNCTION_ATTRIBUTES
 simde__mmask8
 simde_mm512_mask_cmpeq_epi64_mask (simde__mmask8 k1, simde__m512i a, simde__m512i b) {
   #if defined(SIMDE_AVX512F_NATIVE)
     return _mm512_mask_cmpeq_epi64_mask(k1, a, b);
+  #else
+    return simde_mm512_cmpeq_epi64_mask(a, b) & k1;
+  #endif
+}
+#if defined(SIMDE_AVX512F_ENABLE_NATIVE_ALIASES)
+#  define _mm512_mask_cmpeq_epi64_mask(k1, a, b) simde_mm512_mask_cmpeq_epi64_mask(k1, a, b)
+#endif
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__mmask16
+simde_mm512_mask_cmpgt_epi32_mask (simde__mmask16 k1, simde__m512i a, simde__m512i b) {
+  #if defined(SIMDE_AVX512F_NATIVE)
+    return _mm512_mask_cmpgt_epi32_mask(k1, a, b);
   #else
     simde__m512i_private
       r_,
@@ -1703,14 +2289,36 @@ simde_mm512_mask_cmpeq_epi64_mask (simde__mmask8 k1, simde__m512i a, simde__m512
       b_ = simde__m512i_to_private(b);
 
     for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) {
-      r_.m256i[i] = simde_mm256_cmpeq_epi64(a_.m256i[i], b_.m256i[i]);
+      r_.m256i[i] = simde_mm256_cmpgt_epi32(a_.m256i[i], b_.m256i[i]);
+    }
+
+    return simde__m512i_private_to_mmask16(r_) & k1;
+  #endif
+}
+#if defined(SIMDE_AVX512F_ENABLE_NATIVE_ALIASES)
+#  define _mm512_mask_cmpgt_epi32_mask(k1, a, b) simde_mm512_mask_cmpgt_epi32_mask(k1, a, b)
+#endif
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__mmask8
+simde_mm512_mask_cmpgt_epi64_mask (simde__mmask8 k1, simde__m512i a, simde__m512i b) {
+  #if defined(SIMDE_AVX512F_NATIVE)
+    return _mm512_mask_cmpgt_epi64_mask(k1, a, b);
+  #else
+    simde__m512i_private
+      r_,
+      a_ = simde__m512i_to_private(a),
+      b_ = simde__m512i_to_private(b);
+
+    for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) {
+      r_.m256i[i] = simde_mm256_cmpgt_epi64(a_.m256i[i], b_.m256i[i]);
     }
 
     return simde__m512i_private_to_mmask8(r_) & k1;
   #endif
 }
 #if defined(SIMDE_AVX512F_ENABLE_NATIVE_ALIASES)
-#  define _mm512_mask_cmpeq_epi64_mask(k1, a, b) simde_mm512_mask_cmpeq_epi64_mask(k1, a, b)
+#  define _mm512_mask_cmpgt_epi64_mask(k1, a, b) simde_mm512_mask_cmpgt_epi64_mask(k1, a, b)
 #endif
 
 SIMDE__FUNCTION_ATTRIBUTES
@@ -1788,6 +2396,31 @@ simde_mm512_cvtepi32_epi8 (simde__m512i a) {
 #  define _mm512_cvtepi32_epi8(a) simde_mm512_cvtepi32_epi8(a)
 #endif
 
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m256i
+simde_mm512_cvtepi32_epi16 (simde__m512i a) {
+#if defined(SIMDE_AVX512F_NATIVE)
+  return _mm512_cvtepi32_epi16(a);
+#else
+  simde__m256i_private r_;
+  simde__m512i_private a_ = simde__m512i_to_private(a);
+
+#if defined(SIMDE__CONVERT_VECTOR)
+  SIMDE__CONVERT_VECTOR(r_.i16, a_.i32);
+#else
+  SIMDE__VECTORIZE
+  for (size_t i = 0 ; i < (sizeof(a_.i32) / sizeof(a_.i32[0])) ; i++) {
+    r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, a_.i32[i]);
+  }
+#endif
+
+  return simde__m256i_from_private(r_);
+#endif
+}
+#if defined(SIMDE_AVX512F_ENABLE_NATIVE_ALIASES)
+#  define _mm512_cvtepi32_epi16(a) simde_mm512_cvtepi32_epi16(a)
+#endif
+
 SIMDE__FUNCTION_ATTRIBUTES
 simde__m128i
 simde_mm512_cvtepi64_epi8 (simde__m512i a) {
@@ -1813,6 +2446,169 @@ simde_mm512_cvtepi64_epi8 (simde__m512i a) {
 #  define _mm512_cvtepi64_epi8(a) simde_mm512_cvtepi64_epi8(a)
 #endif
 
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i
+simde_mm512_cvtepi64_epi16 (simde__m512i a) {
+#if defined(SIMDE_AVX512F_NATIVE)
+  return _mm512_cvtepi64_epi16(a);
+#else
+  simde__m128i_private r_ = simde__m128i_to_private(simde_mm_setzero_si128());
+  simde__m512i_private a_ = simde__m512i_to_private(a);
+
+#if defined(SIMDE__CONVERT_VECTOR)
+  SIMDE__CONVERT_VECTOR(r_.i16, a_.i64);
+#else
+  SIMDE__VECTORIZE
+  for (size_t i = 0 ; i < (sizeof(a_.i64) / sizeof(a_.i64[0])) ; i++) {
+    r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, a_.i64[i]);
+  }
+#endif
+
+  return simde__m128i_from_private(r_);
+#endif
+}
+#if defined(SIMDE_AVX512F_ENABLE_NATIVE_ALIASES)
+#  define _mm512_cvtepi64_epi16(a) simde_mm512_cvtepi64_epi16(a)
+#endif
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m512
+simde_mm512_div_ps (simde__m512 a, simde__m512 b) {
+#if defined(SIMDE_AVX512F_NATIVE)
+  return _mm512_div_ps(a, b);
+#else
+  simde__m512_private
+    r_,
+    a_ = simde__m512_to_private(a),
+    b_ = simde__m512_to_private(b);
+
+#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
+  r_.f32 = a_.f32 / b_.f32;
+#else
+  SIMDE__VECTORIZE
+  for (size_t i = 0 ; i < (sizeof(r_.m256) / sizeof(r_.m256[0])) ; i++) {
+    r_.m256[i] = simde_mm256_div_ps(a_.m256[i], b_.m256[i]);
+  }
+#endif
+
+  return simde__m512_from_private(r_);
+#endif
+}
+#if defined(SIMDE_AVX512F_ENABLE_NATIVE_ALIASES)
+#  define _mm512_div_ps(a, b) simde_mm512_div_ps(a, b)
+#endif
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m512d
+simde_mm512_div_pd (simde__m512d a, simde__m512d b) {
+#if defined(SIMDE_AVX512F_NATIVE)
+  return _mm512_div_pd(a, b);
+#else
+  simde__m512d_private
+    r_,
+    a_ = simde__m512d_to_private(a),
+    b_ = simde__m512d_to_private(b);
+
+#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
+  r_.f64 = a_.f64 / b_.f64;
+#else
+  SIMDE__VECTORIZE
+  for (size_t i = 0 ; i < (sizeof(r_.m256d) / sizeof(r_.m256d[0])) ; i++) {
+    r_.m256d[i] = simde_mm256_div_pd(a_.m256d[i], b_.m256d[i]);
+  }
+#endif
+
+  return simde__m512d_from_private(r_);
+#endif
+}
+#if defined(SIMDE_AVX512F_ENABLE_NATIVE_ALIASES)
+#  define _mm512_div_pd(a, b) simde_mm512_div_pd(a, b)
+#endif
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m512
+simde_mm512_mul_ps (simde__m512 a, simde__m512 b) {
+#if defined(SIMDE_AVX512F_NATIVE)
+  return _mm512_mul_ps(a, b);
+#else
+  simde__m512_private
+    r_,
+    a_ = simde__m512_to_private(a),
+    b_ = simde__m512_to_private(b);
+
+#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
+  r_.f32 = a_.f32 * b_.f32;
+#else
+  SIMDE__VECTORIZE
+  for (size_t i = 0 ; i < (sizeof(r_.m256) / sizeof(r_.m256[0])) ; i++) {
+    r_.m256[i] = simde_mm256_mul_ps(a_.m256[i], b_.m256[i]);
+  }
+#endif
+
+  return simde__m512_from_private(r_);
+#endif
+}
+#if defined(SIMDE_AVX512F_ENABLE_NATIVE_ALIASES)
+#  define _mm512_mul_ps(a, b) simde_mm512_mul_ps(a, b)
+#endif
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m512d
+simde_mm512_mul_pd (simde__m512d a, simde__m512d b) {
+#if defined(SIMDE_AVX512F_NATIVE)
+  return _mm512_mul_pd(a, b);
+#else
+  simde__m512d_private
+    r_,
+    a_ = simde__m512d_to_private(a),
+    b_ = simde__m512d_to_private(b);
+
+#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
+  r_.f64 = a_.f64 * b_.f64;
+#else
+  SIMDE__VECTORIZE
+  for (size_t i = 0 ; i < (sizeof(r_.m256d) / sizeof(r_.m256d[0])) ; i++) {
+    r_.m256d[i] = simde_mm256_mul_pd(a_.m256d[i], b_.m256d[i]);
+  }
+#endif
+
+  return simde__m512d_from_private(r_);
+#endif
+}
+#if defined(SIMDE_AVX512F_ENABLE_NATIVE_ALIASES)
+#  define _mm512_mul_pd(a, b) simde_mm512_mul_pd(a, b)
+#endif
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m512i
+simde_mm512_or_si512 (simde__m512i a, simde__m512i b) {
+#if defined(SIMDE_AVX512F_NATIVE)
+  return _mm512_or_si512(a, b);
+#else
+  simde__m512i_private
+    r_,
+    a_ = simde__m512i_to_private(a),
+    b_ = simde__m512i_to_private(b);
+
+#if defined(SIMDE_ARCH_X86_AVX2)
+  r_.m256i[0] = simde_mm256_or_si256(a_.m256i[0], b_.m256i[0]);
+  r_.m256i[1] = simde_mm256_or_si256(a_.m256i[1], b_.m256i[1]);
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
+  r_.i32f = a_.i32f | b_.i32f;
+#else
+  SIMDE__VECTORIZE
+  for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
+    r_.i32f[i] = a_.i32f[i] | b_.i32f[i];
+  }
+#endif
+
+  return simde__m512i_from_private(r_);
+#endif
+}
+#if defined(SIMDE_AVX512F_ENABLE_NATIVE_ALIASES)
+#  define _mm512_or_si512(a, b) simde_mm512_or_si512(a, b)
+#endif
+
 SIMDE__END_DECLS
 
 HEDLEY_DIAGNOSTIC_POP


=====================================
simde/x86/sse2.h
=====================================
@@ -4405,15 +4405,13 @@ simde_mm_sll_epi64 (simde__m128i a, simde__m128i count) {
   if (count_.u64[0] > 63)
     return simde_mm_setzero_si128();
 
-#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_ARM_SHIFT_SCALAR)
-  /* GCC ≤ 7 on AArch64 generates an ICE here */
-  r_.u64 = (a_.u64 << count_.u64[0]);
-#else
-  SIMDE__VECTORIZE
+  const int_fast16_t s = HEDLEY_STATIC_CAST(int_fast16_t, count_.u64[0]);
+  #if !defined(SIMDE_BUG_GCC_94488)
+    SIMDE__VECTORIZE
+  #endif
   for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
-    r_.u64[i] = a_.u64[i] << count_.u64[0];
+    r_.u64[i] = a_.u64[i] << s;
   }
-#endif
 
   return simde__m128i_from_private(r_);
 #endif
@@ -4542,11 +4540,13 @@ simde_mm_srl_epi64 (simde__m128i a, simde__m128i count) {
     a_ = simde__m128i_to_private(a),
     count_ = simde__m128i_to_private(count);
 
-  if (count_.u64[0] > 31)
+  if (count_.u64[0] > 63)
     return simde_mm_setzero_si128();
-  const int s = (int) (count_.u64[0]);
 
-  SIMDE__VECTORIZE
+  const int_fast16_t s = HEDLEY_STATIC_CAST(int_fast16_t, count_.u64[0]);
+  #if !defined(SIMDE_BUG_GCC_94488)
+    SIMDE__VECTORIZE
+  #endif
   for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
     r_.u64[i] = a_.u64[i] >> s;
   }


=====================================
test/arm/meson.build
=====================================
@@ -0,0 +1,4 @@
+subdir('neon')
+
+simde_tests_arm = static_library('simde-tests-arm', 'test-arm.c',
+  link_with: [simde_tests_arm_neon])


=====================================
test/arm/neon/meson.build
=====================================
@@ -0,0 +1,23 @@
+simde_test_arm_neon_sources = [
+  'vadd.c',
+  'vdup_n.c',
+  'vmul.c',
+  'vsub.c'
+]
+
+foreach src : simde_test_arm_neon_sources
+  simde_test_arm_neon_sources += configure_file(input: src, output: src + 'pp', copy: true)
+endforeach
+
+simde_tests_arm_neon_emul = static_library('simde-tests-arm-neon-emul', simde_test_arm_neon_sources,
+  c_args: simde_c_args + simde_c_defs + simde_native_c_flags + ['-DSIMDE_NO_NATIVE'],
+  cpp_args: simde_c_args + simde_c_defs + simde_native_c_flags + ['-DSIMDE_NO_NATIVE'],
+  include_directories: simde_include_dir)
+
+simde_tests_arm_neon_native = static_library('simde-tests-arm-neon-native', simde_test_arm_neon_sources,
+  c_args: simde_c_args + simde_c_defs + simde_native_c_flags + simde_native_c_flags,
+  cpp_args: simde_c_args + simde_c_defs + simde_native_c_flags + simde_native_cxx_flags,
+  include_directories: simde_include_dir)
+
+simde_tests_arm_neon = static_library('simde-tests-neon', 'test-neon.c',
+  link_with: [simde_tests_arm_neon_emul, simde_tests_arm_neon_native])


=====================================
test/meson.build
=====================================
@@ -0,0 +1,64 @@
+munit_deps = []
+munit_defs = []
+if cc.get_id() == 'pgi'
+  munit_deps += dependency('openmp')
+endif
+if cc.has_function('clock_gettime')
+  librt_dep = cc.find_library('rt', required: false)
+  if not librt_dep.found()
+    munit_defs += '-DPSNIP_CLOCK_NO_LIBRT'
+  endif
+  munit_deps += librt_dep
+endif
+munit = static_library('munit', 'munit/munit.c', dependencies: munit_deps)
+
+simde_c_defs = []
+simde_cxx_defs = []
+simde_c_args = []
+simde_cxx_args = []
+simde_deps = []
+
+c_openmp_simd = false
+cxx_openmp_simd = false
+foreach omp_arg : ['-fopenmp-simd', '-qopenmp-simd']
+  if (not c_openmp_simd) and cc.has_argument(omp_arg)
+    simde_c_args += omp_arg
+    simde_c_defs += '-DSIMDE_ENABLE_OPENMP'
+    c_openmp_simd = true
+  endif
+
+  if (not cxx_openmp_simd) and cxx.has_argument(omp_arg)
+    simde_cxx_args += omp_arg
+    simde_cxx_defs += '-DSIMDE_ENABLE_OPENMP'
+    cxx_openmp_simd = true
+  endif
+endforeach
+
+simde_native_c_flags = []
+# if cc.has_argument('-march=native+simd')
+#   simde_native_c_flags += '-march=native+simd'
+# elif cc.has_argument('-march=native')
+#   simde_native_c_flags += '-march=native'
+# endif
+
+simde_native_cxx_flags = []
+# if cxx.has_argument('-march=native+simd')
+#   simde_native_cxx_flags += '-march=native+simd'
+# elif cxx.has_argument('-march=native')
+#   simde_native_cxx_flags += '-march=native'
+# endif
+
+if not c_openmp_simd
+  simde_deps += dependency('openmp', required: false)
+endif
+
+simde_include_dir = include_directories('..')
+
+subdir('x86')
+subdir('arm')
+
+executable('run-tests', 'run-tests.c',
+           link_with: [munit, simde_tests_x86, simde_tests_arm],
+           dependencies: simde_deps,
+           c_args: simde_c_args + simde_c_defs + simde_native_c_flags,
+           cpp_args: simde_cxx_args + simde_cxx_defs + simde_native_cxx_flags)


=====================================
test/x86/avx512bw.c
=====================================
@@ -2179,6 +2179,300 @@ test_simde_mm512_cmpeq_epi8_mask(const MunitParameter params[], void* data) {
   return MUNIT_OK;
 }
 
+static MunitResult
+test_simde_mm512_cvtepi16_epi8(const MunitParameter params[], void* data) {
+  (void) params;
+  (void) data;
+
+  const struct {
+    simde__m512i a;
+    simde__m256i r;
+  } test_vec[8] = {
+    { simde_mm512_set_epi16(INT16_C( 14423), INT16_C(  3775), INT16_C( 16156), INT16_C( 17811),
+                            INT16_C(-14881), INT16_C(-30283), INT16_C( 27295), INT16_C(-12290),
+                            INT16_C( 12394), INT16_C( 32764), INT16_C(  8681), INT16_C( 21255),
+                            INT16_C(-21785), INT16_C(-24065), INT16_C(-28005), INT16_C( 15206),
+                            INT16_C(  6131), INT16_C(-29323), INT16_C( -9530), INT16_C( -6655),
+                            INT16_C( 14785), INT16_C( -9158), INT16_C(  7009), INT16_C(  4834),
+                            INT16_C(-15579), INT16_C(  5296), INT16_C( 20054), INT16_C( 12832),
+                            INT16_C( 15724), INT16_C(  5918), INT16_C( 25398), INT16_C( 14084)),
+      simde_mm256_set_epi8(INT8_C(  87), INT8_C( -65), INT8_C(  28), INT8_C(-109),
+                           INT8_C( -33), INT8_C( -75), INT8_C( -97), INT8_C(  -2),
+                           INT8_C( 106), INT8_C(  -4), INT8_C( -23), INT8_C(   7),
+                           INT8_C( -25), INT8_C(  -1), INT8_C(-101), INT8_C( 102),
+                           INT8_C( -13), INT8_C( 117), INT8_C( -58), INT8_C(   1),
+                           INT8_C( -63), INT8_C(  58), INT8_C(  97), INT8_C( -30),
+                           INT8_C(  37), INT8_C( -80), INT8_C(  86), INT8_C(  32),
+                           INT8_C( 108), INT8_C(  30), INT8_C(  54), INT8_C(   4)) },
+    { simde_mm512_set_epi16(INT16_C(  8455), INT16_C(  1140), INT16_C(-23383), INT16_C( 22825),
+                            INT16_C(-21438), INT16_C(  8713), INT16_C(-25940), INT16_C(-31180),
+                            INT16_C(-13214), INT16_C( 10200), INT16_C(-21253), INT16_C(  2612),
+                            INT16_C(-27891), INT16_C( 14031), INT16_C( -9014), INT16_C( 10287),
+                            INT16_C(-11660), INT16_C( 26858), INT16_C(-19518), INT16_C(  2472),
+                            INT16_C( 27637), INT16_C( 14857), INT16_C( 30034), INT16_C(-24153),
+                            INT16_C( 31935), INT16_C( -6397), INT16_C( -2502), INT16_C( 31062),
+                            INT16_C( 30236), INT16_C(  5156), INT16_C( 18439), INT16_C(-13074)),
+      simde_mm256_set_epi8(INT8_C(   7), INT8_C( 116), INT8_C( -87), INT8_C(  41),
+                           INT8_C(  66), INT8_C(   9), INT8_C( -84), INT8_C(  52),
+                           INT8_C(  98), INT8_C( -40), INT8_C(  -5), INT8_C(  52),
+                           INT8_C(  13), INT8_C( -49), INT8_C( -54), INT8_C(  47),
+                           INT8_C( 116), INT8_C( -22), INT8_C( -62), INT8_C( -88),
+                           INT8_C( -11), INT8_C(   9), INT8_C(  82), INT8_C( -89),
+                           INT8_C( -65), INT8_C(   3), INT8_C(  58), INT8_C(  86),
+                           INT8_C(  28), INT8_C(  36), INT8_C(   7), INT8_C( -18)) },
+    { simde_mm512_set_epi16(INT16_C( 18175), INT16_C( -3760), INT16_C( 10318), INT16_C(-31849),
+                            INT16_C(-32429), INT16_C(-26500), INT16_C( 24084), INT16_C(-23946),
+                            INT16_C(  2525), INT16_C(  2478), INT16_C(-15141), INT16_C(-27410),
+                            INT16_C( 30961), INT16_C(-31554), INT16_C( -9533), INT16_C(-20012),
+                            INT16_C(-21820), INT16_C( 11767), INT16_C(-17849), INT16_C( 24518),
+                            INT16_C(-22206), INT16_C(-24996), INT16_C(-19566), INT16_C( 17826),
+                            INT16_C( 25765), INT16_C( 29123), INT16_C( 28065), INT16_C(  1432),
+                            INT16_C(-24949), INT16_C( 30580), INT16_C( 20499), INT16_C(-29164)),
+      simde_mm256_set_epi8(INT8_C(  -1), INT8_C(  80), INT8_C(  78), INT8_C(-105),
+                           INT8_C(  83), INT8_C( 124), INT8_C(  20), INT8_C( 118),
+                           INT8_C( -35), INT8_C( -82), INT8_C( -37), INT8_C( -18),
+                           INT8_C( -15), INT8_C( -66), INT8_C( -61), INT8_C( -44),
+                           INT8_C( -60), INT8_C(  -9), INT8_C(  71), INT8_C( -58),
+                           INT8_C(  66), INT8_C(  92), INT8_C(-110), INT8_C( -94),
+                           INT8_C( -91), INT8_C( -61), INT8_C( -95), INT8_C(-104),
+                           INT8_C(-117), INT8_C( 116), INT8_C(  19), INT8_C(  20)) },
+    { simde_mm512_set_epi16(INT16_C( 10816), INT16_C( 16713), INT16_C( 29707), INT16_C( 15186),
+                            INT16_C( 31860), INT16_C(-28520), INT16_C( 18947), INT16_C(-27460),
+                            INT16_C( 10883), INT16_C(   310), INT16_C(  8277), INT16_C(-28768),
+                            INT16_C( -4553), INT16_C( 23273), INT16_C(-27696), INT16_C(-20678),
+                            INT16_C( 13089), INT16_C( -6620), INT16_C( 31575), INT16_C(-20169),
+                            INT16_C( 14440), INT16_C( -9264), INT16_C(-26919), INT16_C(-25720),
+                            INT16_C(-18371), INT16_C( 25765), INT16_C(-13162), INT16_C(-16808),
+                            INT16_C(  5695), INT16_C(-25080), INT16_C( 19142), INT16_C(  3825)),
+      simde_mm256_set_epi8(INT8_C(  64), INT8_C(  73), INT8_C(  11), INT8_C(  82),
+                           INT8_C( 116), INT8_C(-104), INT8_C(   3), INT8_C( -68),
+                           INT8_C(-125), INT8_C(  54), INT8_C(  85), INT8_C( -96),
+                           INT8_C(  55), INT8_C( -23), INT8_C( -48), INT8_C(  58),
+                           INT8_C(  33), INT8_C(  36), INT8_C(  87), INT8_C(  55),
+                           INT8_C( 104), INT8_C( -48), INT8_C( -39), INT8_C(-120),
+                           INT8_C(  61), INT8_C( -91), INT8_C(-106), INT8_C(  88),
+                           INT8_C(  63), INT8_C(   8), INT8_C( -58), INT8_C( -15)) },
+    { simde_mm512_set_epi16(INT16_C(  5079), INT16_C(-24746), INT16_C( 23487), INT16_C(-22087),
+                            INT16_C( -8346), INT16_C( 29848), INT16_C( 14241), INT16_C( 18254),
+                            INT16_C( -3124), INT16_C(-16186), INT16_C(-13364), INT16_C( 10652),
+                            INT16_C( 31028), INT16_C( 21346), INT16_C(  1443), INT16_C(-20222),
+                            INT16_C(-17028), INT16_C(-21899), INT16_C( 18933), INT16_C(  6935),
+                            INT16_C( 24619), INT16_C(  1737), INT16_C( 12596), INT16_C( 31606),
+                            INT16_C(-32691), INT16_C( 11392), INT16_C( 32126), INT16_C(-32712),
+                            INT16_C( 20927), INT16_C(-27859), INT16_C( 22640), INT16_C(  8969)),
+      simde_mm256_set_epi8(INT8_C( -41), INT8_C(  86), INT8_C( -65), INT8_C( -71),
+                           INT8_C( 102), INT8_C(-104), INT8_C( -95), INT8_C(  78),
+                           INT8_C( -52), INT8_C( -58), INT8_C( -52), INT8_C(-100),
+                           INT8_C(  52), INT8_C(  98), INT8_C( -93), INT8_C(   2),
+                           INT8_C( 124), INT8_C( 117), INT8_C( -11), INT8_C(  23),
+                           INT8_C(  43), INT8_C( -55), INT8_C(  52), INT8_C( 118),
+                           INT8_C(  77), INT8_C(-128), INT8_C( 126), INT8_C(  56),
+                           INT8_C( -65), INT8_C(  45), INT8_C( 112), INT8_C(   9)) },
+    { simde_mm512_set_epi16(INT16_C(  6901), INT16_C(-23435), INT16_C(-26040), INT16_C(-11295),
+                            INT16_C(   623), INT16_C(-23058), INT16_C( 17549), INT16_C(-23291),
+                            INT16_C( 17215), INT16_C( -4892), INT16_C(  -849), INT16_C( 21086),
+                            INT16_C(-13056), INT16_C( 19549), INT16_C( 16492), INT16_C(-22767),
+                            INT16_C(-24079), INT16_C(  6429), INT16_C( 15302), INT16_C( -9175),
+                            INT16_C( 17671), INT16_C(-29856), INT16_C(-12718), INT16_C(-22914),
+                            INT16_C(-19613), INT16_C( 14088), INT16_C(-10443), INT16_C( 31757),
+                            INT16_C( 24994), INT16_C( 24174), INT16_C( -9596), INT16_C(-22481)),
+      simde_mm256_set_epi8(INT8_C( -11), INT8_C( 117), INT8_C(  72), INT8_C( -31),
+                           INT8_C( 111), INT8_C( -18), INT8_C(-115), INT8_C(   5),
+                           INT8_C(  63), INT8_C( -28), INT8_C( -81), INT8_C(  94),
+                           INT8_C(   0), INT8_C(  93), INT8_C( 108), INT8_C(  17),
+                           INT8_C( -15), INT8_C(  29), INT8_C( -58), INT8_C(  41),
+                           INT8_C(   7), INT8_C(  96), INT8_C(  82), INT8_C( 126),
+                           INT8_C(  99), INT8_C(   8), INT8_C(  53), INT8_C(  13),
+                           INT8_C( -94), INT8_C( 110), INT8_C(-124), INT8_C(  47)) },
+    { simde_mm512_set_epi16(INT16_C( 15520), INT16_C( 15679), INT16_C(  8541), INT16_C(-20376),
+                            INT16_C(  8861), INT16_C( 12926), INT16_C( 25712), INT16_C( -8433),
+                            INT16_C( -7066), INT16_C(-23691), INT16_C(-20251), INT16_C( 18056),
+                            INT16_C(  5498), INT16_C(-18751), INT16_C(-26321), INT16_C(  7918),
+                            INT16_C(  1647), INT16_C( 21774), INT16_C(  5430), INT16_C(-19512),
+                            INT16_C(-14894), INT16_C( 12466), INT16_C( -9612), INT16_C(-23130),
+                            INT16_C( 18357), INT16_C( 32349), INT16_C(-25760), INT16_C( -6559),
+                            INT16_C(-24198), INT16_C( 13614), INT16_C( 13473), INT16_C(-25578)),
+      simde_mm256_set_epi8(INT8_C( -96), INT8_C(  63), INT8_C(  93), INT8_C( 104),
+                           INT8_C( -99), INT8_C( 126), INT8_C( 112), INT8_C(  15),
+                           INT8_C( 102), INT8_C( 117), INT8_C( -27), INT8_C(-120),
+                           INT8_C( 122), INT8_C( -63), INT8_C(  47), INT8_C( -18),
+                           INT8_C( 111), INT8_C(  14), INT8_C(  54), INT8_C( -56),
+                           INT8_C( -46), INT8_C( -78), INT8_C( 116), INT8_C( -90),
+                           INT8_C( -75), INT8_C(  93), INT8_C(  96), INT8_C(  97),
+                           INT8_C( 122), INT8_C(  46), INT8_C( -95), INT8_C(  22)) },
+    { simde_mm512_set_epi16(INT16_C(-13944), INT16_C( 30422), INT16_C( 10523), INT16_C( 28986),
+                            INT16_C(-23789), INT16_C(-20754), INT16_C( 29282), INT16_C(-10845),
+                            INT16_C( 10721), INT16_C(  2777), INT16_C(-18838), INT16_C(  8324),
+                            INT16_C( 19192), INT16_C(   114), INT16_C( -9073), INT16_C(  2615),
+                            INT16_C( 21008), INT16_C( 12652), INT16_C(-14859), INT16_C(  5734),
+                            INT16_C( -5598), INT16_C(-10707), INT16_C(  2170), INT16_C( 23903),
+                            INT16_C( 29988), INT16_C( 24405), INT16_C(  5383), INT16_C(-29994),
+                            INT16_C(  7143), INT16_C( 22270), INT16_C( -1480), INT16_C( 15491)),
+      simde_mm256_set_epi8(INT8_C(-120), INT8_C( -42), INT8_C(  27), INT8_C(  58),
+                           INT8_C(  19), INT8_C( -18), INT8_C(  98), INT8_C( -93),
+                           INT8_C( -31), INT8_C( -39), INT8_C( 106), INT8_C(-124),
+                           INT8_C(  -8), INT8_C( 114), INT8_C(-113), INT8_C(  55),
+                           INT8_C(  16), INT8_C( 108), INT8_C( -11), INT8_C( 102),
+                           INT8_C(  34), INT8_C(  45), INT8_C( 122), INT8_C(  95),
+                           INT8_C(  36), INT8_C(  85), INT8_C(   7), INT8_C( -42),
+                           INT8_C( -25), INT8_C(  -2), INT8_C(  56), INT8_C(-125)) }
+  };
+
+  for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])); i++) {
+    simde__m256i r = simde_mm512_cvtepi16_epi8(test_vec[i].a);
+    simde_assert_m256i_i8(r, ==, test_vec[i].r);
+  }
+
+  return MUNIT_OK;
+}
+
+static MunitResult
+test_simde_mm512_cvtepi8_epi16(const MunitParameter params[], void* data) {
+  (void) params;
+  (void) data;
+
+  const struct {
+    simde__m256i a;
+    simde__m512i r;
+  } test_vec[8] = {
+    { simde_mm256_set_epi8(INT8_C(   7), INT8_C(  68), INT8_C( -86), INT8_C( -36),
+                           INT8_C( -19), INT8_C(  73), INT8_C(  92), INT8_C( -27),
+                           INT8_C(  55), INT8_C( -65), INT8_C( -50), INT8_C(  19),
+                           INT8_C(-111), INT8_C( -79), INT8_C( -16), INT8_C(  70),
+                           INT8_C(  27), INT8_C( -28), INT8_C( 116), INT8_C(  42),
+                           INT8_C(  -4), INT8_C(  78), INT8_C(  31), INT8_C(  51),
+                           INT8_C(  92), INT8_C(  39), INT8_C(-125), INT8_C(  94),
+                           INT8_C( -78), INT8_C(  67), INT8_C( -43), INT8_C( -71)),
+      simde_mm512_set_epi16(INT16_C(     7), INT16_C(    68), INT16_C(   -86), INT16_C(   -36),
+                            INT16_C(   -19), INT16_C(    73), INT16_C(    92), INT16_C(   -27),
+                            INT16_C(    55), INT16_C(   -65), INT16_C(   -50), INT16_C(    19),
+                            INT16_C(  -111), INT16_C(   -79), INT16_C(   -16), INT16_C(    70),
+                            INT16_C(    27), INT16_C(   -28), INT16_C(   116), INT16_C(    42),
+                            INT16_C(    -4), INT16_C(    78), INT16_C(    31), INT16_C(    51),
+                            INT16_C(    92), INT16_C(    39), INT16_C(  -125), INT16_C(    94),
+                            INT16_C(   -78), INT16_C(    67), INT16_C(   -43), INT16_C(   -71)) },
+    { simde_mm256_set_epi8(INT8_C(  29), INT8_C( -37), INT8_C(  27), INT8_C(  10),
+                           INT8_C( -22), INT8_C(  -9), INT8_C(-125), INT8_C(  -3),
+                           INT8_C( -53), INT8_C(  92), INT8_C( 103), INT8_C(  92),
+                           INT8_C( 123), INT8_C(  74), INT8_C(  36), INT8_C(  59),
+                           INT8_C(  46), INT8_C( -29), INT8_C(-103), INT8_C(  -4),
+                           INT8_C( 109), INT8_C( -54), INT8_C(  41), INT8_C(  79),
+                           INT8_C(  15), INT8_C( -92), INT8_C( 102), INT8_C( 116),
+                           INT8_C( -42), INT8_C(  52), INT8_C( -61), INT8_C( -99)),
+      simde_mm512_set_epi16(INT16_C(    29), INT16_C(   -37), INT16_C(    27), INT16_C(    10),
+                            INT16_C(   -22), INT16_C(    -9), INT16_C(  -125), INT16_C(    -3),
+                            INT16_C(   -53), INT16_C(    92), INT16_C(   103), INT16_C(    92),
+                            INT16_C(   123), INT16_C(    74), INT16_C(    36), INT16_C(    59),
+                            INT16_C(    46), INT16_C(   -29), INT16_C(  -103), INT16_C(    -4),
+                            INT16_C(   109), INT16_C(   -54), INT16_C(    41), INT16_C(    79),
+                            INT16_C(    15), INT16_C(   -92), INT16_C(   102), INT16_C(   116),
+                            INT16_C(   -42), INT16_C(    52), INT16_C(   -61), INT16_C(   -99)) },
+    { simde_mm256_set_epi8(INT8_C(  -9), INT8_C( -47), INT8_C( 107), INT8_C( -74),
+                           INT8_C(-126), INT8_C(  34), INT8_C(  64), INT8_C( 115),
+                           INT8_C( -65), INT8_C(-124), INT8_C(  54), INT8_C(  27),
+                           INT8_C(  41), INT8_C( 112), INT8_C(  61), INT8_C(   6),
+                           INT8_C(   7), INT8_C(  39), INT8_C(-109), INT8_C( -99),
+                           INT8_C(  63), INT8_C( -35), INT8_C(-111), INT8_C( -72),
+                           INT8_C( 109), INT8_C( -39), INT8_C( -99), INT8_C(  26),
+                           INT8_C(  66), INT8_C( -78), INT8_C(  30), INT8_C(  38)),
+      simde_mm512_set_epi16(INT16_C(    -9), INT16_C(   -47), INT16_C(   107), INT16_C(   -74),
+                            INT16_C(  -126), INT16_C(    34), INT16_C(    64), INT16_C(   115),
+                            INT16_C(   -65), INT16_C(  -124), INT16_C(    54), INT16_C(    27),
+                            INT16_C(    41), INT16_C(   112), INT16_C(    61), INT16_C(     6),
+                            INT16_C(     7), INT16_C(    39), INT16_C(  -109), INT16_C(   -99),
+                            INT16_C(    63), INT16_C(   -35), INT16_C(  -111), INT16_C(   -72),
+                            INT16_C(   109), INT16_C(   -39), INT16_C(   -99), INT16_C(    26),
+                            INT16_C(    66), INT16_C(   -78), INT16_C(    30), INT16_C(    38)) },
+    { simde_mm256_set_epi8(INT8_C( -72), INT8_C( -80), INT8_C( 101), INT8_C(  81),
+                           INT8_C(  23), INT8_C( -68), INT8_C( -57), INT8_C(-111),
+                           INT8_C(  -3), INT8_C(  21), INT8_C( 121), INT8_C( -22),
+                           INT8_C(-104), INT8_C( -10), INT8_C( -37), INT8_C(  66),
+                           INT8_C( -93), INT8_C( -80), INT8_C(  34), INT8_C( 104),
+                           INT8_C( -39), INT8_C( -99), INT8_C(  18), INT8_C( 110),
+                           INT8_C(-118), INT8_C(  38), INT8_C( 112), INT8_C( -67),
+                           INT8_C(  60), INT8_C(  47), INT8_C(  32), INT8_C(  33)),
+      simde_mm512_set_epi16(INT16_C(   -72), INT16_C(   -80), INT16_C(   101), INT16_C(    81),
+                            INT16_C(    23), INT16_C(   -68), INT16_C(   -57), INT16_C(  -111),
+                            INT16_C(    -3), INT16_C(    21), INT16_C(   121), INT16_C(   -22),
+                            INT16_C(  -104), INT16_C(   -10), INT16_C(   -37), INT16_C(    66),
+                            INT16_C(   -93), INT16_C(   -80), INT16_C(    34), INT16_C(   104),
+                            INT16_C(   -39), INT16_C(   -99), INT16_C(    18), INT16_C(   110),
+                            INT16_C(  -118), INT16_C(    38), INT16_C(   112), INT16_C(   -67),
+                            INT16_C(    60), INT16_C(    47), INT16_C(    32), INT16_C(    33)) },
+    { simde_mm256_set_epi8(INT8_C( 120), INT8_C( -90), INT8_C(-101), INT8_C(-106),
+                           INT8_C(  70), INT8_C( -49), INT8_C(  29), INT8_C( -43),
+                           INT8_C( -42), INT8_C(  38), INT8_C(  16), INT8_C( -43),
+                           INT8_C( -40), INT8_C( -76), INT8_C( -67), INT8_C(  53),
+                           INT8_C( -73), INT8_C( -17), INT8_C(  66), INT8_C(  57),
+                           INT8_C( -65), INT8_C( -63), INT8_C(  17), INT8_C(  -9),
+                           INT8_C(  95), INT8_C( -50), INT8_C(-118), INT8_C( 114),
+                           INT8_C(  58), INT8_C( -28), INT8_C( -81), INT8_C( -37)),
+      simde_mm512_set_epi16(INT16_C(   120), INT16_C(   -90), INT16_C(  -101), INT16_C(  -106),
+                            INT16_C(    70), INT16_C(   -49), INT16_C(    29), INT16_C(   -43),
+                            INT16_C(   -42), INT16_C(    38), INT16_C(    16), INT16_C(   -43),
+                            INT16_C(   -40), INT16_C(   -76), INT16_C(   -67), INT16_C(    53),
+                            INT16_C(   -73), INT16_C(   -17), INT16_C(    66), INT16_C(    57),
+                            INT16_C(   -65), INT16_C(   -63), INT16_C(    17), INT16_C(    -9),
+                            INT16_C(    95), INT16_C(   -50), INT16_C(  -118), INT16_C(   114),
+                            INT16_C(    58), INT16_C(   -28), INT16_C(   -81), INT16_C(   -37)) },
+    { simde_mm256_set_epi8(INT8_C( -97), INT8_C(  10), INT8_C( -75), INT8_C(-120),
+                           INT8_C( -32), INT8_C(-105), INT8_C( -75), INT8_C(-101),
+                           INT8_C(  71), INT8_C(-122), INT8_C(-112), INT8_C(  -2),
+                           INT8_C(  60), INT8_C( -71), INT8_C( 101), INT8_C(  -1),
+                           INT8_C(  95), INT8_C( -58), INT8_C( -70), INT8_C( 102),
+                           INT8_C( 115), INT8_C( -68), INT8_C(-110), INT8_C( -36),
+                           INT8_C(   6), INT8_C(  58), INT8_C(  73), INT8_C(  97),
+                           INT8_C( -51), INT8_C(  -4), INT8_C(  58), INT8_C(  31)),
+      simde_mm512_set_epi16(INT16_C(   -97), INT16_C(    10), INT16_C(   -75), INT16_C(  -120),
+                            INT16_C(   -32), INT16_C(  -105), INT16_C(   -75), INT16_C(  -101),
+                            INT16_C(    71), INT16_C(  -122), INT16_C(  -112), INT16_C(    -2),
+                            INT16_C(    60), INT16_C(   -71), INT16_C(   101), INT16_C(    -1),
+                            INT16_C(    95), INT16_C(   -58), INT16_C(   -70), INT16_C(   102),
+                            INT16_C(   115), INT16_C(   -68), INT16_C(  -110), INT16_C(   -36),
+                            INT16_C(     6), INT16_C(    58), INT16_C(    73), INT16_C(    97),
+                            INT16_C(   -51), INT16_C(    -4), INT16_C(    58), INT16_C(    31)) },
+    { simde_mm256_set_epi8(INT8_C( -73), INT8_C(-123), INT8_C( -11), INT8_C(  62),
+                           INT8_C( -96), INT8_C(-103), INT8_C(  85), INT8_C(  88),
+                           INT8_C( -19), INT8_C(  28), INT8_C(-107), INT8_C( -81),
+                           INT8_C(-125), INT8_C(  88), INT8_C(  84), INT8_C( 115),
+                           INT8_C( 105), INT8_C( -47), INT8_C(  68), INT8_C(-124),
+                           INT8_C(  32), INT8_C(-100), INT8_C(  10), INT8_C( -69),
+                           INT8_C( 124), INT8_C( -51), INT8_C( -89), INT8_C( -72),
+                           INT8_C( -92), INT8_C(  -5), INT8_C( -46), INT8_C( 115)),
+      simde_mm512_set_epi16(INT16_C(   -73), INT16_C(  -123), INT16_C(   -11), INT16_C(    62),
+                            INT16_C(   -96), INT16_C(  -103), INT16_C(    85), INT16_C(    88),
+                            INT16_C(   -19), INT16_C(    28), INT16_C(  -107), INT16_C(   -81),
+                            INT16_C(  -125), INT16_C(    88), INT16_C(    84), INT16_C(   115),
+                            INT16_C(   105), INT16_C(   -47), INT16_C(    68), INT16_C(  -124),
+                            INT16_C(    32), INT16_C(  -100), INT16_C(    10), INT16_C(   -69),
+                            INT16_C(   124), INT16_C(   -51), INT16_C(   -89), INT16_C(   -72),
+                            INT16_C(   -92), INT16_C(    -5), INT16_C(   -46), INT16_C(   115)) },
+    { simde_mm256_set_epi8(INT8_C( 104), INT8_C(  66), INT8_C(  51), INT8_C(  81),
+                           INT8_C( -69), INT8_C( 104), INT8_C( 126), INT8_C( -43),
+                           INT8_C( -40), INT8_C(  23), INT8_C(-124), INT8_C(  98),
+                           INT8_C(-125), INT8_C(  95), INT8_C( -36), INT8_C(  46),
+                           INT8_C(-115), INT8_C( -93), INT8_C(   2), INT8_C( -77),
+                           INT8_C(  80), INT8_C(-116), INT8_C(  61), INT8_C( -89),
+                           INT8_C( -37), INT8_C(   9), INT8_C(  84), INT8_C( -64),
+                           INT8_C(  94), INT8_C(  67), INT8_C( -53), INT8_C( 111)),
+      simde_mm512_set_epi16(INT16_C(   104), INT16_C(    66), INT16_C(    51), INT16_C(    81),
+                            INT16_C(   -69), INT16_C(   104), INT16_C(   126), INT16_C(   -43),
+                            INT16_C(   -40), INT16_C(    23), INT16_C(  -124), INT16_C(    98),
+                            INT16_C(  -125), INT16_C(    95), INT16_C(   -36), INT16_C(    46),
+                            INT16_C(  -115), INT16_C(   -93), INT16_C(     2), INT16_C(   -77),
+                            INT16_C(    80), INT16_C(  -116), INT16_C(    61), INT16_C(   -89),
+                            INT16_C(   -37), INT16_C(     9), INT16_C(    84), INT16_C(   -64),
+                            INT16_C(    94), INT16_C(    67), INT16_C(   -53), INT16_C(   111)) }
+  };
+
+  for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])); i++) {
+    simde__m512i r = simde_mm512_cvtepi8_epi16(test_vec[i].a);
+    simde_assert_m512i_i16(r, ==, test_vec[i].r);
+  }
+
+  return MUNIT_OK;
+}
+
 #endif /* defined(SIMDE_avx512bw_NATIVE) || defined(SIMDE_NO_NATIVE) || defined(SIMDE_ALWAYS_BUILD_NATIVE_TESTS) */
 
 HEDLEY_DIAGNOSTIC_PUSH
@@ -2197,6 +2491,10 @@ static MunitTest test_suite_tests[] = {
   SIMDE_TESTS_DEFINE_TEST(mm512_shuffle_epi8),
   SIMDE_TESTS_DEFINE_TEST(mm512_cmpeq_epi8_mask),
 
+  SIMDE_TESTS_DEFINE_TEST(mm512_cvtepi16_epi8),
+  
+  SIMDE_TESTS_DEFINE_TEST(mm512_cvtepi8_epi16),
+
 #endif /* defined(SIMDE_AVX512bw_NATIVE) || defined(SIMDE_NO_NATIVE) || defined(SIMDE_ALWAYS_BUILD_NATIVE_TESTS) */
   { NULL, NULL, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL }
 };


=====================================
test/x86/avx512f.c
=====================================
The diff for this file was not included because it is too large.

=====================================
test/x86/meson.build
=====================================
@@ -0,0 +1,32 @@
+simde_test_x86_sources = [
+  'mmx.c',
+  'sse.c',
+  'sse2.c',
+  'sse3.c',
+  'ssse3.c',
+  'sse4.1.c',
+  'sse4.2.c',
+  'avx.c',
+  'avx2.c',
+  'fma.c',
+  'avx512f.c',
+  'avx512bw.c',
+  'svml.c'
+]
+
+foreach src : simde_test_x86_sources
+  simde_test_x86_sources += configure_file(input: src, output: src + 'pp', copy: true)
+endforeach
+
+simde_tests_x86_emul = static_library('simde-tests-x86-emul', simde_test_x86_sources,
+  c_args: simde_c_args + simde_c_defs + simde_native_c_flags + ['-DSIMDE_NO_NATIVE'],
+  cpp_args: simde_cxx_args + simde_cxx_defs + simde_native_cxx_flags + ['-DSIMDE_NO_NATIVE'],
+  include_directories: simde_include_dir)
+
+simde_tests_x86_native = static_library('simde-tests-x86-native', simde_test_x86_sources,
+  c_args: simde_c_args + simde_c_defs + simde_native_c_flags,
+  cpp_args: simde_cxx_args + simde_cxx_defs + simde_native_cxx_flags,
+  include_directories: simde_include_dir)
+
+simde_tests_x86 = static_library('simde-tests-x86', 'test-x86.c',
+  link_with: [simde_tests_x86_emul, simde_tests_x86_native])


=====================================
test/x86/mmx.c
=====================================
@@ -25,7 +25,6 @@
 #include <test/x86/test-mmx.h>
 
 #if defined(SIMDE_MMX_NATIVE) || defined(SIMDE_NO_NATIVE) || defined(SIMDE_ALWAYS_BUILD_NATIVE_TESTS)
-#endif /* defined(SIMDE_MMX_NATIVE) || defined(SIMDE_NO_NATIVE) || defined(SIMDE_ALWAYS_BUILD_NATIVE_TESTS) */
 
 static MunitResult
 test_simde_mm_set1_pi8(const MunitParameter params[], void* data) {
@@ -2786,7 +2785,6 @@ test_simde_m_to_int64(const MunitParameter params[], void* data) {
   return MUNIT_OK;
 }
 
-#if defined(SIMDE_MMX_NATIVE) || defined(SIMDE_NO_NATIVE) || defined(SIMDE_ALWAYS_BUILD_NATIVE_TESTS)
 #endif /* defined(SIMDE_MMX_NATIVE) || defined(SIMDE_NO_NATIVE) || defined(SIMDE_ALWAYS_BUILD_NATIVE_TESTS) */
 
 HEDLEY_DIAGNOSTIC_PUSH


=====================================
test/x86/skel.c
=====================================
@@ -2804,7 +2804,7 @@ test_simde_mm512_mask_xxx_epi32_mask(const MunitParameter params[], void* data)
 
     r = simde_mm512_mask_xxx_epi32_mask(k, simde__m512i_from_private(a), simde__m512i_from_private(b));
 
-    printf("    { UINT16_C(%5" PRId16 "),\n", k);
+    printf("    { UINT16_C(%5" PRIu16 "),\n", k);
     printf("      simde_mm512_set_epi32(INT32_C(%11" PRId32 "), INT32_C(%11" PRId32 "), INT32_C(%11" PRId32 "), INT32_C(%11" PRId32 "),\n"
            "                            INT32_C(%11" PRId32 "), INT32_C(%11" PRId32 "), INT32_C(%11" PRId32 "), INT32_C(%11" PRId32 "),\n"
            "                            INT32_C(%11" PRId32 "), INT32_C(%11" PRId32 "), INT32_C(%11" PRId32 "), INT32_C(%11" PRId32 "),\n"
@@ -2817,13 +2817,13 @@ test_simde_mm512_mask_xxx_epi32_mask(const MunitParameter params[], void* data)
            "                            INT32_C(%11" PRId32 "), INT32_C(%11" PRId32 "), INT32_C(%11" PRId32 "), INT32_C(%11" PRId32 ")),\n",
            b.i32[15], b.i32[14], b.i32[13], b.i32[12], b.i32[11], b.i32[10], b.i32[ 9], b.i32[ 8],
            b.i32[ 7], b.i32[ 6], b.i32[ 5], b.i32[ 4], b.i32[ 3], b.i32[ 2], b.i32[ 1], b.i32[ 0]);
-    printf("      UINT16_C(%5" PRId16 ") },\n", r);
+    printf("      UINT16_C(%5" PRIu16 ") },\n", r);
   }
   return MUNIT_FAIL;
 
   for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])); i++) {
     simde__mmask16 r = simde_mm512_mask_xxx_epi32_mask(test_vec[i].k, test_vec[i].a, test_vec[i].b);
-    munit_assert_uint16(r, ==, test_vec[i].r);
+    simde_assert_mmask16(r, ==, test_vec[i].r);
   }
 
   return MUNIT_OK;
@@ -2873,7 +2873,7 @@ test_simde_mm512_mask_xxx_epi64_mask(const MunitParameter params[], void* data)
 
   for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])); i++) {
     simde__mmask8 r = simde_mm512_mask_xxx_epi64_mask(test_vec[i].k, test_vec[i].a, test_vec[i].b);
-    munit_assert_uint8(r, ==, test_vec[i].r);
+    simde_assert_mmask8(r, ==, test_vec[i].r);
   }
 
   return MUNIT_OK;


=====================================
test/x86/test-avx512.h
=====================================
@@ -13,4 +13,13 @@ SIMDE_TEST_DEFINE_ASSERT_VEC_CLOSE(m512d, f64)
 #define simde_assert_m512d_close(a, b, precision) \
   simde_assert_m512d_f64_close_ex(__LINE__, __FILE__, a, b, precision)
 
+#define simde_assert_mmask8(a, op, b) \
+  munit_assert_uint8(HEDLEY_STATIC_CAST(uint8_t, a), op, HEDLEY_STATIC_CAST(uint8_t, b))
+#define simde_assert_mmask16(a, op, b) \
+  munit_assert_uint16(HEDLEY_STATIC_CAST(uint16_t, a), op, HEDLEY_STATIC_CAST(uint16_t, b))
+#define simde_assert_mmask32(a, op, b) \
+  munit_assert_uint32(HEDLEY_STATIC_CAST(uint32_t, a), op, HEDLEY_STATIC_CAST(uint32_t, b))
+#define simde_assert_mmask64(a, op, b) \
+  munit_assert_uint64(HEDLEY_STATIC_CAST(uint64_t, a), op, HEDLEY_STATIC_CAST(uint64_t, b))
+
 #endif /* !defined(SIMDE_TEST_X86_AVX512) */



View it on GitLab: https://salsa.debian.org/med-team/simde/-/commit/cc9ed7fbec3c8e9ba2f95a63fdea42255bfe2a39

-- 
View it on GitLab: https://salsa.debian.org/med-team/simde/-/commit/cc9ed7fbec3c8e9ba2f95a63fdea42255bfe2a39
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20200407/b38026d8/attachment-0001.html>


More information about the debian-med-commit mailing list