[med-svn] [Git][med-team/htscodecs][upstream] New upstream version 1.6.0

Fri Dec 15 11:54:33 GMT 2023


Étienne Mollier pushed to branch upstream at Debian Med / htscodecs


Commits:
027895ec by Étienne Mollier at 2023-12-14T14:00:26+01:00
New upstream version 1.6.0
- - - - -


19 changed files:

- .gitignore
- NEWS.md
- README.md
- configure.ac
- htscodecs/Makefile.am
- htscodecs/fqzcomp_qual.c
- htscodecs/htscodecs.h
- htscodecs/rANS_static16_int.h
- htscodecs/rANS_static32x16pr_avx2.c
- htscodecs/rANS_static32x16pr_avx512.c
- htscodecs/rANS_static32x16pr_sse4.c
- htscodecs/rANS_static4x16pr.c
- htscodecs/tokenise_name3.c
- m4/hts_check_compile_flags_needed.m4
- tests/Makefile.am
- tests/entropy.c
- + tests/entropy_fuzz.c
- + tests/fqzcomp_qual_fuzzrt.c
- + tests/tokenise_name3_fuzzrt.c


Changes:

=====================================
.gitignore
=====================================
@@ -31,11 +31,17 @@ tests/*.trs
 tests/.deps/
 tests/.libs/
 tests/arith_dynamic
+tests/arith_dynamic_fuzz
 tests/entropy
+tests/entropy_fuzz
 tests/fqzcomp_qual
+tests/fqzcomp_qual_fuzz
 tests/Makefile
 tests/rans4x16pr
+tests/rans4x16pr_fuzz
 tests/rans4x8
+tests/rans4x8_fuzz
 tests/test.out/
 tests/tokenise_name3
+tests/tokenise_name3_fuzz
 tests/varint


=====================================
NEWS.md
=====================================
@@ -1,3 +1,54 @@
+Release 1.6.0: 7th December 2023
+--------------------------------
+
+This release is primarily bug fixes, mostly spotted through improved fuzz
+testing.
+
+One big change however is the SIMD rANS codecs are now performant on Intel
+CPUs with the DownFall mitigation microcode applied.
+
+
+Changes
+
+- Replaced the rANS codec SIMD gathers with simulated gathers via scalar
+  memory fetches.  This helps AMD Zen4, but importantly it also fixes a
+  disastrous performance regression caused by Intel's DownFall microcode fix.
+
+  There is an impact on pre-DownFall speeds, but we should focus on patched
+  CPUs as a priority.
+
+- A small speed up to the rans_F_to_s3 function used by order-0 rans decode.
+
+- Small speed up to SIMD rans32x16 order-1 encoder by reducing cache misses.
+  Also sped up the rans4x8 order-1 encoder, particularly on AMD Zen4.
+
+- Now supports building with "zig cc"
+  (Issue #109, reported by David Jackson)
+
+
+Bug fixes
+
+- Improve robustness of name tokeniser when given non 7-bit ASCII and on
+  machines where "char" defaults to unsigned.
+  (Issue #105, reported by Shubham Chandak)
+
+- Also fixed a 1 byte buffer read-overrun in name tokeniser.
+
+- Fix name tokeniser encoder failure with some duplicated streams.
+
+- Fixed rans_set_cpu to work multiple times, as well as reinstating the
+  ability to change decode and encode side independently (accidentally lost in
+  commit 958032c).  No effect on usage, but it improves the test coverage.
+
+- Added a round-trip fuzz tester to test the ability to encode.  The old fuzz
+  testing was decode streams only.
+
+- Fixed bounds checking in rans_uncompress_O0_32x16_avx2, fixing buffer read
+  overruns.
+
+- Removed undefined behaviour in transpose_and_copy(), fixing zig cc builds.
+
+
 Release 1.5.2: 6th October 2023
 -------------------------------
 


=====================================
README.md
=====================================
@@ -32,6 +32,32 @@ separate entity.  If you are attempting to make use of these codecs
 within your own library, such as we do within Staden io_lib, it may be
 useful to configure this with `--disable-shared --with-pic'.
 
+Intel GDS/Downfall
+------------------
+
+The new Intel microcode to patch the Gather Data Sampling (GDS) bug
+has severely impacted the performance of the AVX2 and AVX512 gather
+SIMD intrinsics.  So much so that simulating vector gather
+instructions using traditional scalar code is now more performant.
+Due to the slower AVX512 gathers on AMD Zen4, the simulated gather
+code is faster there too, even without having a Downfall mitigation.
+
+Unfortunately this simulated code is slower than the original gather
+code when run on an unpatched CPUs.  Without the GDS microcode patch
+on Intel CPUs, the simulated gathers slows the rate of decode by
+10-30% and the rate of encode for AVX512 by up to 50%.  The GDS
+microcode patch however can slow AVX2 decode and AVX512 encode by
+2-3x.
+
+Ideally we would auto-detect and select the optimal algorithm, but the
+easy solution is simply to pick the method with the most uniform
+performance without having bad worst-case rates.
+
+Hence by default the simulated gather implementation is now the
+default.  The old code can be reenabled by building with:
+
+    make CPPFLAGS=-DUSE_GATHER
+
 
 Testing
 -------


=====================================
configure.ac
=====================================
@@ -1,5 +1,5 @@
 dnl Process this file with autoconf to produce a configure script.
-AC_INIT(htscodecs, 1.5.2)
+AC_INIT(htscodecs, 1.6.0)
 
 # Some functions benefit from -O3 optimisation, so if the user didn't
 # explicitly set any compiler flags, we'll plump for O3.
@@ -61,7 +61,7 @@ AM_EXTRA_RECURSIVE_TARGETS([fuzz])
 #       libhtscodecs.so.1.1.0
 
 VERS_CURRENT=3
-VERS_REVISION=4
+VERS_REVISION=5
 VERS_AGE=1
 AC_SUBST(VERS_CURRENT)
 AC_SUBST(VERS_REVISION)
@@ -129,68 +129,33 @@ dnl AC_CHECK_LIB([bsc], [bsc_compress], [
 dnl 	LIBS="-lbsc $LIBS"
 dnl 	AC_DEFINE([HAVE_LIBBSC],1,[Define to 1 if you have the libbsc library.])])
 
-dnl Count parts needed to build rANS_static32x16pr_sse4.c
-sse4_prerequisites=""
-
-dnl Check if we can use our SSSE3 implementations of rANS 32x16 codec.
-HTS_CHECK_COMPILE_FLAGS_NEEDED([ssse3], [-mssse3], [AC_LANG_PROGRAM([[
-	  #ifdef __x86_64__
-	  #include "x86intrin.h"
-	  #endif
-	]],[[
-	  #ifdef __x86_64__
-	  __m128i a = _mm_set_epi32(1, 2, 3, 4), b = _mm_set_epi32(4, 3, 2, 1);
-	  __m128i c = _mm_shuffle_epi8(a, b);
-	  return *((char *) &c);
-	  #endif
-	]])], [
-        MSSSE3="$flags_needed"
-	sse4_prerequisites="o$sse4_prerequisites"
-	AC_SUBST([MSSSE3])
-        AC_DEFINE([HAVE_SSSE3],1,[Defined to 1 if rANS source using SSSE3 can be compiled.])
-])
-
-dnl Check if we can use popcnt instructions
-HTS_CHECK_COMPILE_FLAGS_NEEDED([popcnt], [-mpopcnt], [AC_LANG_PROGRAM([[
-	  #ifdef __x86_64__
-	  #include "x86intrin.h"
-	  #endif
-	]],[[
-	  #ifdef __x86_64__
-	  unsigned int i = _mm_popcnt_u32(1);
-	  return i != 1;
-	  #endif
-	]])], [
-        MPOPCNT="$flags_needed"
-	sse4_prerequisites="o$sse4_prerequisites"
-	AC_SUBST([MPOPCNT])
-        AC_DEFINE([HAVE_POPCNT],1,[Defined to 1 if rANS source using popcnt can be compiled.])
-])
-
-dnl Check if we can use our SSE4.1 too.  This *may* always imply SSSE3?
-dnl It may be easier just to target an old era of cpu than -mssse3 -msse4.1
-dnl -mpopcnt.  Eg -march=nehalem.  I don't know how wide spread that is.
-HTS_CHECK_COMPILE_FLAGS_NEEDED([sse4.1], [-msse4.1], [AC_LANG_PROGRAM([[
+dnl Check if we can use our SSE4.1 too.
+dnl Our SSE4 codec uses SSE4.1, SSSE3 (shuffle) and POPCNT, so we check all 3
+dnl together.  This helps Zig builds which don't work well if we test each
+dnl individually.
+HTS_CHECK_COMPILE_FLAGS_NEEDED([sse4.1], [-msse4.1 -mssse3 -mpopcnt], [AC_LANG_PROGRAM([[
 	  #ifdef __x86_64__
 	  #include "x86intrin.h"
 	  #endif
 	]],[[
 	  #ifdef __x86_64__
 	  __m128i a = _mm_set_epi32(1, 2, 3, 4), b = _mm_set_epi32(4, 3, 2, 1);
-	  __m128i c = _mm_max_epu32(a, b);
-	  return *((char *) &c);
+	  __m128i c = _mm_shuffle_epi8(_mm_max_epu32(a, b), b);
+	  return _mm_popcnt_u32(*((char *) &c));
 	  #endif
 	]])], [
         MSSE4_1="$flags_needed"
-	sse4_prerequisites="o$sse4_prerequisites"
+	build_rans_sse4=yes
 	AC_SUBST([MSSE4_1])
         AC_DEFINE([HAVE_SSE4_1],1,[Defined to 1 if rANS source using SSE4.1 can be compiled.])
+        AC_DEFINE([HAVE_SSSE3],1,[Defined to 1 if rANS source using SSSE3 can be compiled.])
+        AC_DEFINE([HAVE_POPCNT],1,[Defined to 1 if rANS source using popcnt can be compiled.])
 ])
-AM_CONDITIONAL([RANS_32x16_SSE4],[test "x$sse4_prerequisites" = "xooo"])
+AM_CONDITIONAL([RANS_32x16_SSE4],[test "$build_rans_sse4" = yes])
 
 dnl Check if we can use our AVX2 implementations.
 build_rans_avx2=no
-HTS_CHECK_COMPILE_FLAGS_NEEDED([avx2], [-mavx2], [AC_LANG_PROGRAM([[
+HTS_CHECK_COMPILE_FLAGS_NEEDED([avx2], [-mavx2 -mpopcnt], [AC_LANG_PROGRAM([[
 	  #ifdef __x86_64__
 	  #include "x86intrin.h"
 	  #endif
@@ -199,19 +164,20 @@ HTS_CHECK_COMPILE_FLAGS_NEEDED([avx2], [-mavx2], [AC_LANG_PROGRAM([[
 	  __m256i a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
 	  __m256i b = _mm256_add_epi32(a, a);
 	  long long c = _mm256_extract_epi64(b, 0);
-	  return (int) c;
+	  return _mm_popcnt_u32((int)c);
 	  #endif
 	]])], [
         MAVX2="$flags_needed"
 	build_rans_avx2=yes
 	AC_SUBST([MAVX2])
         AC_DEFINE([HAVE_AVX2],1,[Defined to 1 if rANS source using AVX2 can be compiled.])
+        AC_DEFINE([HAVE_POPCNT],1,[Defined to 1 if rANS source using popcnt can be compiled.])
 ])
 AM_CONDITIONAL([RANS_32x16_AVX2],[test "$build_rans_avx2" = yes])
 
-dnl Check also if we have AVX512.  If so this overrides AVX2
+dnl Check also if we have AVX512.
 build_rans_avx512=no
-HTS_CHECK_COMPILE_FLAGS_NEEDED([avx512f], [-mavx512f], [AC_LANG_PROGRAM([[
+HTS_CHECK_COMPILE_FLAGS_NEEDED([avx512f], [-mavx512f -mpopcnt], [AC_LANG_PROGRAM([[
 	  #ifdef __x86_64__
 	  #include "x86intrin.h"
 	  #endif
@@ -219,7 +185,7 @@ HTS_CHECK_COMPILE_FLAGS_NEEDED([avx512f], [-mavx512f], [AC_LANG_PROGRAM([[
 	  #ifdef __x86_64__
 	  __m512i a = _mm512_set1_epi32(1);
 	  __m512i b = _mm512_add_epi32(a, a);
-	  return *((char *) &b);
+	  return _mm_popcnt_u32(*((char *) &b));
 	  #endif
 	]])], [
         MAVX512="$flags_needed"
@@ -227,6 +193,7 @@ HTS_CHECK_COMPILE_FLAGS_NEEDED([avx512f], [-mavx512f], [AC_LANG_PROGRAM([[
 	AC_SUBST([MAVX512])
         AC_DEFINE([HAVE_AVX512],1,[Defined to 1 if rANS source using AVX512F can be compiled.])
 ])
+        AC_DEFINE([HAVE_POPCNT],1,[Defined to 1 if rANS source using popcnt can be compiled.])
 AM_CONDITIONAL([RANS_32x16_AVX512],[test "$build_rans_avx512" = yes])
 
 AC_SUBST([HTSCODECS_SIMD_SRC])


=====================================
htscodecs/Makefile.am
=====================================
@@ -72,7 +72,7 @@ noinst_LTLIBRARIES =
 if RANS_32x16_SSE4
 noinst_LTLIBRARIES += librANS_static32x16pr_sse4.la
 librANS_static32x16pr_sse4_la_SOURCES = rANS_static32x16pr_sse4.c
-librANS_static32x16pr_sse4_la_CFLAGS = @MSSE4_1@ @MSSSE3@ @MPOPCNT@
+librANS_static32x16pr_sse4_la_CFLAGS = @MSSE4_1@
 libhtscodecs_la_LIBADD += librANS_static32x16pr_sse4.la
 endif
 if RANS_32x16_AVX2


=====================================
htscodecs/fqzcomp_qual.c
=====================================
@@ -911,7 +911,7 @@ int fqz_pick_parameters(fqz_gparams *gp,
         // NB: stab is already all zero
     }
 
-    if (gp->max_sel) {
+    if (gp->max_sel && s->num_records) {
         int max = 0;
         for (i = 0; i < s->num_records; i++) {
             if (max < (s->flags[i] >> 16))
@@ -1117,6 +1117,12 @@ unsigned char *compress_block_fqz2f(int vers,
 
     for (i = 0; i < in_size; i++) {
         if (state.p == 0) {
+            if (state.rec >= s->num_records || s->len[state.rec] <= 0) {
+                free(comp);
+                comp = NULL;
+                goto err;
+            }
+
             if (compress_new_read(s, &state, gp, pm, &model, &rc,
                                   in, &i, /*&rec,*/ &last))
                 continue;


=====================================
htscodecs/htscodecs.h
=====================================
@@ -43,7 +43,7 @@
  * Note currently this needs manually editing as it isn't automatically
  * updated by autoconf.
  */
-#define HTSCODECS_VERSION 100502
+#define HTSCODECS_VERSION 100600
 
 /*
  * A const string form of the HTSCODECS_VERSION define.


=====================================
htscodecs/rANS_static16_int.h
=====================================
@@ -537,15 +537,13 @@ static inline int decode_freq1(uint8_t *cp, uint8_t *cp_end, int shift,
 
 // Build s3 symbol lookup table.
 // This is 12 bit freq, 12 bit bias and 8 bit symbol.
-static inline int rans_F_to_s3(uint32_t *F, int shift, uint32_t *s3) {
-    int j, x, y;
+static inline int rans_F_to_s3(const uint32_t *F, int shift, uint32_t *s3) {
+    int j, x;
     for (j = x = 0; j < 256; j++) {
-        if (F[j]) {
-            if (F[j] > (1<<shift) - x)
-                return 1;
-            for (y = 0; y < F[j]; y++)
-                s3[y+x] = (((uint32_t)F[j])<<(shift+8))|(y<<8)|j;
-            x += F[j];
+        if (F[j] && F[j] <= (1<<shift) - x) {
+            uint32_t base = (((uint32_t)F[j])<<(shift+8))|j, y;
+            for (y = 0; y < F[j]; y++, x++)
+                s3[x] = base + (y<<8);
         }
     }
 


=====================================
htscodecs/rANS_static32x16pr_avx2.c
=====================================
@@ -115,14 +115,41 @@ static inline __m256i _mm256_mulhi_epu32(__m256i a, __m256i b) {
 }
 #endif
 
-#if 0
-// Simulated gather.  This is sometimes faster as it can run on other ports.
+#ifndef USE_GATHER
+// Simulated gather.  This is sometimes faster if gathers are slow, either
+// due to the particular implementation (maybe on Zen4) or because of
+// a microcode patch such as Intel's Downfall fix.
 static inline __m256i _mm256_i32gather_epi32x(int *b, __m256i idx, int size) {
+    volatile // force the store to happen, hence forcing scalar loads
     int c[8] __attribute__((aligned(32)));
     _mm256_store_si256((__m256i *)c, idx);
-    return _mm256_set_epi32(b[c[7]], b[c[6]], b[c[5]], b[c[4]],
-                            b[c[3]], b[c[2]], b[c[1]], b[c[0]]);
+
+    // Fast with modern gccs, and no change with clang.
+    // Equivalent to:
+    //     return _mm256_set_epi32(b[c[7]], b[c[6]], b[c[5]], b[c[4]],
+    //                             b[c[3]], b[c[2]], b[c[1]], b[c[0]]);
+    register int bc1 = b[c[1]];
+    register int bc3 = b[c[3]];
+    register int bc5 = b[c[5]];
+    register int bc7 = b[c[7]];
+
+    __m128i x0a = _mm_cvtsi32_si128(b[c[0]]);
+    __m128i x1a = _mm_cvtsi32_si128(b[c[2]]);
+    __m128i x2a = _mm_cvtsi32_si128(b[c[4]]);
+    __m128i x3a = _mm_cvtsi32_si128(b[c[6]]);
+
+    __m128i x0 = _mm_insert_epi32(x0a, bc1, 1);
+    __m128i x1 = _mm_insert_epi32(x1a, bc3, 1);
+    __m128i x2 = _mm_insert_epi32(x2a, bc5, 1);
+    __m128i x3 = _mm_insert_epi32(x3a, bc7, 1);
+
+    __m128i x01 = _mm_unpacklo_epi64(x0, x1);
+    __m128i x23 = _mm_unpacklo_epi64(x2, x3);
+
+    __m256i y =_mm256_castsi128_si256(x01);
+    return _mm256_inserti128_si256(y, x23, 1);
 }
+
 #else
 #define _mm256_i32gather_epi32x _mm256_i32gather_epi32
 #endif
@@ -501,11 +528,15 @@ unsigned char *rans_uncompress_O0_32x16_avx2(unsigned char *in,
         //for (z = 0; z < NX; z++)
         //  m[z] = R[z] & mask;
         __m256i masked1 = _mm256_and_si256(Rv1, maskv);
-        __m256i masked2 = _mm256_and_si256(Rv2, maskv);
+	__m256i masked2 = _mm256_and_si256(Rv2, maskv);
+        __m256i masked3 = _mm256_and_si256(Rv3, maskv);
+        __m256i masked4 = _mm256_and_si256(Rv4, maskv);
 
-        //  S[z] = s3[m[z]];
-        __m256i Sv1 = _mm256_i32gather_epi32x((int *)s3, masked1, sizeof(*s3));
+	//  S[z] = s3[m[z]];
+	__m256i Sv1 = _mm256_i32gather_epi32x((int *)s3, masked1, sizeof(*s3));
         __m256i Sv2 = _mm256_i32gather_epi32x((int *)s3, masked2, sizeof(*s3));
+        __m256i Sv3 = _mm256_i32gather_epi32x((int *)s3, masked3, sizeof(*s3));
+        __m256i Sv4 = _mm256_i32gather_epi32x((int *)s3, masked4, sizeof(*s3));
 
         //  f[z] = S[z]>>(TF_SHIFT+8);
         __m256i fv1 = _mm256_srli_epi32(Sv1, TF_SHIFT+8);
@@ -526,15 +557,7 @@ unsigned char *rans_uncompress_O0_32x16_avx2(unsigned char *in,
         Rv2 = _mm256_add_epi32(
                   _mm256_mullo_epi32(
                       _mm256_srli_epi32(Rv2,TF_SHIFT), fv2), bv2);
-#ifdef __clang__
-        // Protect against running off the end of in buffer.
-        // We copy it to a worst-case local buffer when near the end.
-        if ((uint8_t *)sp > cp_end) {
-            memmove(overflow, sp, cp_end+64 - (uint8_t *)sp);
-            sp = (uint16_t *)overflow;
-            cp_end = overflow + sizeof(overflow) - 64;
-        }
-#endif
+
         // Tricky one:  out[i+z] = s[z];
         //             ---h---g ---f---e  ---d---c ---b---a
         //             ---p---o ---n---m  ---l---k ---j---i
@@ -543,6 +566,15 @@ unsigned char *rans_uncompress_O0_32x16_avx2(unsigned char *in,
         // packs_epi16 ponmlkji ponmlkji  hgfedcba hgfedcba
         sv1 = _mm256_packus_epi32(sv1, sv2);
         sv1 = _mm256_permute4x64_epi64(sv1, 0xd8);
+
+        // Protect against running off the end of in buffer.
+        // We copy it to a worst-case local buffer when near the end.
+        if ((uint8_t *)sp > cp_end) {
+            memmove(overflow, sp, cp_end+64 - (uint8_t *)sp);
+            sp = (uint16_t *)overflow;
+            cp_end = overflow + sizeof(overflow) - 64;
+        }
+
         __m256i Vv1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp));
         sv1 = _mm256_packus_epi16(sv1, sv1);
 
@@ -583,18 +615,6 @@ unsigned char *rans_uncompress_O0_32x16_avx2(unsigned char *in,
         Rv2 = _mm256_blendv_epi8(Rv2, Yv2, renorm_mask2);
 
         // ------------------------------------------------------------
-
-        //  m[z] = R[z] & mask;
-        //  S[z] = s3[m[z]];
-        __m256i masked3 = _mm256_and_si256(Rv3, maskv);
-        __m256i Sv3 = _mm256_i32gather_epi32x((int *)s3, masked3, sizeof(*s3));
-
-        *(uint64_t *)&out[i+0] = _mm256_extract_epi64(sv1, 0);
-        *(uint64_t *)&out[i+8] = _mm256_extract_epi64(sv1, 2);
-
-        __m256i masked4 = _mm256_and_si256(Rv4, maskv);
-        __m256i Sv4 = _mm256_i32gather_epi32x((int *)s3, masked4, sizeof(*s3));
-
         //  f[z] = S[z]>>(TF_SHIFT+8);
         __m256i fv3 = _mm256_srli_epi32(Sv3, TF_SHIFT+8);
         __m256i fv4 = _mm256_srli_epi32(Sv4, TF_SHIFT+8);
@@ -625,12 +645,15 @@ unsigned char *rans_uncompress_O0_32x16_avx2(unsigned char *in,
         renorm_mask3 = _mm256_cmplt_epu32_imm(Rv3, RANS_BYTE_L);
         sv3 = _mm256_packus_epi16(sv3, sv3);
         renorm_mask4 = _mm256_cmplt_epu32_imm(Rv4, RANS_BYTE_L);
-        
-        *(uint64_t *)&out[i+16] = _mm256_extract_epi64(sv3, 0);
-        *(uint64_t *)&out[i+24] = _mm256_extract_epi64(sv3, 2);
 
         // y = (R[z] << 16) | V[z];
         __m256i Vv3 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp));
+
+        *(uint64_t *)&out[i+0]  = _mm256_extract_epi64(sv1, 0);
+        *(uint64_t *)&out[i+8]  = _mm256_extract_epi64(sv1, 2);
+        *(uint64_t *)&out[i+16] = _mm256_extract_epi64(sv3, 0);
+        *(uint64_t *)&out[i+24] = _mm256_extract_epi64(sv3, 2);
+
         __m256i Yv3 = _mm256_slli_epi32(Rv3, 16);
         unsigned int imask3 = _mm256_movemask_ps((__m256)renorm_mask3);
         __m256i idx3 = _mm256_load_si256((const __m256i*)permute[imask3]);
@@ -650,19 +673,6 @@ unsigned char *rans_uncompress_O0_32x16_avx2(unsigned char *in,
         Yv4 = _mm256_or_si256(Yv4, Vv4);
         sp += _mm_popcnt_u32(imask4);
 
-#ifndef __clang__
-        // 26% faster here than above for gcc10, but former location is
-        // better on clang.
-
-        // Protect against running off the end of in buffer.
-        // We copy it to a worst-case local buffer when near the end.
-        if ((uint8_t *)sp > cp_end) {
-            memmove(overflow, sp, cp_end+64 - (uint8_t *)sp);
-            sp = (uint16_t *)overflow;
-            cp_end = overflow + sizeof(overflow) - 64;
-        }
-#endif
-
         // R[z] = c ? Y[z] : R[z];
         Rv3 = _mm256_blendv_epi8(Rv3, Yv3, renorm_mask3);
         Rv4 = _mm256_blendv_epi8(Rv4, Yv4, renorm_mask4);
@@ -749,6 +759,10 @@ unsigned char *rans_compress_O1_32x16_avx2(unsigned char *in, unsigned int in_si
 
     uint16_t *ptr16 = (uint16_t *)ptr;
 
+//       clang16      clang10      gcc7         gcc13
+//       587 435 381  588 438 403  504 386 415  527 381 394
+// simT  611 432 402  475 401 367  472 422 386  486 353 324
+
     LOAD(Rv, ransN);
 
     for (; iN[0] >= 0; ) {
@@ -782,72 +796,72 @@ unsigned char *rans_compress_O1_32x16_avx2(unsigned char *in, unsigned int in_si
         // [2] B2......
         // [3] ......B3  OR to get B2B1B0B3 and shuffle to B3B2B1B0
 
-        __m256i sh[16];
-        for (z = 0; z < 16; z+=4) {
-            int Z = z*2;
+        __m256i xmaxv[4];
+        __m256i rfv[4];
+        __m256i SDv[4];
+        __m256i biasv[4];
+
+        const __m256i xA = _mm256_set_epi32(0,0,0,-1, 0,0,0,-1);
+        const __m256i xB = _mm256_set_epi32(0,0,-1,0, 0,0,-1,0);
+        const __m256i xC = _mm256_set_epi32(0,-1,0,0, 0,-1,0,0);
+        const __m256i xD = _mm256_set_epi32(-1,0,0,0, -1,0,0,0);
 
+        for (z = 0; z < 32; z += 8) {
 #define m128_to_256 _mm256_castsi128_si256
-            __m256i t0, t1, t2, t3;
-            __m128i *s0, *s1, *s2, *s3;
-            s0 = (__m128i *)(&syms[in[iN[Z+0]]][lN[Z+0]]);
-            s1 = (__m128i *)(&syms[in[iN[Z+4]]][lN[Z+4]]);
-            s2 = (__m128i *)(&syms[in[iN[Z+1]]][lN[Z+1]]);
-            s3 = (__m128i *)(&syms[in[iN[Z+5]]][lN[Z+5]]);
+            __m128i *s0 = (__m128i *)(&syms[in[iN[z+0]]][lN[z+0]]);
+            __m128i *s1 = (__m128i *)(&syms[in[iN[z+4]]][lN[z+4]]);
+            __m128i *s2 = (__m128i *)(&syms[in[iN[z+1]]][lN[z+1]]);
+            __m128i *s3 = (__m128i *)(&syms[in[iN[z+5]]][lN[z+5]]);
 
+            __m256i t0, t1, t2, t3;
             t0 = _mm256_shuffle_epi32(m128_to_256(_mm_loadu_si128(s0)), 0xE4);
             t1 = _mm256_shuffle_epi32(m128_to_256(_mm_loadu_si128(s1)), 0xE4);
             t2 = _mm256_shuffle_epi32(m128_to_256(_mm_loadu_si128(s2)), 0x93);
             t3 = _mm256_shuffle_epi32(m128_to_256(_mm_loadu_si128(s3)), 0x93);
 
-            lN[Z+0] = in[iN[Z+0]];
-            lN[Z+4] = in[iN[Z+4]];
-            lN[Z+1] = in[iN[Z+1]];
-            lN[Z+5] = in[iN[Z+5]];
-
-            sh[z+0] = _mm256_permute2x128_si256(t0, t1, 0x20);
-            sh[z+1] = _mm256_permute2x128_si256(t2, t3, 0x20);
-
-            s0 = (__m128i *)(&syms[in[iN[Z+2]]][lN[Z+2]]);
-            s1 = (__m128i *)(&syms[in[iN[Z+6]]][lN[Z+6]]);
-            s2 = (__m128i *)(&syms[in[iN[Z+3]]][lN[Z+3]]);
-            s3 = (__m128i *)(&syms[in[iN[Z+7]]][lN[Z+7]]);
-
-            t0 = _mm256_shuffle_epi32(m128_to_256(_mm_loadu_si128(s0)), 0x4E);
-            t1 = _mm256_shuffle_epi32(m128_to_256(_mm_loadu_si128(s1)), 0x4E);
-            t2 = _mm256_shuffle_epi32(m128_to_256(_mm_loadu_si128(s2)), 0x39);
-            t3 = _mm256_shuffle_epi32(m128_to_256(_mm_loadu_si128(s3)), 0x39);
-
-            lN[Z+2] = in[iN[Z+2]];
-            lN[Z+6] = in[iN[Z+6]];
-            lN[Z+3] = in[iN[Z+3]];
-            lN[Z+7] = in[iN[Z+7]];
-
-            sh[z+2] = _mm256_permute2x128_si256(t0, t1, 0x20);
-            sh[z+3] = _mm256_permute2x128_si256(t2, t3, 0x20);
-
-            // potential to set xmax, rf, bias, and SD in-situ here, removing
-            // the need to hold sh[] in regs.  Doing so doesn't seem to speed
-            // things up though.
+            __m256i sh0 = _mm256_permute2x128_si256(t0, t1, 0x20);
+            __m256i sh1 = _mm256_permute2x128_si256(t2, t3, 0x20);
+
+            lN[z+0] = in[iN[z+0]];
+            lN[z+4] = in[iN[z+4]];
+            lN[z+1] = in[iN[z+1]];
+            lN[z+5] = in[iN[z+5]];
+
+            // Initialise first half of xmax, rf, SD and bias vectors
+            __m128i *s4 = (__m128i *)(&syms[in[iN[z+2]]][lN[z+2]]);
+            __m128i *s5 = (__m128i *)(&syms[in[iN[z+6]]][lN[z+6]]);
+            __m128i *s6 = (__m128i *)(&syms[in[iN[z+3]]][lN[z+3]]);
+            __m128i *s7 = (__m128i *)(&syms[in[iN[z+7]]][lN[z+7]]);
+
+            __m256i t4, t5, t6, t7;
+            t4 = _mm256_shuffle_epi32(m128_to_256(_mm_loadu_si128(s4)), 0x4E);
+            t5 = _mm256_shuffle_epi32(m128_to_256(_mm_loadu_si128(s5)), 0x4E);
+            t6 = _mm256_shuffle_epi32(m128_to_256(_mm_loadu_si128(s6)), 0x39);
+            t7 = _mm256_shuffle_epi32(m128_to_256(_mm_loadu_si128(s7)), 0x39);
+
+            __m256i sh2 = _mm256_permute2x128_si256(t4, t5, 0x20);
+            __m256i sh3 = _mm256_permute2x128_si256(t6, t7, 0x20);
+
+            lN[z+2] = in[iN[z+2]];
+            lN[z+6] = in[iN[z+6]];
+            lN[z+3] = in[iN[z+3]];
+            lN[z+7] = in[iN[z+7]];
+
+#define SH_LOAD(A, B, C, D)                                        \
+            _mm256_or_si256(_mm256_or_si256(_mm256_and_si256(sh0, A), \
+                                            _mm256_and_si256(sh1, B)),\
+                            _mm256_or_si256(_mm256_and_si256(sh2, C), \
+                                            _mm256_and_si256(sh3, D)))
+            xmaxv[z/8] = SH_LOAD(xA, xB, xC, xD);
+            rfv  [z/8] = SH_LOAD(xB, xC, xD, xA);
+            SDv  [z/8] = SH_LOAD(xD, xA, xB, xC);
+            biasv[z/8] = SH_LOAD(xC, xD, xA, xB);
+
+            rfv  [z/8] = _mm256_shuffle_epi32(rfv  [z/8],  0x39);
+            SDv  [z/8] = _mm256_shuffle_epi32(SDv  [z/8],  0x93);
+            biasv[z/8] = _mm256_shuffle_epi32(biasv[z/8],0x4E);
         }
 
-        __m256i xA = _mm256_set_epi32(0,0,0,-1, 0,0,0,-1);
-        __m256i xB = _mm256_set_epi32(0,0,-1,0, 0,0,-1,0);
-        __m256i xC = _mm256_set_epi32(0,-1,0,0, 0,-1,0,0);
-        __m256i xD = _mm256_set_epi32(-1,0,0,0, -1,0,0,0);
-
-        // Extract 32-bit xmax elements from syms[] data (in sh vec array)
-/*
-#define SYM_LOAD(x, A, B, C, D)                                         \
-        _mm256_or_si256(_mm256_or_si256(_mm256_and_si256(sh[x+0], A),   \
-                                        _mm256_and_si256(sh[x+1], B)),  \
-                        _mm256_or_si256(_mm256_and_si256(sh[x+2], C),   \
-                                        _mm256_and_si256(sh[x+3], D)))
-*/
-        __m256i xmax1 = SYM_LOAD( 0, xA, xB, xC, xD);
-        __m256i xmax2 = SYM_LOAD( 4, xA, xB, xC, xD);
-        __m256i xmax3 = SYM_LOAD( 8, xA, xB, xC, xD);
-        __m256i xmax4 = SYM_LOAD(12, xA, xB, xC, xD);
-
         // ------------------------------------------------------------
         //      for (z = NX-1; z >= 0; z--) {
         //          if (ransN[z] >= x_max[z]) {
@@ -855,10 +869,10 @@ unsigned char *rans_compress_O1_32x16_avx2(unsigned char *in, unsigned int in_si
         //              ransN[z] >>= 16;
         //          }
         //      }
-        __m256i cv1 = _mm256_cmpgt_epi32(Rv1, xmax1);
-        __m256i cv2 = _mm256_cmpgt_epi32(Rv2, xmax2);
-        __m256i cv3 = _mm256_cmpgt_epi32(Rv3, xmax3);
-        __m256i cv4 = _mm256_cmpgt_epi32(Rv4, xmax4);
+        __m256i cv1 = _mm256_cmpgt_epi32(Rv1, xmaxv[0]);
+        __m256i cv2 = _mm256_cmpgt_epi32(Rv2, xmaxv[1]);
+        __m256i cv3 = _mm256_cmpgt_epi32(Rv3, xmaxv[2]);
+        __m256i cv4 = _mm256_cmpgt_epi32(Rv4, xmaxv[3]);
 
         // Store bottom 16-bits at ptr16
         //
@@ -894,10 +908,6 @@ unsigned char *rans_compress_O1_32x16_avx2(unsigned char *in, unsigned int in_si
         V12 = _mm256_permute4x64_epi64(V12, 0xd8);
         V34 = _mm256_permute4x64_epi64(V34, 0xd8);
 
-        // Load rcp_freq ready for later
-        __m256i rfv1 = _mm256_shuffle_epi32(SYM_LOAD( 0, xB, xC, xD, xA),0x39);
-        __m256i rfv2 = _mm256_shuffle_epi32(SYM_LOAD( 4, xB, xC, xD, xA),0x39);
-
         // Now we have bottom N 16-bit values in each V12/V34 to flush
         __m128i f =  _mm256_extractf128_si256(V34, 1);
         _mm_storeu_si128((__m128i *)(ptr16-8), f);
@@ -915,9 +925,6 @@ unsigned char *rans_compress_O1_32x16_avx2(unsigned char *in, unsigned int in_si
         _mm_storeu_si128((__m128i *)(ptr16-8), f);
         ptr16 -= _mm_popcnt_u32(imask1);
 
-        __m256i rfv3 = _mm256_shuffle_epi32(SYM_LOAD( 8, xB, xC, xD, xA),0x39);
-        __m256i rfv4 = _mm256_shuffle_epi32(SYM_LOAD(12, xB, xC, xD, xA),0x39);
-
         __m256i Rs1, Rs2, Rs3, Rs4;
         Rs1 = _mm256_srli_epi32(Rv1,16);
         Rs2 = _mm256_srli_epi32(Rv2,16);
@@ -941,54 +948,41 @@ unsigned char *rans_compress_O1_32x16_avx2(unsigned char *in, unsigned int in_si
         // (AVX512 allows us to hold it all in 64-bit lanes and use mullo_epi64
         // plus a shift.  KNC has mulhi_epi32, but not sure if this is
         // available.)
-        rfv1 = _mm256_mulhi_epu32(Rv1, rfv1);
-        rfv2 = _mm256_mulhi_epu32(Rv2, rfv2);
-        rfv3 = _mm256_mulhi_epu32(Rv3, rfv3);
-        rfv4 = _mm256_mulhi_epu32(Rv4, rfv4);
+        rfv[0] = _mm256_mulhi_epu32(Rv1, rfv[0]);
+        rfv[1] = _mm256_mulhi_epu32(Rv2, rfv[1]);
+        rfv[2] = _mm256_mulhi_epu32(Rv3, rfv[2]);
+        rfv[3] = _mm256_mulhi_epu32(Rv4, rfv[3]);
 
-        // Load cmpl_freq / rcp_shift from syms
-        __m256i SDv1 = _mm256_shuffle_epi32(SYM_LOAD( 0, xD, xA, xB, xC),0x93);
-        __m256i SDv2 = _mm256_shuffle_epi32(SYM_LOAD( 4, xD, xA, xB, xC),0x93);
-        // Load bias from syms[]
-        __m256i biasv1=_mm256_shuffle_epi32(SYM_LOAD( 0, xC, xD, xA, xB),0x4E);
-        __m256i biasv2=_mm256_shuffle_epi32(SYM_LOAD( 4, xC, xD, xA, xB),0x4E);
-
-        __m256i shiftv1 = _mm256_srli_epi32(SDv1, 16);
-        __m256i shiftv2 = _mm256_srli_epi32(SDv2, 16);
-
-        __m256i SDv3 = _mm256_shuffle_epi32(SYM_LOAD( 8, xD, xA, xB, xC),0x93);
-        __m256i SDv4 = _mm256_shuffle_epi32(SYM_LOAD(12, xD, xA, xB, xC),0x93);
-        __m256i biasv3=_mm256_shuffle_epi32(SYM_LOAD( 8, xC, xD, xA, xB),0x4E);
-        __m256i biasv4=_mm256_shuffle_epi32(SYM_LOAD(12, xC, xD, xA, xB),0x4E);
-
-        __m256i shiftv3 = _mm256_srli_epi32(SDv3, 16);
-        __m256i shiftv4 = _mm256_srli_epi32(SDv4, 16);
+        __m256i shiftv1 = _mm256_srli_epi32(SDv[0], 16);
+        __m256i shiftv2 = _mm256_srli_epi32(SDv[1], 16);
+        __m256i shiftv3 = _mm256_srli_epi32(SDv[2], 16);
+        __m256i shiftv4 = _mm256_srli_epi32(SDv[3], 16);
 
         shiftv1 = _mm256_sub_epi32(shiftv1, _mm256_set1_epi32(32));
         shiftv2 = _mm256_sub_epi32(shiftv2, _mm256_set1_epi32(32));
         shiftv3 = _mm256_sub_epi32(shiftv3, _mm256_set1_epi32(32));
         shiftv4 = _mm256_sub_epi32(shiftv4, _mm256_set1_epi32(32));
 
-        __m256i qv1 = _mm256_srlv_epi32(rfv1, shiftv1);
-        __m256i qv2 = _mm256_srlv_epi32(rfv2, shiftv2);
+        __m256i qv1 = _mm256_srlv_epi32(rfv[0], shiftv1);
+        __m256i qv2 = _mm256_srlv_epi32(rfv[1], shiftv2);
 
-        __m256i freqv1 = _mm256_and_si256(SDv1, _mm256_set1_epi32(0xffff));
-        __m256i freqv2 = _mm256_and_si256(SDv2, _mm256_set1_epi32(0xffff));
+        __m256i freqv1 = _mm256_and_si256(SDv[0], _mm256_set1_epi32(0xffff));
+        __m256i freqv2 = _mm256_and_si256(SDv[1], _mm256_set1_epi32(0xffff));
         qv1 = _mm256_mullo_epi32(qv1, freqv1);
         qv2 = _mm256_mullo_epi32(qv2, freqv2);
 
-        __m256i qv3 = _mm256_srlv_epi32(rfv3, shiftv3);
-        __m256i qv4 = _mm256_srlv_epi32(rfv4, shiftv4);
+        __m256i qv3 = _mm256_srlv_epi32(rfv[2], shiftv3);
+        __m256i qv4 = _mm256_srlv_epi32(rfv[3], shiftv4);
 
-        __m256i freqv3 = _mm256_and_si256(SDv3, _mm256_set1_epi32(0xffff));
-        __m256i freqv4 = _mm256_and_si256(SDv4, _mm256_set1_epi32(0xffff));
+        __m256i freqv3 = _mm256_and_si256(SDv[2], _mm256_set1_epi32(0xffff));
+        __m256i freqv4 = _mm256_and_si256(SDv[3], _mm256_set1_epi32(0xffff));
         qv3 = _mm256_mullo_epi32(qv3, freqv3);
         qv4 = _mm256_mullo_epi32(qv4, freqv4);
 
-        qv1 = _mm256_add_epi32(qv1, biasv1);
-        qv2 = _mm256_add_epi32(qv2, biasv2);
-        qv3 = _mm256_add_epi32(qv3, biasv3);
-        qv4 = _mm256_add_epi32(qv4, biasv4);
+        qv1 = _mm256_add_epi32(qv1, biasv[0]);
+        qv2 = _mm256_add_epi32(qv2, biasv[1]);
+        qv3 = _mm256_add_epi32(qv3, biasv[2]);
+        qv4 = _mm256_add_epi32(qv4, biasv[3]);
 
         for (z = 0; z < NX; z++)
             iN[z]--;
@@ -1539,27 +1533,11 @@ unsigned char *rans_uncompress_O1_32x16_avx2(unsigned char *in,
             sv3 = _mm256_permute4x64_epi64(sv3, 0xd8); // shuffle;  AaBb
             sv3 = _mm256_packus_epi16(sv3, sv3);       // 16 to 8
 
-            // Method 1
             u.tbuf64[tidx][0] = _mm256_extract_epi64(sv1, 0);
             u.tbuf64[tidx][1] = _mm256_extract_epi64(sv1, 2);
             u.tbuf64[tidx][2] = _mm256_extract_epi64(sv3, 0);
             u.tbuf64[tidx][3] = _mm256_extract_epi64(sv3, 2);
 
-//          // Method 2
-//          sv1 = _mm256_permute4x64_epi64(sv1, 8); // x x 10 00
-//          _mm_storeu_si128((__m128i *)&u.tbuf64[tidx][0],
-//                           _mm256_extractf128_si256(sv1, 0));
-//          sv3 = _mm256_permute4x64_epi64(sv3, 8); // x x 10 00
-//          _mm_storeu_si128((__m128i *)&u.tbuf64[tidx][2],
-//                           _mm256_extractf128_si256(sv3, 0));
-
-//          // Method 3
-//          sv1 = _mm256_and_si256(sv1, _mm256_set_epi64x(0,-1,0,-1)); // AxBx
-//          sv3 = _mm256_and_si256(sv3, _mm256_set_epi64x(-1,0,-1,0)); // xCxD
-//          sv1 = _mm256_or_si256(sv1, sv3);                           // ACBD
-//          sv1 = _mm256_permute4x64_epi64(sv1, 0xD8); //rev 00 10 01 11; ABCD
-//          _mm256_storeu_si256((__m256i *)u.tbuf64[tidx], sv1);
-
             iN[0]++;
             if (++tidx == 32) {
                 iN[0]-=32;


=====================================
htscodecs/rANS_static32x16pr_avx512.c
=====================================
@@ -55,6 +55,74 @@
 #include "varint.h"
 #include "utils.h"
 
+#ifndef USE_GATHER
+
+// Speds with Downfall mitigation Off/On and Zen4.
+//         <------ AVX512 --------->   <------- AVX2 -------->
+// -o4:    IntelOff IntelOn  AMDZen4   
+// gcc7    544/1673 562/1711 448/1818  649/1515 645/1525 875/1624
+// gcc13   557/1672 576/1711 582/1761  630/1623 629/1652 866/1762
+// clang10 541/1547 564/1630 807/1912  620/1456 637/1481 837/1606
+// clang16 533/1431 555/1510 890/1611  629/1370 627/1406 996/1432
+//
+// Zen4 encode is particularly slow with gcc, but AVX2 encode is
+// faster and we use that already.
+static inline __m512i _mm512_i32gather_epi32x(__m512i idx, void *v, int size) {
+    uint32_t *b = (uint32_t *)v;
+
+#ifndef __clang__
+    volatile
+#endif
+    int c[16] __attribute__((aligned(32)));
+
+    //_mm512_store_si512((__m512i *)c, idx); // equivalent, but slower
+    _mm256_store_si256((__m256i *)(c),   _mm512_castsi512_si256(idx));
+    _mm256_store_si256((__m256i *)(c+8), _mm512_extracti64x4_epi64(idx, 1));
+
+    __m128i x0 = _mm_insert_epi32(_mm_cvtsi32_si128(b[c[0]]), b[c[1]], 1);
+    __m128i x1 = _mm_insert_epi32(_mm_cvtsi32_si128(b[c[2]]), b[c[3]], 1);
+    __m128i x2 = _mm_insert_epi32(_mm_cvtsi32_si128(b[c[4]]), b[c[5]], 1);
+    __m128i x3 = _mm_insert_epi32(_mm_cvtsi32_si128(b[c[6]]), b[c[7]], 1);
+
+    __m128i x01 = _mm_unpacklo_epi64(x0, x1);
+    __m128i x23 = _mm_unpacklo_epi64(x2, x3);
+    __m256i y0 =_mm256_castsi128_si256(x01);
+
+    __m128i x4 = _mm_insert_epi32(_mm_cvtsi32_si128(b[c[ 8]]), b[c[ 9]], 1);
+    __m128i x5 = _mm_insert_epi32(_mm_cvtsi32_si128(b[c[10]]), b[c[11]], 1);
+    __m128i x6 = _mm_insert_epi32(_mm_cvtsi32_si128(b[c[12]]), b[c[13]], 1);
+    __m128i x7 = _mm_insert_epi32(_mm_cvtsi32_si128(b[c[14]]), b[c[15]], 1);
+
+    __m128i x45 = _mm_unpacklo_epi64(x4, x5);
+    __m128i x67 = _mm_unpacklo_epi64(x6, x7);
+    __m256i y1 =_mm256_castsi128_si256(x45);
+
+    y0 = _mm256_inserti128_si256(y0, x23, 1);
+    y1 = _mm256_inserti128_si256(y1, x67, 1);
+
+    return _mm512_inserti64x4(_mm512_castsi256_si512(y0), y1, 1);
+}
+
+// 32-bit indices, 8-bit quantities into 32-bit lanes
+static inline __m512i _mm512_i32gather_epi32x1(__m512i idx, void *v) {
+    uint8_t *b = (uint8_t *)v;
+    volatile int c[16] __attribute__((aligned(32)));
+
+    //_mm512_store_si512((__m512i *)c, idx); // equivalent, but slower
+    _mm256_store_si256((__m256i *)(c),   _mm512_castsi512_si256(idx));
+    _mm256_store_si256((__m256i *)(c+8), _mm512_extracti64x4_epi64(idx, 1));
+
+    return _mm512_set_epi32(b[c[15]], b[c[14]], b[c[13]], b[c[12]],
+                            b[c[11]], b[c[10]], b[c[9]], b[c[8]],
+                            b[c[7]], b[c[6]], b[c[5]], b[c[4]],
+                            b[c[3]], b[c[2]], b[c[1]], b[c[0]]);
+}
+
+#else
+// real gathers
+#define _mm512_i32gather_epi32x _mm512_i32gather_epi32
+#endif
+
 unsigned char *rans_compress_O0_32x16_avx512(unsigned char *in,
                                              unsigned int in_size,
                                              unsigned char *out,
@@ -149,8 +217,8 @@ unsigned char *rans_compress_O0_32x16_avx512(unsigned char *in,
         __m512i c1 = _mm512_cvtepu8_epi32(_mm256_extracti128_si256(c12,0));
         __m512i c2 = _mm512_cvtepu8_epi32(_mm256_extracti128_si256(c12,1));
 #define SET512(a,b) \
-        __m512i a##1 = _mm512_i32gather_epi32(c1, b, 4); \
-        __m512i a##2 = _mm512_i32gather_epi32(c2, b, 4)
+        __m512i a##1 = _mm512_i32gather_epi32x(c1, b, 4); \
+        __m512i a##2 = _mm512_i32gather_epi32x(c2, b, 4)
 
         SET512(xmax, SB);
 
@@ -162,7 +230,6 @@ unsigned char *rans_compress_O0_32x16_avx512(unsigned char *in,
         SET512(SDv,  SD);
         int pc2 = _mm_popcnt_u32(gt_mask2);
 
-        //Rp1 = _mm512_maskz_compress_epi32(gt_mask1, Rp1);
         Rp1 = _mm512_maskz_compress_epi32(gt_mask1, Rp1);
         Rp2 = _mm512_maskz_compress_epi32(gt_mask2, Rp2);
 
@@ -237,7 +304,7 @@ unsigned char *rans_compress_O0_32x16_avx512(unsigned char *in,
 
     for (z = 32-1; z >= 0; z--)
         RansEncFlush(&ransN[z], &ptr);
-    
+
  empty:
     // Finalise block size and return it
     *out_size = (out_end - ptr) + tab_size;
@@ -305,8 +372,8 @@ unsigned char *rans_uncompress_O0_32x16_avx512(unsigned char *in,
     // loop for the next cycle so we can remove some of the instr. latency.
     __m512i masked1 = _mm512_and_epi32(R1, maskv);
     __m512i masked2 = _mm512_and_epi32(R2, maskv);
-    __m512i S1 = _mm512_i32gather_epi32(masked1, (int *)s3, sizeof(*s3));
-    __m512i S2 = _mm512_i32gather_epi32(masked2, (int *)s3, sizeof(*s3));
+    __m512i S1 = _mm512_i32gather_epi32x(masked1, (int *)s3, sizeof(*s3));
+    __m512i S2 = _mm512_i32gather_epi32x(masked2, (int *)s3, sizeof(*s3));
 
     uint8_t overflow[64+64] = {0};
     for (i=0; i < out_end; i+=32) {
@@ -334,13 +401,13 @@ unsigned char *rans_uncompress_O0_32x16_avx512(unsigned char *in,
       R1 = _mm512_add_epi32(
                _mm512_mullo_epi32(
                    _mm512_srli_epi32(R1, TF_SHIFT), f1), b1);
+      __mmask16 renorm_mask1, renorm_mask2;
+      renorm_mask1=_mm512_cmplt_epu32_mask(R1, _mm512_set1_epi32(RANS_BYTE_L));
       R2 = _mm512_add_epi32(
                _mm512_mullo_epi32(
                    _mm512_srli_epi32(R2, TF_SHIFT), f2), b2);
 
       // renorm. this is the interesting part:
-      __mmask16 renorm_mask1, renorm_mask2;
-      renorm_mask1=_mm512_cmplt_epu32_mask(R1, _mm512_set1_epi32(RANS_BYTE_L));
       renorm_mask2=_mm512_cmplt_epu32_mask(R2, _mm512_set1_epi32(RANS_BYTE_L));
       // advance by however many words we actually read
       sp += _mm_popcnt_u32(renorm_mask1);
@@ -349,34 +416,39 @@ unsigned char *rans_uncompress_O0_32x16_avx512(unsigned char *in,
 
       // select masked only
       __m512i renorm_vals1, renorm_vals2;
-      renorm_vals1 = _mm512_maskz_expand_epi32(renorm_mask1, renorm_words1);
-      renorm_vals2 = _mm512_maskz_expand_epi32(renorm_mask2, renorm_words2);
-      // shift & add selected words
-      R1 = _mm512_mask_slli_epi32(R1, renorm_mask1, R1, 16);
-      R2 = _mm512_mask_slli_epi32(R2, renorm_mask2, R2, 16);
-      R1 = _mm512_add_epi32(R1, renorm_vals1);
-      R2 = _mm512_add_epi32(R2, renorm_vals2);
+
+      renorm_vals1 = _mm512_mask_expand_epi32(R1, renorm_mask1, renorm_words1);
+      renorm_vals2 = _mm512_mask_expand_epi32(R2, renorm_mask2, renorm_words2);
 
       // For start of next loop iteration.  This has been moved here
       // (and duplicated to before the loop starts) so we can do something
       // with the latency period of gather, such as finishing up the
-      // renorm offset and writing the results. 
+      // renorm offset and writing the results.
       __m512i S1_ = S1; // temporary copy for use in out[]=S later
       __m512i S2_ = S2;
 
-      masked1 = _mm512_and_epi32(R1, maskv);
-      masked2 = _mm512_and_epi32(R2, maskv);
-      // Gather is slow bit (half total time) - 30 cycle latency.
-      S1 = _mm512_i32gather_epi32(masked1, (int *)s3, sizeof(*s3));
-      S2 = _mm512_i32gather_epi32(masked2, (int *)s3, sizeof(*s3));
+      masked1 = _mm512_and_epi32(renorm_vals1, maskv);
+      S1 = _mm512_i32gather_epi32x(masked1, (int *)s3, sizeof(*s3));
+      masked2 = _mm512_and_epi32(renorm_vals2, maskv);
+      S2 = _mm512_i32gather_epi32x(masked2, (int *)s3, sizeof(*s3));
+
+      R1 = _mm512_mask_slli_epi32(R1, renorm_mask1, R1, 16);
+      R2 = _mm512_mask_slli_epi32(R2, renorm_mask2, R2, 16);
+
+      __m512i m16 = _mm512_set1_epi32(0xffff);
+      renorm_vals1 = _mm512_maskz_and_epi32(renorm_mask1, renorm_vals1, m16);
+      renorm_vals2 = _mm512_maskz_and_epi32(renorm_mask2, renorm_vals2, m16);
 
       // advance by however many words we actually read
       sp += _mm_popcnt_u32(renorm_mask2);
 
+      R1 = _mm512_add_epi32(R1, renorm_vals1);
+      R2 = _mm512_add_epi32(R2, renorm_vals2);
+
       //out[i+z] = S;
       _mm_storeu_si128((__m128i *)(out+i),    _mm512_cvtepi32_epi8(S1_));
       _mm_storeu_si128((__m128i *)(out+i+16), _mm512_cvtepi32_epi8(S2_));
-    }      
+    }
 
     _mm512_store_epi32(&Rv[ 0], R1);
     _mm512_store_epi32(&Rv[16], R2);
@@ -424,14 +496,14 @@ static inline void transpose_and_copy_avx512(uint8_t *out, int iN[32],
 //      iN[z] += 32;
 //  }
 
-    
+
     __m512i v1 = _mm512_set_epi32(15, 14, 13, 12, 11, 10,  9,  8,
                                    7,  6,  5,  4,  3,  2,  1,  0);
     v1 = _mm512_slli_epi32(v1, 5);
-    
+
     for (z = 0; z < 32; z++) {
-        __m512i t1 = _mm512_i32gather_epi32(v1, &t32[ 0][z], 4);
-        __m512i t2 = _mm512_i32gather_epi32(v1, &t32[16][z], 4);
+        __m512i t1 = _mm512_i32gather_epi32x(v1, &t32[ 0][z], 4);
+        __m512i t2 = _mm512_i32gather_epi32x(v1, &t32[16][z], 4);
         _mm_storeu_si128((__m128i*)(&out[iN[z]   ]), _mm512_cvtepi32_epi8(t1));
         _mm_storeu_si128((__m128i*)(&out[iN[z]+16]), _mm512_cvtepi32_epi8(t2));
         iN[z] += 32;
@@ -470,7 +542,7 @@ unsigned char *rans_compress_O1_32x16_avx512(unsigned char *in,
     }
 
     cp = out;
-    int shift = encode_freq1(in, in_size, 32, syms, &cp); 
+    int shift = encode_freq1(in, in_size, 32, syms, &cp);
     if (shift < 0) {
         free(out_free);
         htscodecs_tls_free(syms);
@@ -483,7 +555,8 @@ unsigned char *rans_compress_O1_32x16_avx512(unsigned char *in,
 
     uint8_t* ptr = out_end;
 
-    int iN[32], isz4 = in_size/32;
+    int iN[32] __attribute__((aligned(64)));
+    int isz4 = in_size/32;
     for (z = 0; z < 32; z++)
         iN[z] = (z+1)*isz4-2;
 
@@ -503,26 +576,29 @@ unsigned char *rans_compress_O1_32x16_avx512(unsigned char *in,
     LOAD512(Rv, ransN);
 
     uint16_t *ptr16 = (uint16_t *)ptr;
-    __m512i last2 = _mm512_set_epi32(lN[31], lN[30], lN[29], lN[28],
-                                     lN[27], lN[26], lN[25], lN[24],
-                                     lN[23], lN[22], lN[21], lN[20],
-                                     lN[19], lN[18], lN[17], lN[16]);
-    __m512i last1 = _mm512_set_epi32(lN[15], lN[14], lN[13], lN[12],
-                                     lN[11], lN[10], lN[ 9], lN[ 8],
-                                     lN[ 7], lN[ 6], lN[ 5], lN[ 4],
-                                     lN[ 3], lN[ 2], lN[ 1], lN[ 0]);
-    
-    __m512i iN2 = _mm512_set_epi32(iN[31], iN[30], iN[29], iN[28],
-                                   iN[27], iN[26], iN[25], iN[24],
-                                   iN[23], iN[22], iN[21], iN[20],
-                                   iN[19], iN[18], iN[17], iN[16]);
-    __m512i iN1 = _mm512_set_epi32(iN[15], iN[14], iN[13], iN[12],
-                                   iN[11], iN[10], iN[ 9], iN[ 8],
-                                   iN[ 7], iN[ 6], iN[ 5], iN[ 4],
-                                   iN[ 3], iN[ 2], iN[ 1], iN[ 0]);
-
-    __m512i c1 = _mm512_i32gather_epi32(iN1, in, 1);
-    __m512i c2 = _mm512_i32gather_epi32(iN2, in, 1);
+    LOAD512(iN, iN);
+    LOAD512(last, lN);
+
+    __m512i c1 = _mm512_i32gather_epi32x1(iN1, in);
+    __m512i c2 = _mm512_i32gather_epi32x1(iN2, in);
+
+    // We cache the next 64-bytes locally and transpose.
+    // This means we can load 32 ints from t32[x] with load instructions
+    // instead of gathers.  The copy, transpose and expand is easier done
+    // in scalar code.
+#define BATCH 64
+    uint8_t t32[BATCH][32] __attribute__((aligned(64)));
+    int next_batch;
+    if (iN[0] > BATCH) {
+        int i, j;
+        for (i = 0; i < BATCH; i++)
+            // memcpy(c[i], &in[iN[i]-32], 32); fast mode
+            for (j = 0; j < 32; j++)
+                t32[BATCH-1-i][j] = in[iN[j]-1-i];
+        next_batch = BATCH;
+    } else {
+        next_batch = -1;
+    }
 
     for (; iN[0] >= 0; iN[0]--) {
         // Note, consider doing the same approach for the AVX2 encoder.
@@ -533,8 +609,6 @@ unsigned char *rans_compress_O1_32x16_avx512(unsigned char *in,
         // FIXME: maybe we need to cope with in[31] read over-flow
         // on loop cycles 0, 1, 2 where gather reads 32-bits instead of
         // 8 bits.  Use set instead there on c2?
-        c1 = _mm512_and_si512(c1, _mm512_set1_epi32(0xff));
-        c2 = _mm512_and_si512(c2, _mm512_set1_epi32(0xff));
 
         // index into syms[0][0] array, used for x_max, rcp_freq, and bias
         __m512i vidx1 = _mm512_slli_epi32(c1, 8);
@@ -553,8 +627,8 @@ unsigned char *rans_compress_O1_32x16_avx512(unsigned char *in,
         //      }
 
 #define SET512x(a,x) \
-        __m512i a##1 = _mm512_i32gather_epi32(vidx1, &syms[0][0].x, 4); \
-        __m512i a##2 = _mm512_i32gather_epi32(vidx2, &syms[0][0].x, 4)
+        __m512i a##1 = _mm512_i32gather_epi32x(vidx1, &syms[0][0].x, 4); \
+        __m512i a##2 = _mm512_i32gather_epi32x(vidx2, &syms[0][0].x, 4)
 
         // Start of next loop, moved here to remove latency.
         // last[z] = c[z]
@@ -564,8 +638,54 @@ unsigned char *rans_compress_O1_32x16_avx512(unsigned char *in,
         last2 = c2;
         iN1 = _mm512_sub_epi32(iN1, _mm512_set1_epi32(1));
         iN2 = _mm512_sub_epi32(iN2, _mm512_set1_epi32(1));
-        c1 = _mm512_i32gather_epi32(iN1, in, 1);
-        c2 = _mm512_i32gather_epi32(iN2, in, 1);
+
+        // Code below is equivalent to this:
+        // c1 = _mm512_i32gather_epi32(iN1, in, 1);
+        // c2 = _mm512_i32gather_epi32(iN2, in, 1);
+
+        // Better when we have a power of 2
+        if (next_batch >= 0) {
+            if (--next_batch < 0 && iN[0] > BATCH) {
+                // Load 32 BATCH blocks of data.
+                // Executed once every BATCH cycles
+                int i, j;
+                uint8_t c[32][BATCH];
+                iN[0] += BATCH;
+                for (j = 0; j < 32; j++) {
+                    iN[j] -= BATCH;
+                    memcpy(c[j], &in[iN[j]-BATCH], BATCH);
+                }
+                // transpose matrix from 32xBATCH to BATCHx32
+                for (j = 0; j < 32; j++) {
+                    for (i = 0; i < BATCH; i+=16) {
+                        int z;
+                        for (z = 0; z < 16; z++)
+                            t32[i+z][j] = c[j][i+z];
+                    }
+                }
+                next_batch = BATCH-1;
+            }
+            if (next_batch >= 0) {
+                // Copy from our of our pre-loaded BATCHx32 tables
+                // Executed every cycles
+                __m128i c1_ = _mm_load_si128((__m128i *)&t32[next_batch][0]);
+                __m128i c2_ = _mm_load_si128((__m128i *)&t32[next_batch][16]);
+                c1 = _mm512_cvtepu8_epi32(c1_);
+                c2 = _mm512_cvtepu8_epi32(c2_);
+            }
+        }
+
+        if (next_batch < 0 && iN[0]) {
+            // no pre-loaded data as within BATCHx32 of input end
+            c1 = _mm512_i32gather_epi32x1(iN1, in);
+            c2 = _mm512_i32gather_epi32x1(iN2, in);
+
+            // Speeds up clang, even though not needed any more.
+            // Harmless to leave here.
+            c1 = _mm512_and_si512(c1, _mm512_set1_epi32(0xff));
+            c2 = _mm512_and_si512(c2, _mm512_set1_epi32(0xff));
+        }
+        // End of "equivalent to" code block
 
         SET512x(xmax, x_max); // high latency
 
@@ -758,10 +878,10 @@ unsigned char *rans_uncompress_O1_32x16_avx512(unsigned char *in,
             _masked2 = _mm512_add_epi32(_masked2, _Lv2);
 
             // This is the biggest bottleneck
-            __m512i _Sv1 = _mm512_i32gather_epi32(_masked1, (int *)&s3F[0][0],
-                                                  sizeof(s3F[0][0]));
-            __m512i _Sv2 = _mm512_i32gather_epi32(_masked2, (int *)&s3F[0][0],
-                                                  sizeof(s3F[0][0]));
+            __m512i _Sv1 = _mm512_i32gather_epi32x(_masked1, (int *)&s3F[0][0],
+                                                   sizeof(s3F[0][0]));
+            __m512i _Sv2 = _mm512_i32gather_epi32x(_masked2, (int *)&s3F[0][0],
+                                                   sizeof(s3F[0][0]));
 
             //  f[z] = S[z]>>(TF_SHIFT_O1+8);
             __m512i _fv1 = _mm512_srli_epi32(_Sv1, TF_SHIFT_O1+8);
@@ -927,10 +1047,10 @@ unsigned char *rans_uncompress_O1_32x16_avx512(unsigned char *in,
             _masked2 = _mm512_add_epi32(_masked2, _Lv2);
 
             // This is the biggest bottleneck
-            __m512i _Sv1 = _mm512_i32gather_epi32(_masked1, (int *)&s3F[0][0],
-                                                  sizeof(s3F[0][0]));
-            __m512i _Sv2 = _mm512_i32gather_epi32(_masked2, (int *)&s3F[0][0],
-                                                  sizeof(s3F[0][0]));
+            __m512i _Sv1 = _mm512_i32gather_epi32x(_masked1, (int *)&s3F[0][0],
+                                                   sizeof(s3F[0][0]));
+            __m512i _Sv2 = _mm512_i32gather_epi32x(_masked2, (int *)&s3F[0][0],
+                                                   sizeof(s3F[0][0]));
 
             //  f[z] = S[z]>>(TF_SHIFT_O1+8);
             __m512i _fv1 = _mm512_srli_epi32(_Sv1, TF_SHIFT_O1_FAST+8);


=====================================
htscodecs/rANS_static32x16pr_sse4.c
=====================================
@@ -807,19 +807,59 @@ unsigned char *rans_uncompress_O0_32x16_sse4(unsigned char *in,
 static inline void transpose_and_copy(uint8_t *out, int iN[32],
                                       uint8_t t[32][32]) {
     int z;
-#ifdef UBSAN
-    // Simplified version to avoid undefined behaviour sanitiser warnings.
+
+    // Simplified version from below.
+    // This is pretty good with zig cc, but slow on clang and very slow on
+    // gcc, even with -O3
+    /*
     for (z = 0; z < NX; z++) {
         int k;
         for (k = 0; k < 32; k++)
             out[iN[z]+k] = t[k][z];
         iN[z] += 32;
     }
-#else
-    // Unaligned access.  We know we can get away with this as this
-    // code is only ever executed on x86 platforms which permit this.
-    for (z = 0; z < NX; z+=4) {
-        *(uint64_t *)&out[iN[z]] =
+    */
+
+
+    // A better approach for clang and gcc can be had with some manual
+    // restructuring to attempt to do the two loops in explcit blocks.
+    // With gcc -O3 or -O2 -ftree-vectorize this is quite fast, as is clang
+    // and zig but neither beat the version below (or, for zig, the basic
+    // code above).
+    //
+    // It's left here incase we ever want to move to tidier code and
+    // to understand what auto-vectorises and what doesn't.
+    /*
+#define NZ 2
+#define NK 8
+    for (z = 0; z < 32; z+=NZ) {
+        for (int k = 0; k < 32; k+=NK) {
+            for (int z0 = z; z0 < z+NZ; z0++) {
+                uint8_t tmp[NK];// __attribute__((aligned(32)));
+                //uint8_t (*RESTRICT t0)[32] = &t[k];
+                for (int k0 = 0; k0 < NK; k0++)
+                    //tmp[k0] = t0[k0][z0];
+                    tmp[k0] = t[k+k0][z0];
+                memcpy(&out[iN[z0]+k], tmp, NK);
+            }
+        }
+        for (int z0 = z; z0 < z+NZ; z0++)
+            iN[z0] += 32;
+    }
+    */
+
+    // Manually unrolled code.
+    // This is fastest on gcc and clang and not far behind with zig cc.
+    // It also doesn't need aggressive gcc optimisation levels to be
+    // efficient.
+    //
+    // It works by constructing 64-bit ints and copying them with a single
+    // memory write.  The fixed size memcpys just boil down to a memory write,
+    // but unlike the earlier versions that did this direct, this isn't
+    // exploiting undefined behaviour.
+    for (z = 0; z < 32; z+=4) {
+        uint64_t i64;
+        i64 =
             ((uint64_t)(t[0][z])<< 0) +
             ((uint64_t)(t[1][z])<< 8) +
             ((uint64_t)(t[2][z])<<16) +
@@ -828,7 +868,8 @@ static inline void transpose_and_copy(uint8_t *out, int iN[32],
             ((uint64_t)(t[5][z])<<40) +
             ((uint64_t)(t[6][z])<<48) +
             ((uint64_t)(t[7][z])<<56);
-        *(uint64_t *)&out[iN[z+1]] =
+        memcpy(&out[iN[z]], &i64, 8);
+        i64 =
             ((uint64_t)(t[0][z+1])<< 0) +
             ((uint64_t)(t[1][z+1])<< 8) +
             ((uint64_t)(t[2][z+1])<<16) +
@@ -837,7 +878,8 @@ static inline void transpose_and_copy(uint8_t *out, int iN[32],
             ((uint64_t)(t[5][z+1])<<40) +
             ((uint64_t)(t[6][z+1])<<48) +
             ((uint64_t)(t[7][z+1])<<56);
-        *(uint64_t *)&out[iN[z+2]] =
+        memcpy(&out[iN[z+1]], &i64, 8);
+        i64 =
             ((uint64_t)(t[0][z+2])<< 0) +
             ((uint64_t)(t[1][z+2])<< 8) +
             ((uint64_t)(t[2][z+2])<<16) +
@@ -846,7 +888,8 @@ static inline void transpose_and_copy(uint8_t *out, int iN[32],
             ((uint64_t)(t[5][z+2])<<40) +
             ((uint64_t)(t[6][z+2])<<48) +
             ((uint64_t)(t[7][z+2])<<56);
-        *(uint64_t *)&out[iN[z+3]] =
+        memcpy(&out[iN[z+2]], &i64, 8);
+        i64 =
             ((uint64_t)(t[0][z+3])<< 0) +
             ((uint64_t)(t[1][z+3])<< 8) +
             ((uint64_t)(t[2][z+3])<<16) +
@@ -855,8 +898,9 @@ static inline void transpose_and_copy(uint8_t *out, int iN[32],
             ((uint64_t)(t[5][z+3])<<40) +
             ((uint64_t)(t[6][z+3])<<48) +
             ((uint64_t)(t[7][z+3])<<56);
+        memcpy(&out[iN[z+3]], &i64, 8);
 
-        *(uint64_t *)&out[iN[z]+8] =
+        i64 =
             ((uint64_t)(t[8+0][z])<< 0) +
             ((uint64_t)(t[8+1][z])<< 8) +
             ((uint64_t)(t[8+2][z])<<16) +
@@ -865,7 +909,8 @@ static inline void transpose_and_copy(uint8_t *out, int iN[32],
             ((uint64_t)(t[8+5][z])<<40) +
             ((uint64_t)(t[8+6][z])<<48) +
             ((uint64_t)(t[8+7][z])<<56);
-        *(uint64_t *)&out[iN[z+1]+8] =
+        memcpy(&out[iN[z]+8], &i64, 8);
+        i64 =
             ((uint64_t)(t[8+0][z+1])<< 0) +
             ((uint64_t)(t[8+1][z+1])<< 8) +
             ((uint64_t)(t[8+2][z+1])<<16) +
@@ -874,7 +919,8 @@ static inline void transpose_and_copy(uint8_t *out, int iN[32],
             ((uint64_t)(t[8+5][z+1])<<40) +
             ((uint64_t)(t[8+6][z+1])<<48) +
             ((uint64_t)(t[8+7][z+1])<<56);
-        *(uint64_t *)&out[iN[z+2]+8] =
+        memcpy(&out[iN[z+1]+8], &i64, 8);
+        i64 =
             ((uint64_t)(t[8+0][z+2])<< 0) +
             ((uint64_t)(t[8+1][z+2])<< 8) +
             ((uint64_t)(t[8+2][z+2])<<16) +
@@ -883,7 +929,8 @@ static inline void transpose_and_copy(uint8_t *out, int iN[32],
             ((uint64_t)(t[8+5][z+2])<<40) +
             ((uint64_t)(t[8+6][z+2])<<48) +
             ((uint64_t)(t[8+7][z+2])<<56);
-        *(uint64_t *)&out[iN[z+3]+8] =
+        memcpy(&out[iN[z+2]+8], &i64, 8);
+        i64 =
             ((uint64_t)(t[8+0][z+3])<< 0) +
             ((uint64_t)(t[8+1][z+3])<< 8) +
             ((uint64_t)(t[8+2][z+3])<<16) +
@@ -892,8 +939,9 @@ static inline void transpose_and_copy(uint8_t *out, int iN[32],
             ((uint64_t)(t[8+5][z+3])<<40) +
             ((uint64_t)(t[8+6][z+3])<<48) +
             ((uint64_t)(t[8+7][z+3])<<56);
+        memcpy(&out[iN[z+3]+8], &i64, 8);
 
-        *(uint64_t *)&out[iN[z]+16] =
+        i64 =
             ((uint64_t)(t[16+0][z])<< 0) +
             ((uint64_t)(t[16+1][z])<< 8) +
             ((uint64_t)(t[16+2][z])<<16) +
@@ -902,7 +950,8 @@ static inline void transpose_and_copy(uint8_t *out, int iN[32],
             ((uint64_t)(t[16+5][z])<<40) +
             ((uint64_t)(t[16+6][z])<<48) +
             ((uint64_t)(t[16+7][z])<<56);
-        *(uint64_t *)&out[iN[z+1]+16] =
+        memcpy(&out[iN[z]+16], &i64, 8);
+        i64 =
             ((uint64_t)(t[16+0][z+1])<< 0) +
             ((uint64_t)(t[16+1][z+1])<< 8) +
             ((uint64_t)(t[16+2][z+1])<<16) +
@@ -911,7 +960,8 @@ static inline void transpose_and_copy(uint8_t *out, int iN[32],
             ((uint64_t)(t[16+5][z+1])<<40) +
             ((uint64_t)(t[16+6][z+1])<<48) +
             ((uint64_t)(t[16+7][z+1])<<56);
-        *(uint64_t *)&out[iN[z+2]+16] =
+        memcpy(&out[iN[z+1]+16], &i64, 8);
+        i64 =
             ((uint64_t)(t[16+0][z+2])<< 0) +
             ((uint64_t)(t[16+1][z+2])<< 8) +
             ((uint64_t)(t[16+2][z+2])<<16) +
@@ -920,7 +970,8 @@ static inline void transpose_and_copy(uint8_t *out, int iN[32],
             ((uint64_t)(t[16+5][z+2])<<40) +
             ((uint64_t)(t[16+6][z+2])<<48) +
             ((uint64_t)(t[16+7][z+2])<<56);
-        *(uint64_t *)&out[iN[z+3]+16] =
+        memcpy(&out[iN[z+2]+16], &i64, 8);
+        i64 =
             ((uint64_t)(t[16+0][z+3])<< 0) +
             ((uint64_t)(t[16+1][z+3])<< 8) +
             ((uint64_t)(t[16+2][z+3])<<16) +
@@ -929,8 +980,9 @@ static inline void transpose_and_copy(uint8_t *out, int iN[32],
             ((uint64_t)(t[16+5][z+3])<<40) +
             ((uint64_t)(t[16+6][z+3])<<48) +
             ((uint64_t)(t[16+7][z+3])<<56);
+        memcpy(&out[iN[z+3]+16], &i64, 8);
 
-        *(uint64_t *)&out[iN[z]+24] =
+        i64 =
             ((uint64_t)(t[24+0][z])<< 0) +
             ((uint64_t)(t[24+1][z])<< 8) +
             ((uint64_t)(t[24+2][z])<<16) +
@@ -939,7 +991,8 @@ static inline void transpose_and_copy(uint8_t *out, int iN[32],
             ((uint64_t)(t[24+5][z])<<40) +
             ((uint64_t)(t[24+6][z])<<48) +
             ((uint64_t)(t[24+7][z])<<56);
-        *(uint64_t *)&out[iN[z+1]+24] =
+        memcpy(&out[iN[z]+24], &i64, 8);
+        i64 =
             ((uint64_t)(t[24+0][z+1])<< 0) +
             ((uint64_t)(t[24+1][z+1])<< 8) +
             ((uint64_t)(t[24+2][z+1])<<16) +
@@ -948,7 +1001,8 @@ static inline void transpose_and_copy(uint8_t *out, int iN[32],
             ((uint64_t)(t[24+5][z+1])<<40) +
             ((uint64_t)(t[24+6][z+1])<<48) +
             ((uint64_t)(t[24+7][z+1])<<56);
-        *(uint64_t *)&out[iN[z+2]+24] =
+        memcpy(&out[iN[z+1]+24], &i64, 8);
+        i64 =
             ((uint64_t)(t[24+0][z+2])<< 0) +
             ((uint64_t)(t[24+1][z+2])<< 8) +
             ((uint64_t)(t[24+2][z+2])<<16) +
@@ -957,7 +1011,8 @@ static inline void transpose_and_copy(uint8_t *out, int iN[32],
             ((uint64_t)(t[24+5][z+2])<<40) +
             ((uint64_t)(t[24+6][z+2])<<48) +
             ((uint64_t)(t[24+7][z+2])<<56);
-        *(uint64_t *)&out[iN[z+3]+24] =
+        memcpy(&out[iN[z+2]+24], &i64, 8);
+        i64 =
             ((uint64_t)(t[24+0][z+3])<< 0) +
             ((uint64_t)(t[24+1][z+3])<< 8) +
             ((uint64_t)(t[24+2][z+3])<<16) +
@@ -966,13 +1021,13 @@ static inline void transpose_and_copy(uint8_t *out, int iN[32],
             ((uint64_t)(t[24+5][z+3])<<40) +
             ((uint64_t)(t[24+6][z+3])<<48) +
             ((uint64_t)(t[24+7][z+3])<<56);
+        memcpy(&out[iN[z+3]+24], &i64, 8);
 
         iN[z+0] += 32;
         iN[z+1] += 32;
         iN[z+2] += 32;
         iN[z+3] += 32;
     }
-#endif
 }
 
 unsigned char *rans_uncompress_O1_32x16_sse4(unsigned char *in,


=====================================
htscodecs/rANS_static4x16pr.c
=====================================
@@ -822,13 +822,7 @@ unsigned char *rans_uncompress_O1_4x16(unsigned char *in, unsigned int in_size,
  */
 #include "rANS_static32x16pr.h"
 
-// Test interface for restricting the auto-detection methods so we
-// can forcibly compare different implementations on the same machine.
-// See RANS_CPU_ defines in rANS_static4x16.h
 static int rans_cpu = 0xFFFF; // all
-void rans_set_cpu(int opts) {
-    rans_cpu = opts;
-}
 
 #if (defined(__GNUC__) || defined(__clang__)) && defined(__x86_64__)
 // Icc and Clang both also set __GNUC__ on linux, but not on Windows.
@@ -861,6 +855,7 @@ static int have_avx2    UNUSED = 0;
 static int have_avx512f UNUSED = 0;
 static int is_amd       UNUSED = 0;
 
+#define HAVE_HTSCODECS_TLS_CPU_INIT
 static void htscodecs_tls_cpu_init(void) {
     unsigned int eax = 0, ebx = 0, ecx = 0, edx = 0;
     // These may be unused, depending on HAVE_* config.h macros
@@ -892,10 +887,6 @@ static void htscodecs_tls_cpu_init(void) {
 
     if (!have_popcnt) have_avx512f = have_avx2 = have_sse4_1 = 0;
     if (!have_ssse3)  have_sse4_1 = 0;
-
-    if (!(rans_cpu & RANS_CPU_ENC_AVX512)) have_avx512f = 0;
-    if (!(rans_cpu & RANS_CPU_ENC_AVX2))   have_avx2 = 0;
-    if (!(rans_cpu & RANS_CPU_ENC_SSE4))   have_sse4_1 = 0;
 }
 
 static inline
@@ -904,6 +895,15 @@ unsigned char *(*rans_enc_func(int do_simd, int order))
      unsigned int in_size,
      unsigned char *out,
      unsigned int *out_size) {
+
+    int have_e_sse4_1  = have_sse4_1;
+    int have_e_avx2    = have_avx2;
+    int have_e_avx512f = have_avx512f;
+
+    if (!(rans_cpu & RANS_CPU_ENC_AVX512)) have_e_avx512f = 0;
+    if (!(rans_cpu & RANS_CPU_ENC_AVX2))   have_e_avx2    = 0;
+    if (!(rans_cpu & RANS_CPU_ENC_SSE4))   have_e_sse4_1  = 0;
+
     if (!do_simd) { // SIMD disabled
         return order & 1
             ? rans_compress_O1_4x16
@@ -922,30 +922,41 @@ unsigned char *(*rans_enc_func(int do_simd, int order))
 #endif
 
     if (order & 1) {
+        // With simulated gathers, the AVX512 is now slower than AVX2, so
+        // we avoid using it unless asking for the real avx512 gather.
+        // Note for testing we do -c 0x0404 to enable AVX512 and disable AVX2.
+        // We then need to call the avx512 func regardless.
+        int use_gather;
+#ifdef USE_GATHER
+        use_gather = 1;
+#else
+        use_gather = !have_e_avx2;
+#endif
+
 #if defined(HAVE_AVX512)
-        if (have_avx512f && (!is_amd || !have_avx2))
+        if (have_e_avx512f && (!is_amd || !have_e_avx2) && use_gather)
             return rans_compress_O1_32x16_avx512;
 #endif
 #if defined(HAVE_AVX2)
-        if (have_avx2)
+        if (have_e_avx2)
             return rans_compress_O1_32x16_avx2;
 #endif
 #if defined(HAVE_SSE4_1) && defined(HAVE_SSSE3) && defined(HAVE_POPCNT)
-        if (have_sse4_1) 
+        if (have_e_sse4_1)
             return rans_compress_O1_32x16;
 #endif
         return rans_compress_O1_32x16;
     } else {
 #if defined(HAVE_AVX512)
-        if (have_avx512f && (!is_amd || !have_avx2))
+        if (have_e_avx512f && (!is_amd || !have_e_avx2))
             return rans_compress_O0_32x16_avx512;
 #endif
 #if defined(HAVE_AVX2)
-        if (have_avx2)
+        if (have_e_avx2)
             return rans_compress_O0_32x16_avx2;
 #endif
 #if defined(HAVE_SSE4_1) && defined(HAVE_SSSE3) && defined(HAVE_POPCNT)
-        if (have_sse4_1)
+        if (have_e_sse4_1)
             return rans_compress_O0_32x16;
 #endif
         return rans_compress_O0_32x16;
@@ -959,6 +970,14 @@ unsigned char *(*rans_dec_func(int do_simd, int order))
      unsigned char *out,
      unsigned int out_size) {
 
+    int have_d_sse4_1  = have_sse4_1;
+    int have_d_avx2    = have_avx2;
+    int have_d_avx512f = have_avx512f;
+
+    if (!(rans_cpu & RANS_CPU_DEC_AVX512)) have_d_avx512f = 0;
+    if (!(rans_cpu & RANS_CPU_DEC_AVX2))   have_d_avx2    = 0;
+    if (!(rans_cpu & RANS_CPU_DEC_SSE4))   have_d_sse4_1  = 0;
+
     if (!do_simd) { // SIMD disabled
         return order & 1
             ? rans_uncompress_O1_4x16
@@ -978,29 +997,29 @@ unsigned char *(*rans_dec_func(int do_simd, int order))
 
     if (order & 1) {
 #if defined(HAVE_AVX512)
-        if (have_avx512f)
+        if (have_d_avx512f)
             return rans_uncompress_O1_32x16_avx512;
 #endif
 #if defined(HAVE_AVX2)
-        if (have_avx2)
+        if (have_d_avx2)
             return rans_uncompress_O1_32x16_avx2;
 #endif
 #if defined(HAVE_SSE4_1) && defined(HAVE_SSSE3) && defined(HAVE_POPCNT)
-        if (have_sse4_1)
+        if (have_d_sse4_1)
             return rans_uncompress_O1_32x16_sse4;
 #endif
         return rans_uncompress_O1_32x16;
     } else {
 #if defined(HAVE_AVX512)
-        if (have_avx512f && (!is_amd || !have_avx2))
+        if (have_d_avx512f)
             return rans_uncompress_O0_32x16_avx512;
 #endif
 #if defined(HAVE_AVX2)
-        if (have_avx2)
+        if (have_d_avx2)
             return rans_uncompress_O0_32x16_avx2;
 #endif
 #if defined(HAVE_SSE4_1) && defined(HAVE_SSSE3) && defined(HAVE_POPCNT)
-        if (have_sse4_1)
+        if (have_d_sse4_1)
             return rans_uncompress_O0_32x16_sse4;
 #endif
         return rans_uncompress_O0_32x16;
@@ -1123,6 +1142,16 @@ unsigned char *(*rans_dec_func(int do_simd, int order))
 
 #endif
 
+// Test interface for restricting the auto-detection methods so we
+// can forcibly compare different implementations on the same machine.
+// See RANS_CPU_ defines in rANS_static4x16.h
+void rans_set_cpu(int opts) {
+    rans_cpu = opts;
+#ifdef HAVE_HTSCODECS_TLS_CPU_INIT
+    htscodecs_tls_cpu_init();
+#endif
+}
+
 /*-----------------------------------------------------------------------------
  * Simple interface to the order-0 vs order-1 encoders and decoders.
  *
@@ -1158,9 +1187,10 @@ unsigned char *rans_compress_to_4x16(unsigned char *in, unsigned int in_size,
 
     if (in_size <= 20)
         order &= ~RANS_ORDER_STRIPE;
+#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
     if (in_size <= 1000)
         order &= ~RANS_ORDER_X32;
-
+#endif
     if (order & RANS_ORDER_STRIPE) {
         int N = (order>>8) & 0xff;
         if (N == 0) N = 4; // default for compatibility with old tests


=====================================
htscodecs/tokenise_name3.c
=====================================
@@ -489,11 +489,11 @@ int build_trie(name_context *ctx, char *data, size_t len, int n) {
     for (nlines = i = 0; i < len; i++, nlines++) {
         t = ctx->t_head;
         t->count++;
-        while (i < len && data[i] > '\n') {
+        while (i < len && (unsigned char)data[i] > '\n') {
             unsigned char c = data[i++];
             if (c & 0x80)
                 //fprintf(stderr, "8-bit ASCII is unsupported\n");
-                abort();
+                return -1;
             c &= 127;
 
 
@@ -653,7 +653,7 @@ int search_trie(name_context *ctx, char *data, size_t len, int n, int *exact, in
             unsigned char c = data[i++];
             if (c & 0x80)
                 //fprintf(stderr, "8-bit ASCII is unsupported\n");
-                abort();
+                return -1;
             c &= 127;
 
             trie_t *x = t->next;
@@ -839,14 +839,14 @@ static int encode_name(name_context *ctx, char *name, int len, int mode) {
                     //ctx->lc[pnum].last[ntok].token_delta=0;
                 } else if (mode == 1 && d < 256 && d >= 0 && ctx->lc[pnum].last[ntok].token_str == s-i) {
 #ifdef ENC_DEBUG
-                    fprintf(stderr, "Tok %d (dig-delta, %d / %d)\n", N_DDELTA, ctx->lc[pnum].last[ntok].token_int, v);
+                    fprintf(stderr, "Tok %d (dig0-delta, %d / %d)\n", N_DDELTA0, ctx->lc[pnum].last[ntok].token_int, v);
 #endif
                     //if (encode_token_int1_(ctx, ntok, N_DZLEN, s-i) < 0) return -1;
                     if (encode_token_int1(ctx, ntok, N_DDELTA0, d) < 0) return -1;
                     //ctx->lc[pnum].last[ntok].token_delta=1;
                 } else {
 #ifdef ENC_DEBUG
-                    fprintf(stderr, "Tok %d (dig, %d / %d)\n", N_DIGITS, ctx->lc[pnum].last[ntok].token_int, v);
+                    fprintf(stderr, "Tok %d (dig0, %d / %d len %d)\n", N_DIGITS0, ctx->lc[pnum].last[ntok].token_int, v, s-i);
 #endif
                     if (encode_token_int1_(ctx, ntok, N_DZLEN, s-i) < 0) return -1;
                     if (encode_token_int(ctx, ntok, N_DIGITS0, v) < 0) return -1;
@@ -854,7 +854,7 @@ static int encode_name(name_context *ctx, char *name, int len, int mode) {
                 }
             } else {
 #ifdef ENC_DEBUG
-                fprintf(stderr, "Tok %d (new dig, %d)\n", N_DIGITS, v);
+                fprintf(stderr, "Tok %d (new dig0, %d len %d)\n", N_DIGITS0, v, s-i);
 #endif
                 if (encode_token_int1_(ctx, ntok, N_DZLEN, s-i) < 0) return -1;
                 if (encode_token_int(ctx, ntok, N_DIGITS0, v) < 0) return -1;
@@ -1577,7 +1577,7 @@ uint8_t *tok3_encode_names(char *blk, int len, int level, int use_arith,
             ctx->desc[i].dup_from = j;
             tot_size += 3; // flag, dup_from, ttype
         } else {
-            ctx->desc[i].dup_from = 0;
+            ctx->desc[i].dup_from = -1;
             tot_size += out_len + 1; // ttype
         }
     }
@@ -1585,7 +1585,7 @@ uint8_t *tok3_encode_names(char *blk, int len, int level, int use_arith,
 #if 0
     for (i = 0; i < ctx->max_tok*16; i++) {
         char fn[1024];
-        if (!ctx->desc[i].buf_l && !ctx->desc[i].dup_from) continue;
+        if (!ctx->desc[i].buf_l && ctx->desc[i].dup_from == -1) continue;
         sprintf(fn, "_tok.%02d_%02d.%d.comp", i>>4,i&15,i);
         FILE *fp = fopen(fn, "w");
         fwrite(ctx->desc[i].buf, 1, ctx->desc[i].buf_l, fp);
@@ -1623,7 +1623,7 @@ uint8_t *tok3_encode_names(char *blk, int len, int level, int use_arith,
             ttype8 |= 128;
             last_tnum = ctx->desc[i].tnum;
         }
-        if (ctx->desc[i].dup_from) {
+        if (ctx->desc[i].dup_from >= 0) {
             //fprintf(stderr, "Dup %d from %d, sz %d\n", i, ctx->desc[i].dup_from, ctx->desc[i].buf_l);
             *cp++ = ttype8 | 64;
             *cp++ = ctx->desc[i].dup_from >> 4;
@@ -1685,7 +1685,7 @@ uint8_t *tok3_decode_names(uint8_t *in, uint32_t sz, uint32_t *out_len) {
     while (o < sz) {
         uint8_t ttype = in[o++];
         if (ttype & 64) {
-            if (o+2 >= sz) goto err;
+            if (o+2 > sz) goto err;
             int j = in[o++]<<4;
             j += in[o++];
             if (ttype & 128) {


=====================================
m4/hts_check_compile_flags_needed.m4
=====================================
@@ -50,7 +50,7 @@ AC_CACHE_CHECK([_AC_LANG compiler flags needed for $1], CACHEVAR, [
     [ax_check_save_flags=$[]_AC_LANG_PREFIX[]FLAGS
      _AC_LANG_PREFIX[]FLAGS="$[]_AC_LANG_PREFIX[]FLAGS $6 $2"
      AC_LINK_IFELSE([m4_default([$3],[AC_LANG_PROGRAM()])],
-       [AS_VAR_SET(CACHEVAR,[$2])],
+       [AS_VAR_SET(CACHEVAR,["$2"])],
        [AS_VAR_SET(CACHEVAR,[unsupported])])
      _AC_LANG_PREFIX[]FLAGS=$ax_check_save_flags])])
 AS_VAR_IF(CACHEVAR,unsupported, [


=====================================
tests/Makefile.am
=====================================
@@ -88,7 +88,16 @@ fuzzer_ldadd   = $(top_builddir)/htscodecs/libcodecsfuzz.a \
 	$(top_builddir)/htscodecs/libcodecsfuzz_avx2.a \
 	$(top_builddir)/htscodecs/libcodecsfuzz_avx512.a
 
-EXTRA_PROGRAMS = rans4x8_fuzz rans4x16pr_fuzz arith_dynamic_fuzz tokenise_name3_fuzz fqzcomp_qual_fuzz
+EXTRA_PROGRAMS = \
+	rans4x8_fuzz \
+	rans4x16pr_fuzz \
+	arith_dynamic_fuzz \
+	tokenise_name3_fuzz \
+	tokenise_name3_fuzzrt \
+	fqzcomp_qual_fuzz \
+	fqzcomp_qual_fuzzrt \
+	entropy_fuzz
+
 rans4x8_fuzz_SOURCES = rANS_static_fuzz.c
 rans4x8_fuzz_CFLAGS  = $(fuzzer_cflags)
 rans4x8_fuzz_LDFLAGS = $(fuzzer_ldflags)
@@ -109,7 +118,22 @@ tokenise_name3_fuzz_CFLAGS  = $(fuzzer_cflags)
 tokenise_name3_fuzz_LDFLAGS = $(fuzzer_ldflags)
 tokenise_name3_fuzz_LDADD   = $(fuzzer_ldadd)
 
+tokenise_name3_fuzzrt_SOURCES = tokenise_name3_fuzzrt.c
+tokenise_name3_fuzzrt_CFLAGS  = $(fuzzer_cflags)
+tokenise_name3_fuzzrt_LDFLAGS = $(fuzzer_ldflags)
+tokenise_name3_fuzzrt_LDADD   = $(fuzzer_ldadd)
+
 fqzcomp_qual_fuzz_SOURCES = fqzcomp_qual_fuzz.c
 fqzcomp_qual_fuzz_CFLAGS  = $(fuzzer_cflags)
 fqzcomp_qual_fuzz_LDFLAGS = $(fuzzer_ldflags)
 fqzcomp_qual_fuzz_LDADD   = $(fuzzer_ldadd)
+
+entropy_fuzz_SOURCES = entropy_fuzz.c
+entropy_fuzz_CFLAGS  = $(fuzzer_cflags)
+entropy_fuzz_LDFLAGS = $(fuzzer_ldflags)
+entropy_fuzz_LDADD   = $(fuzzer_ldadd)
+
+fqzcomp_qual_fuzzrt_SOURCES = fqzcomp_qual_fuzzrt.c
+fqzcomp_qual_fuzzrt_CFLAGS  = $(fuzzer_cflags)
+fqzcomp_qual_fuzzrt_LDFLAGS = $(fuzzer_ldflags)
+fqzcomp_qual_fuzzrt_LDADD   = $(fuzzer_ldadd)


=====================================
tests/entropy.c
=====================================
@@ -126,7 +126,7 @@ int main(int argc, char **argv) {
     unsigned char *in = load(infp, &in_size);
     int order_a[] = {0,1,                            // r4x8
                      64,65, 128,129, 192,193,        // r4x16, arith
-                     4,5, 68,69, 132,133, 194,197,   // r4x16 SIMD
+                     4,5, 68,69, 132,133, 196,197,   // r4x16 SIMD
                      };
     char *codec[] = {"r4x8", "r4x16", "r32x16", "arith"};
     int i, j;


=====================================
tests/entropy_fuzz.c
=====================================
@@ -0,0 +1,151 @@
+/* Fuzz testing target. */
+/*
+ * Copyright (c) 2022 Genome Research Ltd.
+ * Author(s): Rob Davies
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *    1. Redistributions of source code must retain the above copyright notice,
+ *       this list of conditions and the following disclaimer.
+ *
+ *    2. Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer in the documentation and/or other materials provided
+ *       with the distribution.
+ *
+ *    3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
+ *       Institute nor the names of its contributors may be used to endorse
+ *       or promote products derived from this software without specific
+ *       prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS
+ * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+ * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH
+ * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <config.h>
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <assert.h>
+#include <string.h>
+#include <sys/time.h>
+
+#include "htscodecs/arith_dynamic.h"
+#include "htscodecs/rANS_static.h"
+#include "htscodecs/rANS_static4x16.h"
+#include "htscodecs/rANS_static32x16pr.h"
+
+int LLVMFuzzerTestOneInput(uint8_t *in, size_t in_size) {
+
+    const int order_a[] = {
+        0,1,        // No extras
+        0x40,041,   // RANS_ORDER_RLE
+        0x80,0x81,  // RANS_ORDER_PACK
+        0xc0,0xc1,  // RANS_ORDER_RLE|RANS_ORDER_PACK
+    };
+
+#if defined(__x86_64__)
+    const int cpu_enc_a[] = {
+        0, RANS_CPU_ENC_SSE4, RANS_CPU_ENC_AVX2, RANS_CPU_ENC_AVX512
+    };
+    const int cpu_dec_a[] = {
+        0, RANS_CPU_DEC_SSE4, RANS_CPU_DEC_AVX2, RANS_CPU_DEC_AVX512
+    };
+#elif defined(__ARM_NEON) && defined(__aarch64__)
+    const int cpu_enc_a[] = { 0, RANS_CPU_ENC_NEON };
+    const int cpu_dec_a[] = { 0, RANS_CPU_DEC_NEON };
+#else
+    const int cpu_enc_a[] = { 0 };
+    const int cpu_dec_a[] = { 0 };
+#endif
+    int i;
+
+    if (in_size > 200000)
+        return -1;
+
+    // rans_compress() only supports order 0 and 1
+    for (i = 0; i < 1; i++) {
+        uint8_t *comp, *uncomp;
+        uint32_t csize, usize;
+        comp = rans_compress(in, in_size, &csize, i);
+        if (!comp) abort();
+        uncomp = rans_uncompress(comp, csize, &usize);
+        if (!uncomp) abort();
+        if (usize != in_size) abort();
+        if (memcmp(uncomp, in, in_size) != 0) abort();
+        free(comp);
+        free(uncomp);
+    }
+
+    for (i = 0; i < sizeof(order_a) / sizeof(*order_a); i++) {
+        int order = order_a[i];
+        uint8_t *comp, *uncomp, *comp0 = NULL;
+        uint32_t csize, usize, csize0 = 0;
+        int c;
+        comp = rans_compress_4x16(in, in_size, &csize, order);
+        if (!comp) abort();
+        uncomp = rans_uncompress_4x16(comp, csize, &usize);
+        if (!uncomp) abort();
+        if (usize != in_size) abort();
+        if (memcmp(uncomp, in, in_size) != 0) abort();
+        free(comp);
+        free(uncomp);
+
+        comp = arith_compress(in, in_size, &csize, order);
+        if (!comp) abort();
+        uncomp = arith_uncompress(comp, csize, &usize);
+        if (!uncomp) abort();
+        if (usize != in_size) abort();
+        if (memcmp(uncomp, in, in_size) != 0) abort();
+        free(comp);
+        free(uncomp);
+        
+        // Check all SIMD variants for RANS_ORDER_X32
+        for (c = 0; c < sizeof(cpu_enc_a)/sizeof(*cpu_enc_a); c++) {
+            rans_set_cpu(cpu_enc_a[c]);
+            comp = rans_compress_4x16(in, in_size, &csize,
+                                      order | RANS_ORDER_X32);
+            if (!comp) abort();
+            if (comp0) {
+                if (csize != csize0 ||
+                    memcmp(comp0, comp, csize) != 0) {
+                    fprintf(stderr,
+                            "Compressed data mismatch order 0x%x cpu 0x%x\n",
+                            order, cpu_enc_a[c]);
+                    abort();
+                }
+                free(comp);
+            } else {
+                comp0 = comp;
+                csize0 = csize;
+            }
+        }
+        for (c = 0; c < sizeof(cpu_dec_a)/sizeof(*cpu_dec_a); c++) {
+            rans_set_cpu(cpu_dec_a[c]);
+            uncomp = rans_uncompress_4x16(comp0, csize0, &usize);
+            if (!uncomp) abort();
+            if (usize != in_size ||
+                memcmp(uncomp, in, in_size) != 0) {
+                fprintf(stderr,
+                        "Uncompressed data mismatch order 0x%x cpu 0x%x\n",
+                        order, cpu_dec_a[c]);
+                abort();
+            }
+            free(uncomp);
+        }
+        free(comp0);
+    }
+    return 0;
+}


=====================================
tests/fqzcomp_qual_fuzzrt.c
=====================================
@@ -0,0 +1,137 @@
+/* Fuzz testing target. */
+/*
+ * Copyright (c) 2023 Genome Research Ltd.
+ * Author(s): James Bonfield
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *    1. Redistributions of source code must retain the above copyright notice,
+ *       this list of conditions and the following disclaimer.
+ *
+ *    2. Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer in the documentation and/or other materials provided
+ *       with the distribution.
+ *
+ *    3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
+ *       Institute nor the names of its contributors may be used to endorse
+ *       or promote products derived from this software without specific
+ *       prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS
+ * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+ * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH
+ * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include "config.h"
+
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <assert.h>
+#include <fcntl.h>
+#include <ctype.h>
+
+#include "htscodecs/fqzcomp_qual.h"
+
+#ifndef MAX_REC
+#define MAX_REC 5000
+#endif
+
+#ifndef MAX_SEQ
+#  define MAX_SEQ 5000
+#endif
+
+static unsigned int slice_len[MAX_REC];
+static unsigned int slice_flags[MAX_REC];
+
+static fqz_slice fixed_slice = {0};
+
+fqz_slice *fake_slice(size_t buf_len, int nrec) {
+    fixed_slice.len = slice_len;
+    fixed_slice.flags = slice_flags;
+    fixed_slice.num_records = nrec;
+
+    // 1 long record
+    if (nrec == 1) {
+	slice_len[0] = buf_len;
+	slice_flags[0] = 0; // FIXME
+	return &fixed_slice;
+    }
+
+    // N 1-byte records
+    if (nrec == buf_len) {
+	int i;
+	for (i = 0; i < buf_len; i++) {
+	    slice_len[i] = 1;
+	    slice_flags[i] = 0; // FIXME
+	}
+	return &fixed_slice;
+    }
+
+    // Otherwise variable length records
+
+    // Reproducability of randomness
+    int seed = rand();
+    srand(0);
+
+    int nlen = buf_len/10+1;
+    int i, l, n = 0;
+    for (i = 0; i < buf_len; i+=l, n++) {
+	l = rand() % (nlen+1);
+	l += l==0;
+	slice_len[n] = i+l < buf_len ? l : buf_len-i;
+	slice_flags[n] = 0; // FIXME
+    }
+    fixed_slice.num_records = n;
+
+    srand(seed); // new random state
+
+    return &fixed_slice;
+}
+
+int LLVMFuzzerTestOneInput(uint8_t *in, size_t in_size) {
+    size_t c_size, u_size;
+
+    int mode = 0;
+    for (mode = 0; mode < 3; mode++) {
+	int mval[3] = {0,1,in_size};
+	fqz_slice *s = fake_slice(in_size, mval[mode]);
+
+	// Semi random strat, but based on a few bits of input data
+	// for reproducability.
+	// This lets the fuzzer explore the parameter space itself.
+	int strat = in_size ? in[0] & 3 : 0;
+	char *comp = fqz_compress(3, s, (char *)in, in_size, &c_size,
+				  strat, NULL);
+	if (!comp) {
+	    fprintf(stderr, "REJECT FQZ %d to (null)n", (int)in_size);
+	    return -1;
+	}
+
+	char *uncomp = fqz_decompress(comp, c_size, &u_size, NULL, 0);
+	if (!uncomp)
+	    abort();
+
+	if (in_size != u_size)
+	    abort();
+
+	if (memcmp(uncomp, in, in_size) != 0)
+	    abort();
+    
+	free(comp);
+	free(uncomp);
+    }
+    
+    return 0;
+}


=====================================
tests/tokenise_name3_fuzzrt.c
=====================================
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2023 Genome Research Ltd.
+ * Author(s): James Bonfield
+ * 
+ * Redistribution and use in source and binary forms, with or without 
+ * modification, are permitted provided that the following conditions are met:
+ * 
+ *    1. Redistributions of source code must retain the above copyright notice,
+ *       this list of conditions and the following disclaimer.
+ * 
+ *    2. Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer in the documentation and/or other materials provided
+ *       with the distribution.
+ * 
+ *    3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
+ *    Institute nor the names of its contributors may be used to endorse
+ *    or promote products derived from this software without specific
+ *    prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS
+ * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+ * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH
+ * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+// Round-trip fuzz testing.  While tokenise_name3_fuzz.c tests the name
+// decoder when given random input, this tests it can encode and then
+// (if an error isn't reported) decode and get back the same content.
+//
+// It's complicated as we need to construct meta-data for how many names
+// we have.
+#include "config.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <limits.h>
+#include <ctype.h>
+#include <assert.h>
+#include <inttypes.h>
+#include <math.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <time.h>
+
+#include "htscodecs/tokenise_name3.h"
+
+int LLVMFuzzerTestOneInput(const uint8_t *in, size_t in_sz) {
+    int level, arith;
+    char in2[8192];
+
+    // 4096 is default max size for libfuzzer anyway
+    if (in_sz > 8192)
+	return -1;
+
+    // Turn newlines to nuls so we can do round-trip testing
+    // on multi-name data.
+    int i;
+    for (i = 0; i < in_sz; i++)
+	in2[i] = in[i] == '\n' ? 0 : in[i];
+    if (in_sz && in2[in_sz-1] > '\n')
+	in2[in_sz++] = 0;
+
+    for (arith = 0; arith < 2; arith++) {
+	for (level = 1; level <= 9; level += 8) { // 1 & 9 only
+	    int clen;
+	    uint8_t *cdat = tok3_encode_names((char *)in2, in_sz, level,
+					      arith, &clen, NULL);
+	    if (!cdat)
+		// skip this input from corpus as it's unparseable
+		return -1;
+
+	    uint32_t ulen;
+	    uint8_t *udat = tok3_decode_names(cdat, clen, &ulen);
+	    if (!udat || ulen != in_sz)
+		abort();
+
+	    if (memcmp(in2, udat, ulen) != 0)
+		abort();
+
+	    free(cdat);
+	    free(udat);
+	}
+    }
+    
+    return 0;
+}



View it on GitLab: https://salsa.debian.org/med-team/htscodecs/-/commit/027895ec31ceedddca37a80b1aa299293a549fa3

-- 
View it on GitLab: https://salsa.debian.org/med-team/htscodecs/-/commit/027895ec31ceedddca37a80b1aa299293a549fa3
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20231215/87cc39ae/attachment-0001.htm>