[med-svn] [Git][deflate-team/libdeflate][debian/latest] 2 commits: New upstream version 1.23

nick black (@nickblack) gitlab at salsa.debian.org
Tue Jan 14 14:45:39 GMT 2025



nick black pushed to branch debian/latest at deflate team / libdeflate


Commits:
3e0469f1 by nick black at 2025-01-14T09:42:27-05:00
New upstream version 1.23
- - - - -
02c1120d by nick black at 2025-01-14T09:42:27-05:00
Update upstream source from tag 'upstream/1.23'

Update to upstream version '1.23'
with Debian dir 26ece65b570aef692b6e36149b2d5f7a8a42045c
- - - - -


16 changed files:

- .github/workflows/ci.yml
- CMakeLists.txt
- NEWS.md
- README.md
- lib/arm/crc32_impl.h
- lib/crc32_multipliers.h
- lib/crc32_tables.h
- lib/x86/adler32_impl.h
- lib/x86/cpu_features.c
- lib/x86/cpu_features.h
- lib/x86/crc32_impl.h
- lib/x86/crc32_pclmul_template.h
- libdeflate.h
- + scripts/gen-crc32-consts.py
- − scripts/gen_crc32_multipliers.c
- − scripts/gen_crc32_tables.c


Changes:

=====================================
.github/workflows/ci.yml
=====================================
@@ -42,7 +42,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout at v4
-      - uses: uraimo/run-on-arch-action at v2.5.0
+      - uses: uraimo/run-on-arch-action at v2.8.1
         with:
           arch: ${{ matrix.arch }}
           distro: ${{ matrix.distro }}
@@ -139,11 +139,8 @@ jobs:
     name: Build (Windows, Visual Studio ${{matrix.toolset}}, ${{matrix.platform}})
     strategy:
       matrix:
-        platform: [ARM64, ARM]
+        platform: [ARM64]
         toolset: [v143, ClangCL]
-        exclude: # Exclude unsupported combinations
-        - platform: ARM
-          toolset: ClangCL
     runs-on: windows-latest
     steps:
     - uses: actions/checkout at v4


=====================================
CMakeLists.txt
=====================================
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.7)
+cmake_minimum_required(VERSION 3.10)
 
 # Default to a release build.
 if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)


=====================================
NEWS.md
=====================================
@@ -1,5 +1,21 @@
 # libdeflate release notes
 
+## Version 1.23
+
+* Fixed bug introduced in 1.20 where incorrect checksums could be calculated if
+  libdeflate was compiled with clang at -O0 and run on a CPU supporting AVX512.
+
+* Fixed bug introduced in 1.20 where incorrect checksums could be calculated in
+  rare cases on macOS computers that support AVX512 and are running an older
+  version of macOS that contains a bug that corrupts AVX512 registers.  This
+  could occur only if code outside libdeflate enabled AVX512 in the thread.
+
+* Fixed build error when using -mno-evex512 with clang 18+ or gcc 14+.
+
+* Increased the minimum CMake version to 3.10.
+
+* Further optimized the x86 CRC code.
+
 ## Version 1.22
 
 * The CMake-based build system now implements a workaround for gcc being paired


=====================================
README.md
=====================================
@@ -144,6 +144,7 @@ libdeflate from a programming language other than C or C++, consider using the
 following bindings:
 
 * C#: [LibDeflate.NET](https://github.com/jzebedee/LibDeflate.NET)
+* Delphi: [libdeflate-pas](https://github.com/zedxxx/libdeflate-pas)
 * Go: [go-libdeflate](https://github.com/4kills/go-libdeflate)
 * Java: [libdeflate-java](https://github.com/astei/libdeflate-java)
 * Julia: [LibDeflate.jl](https://github.com/jakobnissen/LibDeflate.jl)


=====================================
lib/arm/crc32_impl.h
=====================================
@@ -434,13 +434,13 @@ crc32_arm_pmullx4(u32 crc, const u8 *p, size_t len)
 		{ CRC32_X543_MODG, CRC32_X479_MODG }, /* 4 vecs */
 		{ CRC32_X287_MODG, CRC32_X223_MODG }, /* 2 vecs */
 	};
-	static const u64 _aligned_attribute(16) final_mults[3][2] = {
-		{ CRC32_X63_MODG, 0 },
-		{ CRC32_BARRETT_CONSTANT_1, 0 },
-		{ CRC32_BARRETT_CONSTANT_2, 0 },
+	static const u64 _aligned_attribute(16) barrett_consts[2][2] = {
+		{ CRC32_BARRETT_CONSTANT_1, CRC32_BARRETT_CONSTANT_1 },
+		{ CRC32_BARRETT_CONSTANT_2, CRC32_BARRETT_CONSTANT_2 },
+	};
+	static const u32 _aligned_attribute(16) mask32[4] = {
+		0, 0, 0xffffffff, 0
 	};
-	const uint8x16_t zeroes = vdupq_n_u8(0);
-	const uint8x16_t mask32 = vreinterpretq_u8_u64(vdupq_n_u64(0xFFFFFFFF));
 	const poly64x2_t multipliers_1 = load_multipliers(mults[0]);
 	uint8x16_t v0, v1, v2, v3;
 
@@ -497,24 +497,13 @@ crc32_arm_pmullx4(u32 crc, const u8 *p, size_t len)
 	if (len)
 		v0 = fold_partial_vec(v0, p, len, multipliers_1);
 
-	/*
-	 * Fold 128 => 96 bits.  This also implicitly appends 32 zero bits,
-	 * which is equivalent to multiplying by x^32.  This is needed because
-	 * the CRC is defined as M(x)*x^32 mod G(x), not just M(x) mod G(x).
-	 */
-
-	v0 = veorq_u8(vextq_u8(v0, zeroes, 8),
-		      clmul_high(vextq_u8(zeroes, v0, 8), multipliers_1));
-
-	/* Fold 96 => 64 bits. */
-	v0 = veorq_u8(vextq_u8(v0, zeroes, 4),
-		      clmul_low(vandq_u8(v0, mask32),
-				load_multipliers(final_mults[0])));
-
-	/* Reduce 64 => 32 bits using Barrett reduction. */
-	v1 = clmul_low(vandq_u8(v0, mask32), load_multipliers(final_mults[1]));
-	v1 = clmul_low(vandq_u8(v1, mask32), load_multipliers(final_mults[2]));
-	return vgetq_lane_u32(vreinterpretq_u32_u8(veorq_u8(v0, v1)), 1);
+	/* Reduce to 32 bits, following lib/x86/crc32_pclmul_template.h */
+	v1 = clmul_low(v0, load_multipliers(barrett_consts[0]));
+	v1 = clmul_low(v1, load_multipliers(barrett_consts[1]));
+	v0 = veorq_u8(v0, vandq_u8(v1, vreinterpretq_u8_u32(vld1q_u32(mask32))));
+	v0 = clmul_high(v0, load_multipliers(barrett_consts[0]));
+	v0 = clmul_low(v0, load_multipliers(barrett_consts[1]));
+	return vgetq_lane_u32(vreinterpretq_u32_u8(v0), 2);
 }
 #undef SUFFIX
 #undef ATTRIBUTES


=====================================
lib/crc32_multipliers.h
=====================================
@@ -1,7 +1,7 @@
 /*
  * crc32_multipliers.h - constants for CRC-32 folding
  *
- * THIS FILE WAS GENERATED BY gen_crc32_multipliers.c.  DO NOT EDIT.
+ * THIS FILE WAS GENERATED BY gen-crc32-consts.py.  DO NOT EDIT.
  */
 
 #define CRC32_X159_MODG 0xae689191 /* x^159 mod G(x) */
@@ -100,10 +100,8 @@
 #define CRC32_X4127_MODG 0x1072db28 /* x^4127 mod G(x) */
 #define CRC32_X4063_MODG 0x0c30f51d /* x^4063 mod G(x) */
 
-#define CRC32_X63_MODG 0xb8bc6765 /* x^63 mod G(x) */
-#define CRC32_BARRETT_CONSTANT_1 0x00000001f7011641ULL /* floor(x^64 / G(x)) */
+#define CRC32_BARRETT_CONSTANT_1 0xb4e5b025f7011641ULL /* floor(x^95 / G(x)) */
 #define CRC32_BARRETT_CONSTANT_2 0x00000001db710641ULL /* G(x) */
-#define CRC32_BARRETT_CONSTANTS { CRC32_BARRETT_CONSTANT_1, CRC32_BARRETT_CONSTANT_2 }
 
 #define CRC32_NUM_CHUNKS 4
 #define CRC32_MIN_VARIABLE_CHUNK_LEN 128UL


=====================================
lib/crc32_tables.h
=====================================
@@ -1,7 +1,7 @@
 /*
  * crc32_tables.h - data tables for CRC-32 computation
  *
- * THIS FILE WAS GENERATED BY gen_crc32_tables.c.  DO NOT EDIT.
+ * THIS FILE WAS GENERATED BY gen-crc32-consts.py.  DO NOT EDIT.
  */
 
 static const u32 crc32_slice1_table[] MAYBE_UNUSED = {


=====================================
lib/x86/adler32_impl.h
=====================================
@@ -82,7 +82,7 @@
  */
 #  define adler32_x86_avx512_vl256_vnni	adler32_x86_avx512_vl256_vnni
 #  define SUFFIX				   _avx512_vl256_vnni
-#  define ATTRIBUTES		_target_attribute("avx512bw,avx512vl,avx512vnni")
+#  define ATTRIBUTES		_target_attribute("avx512bw,avx512vl,avx512vnni" NO_EVEX512)
 #  define VL			32
 #  define USE_VNNI		1
 #  define USE_AVX512		1
@@ -95,7 +95,7 @@
  */
 #  define adler32_x86_avx512_vl512_vnni	adler32_x86_avx512_vl512_vnni
 #  define SUFFIX				   _avx512_vl512_vnni
-#  define ATTRIBUTES		_target_attribute("avx512bw,avx512vnni")
+#  define ATTRIBUTES		_target_attribute("avx512bw,avx512vnni" EVEX512)
 #  define VL			64
 #  define USE_VNNI		1
 #  define USE_AVX512		1


=====================================
lib/x86/cpu_features.c
=====================================
@@ -88,6 +88,27 @@ static const struct cpu_feature x86_cpu_feature_table[] = {
 
 volatile u32 libdeflate_x86_cpu_features = 0;
 
+static inline bool
+os_supports_avx512(u64 xcr0)
+{
+#ifdef __APPLE__
+	/*
+	 * The Darwin kernel had a bug where it could corrupt the opmask
+	 * registers.  See
+	 * https://community.intel.com/t5/Software-Tuning-Performance/MacOS-Darwin-kernel-bug-clobbers-AVX-512-opmask-register-state/m-p/1327259
+	 * Darwin also does not initially set the XCR0 bits for AVX512, but they
+	 * are set if the thread tries to use AVX512 anyway.  Thus, to safely
+	 * and consistently use AVX512 on macOS we'd need to check the kernel
+	 * version as well as detect AVX512 support using a macOS-specific
+	 * method.  We don't bother with this, especially given Apple's
+	 * transition to arm64.
+	 */
+	return false;
+#else
+	return (xcr0 & 0xe6) == 0xe6;
+#endif
+}
+
 /*
  * Don't use 512-bit vectors on Intel CPUs before Rocket Lake and Sapphire
  * Rapids, due to the downclocking penalty.
@@ -140,7 +161,12 @@ void libdeflate_init_x86_cpu_features(void)
 		family += (a >> 20) & 0xff;
 	if (d & (1 << 26))
 		features |= X86_CPU_FEATURE_SSE2;
-	if (c & (1 << 1))
+	/*
+	 * No known CPUs have pclmulqdq without sse4.1, so in practice code
+	 * targeting pclmulqdq can use sse4.1 instructions.  But to be safe,
+	 * explicitly check for both the pclmulqdq and sse4.1 bits.
+	 */
+	if ((c & (1 << 1)) && (c & (1 << 19)))
 		features |= X86_CPU_FEATURE_PCLMULQDQ;
 	if (c & (1 << 27))
 		xcr0 = read_xcr(0);
@@ -152,21 +178,24 @@ void libdeflate_init_x86_cpu_features(void)
 
 	/* EAX=7, ECX=0: Extended Features */
 	cpuid(7, 0, &a, &b, &c, &d);
-	if ((b & (1 << 5)) && ((xcr0 & 0x6) == 0x6))
-		features |= X86_CPU_FEATURE_AVX2;
 	if (b & (1 << 8))
 		features |= X86_CPU_FEATURE_BMI2;
-	if (((xcr0 & 0xe6) == 0xe6) &&
-	    allow_512bit_vectors(manufacturer, family, model))
-		features |= X86_CPU_FEATURE_ZMM;
-	if ((b & (1 << 30)) && ((xcr0 & 0xe6) == 0xe6))
-		features |= X86_CPU_FEATURE_AVX512BW;
-	if ((b & (1U << 31)) && ((xcr0 & 0xe6) == 0xe6))
-		features |= X86_CPU_FEATURE_AVX512VL;
-	if ((c & (1 << 10)) && ((xcr0 & 0x6) == 0x6))
-		features |= X86_CPU_FEATURE_VPCLMULQDQ;
-	if ((c & (1 << 11)) && ((xcr0 & 0xe6) == 0xe6))
-		features |= X86_CPU_FEATURE_AVX512VNNI;
+	if ((xcr0 & 0x6) == 0x6) {
+		if (b & (1 << 5))
+			features |= X86_CPU_FEATURE_AVX2;
+		if (c & (1 << 10))
+			features |= X86_CPU_FEATURE_VPCLMULQDQ;
+	}
+	if (os_supports_avx512(xcr0)) {
+		if (allow_512bit_vectors(manufacturer, family, model))
+			features |= X86_CPU_FEATURE_ZMM;
+		if (b & (1 << 30))
+			features |= X86_CPU_FEATURE_AVX512BW;
+		if (b & (1U << 31))
+			features |= X86_CPU_FEATURE_AVX512VL;
+		if (c & (1 << 11))
+			features |= X86_CPU_FEATURE_AVX512VNNI;
+	}
 
 	/* EAX=7, ECX=1: Extended Features */
 	cpuid(7, 1, &a, &b, &c, &d);


=====================================
lib/x86/cpu_features.h
=====================================
@@ -108,7 +108,8 @@ static inline u32 get_x86_cpu_features(void) { return 0; }
 #  define HAVE_SSE2_NATIVE		0
 #endif
 
-#if defined(__PCLMUL__) || (defined(_MSC_VER) && defined(__AVX2__))
+#if (defined(__PCLMUL__) && defined(__SSE4_1__)) || \
+	(defined(_MSC_VER) && defined(__AVX2__))
 #  define HAVE_PCLMULQDQ(features)	1
 #else
 #  define HAVE_PCLMULQDQ(features)	((features) & X86_CPU_FEATURE_PCLMULQDQ)
@@ -164,6 +165,15 @@ static inline u32 get_x86_cpu_features(void) { return 0; }
 #  define HAVE_AVXVNNI(features)	((features) & X86_CPU_FEATURE_AVXVNNI)
 #endif
 
+#if (GCC_PREREQ(14, 0) || CLANG_PREREQ(18, 0, 18000000)) \
+	&& !defined(__EVEX512__) /* avoid subtracting the evex512 feature */
+#  define EVEX512	",evex512"	/* needed to override potential -mno-evex512 */
+#  define NO_EVEX512	",no-evex512"	/* needed for AVX10/256 compatibility */
+#else
+#  define EVEX512	""
+#  define NO_EVEX512	""
+#endif
+
 #endif /* ARCH_X86_32 || ARCH_X86_64 */
 
 #endif /* LIB_X86_CPU_FEATURES_H */


=====================================
lib/x86/crc32_impl.h
=====================================
@@ -44,31 +44,26 @@ static const u8 MAYBE_UNUSED shift_tab[48] = {
 };
 
 #if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER)
-/* PCLMULQDQ implementation */
+/*
+ * PCLMULQDQ implementation.  This targets PCLMULQDQ+SSE4.1, since in practice
+ * all CPUs that support PCLMULQDQ also support SSE4.1.
+ */
 #  define crc32_x86_pclmulqdq	crc32_x86_pclmulqdq
 #  define SUFFIX			 _pclmulqdq
-#  define ATTRIBUTES		_target_attribute("pclmul")
+#  define ATTRIBUTES		_target_attribute("pclmul,sse4.1")
 #  define VL			16
-#  define USE_SSE4_1		0
 #  define USE_AVX512		0
 #  include "crc32_pclmul_template.h"
 
 /*
- * PCLMULQDQ/AVX implementation.  Compared to the regular PCLMULQDQ
- * implementation, this still uses 128-bit vectors, but it has two potential
- * benefits.  First, simply compiling against the AVX target can improve
- * performance significantly (e.g. 10100 MB/s to 16700 MB/s on Skylake) without
- * actually using any AVX intrinsics, probably due to the availability of
- * non-destructive VEX-encoded instructions.  Second, AVX support implies SSSE3
- * and SSE4.1 support, and we can use SSSE3 and SSE4.1 intrinsics for efficient
- * handling of partial blocks.  (We *could* compile a variant with
- * PCLMULQDQ+SSE4.1 without AVX, but for simplicity we currently don't bother.)
+ * PCLMULQDQ/AVX implementation.  Same as above, but this is compiled with AVX
+ * enabled so that the compiler can generate VEX-coded instructions which can be
+ * slightly more efficient.  It still uses 128-bit vectors.
  */
 #  define crc32_x86_pclmulqdq_avx	crc32_x86_pclmulqdq_avx
 #  define SUFFIX				 _pclmulqdq_avx
 #  define ATTRIBUTES		_target_attribute("pclmul,avx")
 #  define VL			16
-#  define USE_SSE4_1		1
 #  define USE_AVX512		0
 #  include "crc32_pclmul_template.h"
 #endif
@@ -83,19 +78,20 @@ static const u8 MAYBE_UNUSED shift_tab[48] = {
  *
  * gcc 8.1 and 8.2 had a similar bug where they assumed that
  * _mm256_clmulepi64_epi128() always needed AVX512.  It's fixed in gcc 8.3.
+ *
+ * _mm256_zextsi128_si256() requires gcc 10.
  */
-#if (GCC_PREREQ(8, 3) || CLANG_PREREQ(6, 0, 10000000)) && \
+#if (GCC_PREREQ(10, 1) || CLANG_PREREQ(6, 0, 10000000)) && \
 	!defined(LIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_VPCLMULQDQ)
 #  define crc32_x86_vpclmulqdq_avx2	crc32_x86_vpclmulqdq_avx2
 #  define SUFFIX				 _vpclmulqdq_avx2
 #  define ATTRIBUTES		_target_attribute("vpclmulqdq,pclmul,avx2")
 #  define VL			32
-#  define USE_SSE4_1		1
 #  define USE_AVX512		0
 #  include "crc32_pclmul_template.h"
 #endif
 
-#if (GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 10000000) || MSVC_PREREQ(1920)) && \
+#if (GCC_PREREQ(10, 1) || CLANG_PREREQ(6, 0, 10000000) || MSVC_PREREQ(1920)) && \
 	!defined(LIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_VPCLMULQDQ)
 /*
  * VPCLMULQDQ/AVX512 implementation using 256-bit vectors.  This is very similar
@@ -103,12 +99,13 @@ static const u8 MAYBE_UNUSED shift_tab[48] = {
  * instruction and more registers.  This is used on CPUs that support AVX-512
  * but where using 512-bit vectors causes downclocking.  This should also be the
  * optimal implementation on CPUs that support AVX10/256 but not AVX10/512.
+ *
+ * _mm256_zextsi128_si256() requires gcc 10.
  */
 #  define crc32_x86_vpclmulqdq_avx512_vl256  crc32_x86_vpclmulqdq_avx512_vl256
 #  define SUFFIX				      _vpclmulqdq_avx512_vl256
-#  define ATTRIBUTES		_target_attribute("vpclmulqdq,pclmul,avx512bw,avx512vl")
+#  define ATTRIBUTES		_target_attribute("vpclmulqdq,pclmul,avx512bw,avx512vl" NO_EVEX512)
 #  define VL			32
-#  define USE_SSE4_1		1
 #  define USE_AVX512		1
 #  include "crc32_pclmul_template.h"
 
@@ -116,12 +113,13 @@ static const u8 MAYBE_UNUSED shift_tab[48] = {
  * VPCLMULQDQ/AVX512 implementation using 512-bit vectors.  This is used on CPUs
  * that have a good AVX-512 implementation including VPCLMULQDQ.  This should
  * also be the optimal implementation on CPUs that support AVX10/512.
+ *
+ * _mm512_zextsi128_si512() requires gcc 10.
  */
 #  define crc32_x86_vpclmulqdq_avx512_vl512  crc32_x86_vpclmulqdq_avx512_vl512
 #  define SUFFIX				      _vpclmulqdq_avx512_vl512
-#  define ATTRIBUTES		_target_attribute("vpclmulqdq,pclmul,avx512bw,avx512vl")
+#  define ATTRIBUTES		_target_attribute("vpclmulqdq,pclmul,avx512bw,avx512vl" EVEX512)
 #  define VL			64
-#  define USE_SSE4_1		1
 #  define USE_AVX512		1
 #  include "crc32_pclmul_template.h"
 #endif


=====================================
lib/x86/crc32_pclmul_template.h
=====================================
@@ -34,17 +34,13 @@
  * ATTRIBUTES:
  *	Target function attributes to use.  Must satisfy the dependencies of the
  *	other parameters as follows:
- *	   VL=16 && USE_SSE4_1=0 && USE_AVX512=0: at least pclmul
- *	   VL=16 && USE_SSE4_1=1 && USE_AVX512=0: at least pclmul,sse4.1
- *	   VL=32 && USE_SSE4_1=1 && USE_AVX512=0: at least vpclmulqdq,pclmul,avx2
- *	   VL=32 && USE_SSE4_1=1 && USE_AVX512=1: at least vpclmulqdq,pclmul,avx512bw,avx512vl
- *	   VL=64 && USE_SSE4_1=1 && USE_AVX512=1: at least vpclmulqdq,pclmul,avx512bw,avx512vl
+ *	   VL=16 && USE_AVX512=0: at least pclmul,sse4.1
+ *	   VL=32 && USE_AVX512=0: at least vpclmulqdq,pclmul,avx2
+ *	   VL=32 && USE_AVX512=1: at least vpclmulqdq,pclmul,avx512bw,avx512vl
+ *	   VL=64 && USE_AVX512=1: at least vpclmulqdq,pclmul,avx512bw,avx512vl
  *	   (Other combinations are not useful and have not been tested.)
  * VL:
  *	Vector length in bytes.  Must be 16, 32, or 64.
- * USE_SSE4_1:
- *	If 1, take advantage of SSE4.1 instructions such as pblendvb.
- *	If 0, assume that the CPU might not support SSE4.1.
  * USE_AVX512:
  *	If 1, take advantage of AVX-512 features such as masking and the
  *	vpternlog instruction.  This doesn't enable the use of 512-bit vectors;
@@ -55,7 +51,10 @@
  * instructions.  Note that the x86 crc32 instruction cannot be used, as it is
  * for a different polynomial, not the gzip one.  For an explanation of CRC
  * folding with carryless multiplication instructions, see
- * scripts/gen_crc32_multipliers.c and the following paper:
+ * scripts/gen-crc32-consts.py and the following blog posts and papers:
+ *
+ *	"An alternative exposition of crc32_4k_pclmulqdq"
+ *	https://www.corsix.org/content/alternative-exposition-crc32_4k_pclmulqdq
  *
  *	"Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
  *	https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
@@ -81,7 +80,7 @@
 #  define fold_vec		fold_vec256
 #  define VLOADU(p)		_mm256_loadu_si256((const void *)(p))
 #  define VXOR(a, b)		_mm256_xor_si256((a), (b))
-#  define M128I_TO_VEC(a)	_mm256_castsi128_si256(a)
+#  define M128I_TO_VEC(a)	_mm256_zextsi128_si256(a)
 #  define MULTS(a, b)		_mm256_set_epi64x(a, b, a, b)
 #  define MULTS_8V		MULTS(CRC32_X2015_MODG, CRC32_X2079_MODG)
 #  define MULTS_4V		MULTS(CRC32_X991_MODG, CRC32_X1055_MODG)
@@ -92,7 +91,7 @@
 #  define fold_vec		fold_vec512
 #  define VLOADU(p)		_mm512_loadu_si512((const void *)(p))
 #  define VXOR(a, b)		_mm512_xor_si512((a), (b))
-#  define M128I_TO_VEC(a)	_mm512_castsi128_si512(a)
+#  define M128I_TO_VEC(a)	_mm512_zextsi128_si512(a)
 #  define MULTS(a, b)		_mm512_set_epi64(a, b, a, b, a, b, a, b)
 #  define MULTS_8V		MULTS(CRC32_X4063_MODG, CRC32_X4127_MODG)
 #  define MULTS_4V		MULTS(CRC32_X2015_MODG, CRC32_X2079_MODG)
@@ -149,7 +148,6 @@ ADD_SUFFIX(fold_vec512)(__m512i src, __m512i dst, __m512i /* __v8du */ mults)
 #define fold_vec512	ADD_SUFFIX(fold_vec512)
 #endif /* VL >= 64 */
 
-#if USE_SSE4_1
 /*
  * Given 'x' containing a 16-byte polynomial, and a pointer 'p' that points to
  * the next '1 <= len <= 15' data bytes, rearrange the concatenation of 'x' and
@@ -181,7 +179,6 @@ ADD_SUFFIX(fold_lessthan16bytes)(__m128i x, const u8 *p, size_t len,
 	return fold_vec128(x0, x1, mults_128b);
 }
 #define fold_lessthan16bytes	ADD_SUFFIX(fold_lessthan16bytes)
-#endif /* USE_SSE4_1 */
 
 static ATTRIBUTES u32
 ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
@@ -192,17 +189,16 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
 	 * folding across 128 bits.  mults_128b differs from mults_1v when
 	 * VL != 16.  All multipliers are 64-bit, to match what pclmulqdq needs,
 	 * but since this is for CRC-32 only their low 32 bits are nonzero.
-	 * For more details, see scripts/gen_crc32_multipliers.c.
+	 * For more details, see scripts/gen-crc32-consts.py.
 	 */
 	const vec_t mults_8v = MULTS_8V;
 	const vec_t mults_4v = MULTS_4V;
 	const vec_t mults_2v = MULTS_2V;
 	const vec_t mults_1v = MULTS_1V;
 	const __m128i mults_128b = _mm_set_epi64x(CRC32_X95_MODG, CRC32_X159_MODG);
-	const __m128i final_mult = _mm_set_epi64x(0, CRC32_X63_MODG);
-	const __m128i mask32 = _mm_set_epi32(0, 0, 0, 0xFFFFFFFF);
 	const __m128i barrett_reduction_constants =
 		_mm_set_epi64x(CRC32_BARRETT_CONSTANT_2, CRC32_BARRETT_CONSTANT_1);
+	const __m128i mask32 = _mm_set_epi32(0, 0xFFFFFFFF, 0, 0);
 	vec_t v0, v1, v2, v3, v4, v5, v6, v7;
 	__m128i x0 = _mm_cvtsi32_si128(crc);
 	__m128i x1;
@@ -273,7 +269,6 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
 			size_t align = -(uintptr_t)p & (VL-1);
 
 			len -= align;
-		#if USE_SSE4_1
 			x0 = _mm_xor_si128(_mm_loadu_si128((const void *)p), x0);
 			p += 16;
 			if (align & 15) {
@@ -296,11 +291,6 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
 			v0 = _mm512_inserti64x4(v0, *(const __m256i *)(p + 16), 1);
 		#  endif
 			p -= 16;
-		#else
-			crc = crc32_slice1(crc, p, align);
-			p += align;
-			v0 = VXOR(VLOADU(p), M128I_TO_VEC(_mm_cvtsi32_si128(crc)));
-		#endif
 		} else {
 			v0 = VXOR(VLOADU(p), M128I_TO_VEC(x0));
 		}
@@ -395,86 +385,69 @@ less_than_vl_remaining:
 less_than_16_remaining:
 	len &= 15;
 
-	/*
-	 * If fold_lessthan16bytes() is available, handle any remainder
-	 * of 1 to 15 bytes now, before reducing to 32 bits.
-	 */
-#if USE_SSE4_1
+	/* Handle any remainder of 1 to 15 bytes. */
 	if (len)
 		x0 = fold_lessthan16bytes(x0, p, len, mults_128b);
-#endif
 #if USE_AVX512
 reduce_x0:
 #endif
-
 	/*
-	 * Fold 128 => 96 bits.  This also implicitly appends 32 zero bits,
-	 * which is equivalent to multiplying by x^32.  This is needed because
-	 * the CRC is defined as M(x)*x^32 mod G(x), not just M(x) mod G(x).
-	 */
-	x0 = _mm_xor_si128(_mm_srli_si128(x0, 8),
-			   _mm_clmulepi64_si128(x0, mults_128b, 0x10));
-
-	/* Fold 96 => 64 bits. */
-	x0 = _mm_xor_si128(_mm_srli_si128(x0, 4),
-			   _mm_clmulepi64_si128(_mm_and_si128(x0, mask32),
-						final_mult, 0x00));
-
-	/*
-	 * Reduce 64 => 32 bits using Barrett reduction.
-	 *
-	 * Let M(x) = A(x)*x^32 + B(x) be the remaining message.  The goal is to
-	 * compute R(x) = M(x) mod G(x).  Since degree(B(x)) < degree(G(x)):
-	 *
-	 *	R(x) = (A(x)*x^32 + B(x)) mod G(x)
-	 *	     = (A(x)*x^32) mod G(x) + B(x)
-	 *
-	 * Then, by the Division Algorithm there exists a unique q(x) such that:
-	 *
-	 *	A(x)*x^32 mod G(x) = A(x)*x^32 - q(x)*G(x)
+	 * Generate the final n-bit CRC from the 128-bit x0 = A as follows:
 	 *
-	 * Since the left-hand side is of maximum degree 31, the right-hand side
-	 * must be too.  This implies that we can apply 'mod x^32' to the
-	 * right-hand side without changing its value:
+	 *	crc = x^n * A mod G
+	 *	    = x^n * (x^64*A_H + A_L) mod G
+	 *	    = x^n * (x^(64-n)*(x^n*A_H mod G) + A_L) mod G
 	 *
-	 *	(A(x)*x^32 - q(x)*G(x)) mod x^32 = q(x)*G(x) mod x^32
+	 * I.e.:
+	 *	crc := 0
+	 *	crc := x^n * (x^(64-n)*crc + A_H) mod G
+	 *	crc := x^n * (x^(64-n)*crc + A_L) mod G
 	 *
-	 * Note that '+' is equivalent to '-' in polynomials over GF(2).
+	 * A_H and A_L denote the high and low 64 polynomial coefficients in A.
 	 *
-	 * We also know that:
+	 * Using Barrett reduction to do the 'mod G', this becomes:
 	 *
-	 *	              / A(x)*x^32 \
-	 *	q(x) = floor (  ---------  )
-	 *	              \    G(x)   /
+	 *	crc := floor((A_H * floor(x^(m+n) / G)) / x^m) * G mod x^n
+	 *	A_L := x^(64-n)*crc + A_L
+	 *	crc := floor((A_L * floor(x^(m+n) / G)) / x^m) * G mod x^n
 	 *
-	 * To compute this efficiently, we can multiply the top and bottom by
-	 * x^32 and move the division by G(x) to the top:
-	 *
-	 *	              / A(x) * floor(x^64 / G(x)) \
-	 *	q(x) = floor (  -------------------------  )
-	 *	              \           x^32            /
-	 *
-	 * Note that floor(x^64 / G(x)) is a constant.
-	 *
-	 * So finally we have:
-	 *
-	 *	                          / A(x) * floor(x^64 / G(x)) \
-	 *	R(x) = B(x) + G(x)*floor (  -------------------------  )
-	 *	                          \           x^32            /
+	 * For the gzip crc, n = 32 and the bit order is LSB (least significant
+	 * bit) first.  'm' must be an integer >= 63 (the max degree of A_L and
+	 * A_H) for sufficient precision to be carried through the calculation.
+	 * As the gzip crc is LSB-first we use m == 63, which results in
+	 * floor(x^(m+n) / G) being 64-bit which is the most pclmulqdq can
+	 * accept.  The multiplication with floor(x^(63+n) / G) then produces a
+	 * 127-bit product, and the floored division by x^63 just takes the
+	 * first qword.
 	 */
-	x1 = _mm_clmulepi64_si128(_mm_and_si128(x0, mask32),
-				  barrett_reduction_constants, 0x00);
-	x1 = _mm_clmulepi64_si128(_mm_and_si128(x1, mask32),
-				  barrett_reduction_constants, 0x10);
-	x0 = _mm_xor_si128(x0, x1);
-#if USE_SSE4_1
-	crc = _mm_extract_epi32(x0, 1);
+
+	/* tmp := floor((A_H * floor(x^(63+n) / G)) / x^63) */
+	x1 = _mm_clmulepi64_si128(x0, barrett_reduction_constants, 0x00);
+	/* tmp is in bits [0:64) of x1. */
+
+	/* crc := tmp * G mod x^n */
+	x1 = _mm_clmulepi64_si128(x1, barrett_reduction_constants, 0x10);
+	/* crc is in bits [64:64+n) of x1. */
+
+	/*
+	 * A_L := x^(64-n)*crc + A_L
+	 * crc is already aligned to add (XOR) it directly to A_L, after
+	 * selecting it using a mask.
+	 */
+#if USE_AVX512
+	x0 = _mm_ternarylogic_epi32(x0, x1, mask32, 0x78);
 #else
-	crc = _mm_cvtsi128_si32(_mm_shuffle_epi32(x0, 0x01));
-	/* Process up to 15 bytes left over at the end. */
-	crc = crc32_slice1(crc, p, len);
+	x0 = _mm_xor_si128(x0, _mm_and_si128(x1, mask32));
 #endif
-	return crc;
+	/*
+	 * crc := floor((A_L * floor(x^(m+n) / G)) / x^m) * G mod x^n
+	 * Same as previous but uses the low-order 64 coefficients of A.
+	 */
+	x0 = _mm_clmulepi64_si128(x0, barrett_reduction_constants, 0x01);
+	x0 = _mm_clmulepi64_si128(x0, barrett_reduction_constants, 0x10);
+
+	/* Extract the CRC from bits [64:64+n) of x0. */
+	return _mm_extract_epi32(x0, 2);
 }
 
 #undef vec_t
@@ -491,5 +464,4 @@ reduce_x0:
 #undef SUFFIX
 #undef ATTRIBUTES
 #undef VL
-#undef USE_SSE4_1
 #undef USE_AVX512


=====================================
libdeflate.h
=====================================
@@ -13,8 +13,8 @@ extern "C" {
 #endif
 
 #define LIBDEFLATE_VERSION_MAJOR	1
-#define LIBDEFLATE_VERSION_MINOR	22
-#define LIBDEFLATE_VERSION_STRING	"1.22"
+#define LIBDEFLATE_VERSION_MINOR	23
+#define LIBDEFLATE_VERSION_STRING	"1.23"
 
 /*
  * Users of libdeflate.dll on Windows can define LIBDEFLATE_DLL to cause


=====================================
scripts/gen-crc32-consts.py
=====================================
@@ -0,0 +1,158 @@
+#!/usr/bin/env python3
+#
+# This script generates constants for efficient computation of the gzip CRC-32.
+
+import sys
+
+# This is the generator polynomial G(x) of the gzip CRC-32, represented as an
+# int using the natural mapping between bits and polynomial coefficients.
+G = 0x104c11db7
+
+# XOR (add) an iterable of polynomials.
+def xor(iterable):
+    res = 0
+    for val in iterable:
+        res ^= val
+    return res
+
+# Multiply two polynomials.
+def clmul(a, b):
+    return xor(a << i for i in range(b.bit_length()) if (b & (1 << i)) != 0)
+
+# Polynomial division floor(a / b).
+def div(a, b):
+    q = 0
+    while a.bit_length() >= b.bit_length():
+        q ^= 1 << (a.bit_length() - b.bit_length())
+        a ^= b << (a.bit_length() - b.bit_length())
+    return q
+
+# Reduce the polynomial 'a' modulo the polynomial 'b'.
+def reduce(a, b):
+    return a ^ clmul(div(a, b), b)
+
+# Reverse the bits of a polynomial.
+def bitreverse(poly, num_bits):
+    return xor(1 << (num_bits - 1 - i) for i in range(num_bits)
+               if (poly & (1 << i)) != 0)
+
+# Compute x^d mod G.
+def x_to_the_d(d):
+    if d < G.bit_length() - 1:
+        return 1 << d
+    t = x_to_the_d(d//2)
+    t = clmul(t, t)
+    if d % 2 != 0:
+        t <<= 1
+    return reduce(t, G)
+
+def gen_tables():
+    print('/*')
+    print(' * crc32_tables.h - data tables for CRC-32 computation')
+    print(' *')
+    print(' * THIS FILE WAS GENERATED BY gen-crc32-consts.py.  DO NOT EDIT.')
+    print(' */')
+    for n in [1, 8]:
+        print('')
+        print(f'static const u32 crc32_slice{n}_table[] MAYBE_UNUSED = {{')
+        # The i'th table entry is the CRC-32 of the message consisting of byte
+        # i % 256 followed by i // 256 zero bytes.
+        polys = [bitreverse(i % 256, 8) << (32 + 8*(i//256)) for i in range(256 * n)]
+        polys = [bitreverse(reduce(poly, G), 32) for poly in polys]
+        for i in range(0, len(polys), 4):
+            print(f'\t0x{polys[i+0]:08x}, 0x{polys[i+1]:08x}, 0x{polys[i+2]:08x}, 0x{polys[i+3]:08x},')
+        print('};')
+
+# Compute the constant multipliers needed for "folding" over various distances
+# with the gzip CRC-32.  Each such multiplier is x^d mod G(x) for some distance
+# d, in bits, over which the folding is occurring.
+#
+# Folding works as follows: let A(x) be a polynomial (possibly reduced partially
+# or fully mod G(x)) for part of the message, and let B(x) be a polynomial
+# (possibly reduced partially or fully mod G(x)) for a later part of the
+# message.  The unreduced combined polynomial is A(x)*x^d + B(x), where d is the
+# number of bits separating the two parts of the message plus len(B(x)).  Since
+# mod G(x) can be applied at any point, x^d mod G(x) can be precomputed and used
+# instead of x^d unreduced.  That allows the combined polynomial to be computed
+# relatively easily in a partially-reduced form A(x)*(x^d mod G(x)) + B(x), with
+# length max(len(A(x)) + 31, len(B(x))).  This does require doing a polynomial
+# multiplication (carryless multiplication).
+#
+# "Folding" in this way can be used for the entire CRC computation except the
+# final reduction to 32 bits; this works well when CPU support for carryless
+# multiplication is available.  It can also be used to combine CRCs of different
+# parts of the message that were computed using a different method.
+#
+# Note that the gzip CRC-32 uses bit-reversed polynomials.  I.e., the low order
+# bits are really the high order polynomial coefficients.
+def gen_multipliers():
+    print('/*')
+    print(' * crc32_multipliers.h - constants for CRC-32 folding')
+    print(' *')
+    print(' * THIS FILE WAS GENERATED BY gen-crc32-consts.py.  DO NOT EDIT.')
+    print(' */')
+    print('')
+
+    # Compute the multipliers needed for CRC-32 folding with carryless
+    # multiplication instructions that operate on the 64-bit halves of 128-bit
+    # segments.  Using the terminology from earlier, for each 64-bit fold
+    # len(A(x)) = 64, and len(B(x)) = 95 since a 64-bit polynomial multiplied by
+    # a 32-bit one produces a 95-bit one.  When A(x) is the low order polynomial
+    # half of a 128-bit segments (high order physical half), the separation
+    # between the message parts is the total length of the 128-bit segments
+    # separating the values.  When A(x) is the high order polynomial half, the
+    # separation is 64 bits greater.
+    for i in range(1, 33):
+        sep_lo = 128 * (i - 1)
+        sep_hi = sep_lo + 64
+        len_B = 95
+        for d in [sep_hi + len_B, # A(x) = high 64 polynomial bits (low 64 physical bits)
+                  sep_lo + len_B # A(x) = low 64 polynomial bits (high 64 physical bits)
+                  ]:
+            poly = bitreverse(x_to_the_d(d), 32)
+            print(f'#define CRC32_X{d}_MODG 0x{poly:08x} /* x^{d} mod G(x) */')
+        print('')
+
+    # Compute constants for the final 128 => 32 bit reduction.
+    poly = bitreverse(div(1 << 95, G), 64)
+    print(f'#define CRC32_BARRETT_CONSTANT_1 0x{poly:016x}ULL /* floor(x^95 / G(x)) */')
+    poly = bitreverse(G, 33)
+    print(f'#define CRC32_BARRETT_CONSTANT_2 0x{poly:016x}ULL /* G(x) */')
+
+    # Compute multipliers for combining the CRCs of separate chunks.
+    print('')
+    num_chunks = 4
+    table_len = 129
+    min_chunk_len = 128
+    print(f'#define CRC32_NUM_CHUNKS {num_chunks}')
+    print(f'#define CRC32_MIN_VARIABLE_CHUNK_LEN {min_chunk_len}UL')
+    print(f'#define CRC32_MAX_VARIABLE_CHUNK_LEN {(table_len-1) * min_chunk_len}UL')
+    print('')
+    print('/* Multipliers for implementations that use a variable chunk length */')
+    print('static const u32 crc32_mults_for_chunklen[][CRC32_NUM_CHUNKS - 1] MAYBE_UNUSED = {')
+    print('\t{ 0 /* unused row */ },')
+    for i in range(1, table_len):
+        chunk_len = i * min_chunk_len
+        print(f'\t/* chunk_len={chunk_len} */')
+        print('\t{ ', end='')
+        for j in range(num_chunks - 1, 0, -1):
+            d = (j * 8 * chunk_len) - 33
+            poly = bitreverse(x_to_the_d(d), 32)
+            print(f'0x{poly:08x} /* x^{d} mod G(x) */, ', end='')
+        print('},')
+    print('};')
+    fixed_chunk_len = 32768
+    print('')
+    print('/* Multipliers for implementations that use a large fixed chunk length */')
+    print(f'#define CRC32_FIXED_CHUNK_LEN {fixed_chunk_len}UL')
+    for j in range(1, num_chunks):
+        d = (j * 8 * fixed_chunk_len) - 33
+        poly = bitreverse(x_to_the_d(d), 32)
+        print(f'#define CRC32_FIXED_CHUNK_MULT_{j} 0x{poly:08x} /* x^{d} mod G(x) */')
+
+with open('lib/crc32_tables.h', 'w') as f:
+    sys.stdout = f
+    gen_tables()
+with open('lib/crc32_multipliers.h', 'w') as f:
+    sys.stdout = f
+    gen_multipliers()


=====================================
scripts/gen_crc32_multipliers.c deleted
=====================================
@@ -1,199 +0,0 @@
-/*
- * gen_crc32_multipliers.c
- *
- * Copyright 2016 Eric Biggers
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-/*
- * This program computes the constant multipliers needed for "folding" over
- * various distances with the gzip CRC-32.  Each such multiplier is x^D mod G(x)
- * for some distance D, in bits, over which the folding is occurring.
- *
- * Folding works as follows: let A(x) be a polynomial (possibly reduced
- * partially or fully mod G(x)) for part of the message, and let B(x) be a
- * polynomial (possibly reduced partially or fully mod G(x)) for a later part of
- * the message.  The unreduced combined polynomial is A(x)*x^D + B(x), where D
- * is the number of bits separating the two parts of the message plus len(B(x)).
- * Since mod G(x) can be applied at any point, x^D mod G(x) can be precomputed
- * and used instead of x^D unreduced.  That allows the combined polynomial to be
- * computed relatively easily in a partially-reduced form A(x)*(x^D mod G(x)) +
- * B(x), with length max(len(A(x)) + 31, len(B(x))).  This does require doing a
- * polynomial multiplication (carryless multiplication).
- *
- * "Folding" in this way can be used for the entire CRC computation except the
- * final reduction to 32 bits; this works well when CPU support for carryless
- * multiplication is available.  It can also be used to combine CRCs of
- * different parts of the message that were computed using a different method.
- *
- * Note that the gzip CRC-32 uses bit-reversed polynomials.  I.e., the low order
- * bits are really the high order polynomial coefficients.
- */
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#include "../common_defs.h"
-
-/* The generator polynomial G(x) for the gzip CRC-32 */
-#define CRCPOLY		0xEDB88320 /* G(x) without x^32 term */
-#define CRCPOLY_FULL	(((u64)CRCPOLY << 1) | 1) /* G(x) */
-
-/* Compute x^D mod G(x) */
-static u32
-compute_xD_modG(size_t D)
-{
-	/* Start with x^0 mod G(x) */
-	u32 remainder = 0x80000000;
-
-	/* Each iteration, 'remainder' becomes x^i mod G(x) */
-	for (size_t i = 1; i <= D; i++)
-		remainder = (remainder >> 1) ^ ((remainder & 1) ? CRCPOLY : 0);
-
-	/* Now 'remainder' is x^D mod G(x) */
-	return remainder;
-}
-
-/* Compute floor(x^64 / G(x)) */
-static u64
-compute_x64_div_G(void)
-{
-	u64 quotient = 0;
-	u64 dividend = 0x1;
-
-	for (int i = 0; i < 64 - 32 + 1; i++) {
-		if ((dividend >> i) & 1) {
-			quotient |= (u64)1 << i;
-			dividend ^= CRCPOLY_FULL << i;
-		}
-	}
-
-	return quotient;
-}
-
-static void
-gen_vec_folding_constants(void)
-{
-	/*
-	 * Compute the multipliers needed for CRC-32 folding with carryless
-	 * multiplication instructions that operate on the 64-bit halves of
-	 * 128-bit segments.  Using the terminology from earlier, for each 64-bit
-	 * fold len(A(x)) = 64, and len(B(x)) = 95 since a 64-bit polynomial
-	 * multiplied by a 32-bit one produces a 95-bit one.  When A(x) is the
-	 * low order polynomial half of a 128-bit segments (high order physical
-	 * half), the separation between the message parts is the total length
-	 * of the 128-bit segments separating the values.  When A(x) is the high
-	 * order polynomial half, the separation is 64 bits greater.
-	 */
-	for (int i = 1; i <= 32; i++) {
-		const int sep_lo = 128 * (i - 1);
-		const int sep_hi = sep_lo + 64;
-		const int len_B = 95;
-		int D;
-
-		/* A(x) = high 64 polynomial bits (low 64 physical bits) */
-		D = sep_hi + len_B;
-		printf("#define CRC32_X%d_MODG 0x%08"PRIx32" /* x^%d mod G(x) */\n",
-		       D, compute_xD_modG(D), D);
-
-		/* A(x) = low 64 polynomial bits (high 64 physical bits) */
-		D = sep_lo + len_B;
-		printf("#define CRC32_X%d_MODG 0x%08"PRIx32" /* x^%d mod G(x) */\n",
-		       D, compute_xD_modG(D), D);
-		printf("\n");
-	}
-
-	/* Multiplier for final 96 => 64 bit fold */
-	printf("#define CRC32_X63_MODG 0x%08"PRIx32" /* x^63 mod G(x) */\n",
-	       compute_xD_modG(63));
-
-	/*
-	 * Constants for final 64 => 32 bit reduction.  These constants are the
-	 * odd ones out, as this final reduction step can't use the regular CRC
-	 * folding described above.  It uses Barrett reduction instead.
-	 */
-	printf("#define CRC32_BARRETT_CONSTANT_1 0x%016"PRIx64"ULL /* floor(x^64 / G(x)) */\n",
-	       compute_x64_div_G());
-	printf("#define CRC32_BARRETT_CONSTANT_2 0x%016"PRIx64"ULL /* G(x) */\n",
-	       CRCPOLY_FULL);
-	printf("#define CRC32_BARRETT_CONSTANTS { CRC32_BARRETT_CONSTANT_1, CRC32_BARRETT_CONSTANT_2 }\n");
-}
-
-/* Multipliers for combining the CRCs of separate chunks */
-static void
-gen_chunk_constants(void)
-{
-	const size_t num_chunks = 4;
-	const size_t table_len = 129;
-	const size_t min_chunk_len = 128;
-
-	printf("#define CRC32_NUM_CHUNKS %zu\n", num_chunks);
-	printf("#define CRC32_MIN_VARIABLE_CHUNK_LEN %zuUL\n", min_chunk_len);
-	printf("#define CRC32_MAX_VARIABLE_CHUNK_LEN %zuUL\n",
-	       (table_len - 1) * min_chunk_len);
-	printf("\n");
-	printf("/* Multipliers for implementations that use a variable chunk length */\n");
-	printf("static const u32 crc32_mults_for_chunklen[][CRC32_NUM_CHUNKS - 1] MAYBE_UNUSED = {\n");
-	printf("\t{ 0 /* unused row */ },\n");
-	for (size_t i = 1; i < table_len; i++) {
-		const size_t chunk_len = i*min_chunk_len;
-
-		printf("\t/* chunk_len=%zu */\n", chunk_len);
-		printf("\t{ ");
-		for (size_t j = num_chunks - 1; j >= 1; j--) {
-			const size_t D = (j * 8 * chunk_len) - 33;
-
-			printf("0x%08"PRIx32" /* x^%zu mod G(x) */, ",
-			       compute_xD_modG(D), D);
-		}
-		printf("},\n");
-	}
-	printf("};\n");
-	printf("\n");
-
-	printf("/* Multipliers for implementations that use a large fixed chunk length */\n");
-	const size_t fixed_chunk_len = 32768;
-	printf("#define CRC32_FIXED_CHUNK_LEN %zuUL\n", fixed_chunk_len);
-	for (int j = 1; j < num_chunks; j++) {
-		const size_t D = (j * 8 * fixed_chunk_len) - 33;
-
-		printf("#define CRC32_FIXED_CHUNK_MULT_%d 0x%08"PRIx32" /* x^%zu mod G(x) */\n",
-		       j, compute_xD_modG(D), D);
-	}
-}
-
-int
-main(void)
-{
-	printf("/*\n"
-	       " * crc32_multipliers.h - constants for CRC-32 folding\n"
-	       " *\n"
-	       " * THIS FILE WAS GENERATED BY gen_crc32_multipliers.c.  DO NOT EDIT.\n"
-	       " */\n"
-	       "\n");
-
-	gen_vec_folding_constants();
-	printf("\n");
-	gen_chunk_constants();
-	return 0;
-}


=====================================
scripts/gen_crc32_tables.c deleted
=====================================
@@ -1,105 +0,0 @@
-/*
- * gen_crc32_tables.c - a program for CRC-32 table generation
- *
- * Copyright 2016 Eric Biggers
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-#include <stdio.h>
-
-#include "../common_defs.h"
-
-#define CRCPOLY	0xEDB88320 /* G(x) without x^32 term */
-
-static u32
-crc32_update_bit(u32 remainder, u8 next_bit)
-{
-	return (remainder >> 1) ^ (((remainder ^ next_bit) & 1) ? CRCPOLY : 0);
-}
-
-static u32
-crc32_update_byte(u32 remainder, u8 next_byte)
-{
-	for (int j = 0; j < 8; j++, next_byte >>= 1)
-		remainder = crc32_update_bit(remainder, next_byte & 1);
-	return remainder;
-}
-
-static void
-print_256_entries(const u32 *entries)
-{
-	for (size_t i = 0; i < 256 / 4; i++) {
-		printf("\t");
-		for (size_t j = 0; j < 4; j++) {
-			printf("0x%08x,", entries[i * 4 + j]);
-			if (j != 3)
-				printf(" ");
-		}
-		printf("\n");
-	}
-}
-
-int
-main(void)
-{
-	u32 crc32_table[0x800];
-
-	/* crc32_table[i] for 0 <= i < 0x100 is the CRC-32 of byte i. */
-	for (int i = 0; i < 0x100; i++)
-		crc32_table[i] = crc32_update_byte(0, i);
-
-	/*
-	 * crc32_table[i] for 0x100 <= i < 0x800 is the CRC-32 of byte i % 0x100
-	 * followed by i / 0x100 zero bytes.
-	 */
-	for (int i = 0x100; i < 0x800; i++)
-		crc32_table[i] = crc32_update_byte(crc32_table[i - 0x100], 0);
-
-	printf("/*\n");
-	printf(" * crc32_tables.h - data tables for CRC-32 computation\n");
-	printf(" *\n");
-	printf(" * THIS FILE WAS GENERATED BY gen_crc32_tables.c.  DO NOT EDIT.\n");
-	printf(" */\n");
-	printf("\n");
-	/*
-	 * Although crc32_slice1_table is the same as the first 256 entries of
-	 * crc32_slice8_table, we output these tables separately because any
-	 * combo of (slice1, slice8, slice1 && slice8, nothing) might be needed,
-	 * and it's simplest to let the compiler optimize out any unused tables.
-	 */
-	printf("static const u32 crc32_slice1_table[] MAYBE_UNUSED = {\n");
-	print_256_entries(&crc32_table[0x000]);
-	printf("};\n");
-	printf("\n");
-	printf("static const u32 crc32_slice8_table[] MAYBE_UNUSED = {\n");
-	print_256_entries(&crc32_table[0x000]);
-	print_256_entries(&crc32_table[0x100]);
-	print_256_entries(&crc32_table[0x200]);
-	print_256_entries(&crc32_table[0x300]);
-	print_256_entries(&crc32_table[0x400]);
-	print_256_entries(&crc32_table[0x500]);
-	print_256_entries(&crc32_table[0x600]);
-	print_256_entries(&crc32_table[0x700]);
-	printf("};\n");
-	return 0;
-}



View it on GitLab: https://salsa.debian.org/deflate-team/libdeflate/-/compare/aadbde3370db9278c86c07d23b54ad768a2b74f2...02c1120d828c894348087b62020eb21b2ba672c2

-- 
View it on GitLab: https://salsa.debian.org/deflate-team/libdeflate/-/compare/aadbde3370db9278c86c07d23b54ad768a2b74f2...02c1120d828c894348087b62020eb21b2ba672c2
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20250114/22a4504d/attachment-0001.htm>


More information about the debian-med-commit mailing list