[med-svn] [Git][deflate-team/libdeflate][debian/latest] 2 commits: New upstream version 1.23
nick black (@nickblack)
gitlab at salsa.debian.org
Tue Jan 14 14:45:39 GMT 2025
nick black pushed to branch debian/latest at deflate team / libdeflate
Commits:
3e0469f1 by nick black at 2025-01-14T09:42:27-05:00
New upstream version 1.23
- - - - -
02c1120d by nick black at 2025-01-14T09:42:27-05:00
Update upstream source from tag 'upstream/1.23'
Update to upstream version '1.23'
with Debian dir 26ece65b570aef692b6e36149b2d5f7a8a42045c
- - - - -
16 changed files:
- .github/workflows/ci.yml
- CMakeLists.txt
- NEWS.md
- README.md
- lib/arm/crc32_impl.h
- lib/crc32_multipliers.h
- lib/crc32_tables.h
- lib/x86/adler32_impl.h
- lib/x86/cpu_features.c
- lib/x86/cpu_features.h
- lib/x86/crc32_impl.h
- lib/x86/crc32_pclmul_template.h
- libdeflate.h
- + scripts/gen-crc32-consts.py
- − scripts/gen_crc32_multipliers.c
- − scripts/gen_crc32_tables.c
Changes:
=====================================
.github/workflows/ci.yml
=====================================
@@ -42,7 +42,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout at v4
- - uses: uraimo/run-on-arch-action at v2.5.0
+ - uses: uraimo/run-on-arch-action at v2.8.1
with:
arch: ${{ matrix.arch }}
distro: ${{ matrix.distro }}
@@ -139,11 +139,8 @@ jobs:
name: Build (Windows, Visual Studio ${{matrix.toolset}}, ${{matrix.platform}})
strategy:
matrix:
- platform: [ARM64, ARM]
+ platform: [ARM64]
toolset: [v143, ClangCL]
- exclude: # Exclude unsupported combinations
- - platform: ARM
- toolset: ClangCL
runs-on: windows-latest
steps:
- uses: actions/checkout at v4
=====================================
CMakeLists.txt
=====================================
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.7)
+cmake_minimum_required(VERSION 3.10)
# Default to a release build.
if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
=====================================
NEWS.md
=====================================
@@ -1,5 +1,21 @@
# libdeflate release notes
+## Version 1.23
+
+* Fixed bug introduced in 1.20 where incorrect checksums could be calculated if
+ libdeflate was compiled with clang at -O0 and run on a CPU supporting AVX512.
+
+* Fixed bug introduced in 1.20 where incorrect checksums could be calculated in
+ rare cases on macOS computers that support AVX512 and are running an older
+ version of macOS that contains a bug that corrupts AVX512 registers. This
+ could occur only if code outside libdeflate enabled AVX512 in the thread.
+
+* Fixed build error when using -mno-evex512 with clang 18+ or gcc 14+.
+
+* Increased the minimum CMake version to 3.10.
+
+* Further optimized the x86 CRC code.
+
## Version 1.22
* The CMake-based build system now implements a workaround for gcc being paired
=====================================
README.md
=====================================
@@ -144,6 +144,7 @@ libdeflate from a programming language other than C or C++, consider using the
following bindings:
* C#: [LibDeflate.NET](https://github.com/jzebedee/LibDeflate.NET)
+* Delphi: [libdeflate-pas](https://github.com/zedxxx/libdeflate-pas)
* Go: [go-libdeflate](https://github.com/4kills/go-libdeflate)
* Java: [libdeflate-java](https://github.com/astei/libdeflate-java)
* Julia: [LibDeflate.jl](https://github.com/jakobnissen/LibDeflate.jl)
=====================================
lib/arm/crc32_impl.h
=====================================
@@ -434,13 +434,13 @@ crc32_arm_pmullx4(u32 crc, const u8 *p, size_t len)
{ CRC32_X543_MODG, CRC32_X479_MODG }, /* 4 vecs */
{ CRC32_X287_MODG, CRC32_X223_MODG }, /* 2 vecs */
};
- static const u64 _aligned_attribute(16) final_mults[3][2] = {
- { CRC32_X63_MODG, 0 },
- { CRC32_BARRETT_CONSTANT_1, 0 },
- { CRC32_BARRETT_CONSTANT_2, 0 },
+ static const u64 _aligned_attribute(16) barrett_consts[2][2] = {
+ { CRC32_BARRETT_CONSTANT_1, CRC32_BARRETT_CONSTANT_1 },
+ { CRC32_BARRETT_CONSTANT_2, CRC32_BARRETT_CONSTANT_2 },
+ };
+ static const u32 _aligned_attribute(16) mask32[4] = {
+ 0, 0, 0xffffffff, 0
};
- const uint8x16_t zeroes = vdupq_n_u8(0);
- const uint8x16_t mask32 = vreinterpretq_u8_u64(vdupq_n_u64(0xFFFFFFFF));
const poly64x2_t multipliers_1 = load_multipliers(mults[0]);
uint8x16_t v0, v1, v2, v3;
@@ -497,24 +497,13 @@ crc32_arm_pmullx4(u32 crc, const u8 *p, size_t len)
if (len)
v0 = fold_partial_vec(v0, p, len, multipliers_1);
- /*
- * Fold 128 => 96 bits. This also implicitly appends 32 zero bits,
- * which is equivalent to multiplying by x^32. This is needed because
- * the CRC is defined as M(x)*x^32 mod G(x), not just M(x) mod G(x).
- */
-
- v0 = veorq_u8(vextq_u8(v0, zeroes, 8),
- clmul_high(vextq_u8(zeroes, v0, 8), multipliers_1));
-
- /* Fold 96 => 64 bits. */
- v0 = veorq_u8(vextq_u8(v0, zeroes, 4),
- clmul_low(vandq_u8(v0, mask32),
- load_multipliers(final_mults[0])));
-
- /* Reduce 64 => 32 bits using Barrett reduction. */
- v1 = clmul_low(vandq_u8(v0, mask32), load_multipliers(final_mults[1]));
- v1 = clmul_low(vandq_u8(v1, mask32), load_multipliers(final_mults[2]));
- return vgetq_lane_u32(vreinterpretq_u32_u8(veorq_u8(v0, v1)), 1);
+ /* Reduce to 32 bits, following lib/x86/crc32_pclmul_template.h */
+ v1 = clmul_low(v0, load_multipliers(barrett_consts[0]));
+ v1 = clmul_low(v1, load_multipliers(barrett_consts[1]));
+ v0 = veorq_u8(v0, vandq_u8(v1, vreinterpretq_u8_u32(vld1q_u32(mask32))));
+ v0 = clmul_high(v0, load_multipliers(barrett_consts[0]));
+ v0 = clmul_low(v0, load_multipliers(barrett_consts[1]));
+ return vgetq_lane_u32(vreinterpretq_u32_u8(v0), 2);
}
#undef SUFFIX
#undef ATTRIBUTES
=====================================
lib/crc32_multipliers.h
=====================================
@@ -1,7 +1,7 @@
/*
* crc32_multipliers.h - constants for CRC-32 folding
*
- * THIS FILE WAS GENERATED BY gen_crc32_multipliers.c. DO NOT EDIT.
+ * THIS FILE WAS GENERATED BY gen-crc32-consts.py. DO NOT EDIT.
*/
#define CRC32_X159_MODG 0xae689191 /* x^159 mod G(x) */
@@ -100,10 +100,8 @@
#define CRC32_X4127_MODG 0x1072db28 /* x^4127 mod G(x) */
#define CRC32_X4063_MODG 0x0c30f51d /* x^4063 mod G(x) */
-#define CRC32_X63_MODG 0xb8bc6765 /* x^63 mod G(x) */
-#define CRC32_BARRETT_CONSTANT_1 0x00000001f7011641ULL /* floor(x^64 / G(x)) */
+#define CRC32_BARRETT_CONSTANT_1 0xb4e5b025f7011641ULL /* floor(x^95 / G(x)) */
#define CRC32_BARRETT_CONSTANT_2 0x00000001db710641ULL /* G(x) */
-#define CRC32_BARRETT_CONSTANTS { CRC32_BARRETT_CONSTANT_1, CRC32_BARRETT_CONSTANT_2 }
#define CRC32_NUM_CHUNKS 4
#define CRC32_MIN_VARIABLE_CHUNK_LEN 128UL
=====================================
lib/crc32_tables.h
=====================================
@@ -1,7 +1,7 @@
/*
* crc32_tables.h - data tables for CRC-32 computation
*
- * THIS FILE WAS GENERATED BY gen_crc32_tables.c. DO NOT EDIT.
+ * THIS FILE WAS GENERATED BY gen-crc32-consts.py. DO NOT EDIT.
*/
static const u32 crc32_slice1_table[] MAYBE_UNUSED = {
=====================================
lib/x86/adler32_impl.h
=====================================
@@ -82,7 +82,7 @@
*/
# define adler32_x86_avx512_vl256_vnni adler32_x86_avx512_vl256_vnni
# define SUFFIX _avx512_vl256_vnni
-# define ATTRIBUTES _target_attribute("avx512bw,avx512vl,avx512vnni")
+# define ATTRIBUTES _target_attribute("avx512bw,avx512vl,avx512vnni" NO_EVEX512)
# define VL 32
# define USE_VNNI 1
# define USE_AVX512 1
@@ -95,7 +95,7 @@
*/
# define adler32_x86_avx512_vl512_vnni adler32_x86_avx512_vl512_vnni
# define SUFFIX _avx512_vl512_vnni
-# define ATTRIBUTES _target_attribute("avx512bw,avx512vnni")
+# define ATTRIBUTES _target_attribute("avx512bw,avx512vnni" EVEX512)
# define VL 64
# define USE_VNNI 1
# define USE_AVX512 1
=====================================
lib/x86/cpu_features.c
=====================================
@@ -88,6 +88,27 @@ static const struct cpu_feature x86_cpu_feature_table[] = {
volatile u32 libdeflate_x86_cpu_features = 0;
+static inline bool
+os_supports_avx512(u64 xcr0)
+{
+#ifdef __APPLE__
+ /*
+ * The Darwin kernel had a bug where it could corrupt the opmask
+ * registers. See
+ * https://community.intel.com/t5/Software-Tuning-Performance/MacOS-Darwin-kernel-bug-clobbers-AVX-512-opmask-register-state/m-p/1327259
+ * Darwin also does not initially set the XCR0 bits for AVX512, but they
+ * are set if the thread tries to use AVX512 anyway. Thus, to safely
+ * and consistently use AVX512 on macOS we'd need to check the kernel
+ * version as well as detect AVX512 support using a macOS-specific
+ * method. We don't bother with this, especially given Apple's
+ * transition to arm64.
+ */
+ return false;
+#else
+ return (xcr0 & 0xe6) == 0xe6;
+#endif
+}
+
/*
* Don't use 512-bit vectors on Intel CPUs before Rocket Lake and Sapphire
* Rapids, due to the downclocking penalty.
@@ -140,7 +161,12 @@ void libdeflate_init_x86_cpu_features(void)
family += (a >> 20) & 0xff;
if (d & (1 << 26))
features |= X86_CPU_FEATURE_SSE2;
- if (c & (1 << 1))
+ /*
+ * No known CPUs have pclmulqdq without sse4.1, so in practice code
+ * targeting pclmulqdq can use sse4.1 instructions. But to be safe,
+ * explicitly check for both the pclmulqdq and sse4.1 bits.
+ */
+ if ((c & (1 << 1)) && (c & (1 << 19)))
features |= X86_CPU_FEATURE_PCLMULQDQ;
if (c & (1 << 27))
xcr0 = read_xcr(0);
@@ -152,21 +178,24 @@ void libdeflate_init_x86_cpu_features(void)
/* EAX=7, ECX=0: Extended Features */
cpuid(7, 0, &a, &b, &c, &d);
- if ((b & (1 << 5)) && ((xcr0 & 0x6) == 0x6))
- features |= X86_CPU_FEATURE_AVX2;
if (b & (1 << 8))
features |= X86_CPU_FEATURE_BMI2;
- if (((xcr0 & 0xe6) == 0xe6) &&
- allow_512bit_vectors(manufacturer, family, model))
- features |= X86_CPU_FEATURE_ZMM;
- if ((b & (1 << 30)) && ((xcr0 & 0xe6) == 0xe6))
- features |= X86_CPU_FEATURE_AVX512BW;
- if ((b & (1U << 31)) && ((xcr0 & 0xe6) == 0xe6))
- features |= X86_CPU_FEATURE_AVX512VL;
- if ((c & (1 << 10)) && ((xcr0 & 0x6) == 0x6))
- features |= X86_CPU_FEATURE_VPCLMULQDQ;
- if ((c & (1 << 11)) && ((xcr0 & 0xe6) == 0xe6))
- features |= X86_CPU_FEATURE_AVX512VNNI;
+ if ((xcr0 & 0x6) == 0x6) {
+ if (b & (1 << 5))
+ features |= X86_CPU_FEATURE_AVX2;
+ if (c & (1 << 10))
+ features |= X86_CPU_FEATURE_VPCLMULQDQ;
+ }
+ if (os_supports_avx512(xcr0)) {
+ if (allow_512bit_vectors(manufacturer, family, model))
+ features |= X86_CPU_FEATURE_ZMM;
+ if (b & (1 << 30))
+ features |= X86_CPU_FEATURE_AVX512BW;
+ if (b & (1U << 31))
+ features |= X86_CPU_FEATURE_AVX512VL;
+ if (c & (1 << 11))
+ features |= X86_CPU_FEATURE_AVX512VNNI;
+ }
/* EAX=7, ECX=1: Extended Features */
cpuid(7, 1, &a, &b, &c, &d);
=====================================
lib/x86/cpu_features.h
=====================================
@@ -108,7 +108,8 @@ static inline u32 get_x86_cpu_features(void) { return 0; }
# define HAVE_SSE2_NATIVE 0
#endif
-#if defined(__PCLMUL__) || (defined(_MSC_VER) && defined(__AVX2__))
+#if (defined(__PCLMUL__) && defined(__SSE4_1__)) || \
+ (defined(_MSC_VER) && defined(__AVX2__))
# define HAVE_PCLMULQDQ(features) 1
#else
# define HAVE_PCLMULQDQ(features) ((features) & X86_CPU_FEATURE_PCLMULQDQ)
@@ -164,6 +165,15 @@ static inline u32 get_x86_cpu_features(void) { return 0; }
# define HAVE_AVXVNNI(features) ((features) & X86_CPU_FEATURE_AVXVNNI)
#endif
+#if (GCC_PREREQ(14, 0) || CLANG_PREREQ(18, 0, 18000000)) \
+ && !defined(__EVEX512__) /* avoid subtracting the evex512 feature */
+# define EVEX512 ",evex512" /* needed to override potential -mno-evex512 */
+# define NO_EVEX512 ",no-evex512" /* needed for AVX10/256 compatibility */
+#else
+# define EVEX512 ""
+# define NO_EVEX512 ""
+#endif
+
#endif /* ARCH_X86_32 || ARCH_X86_64 */
#endif /* LIB_X86_CPU_FEATURES_H */
=====================================
lib/x86/crc32_impl.h
=====================================
@@ -44,31 +44,26 @@ static const u8 MAYBE_UNUSED shift_tab[48] = {
};
#if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER)
-/* PCLMULQDQ implementation */
+/*
+ * PCLMULQDQ implementation. This targets PCLMULQDQ+SSE4.1, since in practice
+ * all CPUs that support PCLMULQDQ also support SSE4.1.
+ */
# define crc32_x86_pclmulqdq crc32_x86_pclmulqdq
# define SUFFIX _pclmulqdq
-# define ATTRIBUTES _target_attribute("pclmul")
+# define ATTRIBUTES _target_attribute("pclmul,sse4.1")
# define VL 16
-# define USE_SSE4_1 0
# define USE_AVX512 0
# include "crc32_pclmul_template.h"
/*
- * PCLMULQDQ/AVX implementation. Compared to the regular PCLMULQDQ
- * implementation, this still uses 128-bit vectors, but it has two potential
- * benefits. First, simply compiling against the AVX target can improve
- * performance significantly (e.g. 10100 MB/s to 16700 MB/s on Skylake) without
- * actually using any AVX intrinsics, probably due to the availability of
- * non-destructive VEX-encoded instructions. Second, AVX support implies SSSE3
- * and SSE4.1 support, and we can use SSSE3 and SSE4.1 intrinsics for efficient
- * handling of partial blocks. (We *could* compile a variant with
- * PCLMULQDQ+SSE4.1 without AVX, but for simplicity we currently don't bother.)
+ * PCLMULQDQ/AVX implementation. Same as above, but this is compiled with AVX
+ * enabled so that the compiler can generate VEX-coded instructions which can be
+ * slightly more efficient. It still uses 128-bit vectors.
*/
# define crc32_x86_pclmulqdq_avx crc32_x86_pclmulqdq_avx
# define SUFFIX _pclmulqdq_avx
# define ATTRIBUTES _target_attribute("pclmul,avx")
# define VL 16
-# define USE_SSE4_1 1
# define USE_AVX512 0
# include "crc32_pclmul_template.h"
#endif
@@ -83,19 +78,20 @@ static const u8 MAYBE_UNUSED shift_tab[48] = {
*
* gcc 8.1 and 8.2 had a similar bug where they assumed that
* _mm256_clmulepi64_epi128() always needed AVX512. It's fixed in gcc 8.3.
+ *
+ * _mm256_zextsi128_si256() requires gcc 10.
*/
-#if (GCC_PREREQ(8, 3) || CLANG_PREREQ(6, 0, 10000000)) && \
+#if (GCC_PREREQ(10, 1) || CLANG_PREREQ(6, 0, 10000000)) && \
!defined(LIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_VPCLMULQDQ)
# define crc32_x86_vpclmulqdq_avx2 crc32_x86_vpclmulqdq_avx2
# define SUFFIX _vpclmulqdq_avx2
# define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx2")
# define VL 32
-# define USE_SSE4_1 1
# define USE_AVX512 0
# include "crc32_pclmul_template.h"
#endif
-#if (GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 10000000) || MSVC_PREREQ(1920)) && \
+#if (GCC_PREREQ(10, 1) || CLANG_PREREQ(6, 0, 10000000) || MSVC_PREREQ(1920)) && \
!defined(LIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_VPCLMULQDQ)
/*
* VPCLMULQDQ/AVX512 implementation using 256-bit vectors. This is very similar
@@ -103,12 +99,13 @@ static const u8 MAYBE_UNUSED shift_tab[48] = {
* instruction and more registers. This is used on CPUs that support AVX-512
* but where using 512-bit vectors causes downclocking. This should also be the
* optimal implementation on CPUs that support AVX10/256 but not AVX10/512.
+ *
+ * _mm256_zextsi128_si256() requires gcc 10.
*/
# define crc32_x86_vpclmulqdq_avx512_vl256 crc32_x86_vpclmulqdq_avx512_vl256
# define SUFFIX _vpclmulqdq_avx512_vl256
-# define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx512bw,avx512vl")
+# define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx512bw,avx512vl" NO_EVEX512)
# define VL 32
-# define USE_SSE4_1 1
# define USE_AVX512 1
# include "crc32_pclmul_template.h"
@@ -116,12 +113,13 @@ static const u8 MAYBE_UNUSED shift_tab[48] = {
* VPCLMULQDQ/AVX512 implementation using 512-bit vectors. This is used on CPUs
* that have a good AVX-512 implementation including VPCLMULQDQ. This should
* also be the optimal implementation on CPUs that support AVX10/512.
+ *
+ * _mm512_zextsi128_si512() requires gcc 10.
*/
# define crc32_x86_vpclmulqdq_avx512_vl512 crc32_x86_vpclmulqdq_avx512_vl512
# define SUFFIX _vpclmulqdq_avx512_vl512
-# define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx512bw,avx512vl")
+# define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx512bw,avx512vl" EVEX512)
# define VL 64
-# define USE_SSE4_1 1
# define USE_AVX512 1
# include "crc32_pclmul_template.h"
#endif
=====================================
lib/x86/crc32_pclmul_template.h
=====================================
@@ -34,17 +34,13 @@
* ATTRIBUTES:
* Target function attributes to use. Must satisfy the dependencies of the
* other parameters as follows:
- * VL=16 && USE_SSE4_1=0 && USE_AVX512=0: at least pclmul
- * VL=16 && USE_SSE4_1=1 && USE_AVX512=0: at least pclmul,sse4.1
- * VL=32 && USE_SSE4_1=1 && USE_AVX512=0: at least vpclmulqdq,pclmul,avx2
- * VL=32 && USE_SSE4_1=1 && USE_AVX512=1: at least vpclmulqdq,pclmul,avx512bw,avx512vl
- * VL=64 && USE_SSE4_1=1 && USE_AVX512=1: at least vpclmulqdq,pclmul,avx512bw,avx512vl
+ * VL=16 && USE_AVX512=0: at least pclmul,sse4.1
+ * VL=32 && USE_AVX512=0: at least vpclmulqdq,pclmul,avx2
+ * VL=32 && USE_AVX512=1: at least vpclmulqdq,pclmul,avx512bw,avx512vl
+ * VL=64 && USE_AVX512=1: at least vpclmulqdq,pclmul,avx512bw,avx512vl
* (Other combinations are not useful and have not been tested.)
* VL:
* Vector length in bytes. Must be 16, 32, or 64.
- * USE_SSE4_1:
- * If 1, take advantage of SSE4.1 instructions such as pblendvb.
- * If 0, assume that the CPU might not support SSE4.1.
* USE_AVX512:
* If 1, take advantage of AVX-512 features such as masking and the
* vpternlog instruction. This doesn't enable the use of 512-bit vectors;
@@ -55,7 +51,10 @@
* instructions. Note that the x86 crc32 instruction cannot be used, as it is
* for a different polynomial, not the gzip one. For an explanation of CRC
* folding with carryless multiplication instructions, see
- * scripts/gen_crc32_multipliers.c and the following paper:
+ * scripts/gen-crc32-consts.py and the following blog posts and papers:
+ *
+ * "An alternative exposition of crc32_4k_pclmulqdq"
+ * https://www.corsix.org/content/alternative-exposition-crc32_4k_pclmulqdq
*
* "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
* https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
@@ -81,7 +80,7 @@
# define fold_vec fold_vec256
# define VLOADU(p) _mm256_loadu_si256((const void *)(p))
# define VXOR(a, b) _mm256_xor_si256((a), (b))
-# define M128I_TO_VEC(a) _mm256_castsi128_si256(a)
+# define M128I_TO_VEC(a) _mm256_zextsi128_si256(a)
# define MULTS(a, b) _mm256_set_epi64x(a, b, a, b)
# define MULTS_8V MULTS(CRC32_X2015_MODG, CRC32_X2079_MODG)
# define MULTS_4V MULTS(CRC32_X991_MODG, CRC32_X1055_MODG)
@@ -92,7 +91,7 @@
# define fold_vec fold_vec512
# define VLOADU(p) _mm512_loadu_si512((const void *)(p))
# define VXOR(a, b) _mm512_xor_si512((a), (b))
-# define M128I_TO_VEC(a) _mm512_castsi128_si512(a)
+# define M128I_TO_VEC(a) _mm512_zextsi128_si512(a)
# define MULTS(a, b) _mm512_set_epi64(a, b, a, b, a, b, a, b)
# define MULTS_8V MULTS(CRC32_X4063_MODG, CRC32_X4127_MODG)
# define MULTS_4V MULTS(CRC32_X2015_MODG, CRC32_X2079_MODG)
@@ -149,7 +148,6 @@ ADD_SUFFIX(fold_vec512)(__m512i src, __m512i dst, __m512i /* __v8du */ mults)
#define fold_vec512 ADD_SUFFIX(fold_vec512)
#endif /* VL >= 64 */
-#if USE_SSE4_1
/*
* Given 'x' containing a 16-byte polynomial, and a pointer 'p' that points to
* the next '1 <= len <= 15' data bytes, rearrange the concatenation of 'x' and
@@ -181,7 +179,6 @@ ADD_SUFFIX(fold_lessthan16bytes)(__m128i x, const u8 *p, size_t len,
return fold_vec128(x0, x1, mults_128b);
}
#define fold_lessthan16bytes ADD_SUFFIX(fold_lessthan16bytes)
-#endif /* USE_SSE4_1 */
static ATTRIBUTES u32
ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
@@ -192,17 +189,16 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
* folding across 128 bits. mults_128b differs from mults_1v when
* VL != 16. All multipliers are 64-bit, to match what pclmulqdq needs,
* but since this is for CRC-32 only their low 32 bits are nonzero.
- * For more details, see scripts/gen_crc32_multipliers.c.
+ * For more details, see scripts/gen-crc32-consts.py.
*/
const vec_t mults_8v = MULTS_8V;
const vec_t mults_4v = MULTS_4V;
const vec_t mults_2v = MULTS_2V;
const vec_t mults_1v = MULTS_1V;
const __m128i mults_128b = _mm_set_epi64x(CRC32_X95_MODG, CRC32_X159_MODG);
- const __m128i final_mult = _mm_set_epi64x(0, CRC32_X63_MODG);
- const __m128i mask32 = _mm_set_epi32(0, 0, 0, 0xFFFFFFFF);
const __m128i barrett_reduction_constants =
_mm_set_epi64x(CRC32_BARRETT_CONSTANT_2, CRC32_BARRETT_CONSTANT_1);
+ const __m128i mask32 = _mm_set_epi32(0, 0xFFFFFFFF, 0, 0);
vec_t v0, v1, v2, v3, v4, v5, v6, v7;
__m128i x0 = _mm_cvtsi32_si128(crc);
__m128i x1;
@@ -273,7 +269,6 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
size_t align = -(uintptr_t)p & (VL-1);
len -= align;
- #if USE_SSE4_1
x0 = _mm_xor_si128(_mm_loadu_si128((const void *)p), x0);
p += 16;
if (align & 15) {
@@ -296,11 +291,6 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
v0 = _mm512_inserti64x4(v0, *(const __m256i *)(p + 16), 1);
# endif
p -= 16;
- #else
- crc = crc32_slice1(crc, p, align);
- p += align;
- v0 = VXOR(VLOADU(p), M128I_TO_VEC(_mm_cvtsi32_si128(crc)));
- #endif
} else {
v0 = VXOR(VLOADU(p), M128I_TO_VEC(x0));
}
@@ -395,86 +385,69 @@ less_than_vl_remaining:
less_than_16_remaining:
len &= 15;
- /*
- * If fold_lessthan16bytes() is available, handle any remainder
- * of 1 to 15 bytes now, before reducing to 32 bits.
- */
-#if USE_SSE4_1
+ /* Handle any remainder of 1 to 15 bytes. */
if (len)
x0 = fold_lessthan16bytes(x0, p, len, mults_128b);
-#endif
#if USE_AVX512
reduce_x0:
#endif
-
/*
- * Fold 128 => 96 bits. This also implicitly appends 32 zero bits,
- * which is equivalent to multiplying by x^32. This is needed because
- * the CRC is defined as M(x)*x^32 mod G(x), not just M(x) mod G(x).
- */
- x0 = _mm_xor_si128(_mm_srli_si128(x0, 8),
- _mm_clmulepi64_si128(x0, mults_128b, 0x10));
-
- /* Fold 96 => 64 bits. */
- x0 = _mm_xor_si128(_mm_srli_si128(x0, 4),
- _mm_clmulepi64_si128(_mm_and_si128(x0, mask32),
- final_mult, 0x00));
-
- /*
- * Reduce 64 => 32 bits using Barrett reduction.
- *
- * Let M(x) = A(x)*x^32 + B(x) be the remaining message. The goal is to
- * compute R(x) = M(x) mod G(x). Since degree(B(x)) < degree(G(x)):
- *
- * R(x) = (A(x)*x^32 + B(x)) mod G(x)
- * = (A(x)*x^32) mod G(x) + B(x)
- *
- * Then, by the Division Algorithm there exists a unique q(x) such that:
- *
- * A(x)*x^32 mod G(x) = A(x)*x^32 - q(x)*G(x)
+ * Generate the final n-bit CRC from the 128-bit x0 = A as follows:
*
- * Since the left-hand side is of maximum degree 31, the right-hand side
- * must be too. This implies that we can apply 'mod x^32' to the
- * right-hand side without changing its value:
+ * crc = x^n * A mod G
+ * = x^n * (x^64*A_H + A_L) mod G
+ * = x^n * (x^(64-n)*(x^n*A_H mod G) + A_L) mod G
*
- * (A(x)*x^32 - q(x)*G(x)) mod x^32 = q(x)*G(x) mod x^32
+ * I.e.:
+ * crc := 0
+ * crc := x^n * (x^(64-n)*crc + A_H) mod G
+ * crc := x^n * (x^(64-n)*crc + A_L) mod G
*
- * Note that '+' is equivalent to '-' in polynomials over GF(2).
+ * A_H and A_L denote the high and low 64 polynomial coefficients in A.
*
- * We also know that:
+ * Using Barrett reduction to do the 'mod G', this becomes:
*
- * / A(x)*x^32 \
- * q(x) = floor ( --------- )
- * \ G(x) /
+ * crc := floor((A_H * floor(x^(m+n) / G)) / x^m) * G mod x^n
+ * A_L := x^(64-n)*crc + A_L
+ * crc := floor((A_L * floor(x^(m+n) / G)) / x^m) * G mod x^n
*
- * To compute this efficiently, we can multiply the top and bottom by
- * x^32 and move the division by G(x) to the top:
- *
- * / A(x) * floor(x^64 / G(x)) \
- * q(x) = floor ( ------------------------- )
- * \ x^32 /
- *
- * Note that floor(x^64 / G(x)) is a constant.
- *
- * So finally we have:
- *
- * / A(x) * floor(x^64 / G(x)) \
- * R(x) = B(x) + G(x)*floor ( ------------------------- )
- * \ x^32 /
+ * For the gzip crc, n = 32 and the bit order is LSB (least significant
+ * bit) first. 'm' must be an integer >= 63 (the max degree of A_L and
+ * A_H) for sufficient precision to be carried through the calculation.
+ * As the gzip crc is LSB-first we use m == 63, which results in
+ * floor(x^(m+n) / G) being 64-bit which is the most pclmulqdq can
+ * accept. The multiplication with floor(x^(63+n) / G) then produces a
+ * 127-bit product, and the floored division by x^63 just takes the
+ * first qword.
*/
- x1 = _mm_clmulepi64_si128(_mm_and_si128(x0, mask32),
- barrett_reduction_constants, 0x00);
- x1 = _mm_clmulepi64_si128(_mm_and_si128(x1, mask32),
- barrett_reduction_constants, 0x10);
- x0 = _mm_xor_si128(x0, x1);
-#if USE_SSE4_1
- crc = _mm_extract_epi32(x0, 1);
+
+ /* tmp := floor((A_H * floor(x^(63+n) / G)) / x^63) */
+ x1 = _mm_clmulepi64_si128(x0, barrett_reduction_constants, 0x00);
+ /* tmp is in bits [0:64) of x1. */
+
+ /* crc := tmp * G mod x^n */
+ x1 = _mm_clmulepi64_si128(x1, barrett_reduction_constants, 0x10);
+ /* crc is in bits [64:64+n) of x1. */
+
+ /*
+ * A_L := x^(64-n)*crc + A_L
+ * crc is already aligned to add (XOR) it directly to A_L, after
+ * selecting it using a mask.
+ */
+#if USE_AVX512
+ x0 = _mm_ternarylogic_epi32(x0, x1, mask32, 0x78);
#else
- crc = _mm_cvtsi128_si32(_mm_shuffle_epi32(x0, 0x01));
- /* Process up to 15 bytes left over at the end. */
- crc = crc32_slice1(crc, p, len);
+ x0 = _mm_xor_si128(x0, _mm_and_si128(x1, mask32));
#endif
- return crc;
+ /*
+ * crc := floor((A_L * floor(x^(m+n) / G)) / x^m) * G mod x^n
+ * Same as previous but uses the low-order 64 coefficients of A.
+ */
+ x0 = _mm_clmulepi64_si128(x0, barrett_reduction_constants, 0x01);
+ x0 = _mm_clmulepi64_si128(x0, barrett_reduction_constants, 0x10);
+
+ /* Extract the CRC from bits [64:64+n) of x0. */
+ return _mm_extract_epi32(x0, 2);
}
#undef vec_t
@@ -491,5 +464,4 @@ reduce_x0:
#undef SUFFIX
#undef ATTRIBUTES
#undef VL
-#undef USE_SSE4_1
#undef USE_AVX512
=====================================
libdeflate.h
=====================================
@@ -13,8 +13,8 @@ extern "C" {
#endif
#define LIBDEFLATE_VERSION_MAJOR 1
-#define LIBDEFLATE_VERSION_MINOR 22
-#define LIBDEFLATE_VERSION_STRING "1.22"
+#define LIBDEFLATE_VERSION_MINOR 23
+#define LIBDEFLATE_VERSION_STRING "1.23"
/*
* Users of libdeflate.dll on Windows can define LIBDEFLATE_DLL to cause
=====================================
scripts/gen-crc32-consts.py
=====================================
@@ -0,0 +1,158 @@
+#!/usr/bin/env python3
+#
+# This script generates constants for efficient computation of the gzip CRC-32.
+
+import sys
+
+# This is the generator polynomial G(x) of the gzip CRC-32, represented as an
+# int using the natural mapping between bits and polynomial coefficients.
+G = 0x104c11db7
+
+# XOR (add) an iterable of polynomials.
+def xor(iterable):
+ res = 0
+ for val in iterable:
+ res ^= val
+ return res
+
+# Multiply two polynomials.
+def clmul(a, b):
+ return xor(a << i for i in range(b.bit_length()) if (b & (1 << i)) != 0)
+
+# Polynomial division floor(a / b).
+def div(a, b):
+ q = 0
+ while a.bit_length() >= b.bit_length():
+ q ^= 1 << (a.bit_length() - b.bit_length())
+ a ^= b << (a.bit_length() - b.bit_length())
+ return q
+
+# Reduce the polynomial 'a' modulo the polynomial 'b'.
+def reduce(a, b):
+ return a ^ clmul(div(a, b), b)
+
+# Reverse the bits of a polynomial.
+def bitreverse(poly, num_bits):
+ return xor(1 << (num_bits - 1 - i) for i in range(num_bits)
+ if (poly & (1 << i)) != 0)
+
+# Compute x^d mod G.
+def x_to_the_d(d):
+ if d < G.bit_length() - 1:
+ return 1 << d
+ t = x_to_the_d(d//2)
+ t = clmul(t, t)
+ if d % 2 != 0:
+ t <<= 1
+ return reduce(t, G)
+
+def gen_tables():
+ print('/*')
+ print(' * crc32_tables.h - data tables for CRC-32 computation')
+ print(' *')
+ print(' * THIS FILE WAS GENERATED BY gen-crc32-consts.py. DO NOT EDIT.')
+ print(' */')
+ for n in [1, 8]:
+ print('')
+ print(f'static const u32 crc32_slice{n}_table[] MAYBE_UNUSED = {{')
+ # The i'th table entry is the CRC-32 of the message consisting of byte
+ # i % 256 followed by i // 256 zero bytes.
+ polys = [bitreverse(i % 256, 8) << (32 + 8*(i//256)) for i in range(256 * n)]
+ polys = [bitreverse(reduce(poly, G), 32) for poly in polys]
+ for i in range(0, len(polys), 4):
+ print(f'\t0x{polys[i+0]:08x}, 0x{polys[i+1]:08x}, 0x{polys[i+2]:08x}, 0x{polys[i+3]:08x},')
+ print('};')
+
+# Compute the constant multipliers needed for "folding" over various distances
+# with the gzip CRC-32. Each such multiplier is x^d mod G(x) for some distance
+# d, in bits, over which the folding is occurring.
+#
+# Folding works as follows: let A(x) be a polynomial (possibly reduced partially
+# or fully mod G(x)) for part of the message, and let B(x) be a polynomial
+# (possibly reduced partially or fully mod G(x)) for a later part of the
+# message. The unreduced combined polynomial is A(x)*x^d + B(x), where d is the
+# number of bits separating the two parts of the message plus len(B(x)). Since
+# mod G(x) can be applied at any point, x^d mod G(x) can be precomputed and used
+# instead of x^d unreduced. That allows the combined polynomial to be computed
+# relatively easily in a partially-reduced form A(x)*(x^d mod G(x)) + B(x), with
+# length max(len(A(x)) + 31, len(B(x))). This does require doing a polynomial
+# multiplication (carryless multiplication).
+#
+# "Folding" in this way can be used for the entire CRC computation except the
+# final reduction to 32 bits; this works well when CPU support for carryless
+# multiplication is available. It can also be used to combine CRCs of different
+# parts of the message that were computed using a different method.
+#
+# Note that the gzip CRC-32 uses bit-reversed polynomials. I.e., the low order
+# bits are really the high order polynomial coefficients.
+def gen_multipliers():
+ print('/*')
+ print(' * crc32_multipliers.h - constants for CRC-32 folding')
+ print(' *')
+ print(' * THIS FILE WAS GENERATED BY gen-crc32-consts.py. DO NOT EDIT.')
+ print(' */')
+ print('')
+
+ # Compute the multipliers needed for CRC-32 folding with carryless
+ # multiplication instructions that operate on the 64-bit halves of 128-bit
+ # segments. Using the terminology from earlier, for each 64-bit fold
+ # len(A(x)) = 64, and len(B(x)) = 95 since a 64-bit polynomial multiplied by
+ # a 32-bit one produces a 95-bit one. When A(x) is the low order polynomial
+ # half of a 128-bit segments (high order physical half), the separation
+ # between the message parts is the total length of the 128-bit segments
+ # separating the values. When A(x) is the high order polynomial half, the
+ # separation is 64 bits greater.
+ for i in range(1, 33):
+ sep_lo = 128 * (i - 1)
+ sep_hi = sep_lo + 64
+ len_B = 95
+ for d in [sep_hi + len_B, # A(x) = high 64 polynomial bits (low 64 physical bits)
+ sep_lo + len_B # A(x) = low 64 polynomial bits (high 64 physical bits)
+ ]:
+ poly = bitreverse(x_to_the_d(d), 32)
+ print(f'#define CRC32_X{d}_MODG 0x{poly:08x} /* x^{d} mod G(x) */')
+ print('')
+
+ # Compute constants for the final 128 => 32 bit reduction.
+ poly = bitreverse(div(1 << 95, G), 64)
+ print(f'#define CRC32_BARRETT_CONSTANT_1 0x{poly:016x}ULL /* floor(x^95 / G(x)) */')
+ poly = bitreverse(G, 33)
+ print(f'#define CRC32_BARRETT_CONSTANT_2 0x{poly:016x}ULL /* G(x) */')
+
+ # Compute multipliers for combining the CRCs of separate chunks.
+ print('')
+ num_chunks = 4
+ table_len = 129
+ min_chunk_len = 128
+ print(f'#define CRC32_NUM_CHUNKS {num_chunks}')
+ print(f'#define CRC32_MIN_VARIABLE_CHUNK_LEN {min_chunk_len}UL')
+ print(f'#define CRC32_MAX_VARIABLE_CHUNK_LEN {(table_len-1) * min_chunk_len}UL')
+ print('')
+ print('/* Multipliers for implementations that use a variable chunk length */')
+ print('static const u32 crc32_mults_for_chunklen[][CRC32_NUM_CHUNKS - 1] MAYBE_UNUSED = {')
+ print('\t{ 0 /* unused row */ },')
+ for i in range(1, table_len):
+ chunk_len = i * min_chunk_len
+ print(f'\t/* chunk_len={chunk_len} */')
+ print('\t{ ', end='')
+ for j in range(num_chunks - 1, 0, -1):
+ d = (j * 8 * chunk_len) - 33
+ poly = bitreverse(x_to_the_d(d), 32)
+ print(f'0x{poly:08x} /* x^{d} mod G(x) */, ', end='')
+ print('},')
+ print('};')
+ fixed_chunk_len = 32768
+ print('')
+ print('/* Multipliers for implementations that use a large fixed chunk length */')
+ print(f'#define CRC32_FIXED_CHUNK_LEN {fixed_chunk_len}UL')
+ for j in range(1, num_chunks):
+ d = (j * 8 * fixed_chunk_len) - 33
+ poly = bitreverse(x_to_the_d(d), 32)
+ print(f'#define CRC32_FIXED_CHUNK_MULT_{j} 0x{poly:08x} /* x^{d} mod G(x) */')
+
+with open('lib/crc32_tables.h', 'w') as f:
+ sys.stdout = f
+ gen_tables()
+with open('lib/crc32_multipliers.h', 'w') as f:
+ sys.stdout = f
+ gen_multipliers()
=====================================
scripts/gen_crc32_multipliers.c deleted
=====================================
@@ -1,199 +0,0 @@
-/*
- * gen_crc32_multipliers.c
- *
- * Copyright 2016 Eric Biggers
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-/*
- * This program computes the constant multipliers needed for "folding" over
- * various distances with the gzip CRC-32. Each such multiplier is x^D mod G(x)
- * for some distance D, in bits, over which the folding is occurring.
- *
- * Folding works as follows: let A(x) be a polynomial (possibly reduced
- * partially or fully mod G(x)) for part of the message, and let B(x) be a
- * polynomial (possibly reduced partially or fully mod G(x)) for a later part of
- * the message. The unreduced combined polynomial is A(x)*x^D + B(x), where D
- * is the number of bits separating the two parts of the message plus len(B(x)).
- * Since mod G(x) can be applied at any point, x^D mod G(x) can be precomputed
- * and used instead of x^D unreduced. That allows the combined polynomial to be
- * computed relatively easily in a partially-reduced form A(x)*(x^D mod G(x)) +
- * B(x), with length max(len(A(x)) + 31, len(B(x))). This does require doing a
- * polynomial multiplication (carryless multiplication).
- *
- * "Folding" in this way can be used for the entire CRC computation except the
- * final reduction to 32 bits; this works well when CPU support for carryless
- * multiplication is available. It can also be used to combine CRCs of
- * different parts of the message that were computed using a different method.
- *
- * Note that the gzip CRC-32 uses bit-reversed polynomials. I.e., the low order
- * bits are really the high order polynomial coefficients.
- */
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#include "../common_defs.h"
-
-/* The generator polynomial G(x) for the gzip CRC-32 */
-#define CRCPOLY 0xEDB88320 /* G(x) without x^32 term */
-#define CRCPOLY_FULL (((u64)CRCPOLY << 1) | 1) /* G(x) */
-
-/* Compute x^D mod G(x) */
-static u32
-compute_xD_modG(size_t D)
-{
- /* Start with x^0 mod G(x) */
- u32 remainder = 0x80000000;
-
- /* Each iteration, 'remainder' becomes x^i mod G(x) */
- for (size_t i = 1; i <= D; i++)
- remainder = (remainder >> 1) ^ ((remainder & 1) ? CRCPOLY : 0);
-
- /* Now 'remainder' is x^D mod G(x) */
- return remainder;
-}
-
-/* Compute floor(x^64 / G(x)) */
-static u64
-compute_x64_div_G(void)
-{
- u64 quotient = 0;
- u64 dividend = 0x1;
-
- for (int i = 0; i < 64 - 32 + 1; i++) {
- if ((dividend >> i) & 1) {
- quotient |= (u64)1 << i;
- dividend ^= CRCPOLY_FULL << i;
- }
- }
-
- return quotient;
-}
-
-static void
-gen_vec_folding_constants(void)
-{
- /*
- * Compute the multipliers needed for CRC-32 folding with carryless
- * multiplication instructions that operate on the 64-bit halves of
- * 128-bit segments. Using the terminology from earlier, for each 64-bit
- * fold len(A(x)) = 64, and len(B(x)) = 95 since a 64-bit polynomial
- * multiplied by a 32-bit one produces a 95-bit one. When A(x) is the
- * low order polynomial half of a 128-bit segments (high order physical
- * half), the separation between the message parts is the total length
- * of the 128-bit segments separating the values. When A(x) is the high
- * order polynomial half, the separation is 64 bits greater.
- */
- for (int i = 1; i <= 32; i++) {
- const int sep_lo = 128 * (i - 1);
- const int sep_hi = sep_lo + 64;
- const int len_B = 95;
- int D;
-
- /* A(x) = high 64 polynomial bits (low 64 physical bits) */
- D = sep_hi + len_B;
- printf("#define CRC32_X%d_MODG 0x%08"PRIx32" /* x^%d mod G(x) */\n",
- D, compute_xD_modG(D), D);
-
- /* A(x) = low 64 polynomial bits (high 64 physical bits) */
- D = sep_lo + len_B;
- printf("#define CRC32_X%d_MODG 0x%08"PRIx32" /* x^%d mod G(x) */\n",
- D, compute_xD_modG(D), D);
- printf("\n");
- }
-
- /* Multiplier for final 96 => 64 bit fold */
- printf("#define CRC32_X63_MODG 0x%08"PRIx32" /* x^63 mod G(x) */\n",
- compute_xD_modG(63));
-
- /*
- * Constants for final 64 => 32 bit reduction. These constants are the
- * odd ones out, as this final reduction step can't use the regular CRC
- * folding described above. It uses Barrett reduction instead.
- */
- printf("#define CRC32_BARRETT_CONSTANT_1 0x%016"PRIx64"ULL /* floor(x^64 / G(x)) */\n",
- compute_x64_div_G());
- printf("#define CRC32_BARRETT_CONSTANT_2 0x%016"PRIx64"ULL /* G(x) */\n",
- CRCPOLY_FULL);
- printf("#define CRC32_BARRETT_CONSTANTS { CRC32_BARRETT_CONSTANT_1, CRC32_BARRETT_CONSTANT_2 }\n");
-}
-
-/* Multipliers for combining the CRCs of separate chunks */
-static void
-gen_chunk_constants(void)
-{
- const size_t num_chunks = 4;
- const size_t table_len = 129;
- const size_t min_chunk_len = 128;
-
- printf("#define CRC32_NUM_CHUNKS %zu\n", num_chunks);
- printf("#define CRC32_MIN_VARIABLE_CHUNK_LEN %zuUL\n", min_chunk_len);
- printf("#define CRC32_MAX_VARIABLE_CHUNK_LEN %zuUL\n",
- (table_len - 1) * min_chunk_len);
- printf("\n");
- printf("/* Multipliers for implementations that use a variable chunk length */\n");
- printf("static const u32 crc32_mults_for_chunklen[][CRC32_NUM_CHUNKS - 1] MAYBE_UNUSED = {\n");
- printf("\t{ 0 /* unused row */ },\n");
- for (size_t i = 1; i < table_len; i++) {
- const size_t chunk_len = i*min_chunk_len;
-
- printf("\t/* chunk_len=%zu */\n", chunk_len);
- printf("\t{ ");
- for (size_t j = num_chunks - 1; j >= 1; j--) {
- const size_t D = (j * 8 * chunk_len) - 33;
-
- printf("0x%08"PRIx32" /* x^%zu mod G(x) */, ",
- compute_xD_modG(D), D);
- }
- printf("},\n");
- }
- printf("};\n");
- printf("\n");
-
- printf("/* Multipliers for implementations that use a large fixed chunk length */\n");
- const size_t fixed_chunk_len = 32768;
- printf("#define CRC32_FIXED_CHUNK_LEN %zuUL\n", fixed_chunk_len);
- for (int j = 1; j < num_chunks; j++) {
- const size_t D = (j * 8 * fixed_chunk_len) - 33;
-
- printf("#define CRC32_FIXED_CHUNK_MULT_%d 0x%08"PRIx32" /* x^%zu mod G(x) */\n",
- j, compute_xD_modG(D), D);
- }
-}
-
-int
-main(void)
-{
- printf("/*\n"
- " * crc32_multipliers.h - constants for CRC-32 folding\n"
- " *\n"
- " * THIS FILE WAS GENERATED BY gen_crc32_multipliers.c. DO NOT EDIT.\n"
- " */\n"
- "\n");
-
- gen_vec_folding_constants();
- printf("\n");
- gen_chunk_constants();
- return 0;
-}
=====================================
scripts/gen_crc32_tables.c deleted
=====================================
@@ -1,105 +0,0 @@
-/*
- * gen_crc32_tables.c - a program for CRC-32 table generation
- *
- * Copyright 2016 Eric Biggers
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-#include <stdio.h>
-
-#include "../common_defs.h"
-
-#define CRCPOLY 0xEDB88320 /* G(x) without x^32 term */
-
-static u32
-crc32_update_bit(u32 remainder, u8 next_bit)
-{
- return (remainder >> 1) ^ (((remainder ^ next_bit) & 1) ? CRCPOLY : 0);
-}
-
-static u32
-crc32_update_byte(u32 remainder, u8 next_byte)
-{
- for (int j = 0; j < 8; j++, next_byte >>= 1)
- remainder = crc32_update_bit(remainder, next_byte & 1);
- return remainder;
-}
-
-static void
-print_256_entries(const u32 *entries)
-{
- for (size_t i = 0; i < 256 / 4; i++) {
- printf("\t");
- for (size_t j = 0; j < 4; j++) {
- printf("0x%08x,", entries[i * 4 + j]);
- if (j != 3)
- printf(" ");
- }
- printf("\n");
- }
-}
-
-int
-main(void)
-{
- u32 crc32_table[0x800];
-
- /* crc32_table[i] for 0 <= i < 0x100 is the CRC-32 of byte i. */
- for (int i = 0; i < 0x100; i++)
- crc32_table[i] = crc32_update_byte(0, i);
-
- /*
- * crc32_table[i] for 0x100 <= i < 0x800 is the CRC-32 of byte i % 0x100
- * followed by i / 0x100 zero bytes.
- */
- for (int i = 0x100; i < 0x800; i++)
- crc32_table[i] = crc32_update_byte(crc32_table[i - 0x100], 0);
-
- printf("/*\n");
- printf(" * crc32_tables.h - data tables for CRC-32 computation\n");
- printf(" *\n");
- printf(" * THIS FILE WAS GENERATED BY gen_crc32_tables.c. DO NOT EDIT.\n");
- printf(" */\n");
- printf("\n");
- /*
- * Although crc32_slice1_table is the same as the first 256 entries of
- * crc32_slice8_table, we output these tables separately because any
- * combo of (slice1, slice8, slice1 && slice8, nothing) might be needed,
- * and it's simplest to let the compiler optimize out any unused tables.
- */
- printf("static const u32 crc32_slice1_table[] MAYBE_UNUSED = {\n");
- print_256_entries(&crc32_table[0x000]);
- printf("};\n");
- printf("\n");
- printf("static const u32 crc32_slice8_table[] MAYBE_UNUSED = {\n");
- print_256_entries(&crc32_table[0x000]);
- print_256_entries(&crc32_table[0x100]);
- print_256_entries(&crc32_table[0x200]);
- print_256_entries(&crc32_table[0x300]);
- print_256_entries(&crc32_table[0x400]);
- print_256_entries(&crc32_table[0x500]);
- print_256_entries(&crc32_table[0x600]);
- print_256_entries(&crc32_table[0x700]);
- printf("};\n");
- return 0;
-}
View it on GitLab: https://salsa.debian.org/deflate-team/libdeflate/-/compare/aadbde3370db9278c86c07d23b54ad768a2b74f2...02c1120d828c894348087b62020eb21b2ba672c2
--
View it on GitLab: https://salsa.debian.org/deflate-team/libdeflate/-/compare/aadbde3370db9278c86c07d23b54ad768a2b74f2...02c1120d828c894348087b62020eb21b2ba672c2
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20250114/22a4504d/attachment-0001.htm>
More information about the debian-med-commit
mailing list