[med-svn] [Git][med-team/libdeflate][upstream] New upstream version 1.14

Lance Lin (@linqigang) gitlab at salsa.debian.org
Mon Oct 3 13:01:04 BST 2022



Lance Lin pushed to branch upstream at Debian Med / libdeflate


Commits:
d43d7389 by Lance Lin at 2022-10-03T18:59:36+07:00
New upstream version 1.14
- - - - -


18 changed files:

- Makefile
- NEWS.md
- README.md
- common_defs.h
- lib/arm/adler32_impl.h
- lib/arm/cpu_features.c
- lib/arm/cpu_features.h
- lib/arm/matchfinder_impl.h
- lib/decompress_template.h
- lib/deflate_compress.c
- lib/deflate_constants.h
- lib/deflate_decompress.c
- lib/x86/cpu_features.h
- lib/x86/decompress_impl.h
- lib/x86/matchfinder_impl.h
- libdeflate.h
- scripts/afl-fuzz/fuzz.sh
- scripts/run_tests.sh


Changes:

=====================================
Makefile
=====================================
@@ -55,12 +55,13 @@ cc-option = $(shell if $(CC) $(1) -c -x c /dev/null -o /dev/null \
 
 override CFLAGS :=							\
 	-O2 -fomit-frame-pointer -std=c99 -I. -Wall -Wundef		\
-	$(call cc-option,-Wpedantic)					\
 	$(call cc-option,-Wdeclaration-after-statement)			\
+	$(call cc-option,-Wimplicit-fallthrough)			\
 	$(call cc-option,-Wmissing-prototypes)				\
+	$(call cc-option,-Wpedantic)					\
+	$(call cc-option,-Wshadow)					\
 	$(call cc-option,-Wstrict-prototypes)				\
 	$(call cc-option,-Wvla)						\
-	$(call cc-option,-Wimplicit-fallthrough)			\
 	$(CFLAGS)
 
 FREESTANDING :=


=====================================
NEWS.md
=====================================
@@ -1,5 +1,31 @@
 # libdeflate release notes
 
+## Version 1.14
+
+Significantly improved decompression performance on all platforms.  Examples
+include (measuring DEFLATE only):
+
+| Platform                           | Speedup over v1.13 |
+|------------------------------------|--------------------|
+| x86_64 (Intel Comet Lake), gcc     | 1.287x             |
+| x86_64 (Intel Comet Lake), clang   | 1.437x             |
+| x86_64 (Intel Ice Lake), gcc       | 1.332x             |
+| x86_64 (Intel Ice Lake), clang     | 1.296x             |
+| x86_64 (Intel Sandy Bridge), gcc   | 1.162x             |
+| x86_64 (Intel Sandy Bridge), clang | 1.092x             |
+| x86_64 (AMD Zen 2), gcc            | 1.263x             |
+| x86_64 (AMD Zen 2), clang          | 1.259x             |
+| i386 (Intel Comet Lake), gcc       | 1.570x             |
+| i386 (Intel Comet Lake), clang     | 1.344x             |
+| arm64 (Apple M1), clang            | 1.306x             |
+| arm64 (Cortex-A76), clang          | 1.355x             |
+| arm64 (Cortex-A55), clang          | 1.190x             |
+| arm32 (Cortex-A76), clang          | 1.665x             |
+| arm32 (Cortex-A55), clang          | 1.283x             |
+
+Thanks to Dougall Johnson (https://dougallj.wordpress.com/) for ideas for many
+of the improvements.
+
 ## Version 1.13
 
 * Changed the 32-bit Windows build of the library to use the default calling


=====================================
README.md
=====================================
@@ -27,11 +27,13 @@ For the release notes, see the [NEWS file](NEWS.md).
 ## Table of Contents
 
 - [Building](#building)
-  - [For UNIX](#for-unix)
-  - [For macOS](#for-macos)
-  - [For Windows](#for-windows)
-    - [Using Cygwin](#using-cygwin)
-    - [Using MSYS2](#using-msys2)
+  - [Using the Makefile](#using-the-makefile)
+    - [For UNIX](#for-unix)
+    - [For macOS](#for-macos)
+    - [For Windows](#for-windows)
+      - [Using Cygwin](#using-cygwin)
+      - [Using MSYS2](#using-msys2)
+  - [Using a custom build system](#using-a-custom-build-system)
 - [API](#api)
 - [Bindings for other programming languages](#bindings-for-other-programming-languages)
 - [DEFLATE vs. zlib vs. gzip](#deflate-vs-zlib-vs-gzip)
@@ -42,7 +44,14 @@ For the release notes, see the [NEWS file](NEWS.md).
 
 # Building
 
-## For UNIX
+libdeflate and the provided programs like `gzip` can be built using the provided
+Makefile.  If only the library is needed, it can alternatively be easily
+integrated into applications and built using any build system; see [Using a
+custom build system](#using-a-custom-build-system).
+
+## Using the Makefile
+
+### For UNIX
 
 Just run `make`, then (if desired) `make install`.  You need GNU Make and either
 GCC or Clang.  GCC is recommended because it builds slightly faster binaries.
@@ -57,7 +66,7 @@ There are also many options which can be set on the `make` command line, e.g. to
 omit library features or to customize the directories into which `make install`
 installs files.  See the Makefile for details.
 
-## For macOS
+### For macOS
 
 Prebuilt macOS binaries can be installed with [Homebrew](https://brew.sh):
 
@@ -65,7 +74,7 @@ Prebuilt macOS binaries can be installed with [Homebrew](https://brew.sh):
 
 But if you need to build the binaries yourself, see the section for UNIX above.
 
-## For Windows
+### For Windows
 
 Prebuilt Windows binaries can be downloaded from
 https://github.com/ebiggers/libdeflate/releases.  But if you need to build the
@@ -84,7 +93,7 @@ binaries built with MinGW will be significantly faster.
 Also note that 64-bit binaries are faster than 32-bit binaries and should be
 preferred whenever possible.
 
-### Using Cygwin
+#### Using Cygwin
 
 Run the Cygwin installer, available from https://cygwin.com/setup-x86_64.exe.
 When you get to the package selection screen, choose the following additional
@@ -119,7 +128,7 @@ or to build 32-bit binaries:
 
     make CC=i686-w64-mingw32-gcc
 
-### Using MSYS2
+#### Using MSYS2
 
 Run the MSYS2 installer, available from http://www.msys2.org/.  After
 installing, open an MSYS2 shell and run:
@@ -161,6 +170,23 @@ and run the following commands:
 
 Or to build 32-bit binaries, do the same but use "MSYS2 MinGW 32-bit" instead.
 
+## Using a custom build system
+
+The source files of the library are designed to be compilable directly, without
+any prerequisite step like running a `./configure` script.  Therefore, as an
+alternative to building the library using the provided Makefile, the library
+source files can be easily integrated directly into your application and built
+using any build system.
+
+You should compile both `lib/*.c` and `lib/*/*.c`.  You don't need to worry
+about excluding irrelevant architecture-specific code, as this is already
+handled in the source files themselves using `#ifdef`s.
+
+It is **strongly** recommended to use either gcc or clang, and to use `-O2`.
+
+If you are doing a freestanding build with `-ffreestanding`, you must add
+`-DFREESTANDING` as well, otherwise performance will suffer greatly.
+
 # API
 
 libdeflate has a simple API that is not zlib-compatible.  You can create


=====================================
common_defs.h
=====================================
@@ -144,8 +144,17 @@ typedef size_t machine_word_t;
 /* restrict - hint that writes only occur through the given pointer */
 #ifdef __GNUC__
 #  define restrict		__restrict__
+#elif defined(_MSC_VER)
+    /*
+     * Don't use MSVC's __restrict; it has nonstandard behavior.
+     * Standard restrict is okay, if it is supported.
+     */
+#  if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)
+#    define restrict		restrict
+#  else
+#    define restrict
+#  endif
 #else
-/* Don't use MSVC's __restrict; it has nonstandard behavior. */
 #  define restrict
 #endif
 
@@ -200,6 +209,7 @@ typedef size_t machine_word_t;
 #define DIV_ROUND_UP(n, d)	(((n) + (d) - 1) / (d))
 #define STATIC_ASSERT(expr)	((void)sizeof(char[1 - 2 * !(expr)]))
 #define ALIGN(n, a)		(((n) + (a) - 1) & ~((a) - 1))
+#define ROUND_UP(n, d)		((d) * DIV_ROUND_UP((n), (d)))
 
 /* ========================================================================== */
 /*                           Endianness handling                              */
@@ -513,8 +523,10 @@ bsr32(u32 v)
 #ifdef __GNUC__
 	return 31 - __builtin_clz(v);
 #elif defined(_MSC_VER)
-	_BitScanReverse(&v, v);
-	return v;
+	unsigned long i;
+
+	_BitScanReverse(&i, v);
+	return i;
 #else
 	unsigned i = 0;
 
@@ -529,9 +541,11 @@ bsr64(u64 v)
 {
 #ifdef __GNUC__
 	return 63 - __builtin_clzll(v);
-#elif defined(_MSC_VER) && defined(_M_X64)
-	_BitScanReverse64(&v, v);
-	return v;
+#elif defined(_MSC_VER) && defined(_WIN64)
+	unsigned long i;
+
+	_BitScanReverse64(&i, v);
+	return i;
 #else
 	unsigned i = 0;
 
@@ -563,8 +577,10 @@ bsf32(u32 v)
 #ifdef __GNUC__
 	return __builtin_ctz(v);
 #elif defined(_MSC_VER)
-	_BitScanForward(&v, v);
-	return v;
+	unsigned long i;
+
+	_BitScanForward(&i, v);
+	return i;
 #else
 	unsigned i = 0;
 
@@ -579,9 +595,11 @@ bsf64(u64 v)
 {
 #ifdef __GNUC__
 	return __builtin_ctzll(v);
-#elif defined(_MSC_VER) && defined(_M_X64)
-	_BitScanForward64(&v, v);
-	return v;
+#elif defined(_MSC_VER) && defined(_WIN64)
+	unsigned long i;
+
+	_BitScanForward64(&i, v);
+	return i;
 #else
 	unsigned i = 0;
 


=====================================
lib/arm/adler32_impl.h
=====================================
@@ -30,7 +30,9 @@
 
 #include "cpu_features.h"
 
+/* Regular NEON implementation */
 #if HAVE_NEON_INTRIN && CPU_IS_LITTLE_ENDIAN()
+#  define adler32_neon		adler32_neon
 #  define FUNCNAME		adler32_neon
 #  define FUNCNAME_CHUNK	adler32_neon_chunk
 #  define IMPL_ALIGNMENT	16
@@ -140,18 +142,119 @@ adler32_neon_chunk(const uint8x16_t *p, const uint8x16_t * const end,
 	*s2 += v_s2[0] + v_s2[1] + v_s2[2] + v_s2[3];
 }
 #  include "../adler32_vec_template.h"
-#  if HAVE_NEON_NATIVE
-#    define DEFAULT_IMPL	adler32_neon
+#endif /* Regular NEON implementation */
+
+/* NEON+dotprod implementation */
+#if HAVE_DOTPROD_INTRIN && CPU_IS_LITTLE_ENDIAN()
+#  define adler32_neon_dotprod	adler32_neon_dotprod
+#  define FUNCNAME		adler32_neon_dotprod
+#  define FUNCNAME_CHUNK	adler32_neon_dotprod_chunk
+#  define IMPL_ALIGNMENT	16
+#  define IMPL_SEGMENT_LEN	64
+#  define IMPL_MAX_CHUNK_LEN	MAX_CHUNK_LEN
+#  if HAVE_DOTPROD_NATIVE
+#    define ATTRIBUTES
 #  else
+#    ifdef __clang__
+#      define ATTRIBUTES  __attribute__((target("dotprod")))
+     /*
+      * With gcc, arch=armv8.2-a is needed for dotprod intrinsics, unless the
+      * default target is armv8.3-a or later in which case it must be omitted.
+      * armv8.3-a or later can be detected by checking for __ARM_FEATURE_JCVT.
+      */
+#    elif defined(__ARM_FEATURE_JCVT)
+#      define ATTRIBUTES  __attribute__((target("+dotprod")))
+#    else
+#      define ATTRIBUTES  __attribute__((target("arch=armv8.2-a+dotprod")))
+#    endif
+#  endif
+#  include <arm_neon.h>
+static forceinline ATTRIBUTES void
+adler32_neon_dotprod_chunk(const uint8x16_t *p, const uint8x16_t * const end,
+			   u32 *s1, u32 *s2)
+{
+	const uint8x16_t mults_a = {
+		64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49,
+	};
+	const uint8x16_t mults_b = {
+		48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33,
+	};
+	const uint8x16_t mults_c = {
+		32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
+	};
+	const uint8x16_t mults_d = {
+		16, 15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1,
+	};
+	const uint8x16_t ones = {
+		 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1 , 1,  1,
+	};
+	uint32x4_t v_s1_a = { 0, 0, 0, 0 };
+	uint32x4_t v_s1_b = { 0, 0, 0, 0 };
+	uint32x4_t v_s1_c = { 0, 0, 0, 0 };
+	uint32x4_t v_s1_d = { 0, 0, 0, 0 };
+	uint32x4_t v_s2_a = { 0, 0, 0, 0 };
+	uint32x4_t v_s2_b = { 0, 0, 0, 0 };
+	uint32x4_t v_s2_c = { 0, 0, 0, 0 };
+	uint32x4_t v_s2_d = { 0, 0, 0, 0 };
+	uint32x4_t v_s1_sums_a = { 0, 0, 0, 0 };
+	uint32x4_t v_s1_sums_b = { 0, 0, 0, 0 };
+	uint32x4_t v_s1_sums_c = { 0, 0, 0, 0 };
+	uint32x4_t v_s1_sums_d = { 0, 0, 0, 0 };
+	uint32x4_t v_s1;
+	uint32x4_t v_s2;
+
+	do {
+		uint8x16_t bytes_a = *p++;
+		uint8x16_t bytes_b = *p++;
+		uint8x16_t bytes_c = *p++;
+		uint8x16_t bytes_d = *p++;
+
+		v_s1_sums_a += v_s1_a;
+		v_s1_a = vdotq_u32(v_s1_a, bytes_a, ones);
+		v_s2_a = vdotq_u32(v_s2_a, bytes_a, mults_a);
+
+		v_s1_sums_b += v_s1_b;
+		v_s1_b = vdotq_u32(v_s1_b, bytes_b, ones);
+		v_s2_b = vdotq_u32(v_s2_b, bytes_b, mults_b);
+
+		v_s1_sums_c += v_s1_c;
+		v_s1_c = vdotq_u32(v_s1_c, bytes_c, ones);
+		v_s2_c = vdotq_u32(v_s2_c, bytes_c, mults_c);
+
+		v_s1_sums_d += v_s1_d;
+		v_s1_d = vdotq_u32(v_s1_d, bytes_d, ones);
+		v_s2_d = vdotq_u32(v_s2_d, bytes_d, mults_d);
+	} while (p != end);
+
+	v_s1 = v_s1_a + v_s1_b + v_s1_c + v_s1_d;
+	v_s2 = v_s2_a + v_s2_b + v_s2_c + v_s2_d +
+	       vqshlq_n_u32(v_s1_sums_a + v_s1_sums_b +
+			    v_s1_sums_c + v_s1_sums_d, 6);
+	*s1 += v_s1[0] + v_s1[1] + v_s1[2] + v_s1[3];
+	*s2 += v_s2[0] + v_s2[1] + v_s2[2] + v_s2[3];
+}
+#  include "../adler32_vec_template.h"
+#endif /* NEON+dotprod implementation */
+
+#if defined(adler32_neon_dotprod) && HAVE_DOTPROD_NATIVE
+#define DEFAULT_IMPL	adler32_neon_dotprod
+#else
 static inline adler32_func_t
 arch_select_adler32_func(void)
 {
-	if (HAVE_NEON(get_arm_cpu_features()))
+	const u32 features MAYBE_UNUSED = get_arm_cpu_features();
+
+#ifdef adler32_neon_dotprod
+	if (HAVE_NEON(features) && HAVE_DOTPROD(features))
+		return adler32_neon_dotprod;
+#endif
+#ifdef adler32_neon
+	if (HAVE_NEON(features))
 		return adler32_neon;
+#endif
 	return NULL;
 }
-#    define arch_select_adler32_func	arch_select_adler32_func
-#  endif /* !HAVE_NEON_NATIVE */
-#endif /* HAVE_NEON_INTRIN && CPU_IS_LITTLE_ENDIAN() */
+#define arch_select_adler32_func	arch_select_adler32_func
+#endif
 
 #endif /* LIB_ARM_ADLER32_IMPL_H */


=====================================
lib/arm/cpu_features.c
=====================================
@@ -126,6 +126,8 @@ static u32 query_arm_cpu_features(void)
 		features |= ARM_CPU_FEATURE_CRC32;
 	if (hwcap & (1 << 17))	/* HWCAP_SHA3 */
 		features |= ARM_CPU_FEATURE_SHA3;
+	if (hwcap & (1 << 20))	/* HWCAP_ASIMDDP */
+		features |= ARM_CPU_FEATURE_DOTPROD;
 #endif
 	return features;
 }
@@ -140,12 +142,13 @@ static const struct {
 	const char *name;
 	u32 feature;
 } feature_sysctls[] = {
-	{ "hw.optional.neon",		ARM_CPU_FEATURE_NEON },
-	{ "hw.optional.AdvSIMD",	ARM_CPU_FEATURE_NEON },
-	{ "hw.optional.arm.FEAT_PMULL",	ARM_CPU_FEATURE_PMULL },
-	{ "hw.optional.armv8_crc32",	ARM_CPU_FEATURE_CRC32 },
-	{ "hw.optional.armv8_2_sha3",	ARM_CPU_FEATURE_SHA3 },
-	{ "hw.optional.arm.FEAT_SHA3",	ARM_CPU_FEATURE_SHA3 },
+	{ "hw.optional.neon",		  ARM_CPU_FEATURE_NEON },
+	{ "hw.optional.AdvSIMD",	  ARM_CPU_FEATURE_NEON },
+	{ "hw.optional.arm.FEAT_PMULL",	  ARM_CPU_FEATURE_PMULL },
+	{ "hw.optional.armv8_crc32",	  ARM_CPU_FEATURE_CRC32 },
+	{ "hw.optional.armv8_2_sha3",	  ARM_CPU_FEATURE_SHA3 },
+	{ "hw.optional.arm.FEAT_SHA3",	  ARM_CPU_FEATURE_SHA3 },
+	{ "hw.optional.arm.FEAT_DotProd", ARM_CPU_FEATURE_DOTPROD },
 };
 
 static u32 query_arm_cpu_features(void)
@@ -173,6 +176,7 @@ static const struct cpu_feature arm_cpu_feature_table[] = {
 	{ARM_CPU_FEATURE_PMULL,		"pmull"},
 	{ARM_CPU_FEATURE_CRC32,		"crc32"},
 	{ARM_CPU_FEATURE_SHA3,		"sha3"},
+	{ARM_CPU_FEATURE_DOTPROD,	"dotprod"},
 };
 
 volatile u32 libdeflate_arm_cpu_features = 0;


=====================================
lib/arm/cpu_features.h
=====================================
@@ -46,11 +46,13 @@
 #define ARM_CPU_FEATURE_PMULL		0x00000002
 #define ARM_CPU_FEATURE_CRC32		0x00000004
 #define ARM_CPU_FEATURE_SHA3		0x00000008
+#define ARM_CPU_FEATURE_DOTPROD		0x00000010
 
-#define HAVE_NEON(features)	(HAVE_NEON_NATIVE  || ((features) & ARM_CPU_FEATURE_NEON))
-#define HAVE_PMULL(features)	(HAVE_PMULL_NATIVE || ((features) & ARM_CPU_FEATURE_PMULL))
-#define HAVE_CRC32(features)	(HAVE_CRC32_NATIVE || ((features) & ARM_CPU_FEATURE_CRC32))
-#define HAVE_SHA3(features)	(HAVE_SHA3_NATIVE  || ((features) & ARM_CPU_FEATURE_SHA3))
+#define HAVE_NEON(features)	(HAVE_NEON_NATIVE    || ((features) & ARM_CPU_FEATURE_NEON))
+#define HAVE_PMULL(features)	(HAVE_PMULL_NATIVE   || ((features) & ARM_CPU_FEATURE_PMULL))
+#define HAVE_CRC32(features)	(HAVE_CRC32_NATIVE   || ((features) & ARM_CPU_FEATURE_CRC32))
+#define HAVE_SHA3(features)	(HAVE_SHA3_NATIVE    || ((features) & ARM_CPU_FEATURE_SHA3))
+#define HAVE_DOTPROD(features)	(HAVE_DOTPROD_NATIVE || ((features) & ARM_CPU_FEATURE_DOTPROD))
 
 #if HAVE_DYNAMIC_ARM_CPU_FEATURES
 #define ARM_CPU_FEATURES_KNOWN		0x80000000
@@ -156,6 +158,24 @@ static inline u32 get_arm_cpu_features(void) { return 0; }
 #  define HAVE_SHA3_INTRIN	0
 #endif
 
+/* dotprod */
+#ifdef __aarch64__
+#  ifdef __ARM_FEATURE_DOTPROD
+#    define HAVE_DOTPROD_NATIVE	1
+#  else
+#    define HAVE_DOTPROD_NATIVE	0
+#  endif
+#  define HAVE_DOTPROD_TARGET \
+	(HAVE_DYNAMIC_ARM_CPU_FEATURES && \
+	 (GCC_PREREQ(8, 1) || __has_builtin(__builtin_neon_vdotq_v)))
+#  define HAVE_DOTPROD_INTRIN \
+	(HAVE_NEON_INTRIN && (HAVE_DOTPROD_NATIVE || HAVE_DOTPROD_TARGET))
+#else
+#  define HAVE_DOTPROD_NATIVE	0
+#  define HAVE_DOTPROD_TARGET	0
+#  define HAVE_DOTPROD_INTRIN	0
+#endif
+
 /*
  * Work around bugs in arm_acle.h and arm_neon.h where sometimes intrinsics are
  * only defined when the corresponding __ARM_FEATURE_* macro is defined.  The
@@ -170,6 +190,9 @@ static inline u32 get_arm_cpu_features(void) { return 0; }
 #if HAVE_SHA3_INTRIN && !HAVE_SHA3_NATIVE && defined(__clang__)
 #  define __ARM_FEATURE_SHA3	1
 #endif
+#if HAVE_DOTPROD_INTRIN && !HAVE_DOTPROD_NATIVE && defined(__clang__)
+#  define __ARM_FEATURE_DOTPROD	1
+#endif
 #if HAVE_CRC32_INTRIN && !HAVE_CRC32_NATIVE && \
 	(defined(__clang__) || defined(__arm__))
 #  include <arm_acle.h>
@@ -179,6 +202,10 @@ static inline u32 get_arm_cpu_features(void) { return 0; }
 #  include <arm_neon.h>
 #  undef __ARM_FEATURE_SHA3
 #endif
+#if HAVE_DOTPROD_INTRIN && !HAVE_DOTPROD_NATIVE && defined(__clang__)
+#  include <arm_neon.h>
+#  undef __ARM_FEATURE_DOTPROD
+#endif
 
 #endif /* __arm__ || __aarch64__ */
 


=====================================
lib/arm/matchfinder_impl.h
=====================================
@@ -28,7 +28,9 @@
 #ifndef LIB_ARM_MATCHFINDER_IMPL_H
 #define LIB_ARM_MATCHFINDER_IMPL_H
 
-#ifdef __ARM_NEON
+#include "cpu_features.h"
+
+#if HAVE_NEON_NATIVE
 #  include <arm_neon.h>
 static forceinline void
 matchfinder_init_neon(mf_pos_t *data, size_t size)
@@ -81,6 +83,6 @@ matchfinder_rebase_neon(mf_pos_t *data, size_t size)
 }
 #define matchfinder_rebase matchfinder_rebase_neon
 
-#endif /* __ARM_NEON */
+#endif /* HAVE_NEON_NATIVE */
 
 #endif /* LIB_ARM_MATCHFINDER_IMPL_H */


=====================================
lib/decompress_template.h
=====================================
@@ -31,7 +31,17 @@
  * target instruction sets.
  */
 
-static enum libdeflate_result ATTRIBUTES
+#ifndef ATTRIBUTES
+#  define ATTRIBUTES
+#endif
+#ifndef EXTRACT_VARBITS
+#  define EXTRACT_VARBITS(word, count)	((word) & BITMASK(count))
+#endif
+#ifndef EXTRACT_VARBITS8
+#  define EXTRACT_VARBITS8(word, count)	((word) & BITMASK((u8)(count)))
+#endif
+
+static enum libdeflate_result ATTRIBUTES MAYBE_UNUSED
 FUNCNAME(struct libdeflate_decompressor * restrict d,
 	 const void * restrict in, size_t in_nbytes,
 	 void * restrict out, size_t out_nbytes_avail,
@@ -39,102 +49,141 @@ FUNCNAME(struct libdeflate_decompressor * restrict d,
 {
 	u8 *out_next = out;
 	u8 * const out_end = out_next + out_nbytes_avail;
+	u8 * const out_fastloop_end =
+		out_end - MIN(out_nbytes_avail, FASTLOOP_MAX_BYTES_WRITTEN);
+
+	/* Input bitstream state; see deflate_decompress.c for documentation */
 	const u8 *in_next = in;
 	const u8 * const in_end = in_next + in_nbytes;
+	const u8 * const in_fastloop_end =
+		in_end - MIN(in_nbytes, FASTLOOP_MAX_BYTES_READ);
 	bitbuf_t bitbuf = 0;
-	unsigned bitsleft = 0;
+	bitbuf_t saved_bitbuf;
+	u32 bitsleft = 0;
 	size_t overread_count = 0;
-	unsigned i;
-	unsigned is_final_block;
+
+	bool is_final_block;
 	unsigned block_type;
-	u16 len;
-	u16 nlen;
 	unsigned num_litlen_syms;
 	unsigned num_offset_syms;
-	u16 tmp16;
-	u32 tmp32;
+	bitbuf_t litlen_tablemask;
+	u32 entry;
 
 next_block:
-	/* Starting to read the next block.  */
+	/* Starting to read the next block */
 	;
 
-	STATIC_ASSERT(CAN_ENSURE(1 + 2 + 5 + 5 + 4));
-	ENSURE_BITS(1 + 2 + 5 + 5 + 4);
+	STATIC_ASSERT(CAN_CONSUME(1 + 2 + 5 + 5 + 4 + 3));
+	REFILL_BITS();
 
-	/* BFINAL: 1 bit  */
-	is_final_block = POP_BITS(1);
+	/* BFINAL: 1 bit */
+	is_final_block = bitbuf & BITMASK(1);
 
-	/* BTYPE: 2 bits  */
-	block_type = POP_BITS(2);
+	/* BTYPE: 2 bits */
+	block_type = (bitbuf >> 1) & BITMASK(2);
 
 	if (block_type == DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN) {
 
-		/* Dynamic Huffman block.  */
+		/* Dynamic Huffman block */
 
-		/* The order in which precode lengths are stored.  */
+		/* The order in which precode lengths are stored */
 		static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = {
 			16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
 		};
 
 		unsigned num_explicit_precode_lens;
+		unsigned i;
 
-		/* Read the codeword length counts.  */
+		/* Read the codeword length counts. */
 
-		STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == ((1 << 5) - 1) + 257);
-		num_litlen_syms = POP_BITS(5) + 257;
+		STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == 257 + BITMASK(5));
+		num_litlen_syms = 257 + ((bitbuf >> 3) & BITMASK(5));
 
-		STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == ((1 << 5) - 1) + 1);
-		num_offset_syms = POP_BITS(5) + 1;
+		STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == 1 + BITMASK(5));
+		num_offset_syms = 1 + ((bitbuf >> 8) & BITMASK(5));
 
-		STATIC_ASSERT(DEFLATE_NUM_PRECODE_SYMS == ((1 << 4) - 1) + 4);
-		num_explicit_precode_lens = POP_BITS(4) + 4;
+		STATIC_ASSERT(DEFLATE_NUM_PRECODE_SYMS == 4 + BITMASK(4));
+		num_explicit_precode_lens = 4 + ((bitbuf >> 13) & BITMASK(4));
 
 		d->static_codes_loaded = false;
 
-		/* Read the precode codeword lengths.  */
+		/*
+		 * Read the precode codeword lengths.
+		 *
+		 * A 64-bit bitbuffer is just one bit too small to hold the
+		 * maximum number of precode lens, so to minimize branches we
+		 * merge one len with the previous fields.
+		 */
 		STATIC_ASSERT(DEFLATE_MAX_PRE_CODEWORD_LEN == (1 << 3) - 1);
-		for (i = 0; i < num_explicit_precode_lens; i++) {
-			ENSURE_BITS(3);
-			d->u.precode_lens[deflate_precode_lens_permutation[i]] = POP_BITS(3);
+		if (CAN_CONSUME(3 * (DEFLATE_NUM_PRECODE_SYMS - 1))) {
+			d->u.precode_lens[deflate_precode_lens_permutation[0]] =
+				(bitbuf >> 17) & BITMASK(3);
+			bitbuf >>= 20;
+			bitsleft -= 20;
+			REFILL_BITS();
+			i = 1;
+			do {
+				d->u.precode_lens[deflate_precode_lens_permutation[i]] =
+					bitbuf & BITMASK(3);
+				bitbuf >>= 3;
+				bitsleft -= 3;
+			} while (++i < num_explicit_precode_lens);
+		} else {
+			bitbuf >>= 17;
+			bitsleft -= 17;
+			i = 0;
+			do {
+				if ((u8)bitsleft < 3)
+					REFILL_BITS();
+				d->u.precode_lens[deflate_precode_lens_permutation[i]] =
+					bitbuf & BITMASK(3);
+				bitbuf >>= 3;
+				bitsleft -= 3;
+			} while (++i < num_explicit_precode_lens);
 		}
-
 		for (; i < DEFLATE_NUM_PRECODE_SYMS; i++)
 			d->u.precode_lens[deflate_precode_lens_permutation[i]] = 0;
 
-		/* Build the decode table for the precode.  */
+		/* Build the decode table for the precode. */
 		SAFETY_CHECK(build_precode_decode_table(d));
 
-		/* Expand the literal/length and offset codeword lengths.  */
-		for (i = 0; i < num_litlen_syms + num_offset_syms; ) {
-			u32 entry;
+		/* Decode the litlen and offset codeword lengths. */
+		i = 0;
+		do {
 			unsigned presym;
 			u8 rep_val;
 			unsigned rep_count;
 
-			ENSURE_BITS(DEFLATE_MAX_PRE_CODEWORD_LEN + 7);
+			if ((u8)bitsleft < DEFLATE_MAX_PRE_CODEWORD_LEN + 7)
+				REFILL_BITS();
 
-			/* (The code below assumes that the precode decode table
-			 * does not have any subtables.)  */
+			/*
+			 * The code below assumes that the precode decode table
+			 * doesn't have any subtables.
+			 */
 			STATIC_ASSERT(PRECODE_TABLEBITS == DEFLATE_MAX_PRE_CODEWORD_LEN);
 
-			/* Read the next precode symbol.  */
-			entry = d->u.l.precode_decode_table[BITS(DEFLATE_MAX_PRE_CODEWORD_LEN)];
-			REMOVE_BITS(entry & HUFFDEC_LENGTH_MASK);
-			presym = entry >> HUFFDEC_RESULT_SHIFT;
+			/* Decode the next precode symbol. */
+			entry = d->u.l.precode_decode_table[
+				bitbuf & BITMASK(DEFLATE_MAX_PRE_CODEWORD_LEN)];
+			bitbuf >>= (u8)entry;
+			bitsleft -= entry; /* optimization: subtract full entry */
+			presym = entry >> 16;
 
 			if (presym < 16) {
-				/* Explicit codeword length  */
+				/* Explicit codeword length */
 				d->u.l.lens[i++] = presym;
 				continue;
 			}
 
-			/* Run-length encoded codeword lengths  */
+			/* Run-length encoded codeword lengths */
 
-			/* Note: we don't need verify that the repeat count
-			 * doesn't overflow the number of elements, since we
-			 * have enough extra spaces to allow for the worst-case
-			 * overflow (138 zeroes when only 1 length was
-			 * remaining).
+			/*
+			 * Note: we don't need verify that the repeat count
+			 * doesn't overflow the number of elements, since we've
+			 * sized the lens array to have enough extra space to
+			 * allow for the worst-case overrun (138 zeroes when
+			 * only 1 length was remaining).
 			 *
 			 * In the case of the small repeat counts (presyms 16
 			 * and 17), it is fastest to always write the maximum
@@ -149,11 +198,13 @@ next_block:
 			STATIC_ASSERT(DEFLATE_MAX_LENS_OVERRUN == 138 - 1);
 
 			if (presym == 16) {
-				/* Repeat the previous length 3 - 6 times  */
+				/* Repeat the previous length 3 - 6 times. */
 				SAFETY_CHECK(i != 0);
 				rep_val = d->u.l.lens[i - 1];
-				STATIC_ASSERT(3 + ((1 << 2) - 1) == 6);
-				rep_count = 3 + POP_BITS(2);
+				STATIC_ASSERT(3 + BITMASK(2) == 6);
+				rep_count = 3 + (bitbuf & BITMASK(2));
+				bitbuf >>= 2;
+				bitsleft -= 2;
 				d->u.l.lens[i + 0] = rep_val;
 				d->u.l.lens[i + 1] = rep_val;
 				d->u.l.lens[i + 2] = rep_val;
@@ -162,9 +213,11 @@ next_block:
 				d->u.l.lens[i + 5] = rep_val;
 				i += rep_count;
 			} else if (presym == 17) {
-				/* Repeat zero 3 - 10 times  */
-				STATIC_ASSERT(3 + ((1 << 3) - 1) == 10);
-				rep_count = 3 + POP_BITS(3);
+				/* Repeat zero 3 - 10 times. */
+				STATIC_ASSERT(3 + BITMASK(3) == 10);
+				rep_count = 3 + (bitbuf & BITMASK(3));
+				bitbuf >>= 3;
+				bitsleft -= 3;
 				d->u.l.lens[i + 0] = 0;
 				d->u.l.lens[i + 1] = 0;
 				d->u.l.lens[i + 2] = 0;
@@ -177,25 +230,45 @@ next_block:
 				d->u.l.lens[i + 9] = 0;
 				i += rep_count;
 			} else {
-				/* Repeat zero 11 - 138 times  */
-				STATIC_ASSERT(11 + ((1 << 7) - 1) == 138);
-				rep_count = 11 + POP_BITS(7);
+				/* Repeat zero 11 - 138 times. */
+				STATIC_ASSERT(11 + BITMASK(7) == 138);
+				rep_count = 11 + (bitbuf & BITMASK(7));
+				bitbuf >>= 7;
+				bitsleft -= 7;
 				memset(&d->u.l.lens[i], 0,
 				       rep_count * sizeof(d->u.l.lens[i]));
 				i += rep_count;
 			}
-		}
+		} while (i < num_litlen_syms + num_offset_syms);
+
 	} else if (block_type == DEFLATE_BLOCKTYPE_UNCOMPRESSED) {
+		u16 len, nlen;
 
-		/* Uncompressed block: copy 'len' bytes literally from the input
-		 * buffer to the output buffer.  */
+		/*
+		 * Uncompressed block: copy 'len' bytes literally from the input
+		 * buffer to the output buffer.
+		 */
 
-		ALIGN_INPUT();
+		bitsleft -= 3; /* for BTYPE and BFINAL */
 
-		SAFETY_CHECK(in_end - in_next >= 4);
+		/*
+		 * Align the bitstream to the next byte boundary.  This means
+		 * the next byte boundary as if we were reading a byte at a
+		 * time.  Therefore, we have to rewind 'in_next' by any bytes
+		 * that have been refilled but not actually consumed yet (not
+		 * counting overread bytes, which don't increment 'in_next').
+		 */
+		bitsleft = (u8)bitsleft;
+		SAFETY_CHECK(overread_count <= (bitsleft >> 3));
+		in_next -= (bitsleft >> 3) - overread_count;
+		overread_count = 0;
+		bitbuf = 0;
+		bitsleft = 0;
 
-		len = READ_U16();
-		nlen = READ_U16();
+		SAFETY_CHECK(in_end - in_next >= 4);
+		len = get_unaligned_le16(in_next);
+		nlen = get_unaligned_le16(in_next + 2);
+		in_next += 4;
 
 		SAFETY_CHECK(len == (u16)~nlen);
 		if (unlikely(len > out_end - out_next))
@@ -209,6 +282,8 @@ next_block:
 		goto block_done;
 
 	} else {
+		unsigned i;
+
 		SAFETY_CHECK(block_type == DEFLATE_BLOCKTYPE_STATIC_HUFFMAN);
 
 		/*
@@ -221,6 +296,9 @@ next_block:
 		 * dynamic Huffman block.
 		 */
 
+		bitbuf >>= 3; /* for BTYPE and BFINAL */
+		bitsleft -= 3;
+
 		if (d->static_codes_loaded)
 			goto have_decode_tables;
 
@@ -245,169 +323,442 @@ next_block:
 		num_offset_syms = 32;
 	}
 
-	/* Decompressing a Huffman block (either dynamic or static)  */
+	/* Decompressing a Huffman block (either dynamic or static) */
 
 	SAFETY_CHECK(build_offset_decode_table(d, num_litlen_syms, num_offset_syms));
 	SAFETY_CHECK(build_litlen_decode_table(d, num_litlen_syms, num_offset_syms));
 have_decode_tables:
-
-	/* The main DEFLATE decode loop  */
-	for (;;) {
-		u32 entry;
-		u32 length;
-		u32 offset;
+	litlen_tablemask = BITMASK(d->litlen_tablebits);
+
+	/*
+	 * This is the "fastloop" for decoding literals and matches.  It does
+	 * bounds checks on in_next and out_next in the loop conditions so that
+	 * additional bounds checks aren't needed inside the loop body.
+	 *
+	 * To reduce latency, the bitbuffer is refilled and the next litlen
+	 * decode table entry is preloaded before each loop iteration.
+	 */
+	if (in_next >= in_fastloop_end || out_next >= out_fastloop_end)
+		goto generic_loop;
+	REFILL_BITS_IN_FASTLOOP();
+	entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
+	do {
+		u32 length, offset, lit;
 		const u8 *src;
 		u8 *dst;
 
-		/* Decode a litlen symbol.  */
-		ENSURE_BITS(DEFLATE_MAX_LITLEN_CODEWORD_LEN);
-		entry = d->u.litlen_decode_table[BITS(LITLEN_TABLEBITS)];
-		if (entry & HUFFDEC_SUBTABLE_POINTER) {
-			/* Litlen subtable required (uncommon case)  */
-			REMOVE_BITS(LITLEN_TABLEBITS);
-			entry = d->u.litlen_decode_table[
-				((entry >> HUFFDEC_RESULT_SHIFT) & 0xFFFF) +
-				BITS(entry & HUFFDEC_LENGTH_MASK)];
-		}
-		REMOVE_BITS(entry & HUFFDEC_LENGTH_MASK);
+		/*
+		 * Consume the bits for the litlen decode table entry.  Save the
+		 * original bitbuf for later, in case the extra match length
+		 * bits need to be extracted from it.
+		 */
+		saved_bitbuf = bitbuf;
+		bitbuf >>= (u8)entry;
+		bitsleft -= entry; /* optimization: subtract full entry */
+
+		/*
+		 * Begin by checking for a "fast" literal, i.e. a literal that
+		 * doesn't need a subtable.
+		 */
 		if (entry & HUFFDEC_LITERAL) {
-			/* Literal  */
-			if (unlikely(out_next == out_end))
-				return LIBDEFLATE_INSUFFICIENT_SPACE;
-			*out_next++ = (u8)(entry >> HUFFDEC_RESULT_SHIFT);
-			continue;
+			/*
+			 * On 64-bit platforms, we decode up to 2 extra fast
+			 * literals in addition to the primary item, as this
+			 * increases performance and still leaves enough bits
+			 * remaining for what follows.  We could actually do 3,
+			 * assuming LITLEN_TABLEBITS=11, but that actually
+			 * decreases performance slightly (perhaps by messing
+			 * with the branch prediction of the conditional refill
+			 * that happens later while decoding the match offset).
+			 *
+			 * Note: the definitions of FASTLOOP_MAX_BYTES_WRITTEN
+			 * and FASTLOOP_MAX_BYTES_READ need to be updated if the
+			 * number of extra literals decoded here is changed.
+			 */
+			if (/* enough bits for 2 fast literals + length + offset preload? */
+			    CAN_CONSUME_AND_THEN_PRELOAD(2 * LITLEN_TABLEBITS +
+							 LENGTH_MAXBITS,
+							 OFFSET_TABLEBITS) &&
+			    /* enough bits for 2 fast literals + slow literal + litlen preload? */
+			    CAN_CONSUME_AND_THEN_PRELOAD(2 * LITLEN_TABLEBITS +
+							 DEFLATE_MAX_LITLEN_CODEWORD_LEN,
+							 LITLEN_TABLEBITS)) {
+				/* 1st extra fast literal */
+				lit = entry >> 16;
+				entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
+				saved_bitbuf = bitbuf;
+				bitbuf >>= (u8)entry;
+				bitsleft -= entry;
+				*out_next++ = lit;
+				if (entry & HUFFDEC_LITERAL) {
+					/* 2nd extra fast literal */
+					lit = entry >> 16;
+					entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
+					saved_bitbuf = bitbuf;
+					bitbuf >>= (u8)entry;
+					bitsleft -= entry;
+					*out_next++ = lit;
+					if (entry & HUFFDEC_LITERAL) {
+						/*
+						 * Another fast literal, but
+						 * this one is in lieu of the
+						 * primary item, so it doesn't
+						 * count as one of the extras.
+						 */
+						lit = entry >> 16;
+						entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
+						REFILL_BITS_IN_FASTLOOP();
+						*out_next++ = lit;
+						continue;
+					}
+				}
+			} else {
+				/*
+				 * Decode a literal.  While doing so, preload
+				 * the next litlen decode table entry and refill
+				 * the bitbuffer.  To reduce latency, we've
+				 * arranged for there to be enough "preloadable"
+				 * bits remaining to do the table preload
+				 * independently of the refill.
+				 */
+				STATIC_ASSERT(CAN_CONSUME_AND_THEN_PRELOAD(
+						LITLEN_TABLEBITS, LITLEN_TABLEBITS));
+				lit = entry >> 16;
+				entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
+				REFILL_BITS_IN_FASTLOOP();
+				*out_next++ = lit;
+				continue;
+			}
 		}
 
-		/* Match or end-of-block  */
-
-		entry >>= HUFFDEC_RESULT_SHIFT;
-		ENSURE_BITS(MAX_ENSURE);
+		/*
+		 * It's not a literal entry, so it can be a length entry, a
+		 * subtable pointer entry, or an end-of-block entry.  Detect the
+		 * two unlikely cases by testing the HUFFDEC_EXCEPTIONAL flag.
+		 */
+		if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) {
+			/* Subtable pointer or end-of-block entry */
 
-		/* Pop the extra length bits and add them to the length base to
-		 * produce the full length.  */
-		length = (entry >> HUFFDEC_LENGTH_BASE_SHIFT) +
-			 POP_BITS(entry & HUFFDEC_EXTRA_LENGTH_BITS_MASK);
+			if (unlikely(entry & HUFFDEC_END_OF_BLOCK))
+				goto block_done;
 
-		/* The match destination must not end after the end of the
-		 * output buffer.  For efficiency, combine this check with the
-		 * end-of-block check.  We're using 0 for the special
-		 * end-of-block length, so subtract 1 and it turn it into
-		 * SIZE_MAX.  */
-		STATIC_ASSERT(HUFFDEC_END_OF_BLOCK_LENGTH == 0);
-		if (unlikely((size_t)length - 1 >= out_end - out_next)) {
-			if (unlikely(length != HUFFDEC_END_OF_BLOCK_LENGTH))
-				return LIBDEFLATE_INSUFFICIENT_SPACE;
-			goto block_done;
+			/*
+			 * A subtable is required.  Load and consume the
+			 * subtable entry.  The subtable entry can be of any
+			 * type: literal, length, or end-of-block.
+			 */
+			entry = d->u.litlen_decode_table[(entry >> 16) +
+				EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
+			saved_bitbuf = bitbuf;
+			bitbuf >>= (u8)entry;
+			bitsleft -= entry;
+
+			/*
+			 * 32-bit platforms that use the byte-at-a-time refill
+			 * method have to do a refill here for there to always
+			 * be enough bits to decode a literal that requires a
+			 * subtable, then preload the next litlen decode table
+			 * entry; or to decode a match length that requires a
+			 * subtable, then preload the offset decode table entry.
+			 */
+			if (!CAN_CONSUME_AND_THEN_PRELOAD(DEFLATE_MAX_LITLEN_CODEWORD_LEN,
+							  LITLEN_TABLEBITS) ||
+			    !CAN_CONSUME_AND_THEN_PRELOAD(LENGTH_MAXBITS,
+							  OFFSET_TABLEBITS))
+				REFILL_BITS_IN_FASTLOOP();
+			if (entry & HUFFDEC_LITERAL) {
+				/* Decode a literal that required a subtable. */
+				lit = entry >> 16;
+				entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
+				REFILL_BITS_IN_FASTLOOP();
+				*out_next++ = lit;
+				continue;
+			}
+			if (unlikely(entry & HUFFDEC_END_OF_BLOCK))
+				goto block_done;
+			/* Else, it's a length that required a subtable. */
 		}
 
-		/* Decode the match offset.  */
-
-		entry = d->offset_decode_table[BITS(OFFSET_TABLEBITS)];
-		if (entry & HUFFDEC_SUBTABLE_POINTER) {
-			/* Offset subtable required (uncommon case)  */
-			REMOVE_BITS(OFFSET_TABLEBITS);
-			entry = d->offset_decode_table[
-				((entry >> HUFFDEC_RESULT_SHIFT) & 0xFFFF) +
-				BITS(entry & HUFFDEC_LENGTH_MASK)];
-		}
-		REMOVE_BITS(entry & HUFFDEC_LENGTH_MASK);
-		entry >>= HUFFDEC_RESULT_SHIFT;
-
-		STATIC_ASSERT(CAN_ENSURE(DEFLATE_MAX_EXTRA_LENGTH_BITS +
-					 DEFLATE_MAX_OFFSET_CODEWORD_LEN) &&
-			      CAN_ENSURE(DEFLATE_MAX_EXTRA_OFFSET_BITS));
-		if (!CAN_ENSURE(DEFLATE_MAX_EXTRA_LENGTH_BITS +
-				DEFLATE_MAX_OFFSET_CODEWORD_LEN +
-				DEFLATE_MAX_EXTRA_OFFSET_BITS))
-			ENSURE_BITS(DEFLATE_MAX_EXTRA_OFFSET_BITS);
-
-		/* Pop the extra offset bits and add them to the offset base to
-		 * produce the full offset.  */
-		offset = (entry & HUFFDEC_OFFSET_BASE_MASK) +
-			 POP_BITS(entry >> HUFFDEC_EXTRA_OFFSET_BITS_SHIFT);
-
-		/* The match source must not begin before the beginning of the
-		 * output buffer.  */
-		SAFETY_CHECK(offset <= out_next - (const u8 *)out);
+		/*
+		 * Decode the match length: the length base value associated
+		 * with the litlen symbol (which we extract from the decode
+		 * table entry), plus the extra length bits.  We don't need to
+		 * consume the extra length bits here, as they were included in
+		 * the bits consumed by the entry earlier.  We also don't need
+		 * to check for too-long matches here, as this is inside the
+		 * fastloop where it's already been verified that the output
+		 * buffer has enough space remaining to copy a max-length match.
+		 */
+		length = entry >> 16;
+		length += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8);
 
 		/*
-		 * Copy the match: 'length' bytes at 'out_next - offset' to
-		 * 'out_next', possibly overlapping.  If the match doesn't end
-		 * too close to the end of the buffer and offset >= WORDBYTES ||
-		 * offset == 1, take a fast path which copies a word at a time
-		 * -- potentially more than the length of the match, but that's
-		 * fine as long as we check for enough extra space.
-		 *
-		 * The remaining cases are not performance-critical so are
-		 * handled by a simple byte-by-byte copy.
+		 * Decode the match offset.  There are enough "preloadable" bits
+		 * remaining to preload the offset decode table entry, but a
+		 * refill might be needed before consuming it.
 		 */
+		STATIC_ASSERT(CAN_CONSUME_AND_THEN_PRELOAD(LENGTH_MAXFASTBITS,
+							   OFFSET_TABLEBITS));
+		entry = d->offset_decode_table[bitbuf & BITMASK(OFFSET_TABLEBITS)];
+		if (CAN_CONSUME_AND_THEN_PRELOAD(OFFSET_MAXBITS,
+						 LITLEN_TABLEBITS)) {
+			/*
+			 * Decoding a match offset on a 64-bit platform.  We may
+			 * need to refill once, but then we can decode the whole
+			 * offset and preload the next litlen table entry.
+			 */
+			if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) {
+				/* Offset codeword requires a subtable */
+				if (unlikely((u8)bitsleft < OFFSET_MAXBITS +
+					     LITLEN_TABLEBITS - PRELOAD_SLACK))
+					REFILL_BITS_IN_FASTLOOP();
+				bitbuf >>= OFFSET_TABLEBITS;
+				bitsleft -= OFFSET_TABLEBITS;
+				entry = d->offset_decode_table[(entry >> 16) +
+					EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
+			} else if (unlikely((u8)bitsleft < OFFSET_MAXFASTBITS +
+					    LITLEN_TABLEBITS - PRELOAD_SLACK))
+				REFILL_BITS_IN_FASTLOOP();
+		} else {
+			/* Decoding a match offset on a 32-bit platform */
+			REFILL_BITS_IN_FASTLOOP();
+			if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) {
+				/* Offset codeword requires a subtable */
+				bitbuf >>= OFFSET_TABLEBITS;
+				bitsleft -= OFFSET_TABLEBITS;
+				entry = d->offset_decode_table[(entry >> 16) +
+					EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
+				REFILL_BITS_IN_FASTLOOP();
+				/* No further refill needed before extra bits */
+				STATIC_ASSERT(CAN_CONSUME(
+					OFFSET_MAXBITS - OFFSET_TABLEBITS));
+			} else {
+				/* No refill needed before extra bits */
+				STATIC_ASSERT(CAN_CONSUME(OFFSET_MAXFASTBITS));
+			}
+		}
+		saved_bitbuf = bitbuf;
+		bitbuf >>= (u8)entry;
+		bitsleft -= entry; /* optimization: subtract full entry */
+		offset = entry >> 16;
+		offset += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8);
 
+		/* Validate the match offset; needed even in the fastloop. */
+		SAFETY_CHECK(offset <= out_next - (const u8 *)out);
 		src = out_next - offset;
 		dst = out_next;
 		out_next += length;
 
-		if (UNALIGNED_ACCESS_IS_FAST &&
-		    /* max overrun is writing 3 words for a min length match */
-		    likely(out_end - out_next >=
-			   3 * WORDBYTES - DEFLATE_MIN_MATCH_LEN)) {
-			if (offset >= WORDBYTES) { /* words don't overlap? */
-				copy_word_unaligned(src, dst);
+		/*
+		 * Before starting to issue the instructions to copy the match,
+		 * refill the bitbuffer and preload the litlen decode table
+		 * entry for the next loop iteration.  This can increase
+		 * performance by allowing the latency of the match copy to
+		 * overlap with these other operations.  To further reduce
+		 * latency, we've arranged for there to be enough bits remaining
+		 * to do the table preload independently of the refill, except
+		 * on 32-bit platforms using the byte-at-a-time refill method.
+		 */
+		if (!CAN_CONSUME_AND_THEN_PRELOAD(
+			MAX(OFFSET_MAXBITS - OFFSET_TABLEBITS,
+			    OFFSET_MAXFASTBITS),
+			LITLEN_TABLEBITS) &&
+		    unlikely((u8)bitsleft < LITLEN_TABLEBITS - PRELOAD_SLACK))
+			REFILL_BITS_IN_FASTLOOP();
+		entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
+		REFILL_BITS_IN_FASTLOOP();
+
+		/*
+		 * Copy the match.  On most CPUs the fastest method is a
+		 * word-at-a-time copy, unconditionally copying about 5 words
+		 * since this is enough for most matches without being too much.
+		 *
+		 * The normal word-at-a-time copy works for offset >= WORDBYTES,
+		 * which is most cases.  The case of offset == 1 is also common
+		 * and is worth optimizing for, since it is just RLE encoding of
+		 * the previous byte, which is the result of compressing long
+		 * runs of the same byte.
+		 *
+		 * Writing past the match 'length' is allowed here, since it's
+		 * been ensured there is enough output space left for a slight
+		 * overrun.  FASTLOOP_MAX_BYTES_WRITTEN needs to be updated if
+		 * the maximum possible overrun here is changed.
+		 */
+		if (UNALIGNED_ACCESS_IS_FAST && offset >= WORDBYTES) {
+			store_word_unaligned(load_word_unaligned(src), dst);
+			src += WORDBYTES;
+			dst += WORDBYTES;
+			store_word_unaligned(load_word_unaligned(src), dst);
+			src += WORDBYTES;
+			dst += WORDBYTES;
+			store_word_unaligned(load_word_unaligned(src), dst);
+			src += WORDBYTES;
+			dst += WORDBYTES;
+			store_word_unaligned(load_word_unaligned(src), dst);
+			src += WORDBYTES;
+			dst += WORDBYTES;
+			store_word_unaligned(load_word_unaligned(src), dst);
+			src += WORDBYTES;
+			dst += WORDBYTES;
+			while (dst < out_next) {
+				store_word_unaligned(load_word_unaligned(src), dst);
+				src += WORDBYTES;
+				dst += WORDBYTES;
+				store_word_unaligned(load_word_unaligned(src), dst);
 				src += WORDBYTES;
 				dst += WORDBYTES;
-				copy_word_unaligned(src, dst);
+				store_word_unaligned(load_word_unaligned(src), dst);
 				src += WORDBYTES;
 				dst += WORDBYTES;
-				do {
-					copy_word_unaligned(src, dst);
-					src += WORDBYTES;
-					dst += WORDBYTES;
-				} while (dst < out_next);
-			} else if (offset == 1) {
-				/* RLE encoding of previous byte, common if the
-				 * data contains many repeated bytes */
-				machine_word_t v = repeat_byte(*src);
+				store_word_unaligned(load_word_unaligned(src), dst);
+				src += WORDBYTES;
+				dst += WORDBYTES;
+				store_word_unaligned(load_word_unaligned(src), dst);
+				src += WORDBYTES;
+				dst += WORDBYTES;
+			}
+		} else if (UNALIGNED_ACCESS_IS_FAST && offset == 1) {
+			machine_word_t v;
 
+			/*
+			 * This part tends to get auto-vectorized, so keep it
+			 * copying a multiple of 16 bytes at a time.
+			 */
+			v = (machine_word_t)0x0101010101010101 * src[0];
+			store_word_unaligned(v, dst);
+			dst += WORDBYTES;
+			store_word_unaligned(v, dst);
+			dst += WORDBYTES;
+			store_word_unaligned(v, dst);
+			dst += WORDBYTES;
+			store_word_unaligned(v, dst);
+			dst += WORDBYTES;
+			while (dst < out_next) {
+				store_word_unaligned(v, dst);
+				dst += WORDBYTES;
+				store_word_unaligned(v, dst);
+				dst += WORDBYTES;
 				store_word_unaligned(v, dst);
 				dst += WORDBYTES;
 				store_word_unaligned(v, dst);
 				dst += WORDBYTES;
-				do {
-					store_word_unaligned(v, dst);
-					dst += WORDBYTES;
-				} while (dst < out_next);
-			} else {
-				*dst++ = *src++;
-				*dst++ = *src++;
-				do {
-					*dst++ = *src++;
-				} while (dst < out_next);
 			}
+		} else if (UNALIGNED_ACCESS_IS_FAST) {
+			store_word_unaligned(load_word_unaligned(src), dst);
+			src += offset;
+			dst += offset;
+			store_word_unaligned(load_word_unaligned(src), dst);
+			src += offset;
+			dst += offset;
+			do {
+				store_word_unaligned(load_word_unaligned(src), dst);
+				src += offset;
+				dst += offset;
+				store_word_unaligned(load_word_unaligned(src), dst);
+				src += offset;
+				dst += offset;
+			} while (dst < out_next);
 		} else {
-			STATIC_ASSERT(DEFLATE_MIN_MATCH_LEN == 3);
 			*dst++ = *src++;
 			*dst++ = *src++;
 			do {
 				*dst++ = *src++;
 			} while (dst < out_next);
 		}
+	} while (in_next < in_fastloop_end && out_next < out_fastloop_end);
+
+	/*
+	 * This is the generic loop for decoding literals and matches.  This
+	 * handles cases where in_next and out_next are close to the end of
+	 * their respective buffers.  Usually this loop isn't performance-
+	 * critical, as most time is spent in the fastloop above instead.  We
+	 * therefore omit some optimizations here in favor of smaller code.
+	 */
+generic_loop:
+	for (;;) {
+		u32 length, offset;
+		const u8 *src;
+		u8 *dst;
+
+		REFILL_BITS();
+		entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
+		saved_bitbuf = bitbuf;
+		bitbuf >>= (u8)entry;
+		bitsleft -= entry;
+		if (unlikely(entry & HUFFDEC_SUBTABLE_POINTER)) {
+			entry = d->u.litlen_decode_table[(entry >> 16) +
+					EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
+			saved_bitbuf = bitbuf;
+			bitbuf >>= (u8)entry;
+			bitsleft -= entry;
+		}
+		length = entry >> 16;
+		if (entry & HUFFDEC_LITERAL) {
+			if (unlikely(out_next == out_end))
+				return LIBDEFLATE_INSUFFICIENT_SPACE;
+			*out_next++ = length;
+			continue;
+		}
+		if (unlikely(entry & HUFFDEC_END_OF_BLOCK))
+			goto block_done;
+		length += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8);
+		if (unlikely(length > out_end - out_next))
+			return LIBDEFLATE_INSUFFICIENT_SPACE;
+
+		if (!CAN_CONSUME(LENGTH_MAXBITS + OFFSET_MAXBITS))
+			REFILL_BITS();
+		entry = d->offset_decode_table[bitbuf & BITMASK(OFFSET_TABLEBITS)];
+		if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) {
+			bitbuf >>= OFFSET_TABLEBITS;
+			bitsleft -= OFFSET_TABLEBITS;
+			entry = d->offset_decode_table[(entry >> 16) +
+					EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
+			if (!CAN_CONSUME(OFFSET_MAXBITS))
+				REFILL_BITS();
+		}
+		offset = entry >> 16;
+		offset += EXTRACT_VARBITS8(bitbuf, entry) >> (u8)(entry >> 8);
+		bitbuf >>= (u8)entry;
+		bitsleft -= entry;
+
+		SAFETY_CHECK(offset <= out_next - (const u8 *)out);
+		src = out_next - offset;
+		dst = out_next;
+		out_next += length;
+
+		STATIC_ASSERT(DEFLATE_MIN_MATCH_LEN == 3);
+		*dst++ = *src++;
+		*dst++ = *src++;
+		do {
+			*dst++ = *src++;
+		} while (dst < out_next);
 	}
 
 block_done:
-	/* Finished decoding a block.  */
+	/* Finished decoding a block */
 
 	if (!is_final_block)
 		goto next_block;
 
-	/* That was the last block.  */
+	/* That was the last block. */
 
-	/* Discard any readahead bits and check for excessive overread */
-	ALIGN_INPUT();
+	bitsleft = (u8)bitsleft;
+
+	/*
+	 * If any of the implicit appended zero bytes were consumed (not just
+	 * refilled) before hitting end of stream, then the data is bad.
+	 */
+	SAFETY_CHECK(overread_count <= (bitsleft >> 3));
+
+	/* Optionally return the actual number of bytes consumed. */
+	if (actual_in_nbytes_ret) {
+		/* Don't count bytes that were refilled but not consumed. */
+		in_next -= (bitsleft >> 3) - overread_count;
 
-	/* Optionally return the actual number of bytes read */
-	if (actual_in_nbytes_ret)
 		*actual_in_nbytes_ret = in_next - (u8 *)in;
+	}
 
-	/* Optionally return the actual number of bytes written */
+	/* Optionally return the actual number of bytes written. */
 	if (actual_out_nbytes_ret) {
 		*actual_out_nbytes_ret = out_next - (u8 *)out;
 	} else {
@@ -419,3 +770,5 @@ block_done:
 
 #undef FUNCNAME
 #undef ATTRIBUTES
+#undef EXTRACT_VARBITS
+#undef EXTRACT_VARBITS8


=====================================
lib/deflate_compress.c
=====================================
@@ -475,7 +475,7 @@ struct deflate_output_bitstream;
 struct libdeflate_compressor {
 
 	/* Pointer to the compress() implementation chosen at allocation time */
-	void (*impl)(struct libdeflate_compressor *c, const u8 *in,
+	void (*impl)(struct libdeflate_compressor *restrict c, const u8 *in,
 		     size_t in_nbytes, struct deflate_output_bitstream *os);
 
 	/* The compression level with which this compressor was created */
@@ -1041,7 +1041,6 @@ compute_length_counts(u32 A[], unsigned root_idx, unsigned len_counts[],
 		unsigned parent = A[node] >> NUM_SYMBOL_BITS;
 		unsigned parent_depth = A[parent] >> NUM_SYMBOL_BITS;
 		unsigned depth = parent_depth + 1;
-		unsigned len = depth;
 
 		/*
 		 * Set the depth of this node so that it is available when its
@@ -1054,19 +1053,19 @@ compute_length_counts(u32 A[], unsigned root_idx, unsigned len_counts[],
 		 * constraint.  This is not the optimal method for generating
 		 * length-limited Huffman codes!  But it should be good enough.
 		 */
-		if (len >= max_codeword_len) {
-			len = max_codeword_len;
+		if (depth >= max_codeword_len) {
+			depth = max_codeword_len;
 			do {
-				len--;
-			} while (len_counts[len] == 0);
+				depth--;
+			} while (len_counts[depth] == 0);
 		}
 
 		/*
 		 * Account for the fact that we have a non-leaf node at the
 		 * current depth.
 		 */
-		len_counts[len]--;
-		len_counts[len + 1] += 2;
+		len_counts[depth]--;
+		len_counts[depth + 1] += 2;
 	}
 }
 
@@ -1189,11 +1188,9 @@ gen_codewords(u32 A[], u8 lens[], const unsigned len_counts[],
 			(next_codewords[len - 1] + len_counts[len - 1]) << 1;
 
 	for (sym = 0; sym < num_syms; sym++) {
-		u8 len = lens[sym];
-		u32 codeword = next_codewords[len]++;
-
 		/* DEFLATE requires bit-reversed codewords. */
-		A[sym] = reverse_codeword(codeword, len);
+		A[sym] = reverse_codeword(next_codewords[lens[sym]]++,
+					  lens[sym]);
 	}
 }
 


=====================================
lib/deflate_constants.h
=====================================
@@ -49,11 +49,8 @@
 /*
  * Maximum number of extra bits that may be required to represent a match
  * length or offset.
- *
- * TODO: are we going to have full DEFLATE64 support?  If so, up to 16
- * length bits must be supported.
  */
 #define DEFLATE_MAX_EXTRA_LENGTH_BITS		5
-#define DEFLATE_MAX_EXTRA_OFFSET_BITS		14
+#define DEFLATE_MAX_EXTRA_OFFSET_BITS		13
 
 #endif /* LIB_DEFLATE_CONSTANTS_H */


=====================================
lib/deflate_decompress.c
=====================================
@@ -26,22 +26,20 @@
  *
  * ---------------------------------------------------------------------------
  *
- * This is a highly optimized DEFLATE decompressor.  When compiled with gcc on
- * x86_64, it decompresses data in about 52% of the time of zlib (48% if BMI2
- * instructions are available).  On other architectures it should still be
- * significantly faster than zlib, but the difference may be smaller.
+ * This is a highly optimized DEFLATE decompressor.  It is much faster than
+ * vanilla zlib, typically well over twice as fast, though results vary by CPU.
  *
- * Why this is faster than zlib's implementation:
+ * Why this is faster than vanilla zlib:
  *
  * - Word accesses rather than byte accesses when reading input
  * - Word accesses rather than byte accesses when copying matches
  * - Faster Huffman decoding combined with various DEFLATE-specific tricks
- * - Larger bitbuffer variable that doesn't need to be filled as often
+ * - Larger bitbuffer variable that doesn't need to be refilled as often
  * - Other optimizations to remove unnecessary branches
  * - Only full-buffer decompression is supported, so the code doesn't need to
  *   support stopping and resuming decompression.
- * - On x86_64, compile a version of the decompression routine using BMI2
- *   instructions and use it automatically at runtime when supported.
+ * - On x86_64, a version of the decompression routine is compiled with BMI2
+ *   instructions enabled and is used automatically at runtime when supported.
  */
 
 #include <limits.h>
@@ -66,77 +64,6 @@
 #  define SAFETY_CHECK(expr)	if (unlikely(!(expr))) return LIBDEFLATE_BAD_DATA
 #endif
 
-/*
- * Each TABLEBITS number is the base-2 logarithm of the number of entries in the
- * main portion of the corresponding decode table.  Each number should be large
- * enough to ensure that for typical data, the vast majority of symbols can be
- * decoded by a direct lookup of the next TABLEBITS bits of compressed data.
- * However, this must be balanced against the fact that a larger table requires
- * more memory and requires more time to fill.
- *
- * Note: you cannot change a TABLEBITS number without also changing the
- * corresponding ENOUGH number!
- */
-#define PRECODE_TABLEBITS	7
-#define LITLEN_TABLEBITS	10
-#define OFFSET_TABLEBITS	8
-
-/*
- * Each ENOUGH number is the maximum number of decode table entries that may be
- * required for the corresponding Huffman code, including the main table and all
- * subtables.  Each number depends on three parameters:
- *
- *	(1) the maximum number of symbols in the code (DEFLATE_NUM_*_SYMS)
- *	(2) the number of main table bits (the TABLEBITS numbers defined above)
- *	(3) the maximum allowed codeword length (DEFLATE_MAX_*_CODEWORD_LEN)
- *
- * The ENOUGH numbers were computed using the utility program 'enough' from
- * zlib.  This program enumerates all possible relevant Huffman codes to find
- * the worst-case usage of decode table entries.
- */
-#define PRECODE_ENOUGH		128	/* enough 19 7 7	*/
-#define LITLEN_ENOUGH		1334	/* enough 288 10 15	*/
-#define OFFSET_ENOUGH		402	/* enough 32 8 15	*/
-
-/*
- * The main DEFLATE decompressor structure.  Since this implementation only
- * supports full buffer decompression, this structure does not store the entire
- * decompression state, but rather only some arrays that are too large to
- * comfortably allocate on the stack.
- */
-struct libdeflate_decompressor {
-
-	/*
-	 * The arrays aren't all needed at the same time.  'precode_lens' and
-	 * 'precode_decode_table' are unneeded after 'lens' has been filled.
-	 * Furthermore, 'lens' need not be retained after building the litlen
-	 * and offset decode tables.  In fact, 'lens' can be in union with
-	 * 'litlen_decode_table' provided that 'offset_decode_table' is separate
-	 * and is built first.
-	 */
-
-	union {
-		u8 precode_lens[DEFLATE_NUM_PRECODE_SYMS];
-
-		struct {
-			u8 lens[DEFLATE_NUM_LITLEN_SYMS +
-				DEFLATE_NUM_OFFSET_SYMS +
-				DEFLATE_MAX_LENS_OVERRUN];
-
-			u32 precode_decode_table[PRECODE_ENOUGH];
-		} l;
-
-		u32 litlen_decode_table[LITLEN_ENOUGH];
-	} u;
-
-	u32 offset_decode_table[OFFSET_ENOUGH];
-
-	/* used only during build_decode_table() */
-	u16 sorted_syms[DEFLATE_MAX_NUM_SYMS];
-
-	bool static_codes_loaded;
-};
-
 /*****************************************************************************
  *				Input bitstream                              *
  *****************************************************************************/
@@ -144,19 +71,32 @@ struct libdeflate_decompressor {
 /*
  * The state of the "input bitstream" consists of the following variables:
  *
- *	- in_next: pointer to the next unread byte in the input buffer
+ *	- in_next: a pointer to the next unread byte in the input buffer
  *
- *	- in_end: pointer just past the end of the input buffer
+ *	- in_end: a pointer to just past the end of the input buffer
  *
  *	- bitbuf: a word-sized variable containing bits that have been read from
- *		  the input buffer.  The buffered bits are right-aligned
- *		  (they're the low-order bits).
+ *		  the input buffer or from the implicit appended zero bytes
+ *
+ *	- bitsleft: the number of bits in 'bitbuf' available to be consumed.
+ *		    After REFILL_BITS_BRANCHLESS(), 'bitbuf' can actually
+ *		    contain more bits than this.  However, only the bits counted
+ *		    by 'bitsleft' can actually be consumed; the rest can only be
+ *		    used for preloading.
  *
- *	- bitsleft: number of bits in 'bitbuf' that are valid.
+ *		    As a micro-optimization, we allow bits 8 and higher of
+ *		    'bitsleft' to contain garbage.  When consuming the bits
+ *		    associated with a decode table entry, this allows us to do
+ *		    'bitsleft -= entry' instead of 'bitsleft -= (u8)entry'.
+ *		    On some CPUs, this helps reduce instruction dependencies.
+ *		    This does have the disadvantage that 'bitsleft' sometimes
+ *		    needs to be cast to 'u8', such as when it's used as a shift
+ *		    amount in REFILL_BITS_BRANCHLESS().  But that one happens
+ *		    for free since most CPUs ignore high bits in shift amounts.
  *
- * To make it easier for the compiler to optimize the code by keeping variables
- * in registers, these are declared as normal variables and manipulated using
- * macros.
+ *	- overread_count: the total number of implicit appended zero bytes that
+ *			  have been loaded into the bitbuffer, including any
+ *			  counted by 'bitsleft' and any already consumed
  */
 
 /*
@@ -164,38 +104,123 @@ struct libdeflate_decompressor {
  * performance, this should have size equal to a machine word.
  *
  * 64-bit platforms have a significant advantage: they get a bigger bitbuffer
- * which they have to fill less often.
+ * which they don't have to refill as often.
  */
 typedef machine_word_t bitbuf_t;
+#define BITBUF_NBITS	(8 * (int)sizeof(bitbuf_t))
+
+/* BITMASK(n) returns a bitmask of length 'n'. */
+#define BITMASK(n)	(((bitbuf_t)1 << (n)) - 1)
 
 /*
- * Number of bits the bitbuffer variable can hold.
- *
- * This is one less than the obvious value because of the optimized arithmetic
- * in FILL_BITS_WORDWISE() that leaves 'bitsleft' in the range
- * [WORDBITS - 8, WORDBITS - 1] rather than [WORDBITS - 7, WORDBITS].
+ * MAX_BITSLEFT is the maximum number of consumable bits, i.e. the maximum value
+ * of '(u8)bitsleft'.  This is the size of the bitbuffer variable, minus 1 if
+ * the branchless refill method is being used (see REFILL_BITS_BRANCHLESS()).
  */
-#define BITBUF_NBITS	(8 * sizeof(bitbuf_t) - 1)
+#define MAX_BITSLEFT	\
+	(UNALIGNED_ACCESS_IS_FAST ? BITBUF_NBITS - 1 : BITBUF_NBITS)
 
 /*
- * The maximum number of bits that can be ensured in the bitbuffer variable,
- * i.e. the maximum value of 'n' that can be passed ENSURE_BITS(n).  The decoder
- * only reads whole bytes from memory, so this is the lowest value of 'bitsleft'
- * at which another byte cannot be read without first consuming some bits.
+ * CONSUMABLE_NBITS is the minimum number of bits that are guaranteed to be
+ * consumable (counted in 'bitsleft') immediately after refilling the bitbuffer.
+ * Since only whole bytes can be added to 'bitsleft', the worst case is
+ * 'MAX_BITSLEFT - 7': the smallest amount where another byte doesn't fit.
  */
-#define MAX_ENSURE	(BITBUF_NBITS - 7)
+#define CONSUMABLE_NBITS	(MAX_BITSLEFT - 7)
 
 /*
- * Evaluates to true if 'n' is a valid argument to ENSURE_BITS(n), or false if
- * 'n' is too large to be passed to ENSURE_BITS(n).  Note: if 'n' is a compile
- * time constant, then this expression will be a compile-type constant.
- * Therefore, CAN_ENSURE() can be used choose between alternative
- * implementations at compile time.
+ * FASTLOOP_PRELOADABLE_NBITS is the minimum number of bits that are guaranteed
+ * to be preloadable immediately after REFILL_BITS_IN_FASTLOOP().  (It is *not*
+ * guaranteed after REFILL_BITS(), since REFILL_BITS() falls back to a
+ * byte-at-a-time refill method near the end of input.)  This may exceed the
+ * number of consumable bits (counted by 'bitsleft').  Any bits not counted in
+ * 'bitsleft' can only be used for precomputation and cannot be consumed.
+ */
+#define FASTLOOP_PRELOADABLE_NBITS	\
+	(UNALIGNED_ACCESS_IS_FAST ? BITBUF_NBITS : CONSUMABLE_NBITS)
+
+/*
+ * PRELOAD_SLACK is the minimum number of bits that are guaranteed to be
+ * preloadable but not consumable, following REFILL_BITS_IN_FASTLOOP() and any
+ * subsequent consumptions.  This is 1 bit if the branchless refill method is
+ * being used, and 0 bits otherwise.
+ */
+#define PRELOAD_SLACK	MAX(0, FASTLOOP_PRELOADABLE_NBITS - MAX_BITSLEFT)
+
+/*
+ * CAN_CONSUME(n) is true if it's guaranteed that if the bitbuffer has just been
+ * refilled, then it's always possible to consume 'n' bits from it.  'n' should
+ * be a compile-time constant, to enable compile-time evaluation.
+ */
+#define CAN_CONSUME(n)	(CONSUMABLE_NBITS >= (n))
+
+/*
+ * CAN_CONSUME_AND_THEN_PRELOAD(consume_nbits, preload_nbits) is true if it's
+ * guaranteed that after REFILL_BITS_IN_FASTLOOP(), it's always possible to
+ * consume 'consume_nbits' bits, then preload 'preload_nbits' bits.  The
+ * arguments should be compile-time constants to enable compile-time evaluation.
+ */
+#define CAN_CONSUME_AND_THEN_PRELOAD(consume_nbits, preload_nbits)	\
+	(CONSUMABLE_NBITS >= (consume_nbits) &&				\
+	 FASTLOOP_PRELOADABLE_NBITS >= (consume_nbits) + (preload_nbits))
+
+/*
+ * REFILL_BITS_BRANCHLESS() branchlessly refills the bitbuffer variable by
+ * reading the next word from the input buffer and updating 'in_next' and
+ * 'bitsleft' based on how many bits were refilled -- counting whole bytes only.
+ * This is much faster than reading a byte at a time, at least if the CPU is
+ * little endian and supports fast unaligned memory accesses.
+ *
+ * The simplest way of branchlessly updating 'bitsleft' would be:
+ *
+ *	bitsleft += (MAX_BITSLEFT - bitsleft) & ~7;
+ *
+ * To make it faster, we define MAX_BITSLEFT to be 'WORDBITS - 1' rather than
+ * WORDBITS, so that in binary it looks like 111111 or 11111.  Then, we update
+ * 'bitsleft' by just setting the bits above the low 3 bits:
+ *
+ *	bitsleft |= MAX_BITSLEFT & ~7;
+ *
+ * That compiles down to a single instruction like 'or $0x38, %rbp'.  Using
+ * 'MAX_BITSLEFT == WORDBITS - 1' also has the advantage that refills can be
+ * done when 'bitsleft == MAX_BITSLEFT' without invoking undefined behavior.
+ *
+ * The simplest way of branchlessly updating 'in_next' would be:
+ *
+ *	in_next += (MAX_BITSLEFT - bitsleft) >> 3;
+ *
+ * With 'MAX_BITSLEFT == WORDBITS - 1' we could use an XOR instead, though this
+ * isn't really better:
+ *
+ *	in_next += (MAX_BITSLEFT ^ bitsleft) >> 3;
+ *
+ * An alternative which can be marginally better is the following:
+ *
+ *	in_next += sizeof(bitbuf_t) - 1;
+ *	in_next -= (bitsleft >> 3) & 0x7;
+ *
+ * It seems this would increase the number of CPU instructions from 3 (sub, shr,
+ * add) to 4 (add, shr, and, sub).  However, if the CPU has a bitfield
+ * extraction instruction (e.g. arm's ubfx), it stays at 3, and is potentially
+ * more efficient because the length of the longest dependency chain decreases
+ * from 3 to 2.  This alternative also has the advantage that it ignores the
+ * high bits in 'bitsleft', so it is compatible with the micro-optimization we
+ * use where we let the high bits of 'bitsleft' contain garbage.
  */
-#define CAN_ENSURE(n)	((n) <= MAX_ENSURE)
+#define REFILL_BITS_BRANCHLESS()					\
+do {									\
+	bitbuf |= get_unaligned_leword(in_next) << (u8)bitsleft;	\
+	in_next += sizeof(bitbuf_t) - 1;				\
+	in_next -= (bitsleft >> 3) & 0x7;				\
+	bitsleft |= MAX_BITSLEFT & ~7;					\
+} while (0)
 
 /*
- * Fill the bitbuffer variable, reading one byte at a time.
+ * REFILL_BITS() loads bits from the input buffer until the bitbuffer variable
+ * contains at least CONSUMABLE_NBITS consumable bits.
+ *
+ * This checks for the end of input, and it doesn't guarantee
+ * FASTLOOP_PRELOADABLE_NBITS, so it can't be used in the fastloop.
  *
  * If we would overread the input buffer, we just don't read anything, leaving
  * the bits zeroed but marking them filled.  This simplifies the decompressor
@@ -212,181 +237,179 @@ typedef machine_word_t bitbuf_t;
  * or return an error.  However, we do it to be slightly more friendly to the
  * not-recommended use case of decompressing with an unknown output size.)
  */
-#define FILL_BITS_BYTEWISE()					\
-do {								\
-	if (likely(in_next != in_end)) {			\
-		bitbuf |= (bitbuf_t)*in_next++ << bitsleft;	\
-	} else {						\
-		overread_count++;				\
-		SAFETY_CHECK(overread_count <= sizeof(bitbuf));	\
-	}							\
-	bitsleft += 8;						\
-} while (bitsleft <= BITBUF_NBITS - 8)
-
-/*
- * Fill the bitbuffer variable by reading the next word from the input buffer
- * and branchlessly updating 'in_next' and 'bitsleft' based on how many bits
- * were filled.  This can be significantly faster than FILL_BITS_BYTEWISE().
- * However, for this to work correctly, the word must be interpreted in
- * little-endian format.  In addition, the memory access may be unaligned.
- * Therefore, this method is most efficient on little-endian architectures that
- * support fast unaligned access, such as x86 and x86_64.
- *
- * For faster updating of 'bitsleft', we consider the bitbuffer size in bits to
- * be 1 less than the word size and therefore be all 1 bits.  Then the number of
- * bits filled is the value of the 0 bits in position >= 3 when changed to 1.
- * E.g. if words are 64 bits and bitsleft = 16 = b010000 then we refill b101000
- * = 40 bits = 5 bytes.  This uses only 4 operations to update 'in_next' and
- * 'bitsleft': one each of +, ^, >>, and |.  (Not counting operations the
- * compiler optimizes out.)  In contrast, the alternative of:
- *
- *	in_next += (BITBUF_NBITS - bitsleft) >> 3;
- *	bitsleft += (BITBUF_NBITS - bitsleft) & ~7;
- *
- * (where BITBUF_NBITS would be WORDBITS rather than WORDBITS - 1) would on
- * average refill an extra bit, but uses 5 operations: two +, and one each of
- * -, >>, and &.  Also the - and & must be completed before 'bitsleft' can be
- * updated, while the current solution updates 'bitsleft' with no dependencies.
- */
-#define FILL_BITS_WORDWISE()					\
-do {								\
-	/* BITBUF_NBITS must be all 1's in binary, see above */	\
-	STATIC_ASSERT((BITBUF_NBITS & (BITBUF_NBITS + 1)) == 0);\
-								\
-	bitbuf |= get_unaligned_leword(in_next) << bitsleft;	\
-	in_next += (bitsleft ^ BITBUF_NBITS) >> 3;		\
-	bitsleft |= BITBUF_NBITS & ~7;				\
+#define REFILL_BITS()							\
+do {									\
+	if (UNALIGNED_ACCESS_IS_FAST &&					\
+	    likely(in_end - in_next >= sizeof(bitbuf_t))) {		\
+		REFILL_BITS_BRANCHLESS();				\
+	} else {							\
+		while ((u8)bitsleft < CONSUMABLE_NBITS) {		\
+			if (likely(in_next != in_end)) {		\
+				bitbuf |= (bitbuf_t)*in_next++ <<	\
+					  (u8)bitsleft;			\
+			} else {					\
+				overread_count++;			\
+				SAFETY_CHECK(overread_count <=		\
+					     sizeof(bitbuf_t));		\
+			}						\
+			bitsleft += 8;					\
+		}							\
+	}								\
 } while (0)
 
 /*
- * Does the bitbuffer variable currently contain at least 'n' bits?
- */
-#define HAVE_BITS(n) (bitsleft >= (n))
-
-/*
- * Load more bits from the input buffer until the specified number of bits is
- * present in the bitbuffer variable.  'n' cannot be too large; see MAX_ENSURE
- * and CAN_ENSURE().
+ * REFILL_BITS_IN_FASTLOOP() is like REFILL_BITS(), but it doesn't check for the
+ * end of the input.  It can only be used in the fastloop.
  */
-#define ENSURE_BITS(n)						\
-if (!HAVE_BITS(n)) {						\
-	if (CPU_IS_LITTLE_ENDIAN() &&				\
-	    UNALIGNED_ACCESS_IS_FAST &&				\
-	    likely(in_end - in_next >= sizeof(bitbuf_t)))	\
-		FILL_BITS_WORDWISE();				\
-	else							\
-		FILL_BITS_BYTEWISE();				\
-}
+#define REFILL_BITS_IN_FASTLOOP()					\
+do {									\
+	STATIC_ASSERT(UNALIGNED_ACCESS_IS_FAST ||			\
+		      FASTLOOP_PRELOADABLE_NBITS == CONSUMABLE_NBITS);	\
+	if (UNALIGNED_ACCESS_IS_FAST) {					\
+		REFILL_BITS_BRANCHLESS();				\
+	} else {							\
+		while ((u8)bitsleft < CONSUMABLE_NBITS) {		\
+			bitbuf |= (bitbuf_t)*in_next++ << (u8)bitsleft;	\
+			bitsleft += 8;					\
+		}							\
+	}								\
+} while (0)
 
 /*
- * Return the next 'n' bits from the bitbuffer variable without removing them.
+ * This is the worst-case maximum number of output bytes that are written to
+ * during each iteration of the fastloop.  The worst case is 2 literals, then a
+ * match of length DEFLATE_MAX_MATCH_LEN.  Additionally, some slack space must
+ * be included for the intentional overrun in the match copy implementation.
  */
-#define BITS(n) ((u32)bitbuf & (((u32)1 << (n)) - 1))
+#define FASTLOOP_MAX_BYTES_WRITTEN	\
+	(2 + DEFLATE_MAX_MATCH_LEN + (5 * WORDBYTES) - 1)
 
 /*
- * Remove the next 'n' bits from the bitbuffer variable.
+ * This is the worst-case maximum number of input bytes that are read during
+ * each iteration of the fastloop.  To get this value, we first compute the
+ * greatest number of bits that can be refilled during a loop iteration.  The
+ * refill at the beginning can add at most MAX_BITSLEFT, and the amount that can
+ * be refilled later is no more than the maximum amount that can be consumed by
+ * 2 literals that don't need a subtable, then a match.  We convert this value
+ * to bytes, rounding up; this gives the maximum number of bytes that 'in_next'
+ * can be advanced.  Finally, we add sizeof(bitbuf_t) to account for
+ * REFILL_BITS_BRANCHLESS() reading a word past 'in_next'.
  */
-#define REMOVE_BITS(n) (bitbuf >>= (n), bitsleft -= (n))
+#define FASTLOOP_MAX_BYTES_READ					\
+	(DIV_ROUND_UP(MAX_BITSLEFT + (2 * LITLEN_TABLEBITS) +	\
+		      LENGTH_MAXBITS + OFFSET_MAXBITS, 8) +	\
+	 sizeof(bitbuf_t))
 
-/*
- * Remove and return the next 'n' bits from the bitbuffer variable.
- */
-#define POP_BITS(n) (tmp32 = BITS(n), REMOVE_BITS(n), tmp32)
+/*****************************************************************************
+ *                              Huffman decoding                             *
+ *****************************************************************************/
 
 /*
- * Verify that the input buffer hasn't been overread, then align the input to
- * the next byte boundary, discarding any remaining bits in the current byte.
+ * The fastest way to decode Huffman-encoded data is basically to use a decode
+ * table that maps the next TABLEBITS bits of data to their symbol.  Each entry
+ * decode_table[i] maps to the symbol whose codeword is a prefix of 'i'.  A
+ * symbol with codeword length 'n' has '2**(TABLEBITS-n)' entries in the table.
  *
- * Note that if the bitbuffer variable currently contains more than 7 bits, then
- * we must rewind 'in_next', effectively putting those bits back.  Only the bits
- * in what would be the "current" byte if we were reading one byte at a time can
- * be actually discarded.
- */
-#define ALIGN_INPUT()							\
-do {									\
-	SAFETY_CHECK(overread_count <= (bitsleft >> 3));		\
-	in_next -= (bitsleft >> 3) - overread_count;			\
-	overread_count = 0;						\
-	bitbuf = 0;							\
-	bitsleft = 0;							\
-} while(0)
-
-/*
- * Read a 16-bit value from the input.  This must have been preceded by a call
- * to ALIGN_INPUT(), and the caller must have already checked for overread.
+ * Ideally, TABLEBITS and the maximum codeword length would be the same; some
+ * compression formats are designed with this goal in mind.  Unfortunately, in
+ * DEFLATE, the maximum litlen and offset codeword lengths are 15 bits, which is
+ * too large for a practical TABLEBITS.  It's not *that* much larger, though, so
+ * the workaround is to use a single level of subtables.  In the main table,
+ * entries for prefixes of codewords longer than TABLEBITS contain a "pointer"
+ * to the appropriate subtable along with the number of bits it is indexed with.
+ *
+ * The most efficient way to allocate subtables is to allocate them dynamically
+ * after the main table.  The worst-case number of table entries needed,
+ * including subtables, is precomputable; see the ENOUGH constants below.
+ *
+ * A useful optimization is to store the codeword lengths in the decode table so
+ * that they don't have to be looked up by indexing a separate table that maps
+ * symbols to their codeword lengths.  We basically do this; however, for the
+ * litlen and offset codes we also implement some DEFLATE-specific optimizations
+ * that build in the consideration of the "extra bits" and the
+ * literal/length/end-of-block division.  For the exact decode table entry
+ * format we use, see the definitions of the *_decode_results[] arrays below.
  */
-#define READ_U16() (tmp16 = get_unaligned_le16(in_next), in_next += 2, tmp16)
 
-/*****************************************************************************
- *                              Huffman decoding                             *
- *****************************************************************************/
 
 /*
- * A decode table for order TABLEBITS consists of a main table of (1 <<
- * TABLEBITS) entries followed by a variable number of subtables.
- *
- * The decoding algorithm takes the next TABLEBITS bits of compressed data and
- * uses them as an index into the decode table.  The resulting entry is either a
- * "direct entry", meaning that it contains the value desired, or a "subtable
- * pointer", meaning that the entry references a subtable that must be indexed
- * using more bits of the compressed data to decode the symbol.
- *
- * Each decode table (a main table along with its subtables, if any) is
- * associated with a Huffman code.  Logically, the result of a decode table
- * lookup is a symbol from the alphabet from which the corresponding Huffman
- * code was constructed.  A symbol with codeword length n <= TABLEBITS is
- * associated with 2**(TABLEBITS - n) direct entries in the table, whereas a
- * symbol with codeword length n > TABLEBITS is associated with one or more
- * subtable entries.
- *
- * On top of this basic design, we implement several optimizations:
- *
- * - We store the length of each codeword directly in each of its decode table
- *   entries.  This allows the codeword length to be produced without indexing
- *   an additional table.
- *
- * - When beneficial, we don't store the Huffman symbol itself, but instead data
- *   generated from it.  For example, when decoding an offset symbol in DEFLATE,
- *   it's more efficient if we can decode the offset base and number of extra
- *   offset bits directly rather than decoding the offset symbol and then
- *   looking up both of those values in an additional table or tables.
- *
- * The size of each decode table entry is 32 bits, which provides slightly
- * better performance than 16-bit entries on 32 and 64 bit processers, provided
- * that the table doesn't get so large that it takes up too much memory and
- * starts generating cache misses.  The bits of each decode table entry are
- * defined as follows:
- *
- * - Bits 30 -- 31: flags (see below)
- * - Bits 8 -- 29: decode result: a Huffman symbol or related data
- * - Bits 0 -- 7: codeword length
+ * These are the TABLEBITS values we use for each of the DEFLATE Huffman codes,
+ * along with their corresponding ENOUGH values.
+ *
+ * For the precode, we use PRECODE_TABLEBITS == 7 since this is the maximum
+ * precode codeword length.  This avoids ever needing subtables.
+ *
+ * For the litlen and offset codes, we cannot realistically avoid ever needing
+ * subtables, since litlen and offset codewords can be up to 15 bits.  A higher
+ * TABLEBITS reduces the number of lookups that need a subtable, which increases
+ * performance; however, it increases memory usage and makes building the table
+ * take longer, which decreases performance.  We choose values that work well in
+ * practice, making subtables rarely needed without making the tables too large.
+ *
+ * Our choice of OFFSET_TABLEBITS == 8 is a bit low; without any special
+ * considerations, 9 would fit the trade-off curve better.  However, there is a
+ * performance benefit to using exactly 8 bits when it is a compile-time
+ * constant, as many CPUs can take the low byte more easily than the low 9 bits.
+ *
+ * zlib treats its equivalents of TABLEBITS as maximum values; whenever it
+ * builds a table, it caps the actual table_bits to the longest codeword.  This
+ * makes sense in theory, as there's no need for the table to be any larger than
+ * needed to support the longest codeword.  However, having the table bits be a
+ * compile-time constant is beneficial to the performance of the decode loop, so
+ * there is a trade-off.  libdeflate currently uses the dynamic table_bits
+ * strategy for the litlen table only, due to its larger maximum size.
+ * PRECODE_TABLEBITS and OFFSET_TABLEBITS are smaller, so going dynamic there
+ * isn't as useful, and OFFSET_TABLEBITS=8 is useful as mentioned above.
+ *
+ * Each TABLEBITS value has a corresponding ENOUGH value that gives the
+ * worst-case maximum number of decode table entries, including the main table
+ * and all subtables.  The ENOUGH value depends on three parameters:
+ *
+ *	(1) the maximum number of symbols in the code (DEFLATE_NUM_*_SYMS)
+ *	(2) the maximum number of main table bits (*_TABLEBITS)
+ *	(3) the maximum allowed codeword length (DEFLATE_MAX_*_CODEWORD_LEN)
+ *
+ * The ENOUGH values were computed using the utility program 'enough' from zlib.
  */
+#define PRECODE_TABLEBITS	7
+#define PRECODE_ENOUGH		128	/* enough 19 7 7	*/
+#define LITLEN_TABLEBITS	11
+#define LITLEN_ENOUGH		2342	/* enough 288 11 15	*/
+#define OFFSET_TABLEBITS	8
+#define OFFSET_ENOUGH		402	/* enough 32 8 15	*/
 
 /*
- * This flag is set in all main decode table entries that represent subtable
- * pointers.
+ * make_decode_table_entry() creates a decode table entry for the given symbol
+ * by combining the static part 'decode_results[sym]' with the dynamic part
+ * 'len', which is the remaining codeword length (the codeword length for main
+ * table entries, or the codeword length minus TABLEBITS for subtable entries).
+ *
+ * In all cases, we add 'len' to each of the two low-order bytes to create the
+ * appropriately-formatted decode table entry.  See the definitions of the
+ * *_decode_results[] arrays below, where the entry format is described.
  */
-#define HUFFDEC_SUBTABLE_POINTER	0x80000000
+static forceinline u32
+make_decode_table_entry(const u32 decode_results[], u32 sym, u32 len)
+{
+	return decode_results[sym] + (len << 8) + len;
+}
 
 /*
- * This flag is set in all entries in the litlen decode table that represent
- * literals.
+ * Here is the format of our precode decode table entries.  Bits not explicitly
+ * described contain zeroes:
+ *
+ *	Bit 20-16:  presym
+ *	Bit 10-8:   codeword length [not used]
+ *	Bit 2-0:    codeword length
+ *
+ * The precode decode table never has subtables, since we use
+ * PRECODE_TABLEBITS == DEFLATE_MAX_PRE_CODEWORD_LEN.
+ *
+ * precode_decode_results[] contains the static part of the entry for each
+ * symbol.  make_decode_table_entry() produces the final entries.
  */
-#define HUFFDEC_LITERAL			0x40000000
-
-/* Mask for extracting the codeword length from a decode table entry.  */
-#define HUFFDEC_LENGTH_MASK		0xFF
-
-/* Shift to extract the decode result from a decode table entry.  */
-#define HUFFDEC_RESULT_SHIFT		8
-
-/* Shift a decode result into its position in the decode table entry.  */
-#define HUFFDEC_RESULT_ENTRY(result)	((u32)(result) << HUFFDEC_RESULT_SHIFT)
-
-/* The decode result for each precode symbol.  There is no special optimization
- * for the precode; the decode result is simply the symbol value.  */
-static const u32 precode_decode_results[DEFLATE_NUM_PRECODE_SYMS] = {
-#define ENTRY(presym)	HUFFDEC_RESULT_ENTRY(presym)
+static const u32 precode_decode_results[] = {
+#define ENTRY(presym)	((u32)presym << 16)
 	ENTRY(0)   , ENTRY(1)   , ENTRY(2)   , ENTRY(3)   ,
 	ENTRY(4)   , ENTRY(5)   , ENTRY(6)   , ENTRY(7)   ,
 	ENTRY(8)   , ENTRY(9)   , ENTRY(10)  , ENTRY(11)  ,
@@ -395,13 +418,97 @@ static const u32 precode_decode_results[DEFLATE_NUM_PRECODE_SYMS] = {
 #undef ENTRY
 };
 
-/* The decode result for each litlen symbol.  For literals, this is the literal
- * value itself and the HUFFDEC_LITERAL flag.  For lengths, this is the length
- * base and the number of extra length bits.  */
-static const u32 litlen_decode_results[DEFLATE_NUM_LITLEN_SYMS] = {
+/* Litlen and offset decode table entry flags */
+
+/* Indicates a literal entry in the litlen decode table */
+#define HUFFDEC_LITERAL			0x80000000
+
+/* Indicates that HUFFDEC_SUBTABLE_POINTER or HUFFDEC_END_OF_BLOCK is set */
+#define HUFFDEC_EXCEPTIONAL		0x00008000
+
+/* Indicates a subtable pointer entry in the litlen or offset decode table */
+#define HUFFDEC_SUBTABLE_POINTER	0x00004000
 
-	/* Literals  */
-#define ENTRY(literal)	(HUFFDEC_LITERAL | HUFFDEC_RESULT_ENTRY(literal))
+/* Indicates an end-of-block entry in the litlen decode table */
+#define HUFFDEC_END_OF_BLOCK		0x00002000
+
+/* Maximum number of bits that can be consumed by decoding a match length */
+#define LENGTH_MAXBITS		(DEFLATE_MAX_LITLEN_CODEWORD_LEN + \
+				 DEFLATE_MAX_EXTRA_LENGTH_BITS)
+#define LENGTH_MAXFASTBITS	(LITLEN_TABLEBITS /* no subtable needed */ + \
+				 DEFLATE_MAX_EXTRA_LENGTH_BITS)
+
+/*
+ * Here is the format of our litlen decode table entries.  Bits not explicitly
+ * described contain zeroes:
+ *
+ *	Literals:
+ *		Bit 31:     1 (HUFFDEC_LITERAL)
+ *		Bit 23-16:  literal value
+ *		Bit 15:     0 (!HUFFDEC_EXCEPTIONAL)
+ *		Bit 14:     0 (!HUFFDEC_SUBTABLE_POINTER)
+ *		Bit 13:     0 (!HUFFDEC_END_OF_BLOCK)
+ *		Bit 11-8:   remaining codeword length [not used]
+ *		Bit 3-0:    remaining codeword length
+ *	Lengths:
+ *		Bit 31:     0 (!HUFFDEC_LITERAL)
+ *		Bit 24-16:  length base value
+ *		Bit 15:     0 (!HUFFDEC_EXCEPTIONAL)
+ *		Bit 14:     0 (!HUFFDEC_SUBTABLE_POINTER)
+ *		Bit 13:     0 (!HUFFDEC_END_OF_BLOCK)
+ *		Bit 11-8:   remaining codeword length
+ *		Bit 4-0:    remaining codeword length + number of extra bits
+ *	End of block:
+ *		Bit 31:     0 (!HUFFDEC_LITERAL)
+ *		Bit 15:     1 (HUFFDEC_EXCEPTIONAL)
+ *		Bit 14:     0 (!HUFFDEC_SUBTABLE_POINTER)
+ *		Bit 13:     1 (HUFFDEC_END_OF_BLOCK)
+ *		Bit 11-8:   remaining codeword length [not used]
+ *		Bit 3-0:    remaining codeword length
+ *	Subtable pointer:
+ *		Bit 31:     0 (!HUFFDEC_LITERAL)
+ *		Bit 30-16:  index of start of subtable
+ *		Bit 15:     1 (HUFFDEC_EXCEPTIONAL)
+ *		Bit 14:     1 (HUFFDEC_SUBTABLE_POINTER)
+ *		Bit 13:     0 (!HUFFDEC_END_OF_BLOCK)
+ *		Bit 11-8:   number of subtable bits
+ *		Bit 3-0:    number of main table bits
+ *
+ * This format has several desirable properties:
+ *
+ *	- The codeword length, length slot base, and number of extra length bits
+ *	  are all built in.  This eliminates the need to separately look up this
+ *	  information by indexing separate arrays by symbol or length slot.
+ *
+ *	- The HUFFDEC_* flags enable easily distinguishing between the different
+ *	  types of entries.  The HUFFDEC_LITERAL flag enables a fast path for
+ *	  literals; the high bit is used for this, as some CPUs can test the
+ *	  high bit more easily than other bits.  The HUFFDEC_EXCEPTIONAL flag
+ *	  makes it possible to detect the two unlikely cases (subtable pointer
+ *	  and end of block) in a single bit flag test.
+ *
+ *	- The low byte is the number of bits that need to be removed from the
+ *	  bitstream; this makes this value easily accessible, and it enables the
+ *	  micro-optimization of doing 'bitsleft -= entry' instead of
+ *	  'bitsleft -= (u8)entry'.  It also includes the number of extra bits,
+ *	  so they don't need to be removed separately.
+ *
+ *	- The flags in bits 15-13 are arranged to be 0 when the
+ *	  "remaining codeword length" in bits 11-8 is needed, making this value
+ *	  fairly easily accessible as well via a shift and downcast.
+ *
+ *	- Similarly, bits 13-12 are 0 when the "subtable bits" in bits 11-8 are
+ *	  needed, making it possible to extract this value with '& 0x3F' rather
+ *	  than '& 0xF'.  This value is only used as a shift amount, so this can
+ *	  save an 'and' instruction as the masking by 0x3F happens implicitly.
+ *
+ * litlen_decode_results[] contains the static part of the entry for each
+ * symbol.  make_decode_table_entry() produces the final entries.
+ */
+static const u32 litlen_decode_results[] = {
+
+	/* Literals */
+#define ENTRY(literal)	(HUFFDEC_LITERAL | ((u32)literal << 16))
 	ENTRY(0)   , ENTRY(1)   , ENTRY(2)   , ENTRY(3)   ,
 	ENTRY(4)   , ENTRY(5)   , ENTRY(6)   , ENTRY(7)   ,
 	ENTRY(8)   , ENTRY(9)   , ENTRY(10)  , ENTRY(11)  ,
@@ -468,17 +575,12 @@ static const u32 litlen_decode_results[DEFLATE_NUM_LITLEN_SYMS] = {
 	ENTRY(252) , ENTRY(253) , ENTRY(254) , ENTRY(255) ,
 #undef ENTRY
 
-#define HUFFDEC_EXTRA_LENGTH_BITS_MASK	0xFF
-#define HUFFDEC_LENGTH_BASE_SHIFT	8
-#define HUFFDEC_END_OF_BLOCK_LENGTH	0
-
-#define ENTRY(length_base, num_extra_bits)	HUFFDEC_RESULT_ENTRY(	\
-	((u32)(length_base) << HUFFDEC_LENGTH_BASE_SHIFT) | (num_extra_bits))
-
-	/* End of block  */
-	ENTRY(HUFFDEC_END_OF_BLOCK_LENGTH, 0),
+	/* End of block */
+	HUFFDEC_EXCEPTIONAL | HUFFDEC_END_OF_BLOCK,
 
-	/* Lengths  */
+	/* Lengths */
+#define ENTRY(length_base, num_extra_bits)	\
+	(((u32)(length_base) << 16) | (num_extra_bits))
 	ENTRY(3  , 0) , ENTRY(4  , 0) , ENTRY(5  , 0) , ENTRY(6  , 0),
 	ENTRY(7  , 0) , ENTRY(8  , 0) , ENTRY(9  , 0) , ENTRY(10 , 0),
 	ENTRY(11 , 1) , ENTRY(13 , 1) , ENTRY(15 , 1) , ENTRY(17 , 1),
@@ -490,16 +592,35 @@ static const u32 litlen_decode_results[DEFLATE_NUM_LITLEN_SYMS] = {
 #undef ENTRY
 };
 
-/* The decode result for each offset symbol.  This is the offset base and the
- * number of extra offset bits.  */
-static const u32 offset_decode_results[DEFLATE_NUM_OFFSET_SYMS] = {
+/* Maximum number of bits that can be consumed by decoding a match offset */
+#define OFFSET_MAXBITS		(DEFLATE_MAX_OFFSET_CODEWORD_LEN + \
+				 DEFLATE_MAX_EXTRA_OFFSET_BITS)
+#define OFFSET_MAXFASTBITS	(OFFSET_TABLEBITS /* no subtable needed */ + \
+				 DEFLATE_MAX_EXTRA_OFFSET_BITS)
 
-#define HUFFDEC_EXTRA_OFFSET_BITS_SHIFT 16
-#define HUFFDEC_OFFSET_BASE_MASK (((u32)1 << HUFFDEC_EXTRA_OFFSET_BITS_SHIFT) - 1)
-
-#define ENTRY(offset_base, num_extra_bits)	HUFFDEC_RESULT_ENTRY(	\
-		((u32)(num_extra_bits) << HUFFDEC_EXTRA_OFFSET_BITS_SHIFT) | \
-		(offset_base))
+/*
+ * Here is the format of our offset decode table entries.  Bits not explicitly
+ * described contain zeroes:
+ *
+ *	Offsets:
+ *		Bit 31-16:  offset base value
+ *		Bit 15:     0 (!HUFFDEC_EXCEPTIONAL)
+ *		Bit 14:     0 (!HUFFDEC_SUBTABLE_POINTER)
+ *		Bit 11-8:   remaining codeword length
+ *		Bit 4-0:    remaining codeword length + number of extra bits
+ *	Subtable pointer:
+ *		Bit 31-16:  index of start of subtable
+ *		Bit 15:     1 (HUFFDEC_EXCEPTIONAL)
+ *		Bit 14:     1 (HUFFDEC_SUBTABLE_POINTER)
+ *		Bit 11-8:   number of subtable bits
+ *		Bit 3-0:    number of main table bits
+ *
+ * These work the same way as the length entries and subtable pointer entries in
+ * the litlen decode table; see litlen_decode_results[] above.
+ */
+static const u32 offset_decode_results[] = {
+#define ENTRY(offset_base, num_extra_bits)	\
+	(((u32)(offset_base) << 16) | (num_extra_bits))
 	ENTRY(1     , 0)  , ENTRY(2     , 0)  , ENTRY(3     , 0)  , ENTRY(4     , 0)  ,
 	ENTRY(5     , 1)  , ENTRY(7     , 1)  , ENTRY(9     , 2)  , ENTRY(13    , 2) ,
 	ENTRY(17    , 3)  , ENTRY(25    , 3)  , ENTRY(33    , 4)  , ENTRY(49    , 4)  ,
@@ -507,10 +628,55 @@ static const u32 offset_decode_results[DEFLATE_NUM_OFFSET_SYMS] = {
 	ENTRY(257   , 7)  , ENTRY(385   , 7)  , ENTRY(513   , 8)  , ENTRY(769   , 8)  ,
 	ENTRY(1025  , 9)  , ENTRY(1537  , 9)  , ENTRY(2049  , 10) , ENTRY(3073  , 10) ,
 	ENTRY(4097  , 11) , ENTRY(6145  , 11) , ENTRY(8193  , 12) , ENTRY(12289 , 12) ,
-	ENTRY(16385 , 13) , ENTRY(24577 , 13) , ENTRY(32769 , 14) , ENTRY(49153 , 14) ,
+	ENTRY(16385 , 13) , ENTRY(24577 , 13) , ENTRY(24577 , 13) , ENTRY(24577 , 13) ,
 #undef ENTRY
 };
 
+/*
+ * The main DEFLATE decompressor structure.  Since libdeflate only supports
+ * full-buffer decompression, this structure doesn't store the entire
+ * decompression state, most of which is in stack variables.  Instead, this
+ * struct just contains the decode tables and some temporary arrays used for
+ * building them, as these are too large to comfortably allocate on the stack.
+ *
+ * Storing the decode tables in the decompressor struct also allows the decode
+ * tables for the static codes to be reused whenever two static Huffman blocks
+ * are decoded without an intervening dynamic block, even across streams.
+ */
+struct libdeflate_decompressor {
+
+	/*
+	 * The arrays aren't all needed at the same time.  'precode_lens' and
+	 * 'precode_decode_table' are unneeded after 'lens' has been filled.
+	 * Furthermore, 'lens' need not be retained after building the litlen
+	 * and offset decode tables.  In fact, 'lens' can be in union with
+	 * 'litlen_decode_table' provided that 'offset_decode_table' is separate
+	 * and is built first.
+	 */
+
+	union {
+		u8 precode_lens[DEFLATE_NUM_PRECODE_SYMS];
+
+		struct {
+			u8 lens[DEFLATE_NUM_LITLEN_SYMS +
+				DEFLATE_NUM_OFFSET_SYMS +
+				DEFLATE_MAX_LENS_OVERRUN];
+
+			u32 precode_decode_table[PRECODE_ENOUGH];
+		} l;
+
+		u32 litlen_decode_table[LITLEN_ENOUGH];
+	} u;
+
+	u32 offset_decode_table[OFFSET_ENOUGH];
+
+	/* used only during build_decode_table() */
+	u16 sorted_syms[DEFLATE_MAX_NUM_SYMS];
+
+	bool static_codes_loaded;
+	unsigned litlen_tablebits;
+};
+
 /*
  * Build a table for fast decoding of symbols from a Huffman code.  As input,
  * this function takes the codeword length of each symbol which may be used in
@@ -534,17 +700,21 @@ static const u32 offset_decode_results[DEFLATE_NUM_OFFSET_SYMS] = {
  * @num_syms
  *	The number of symbols in the code, including all unused symbols.
  * @decode_results
- *	An array which provides, for each symbol, the actual value to store into
- *	the decode table.  This value will be directly produced as the result of
- *	decoding that symbol, thereby moving the indirection out of the decode
- *	loop and into the table initialization.
+ *	An array which gives the incomplete decode result for each symbol.  The
+ *	needed values in this array will be combined with codeword lengths to
+ *	make the final decode table entries using make_decode_table_entry().
  * @table_bits
  *	The log base-2 of the number of main table entries to use.
+ *	If @table_bits_ret != NULL, then @table_bits is treated as a maximum
+ *	value and it will be decreased if a smaller table would be sufficient.
  * @max_codeword_len
  *	The maximum allowed codeword length for this Huffman code.
  *	Must be <= DEFLATE_MAX_CODEWORD_LEN.
  * @sorted_syms
  *	A temporary array of length @num_syms.
+ * @table_bits_ret
+ *	If non-NULL, then the dynamic table_bits is enabled, and the actual
+ *	table_bits value will be returned here.
  *
  * Returns %true if successful; %false if the codeword lengths do not form a
  * valid Huffman code.
@@ -554,9 +724,10 @@ build_decode_table(u32 decode_table[],
 		   const u8 lens[],
 		   const unsigned num_syms,
 		   const u32 decode_results[],
-		   const unsigned table_bits,
-		   const unsigned max_codeword_len,
-		   u16 *sorted_syms)
+		   unsigned table_bits,
+		   unsigned max_codeword_len,
+		   u16 *sorted_syms,
+		   unsigned *table_bits_ret)
 {
 	unsigned len_counts[DEFLATE_MAX_CODEWORD_LEN + 1];
 	unsigned offsets[DEFLATE_MAX_CODEWORD_LEN + 1];
@@ -576,6 +747,17 @@ build_decode_table(u32 decode_table[],
 	for (sym = 0; sym < num_syms; sym++)
 		len_counts[lens[sym]]++;
 
+	/*
+	 * Determine the actual maximum codeword length that was used, and
+	 * decrease table_bits to it if allowed.
+	 */
+	while (max_codeword_len > 1 && len_counts[max_codeword_len] == 0)
+		max_codeword_len--;
+	if (table_bits_ret != NULL) {
+		table_bits = MIN(table_bits, max_codeword_len);
+		*table_bits_ret = table_bits;
+	}
+
 	/*
 	 * Sort the symbols primarily by increasing codeword length and
 	 * secondarily by increasing symbol value; or equivalently by their
@@ -632,7 +814,7 @@ build_decode_table(u32 decode_table[],
 			 */
 
 			/* sym=0, len=1 (arbitrary) */
-			entry = decode_results[0] | 1;
+			entry = make_decode_table_entry(decode_results, 0, 1);
 		} else {
 			/*
 			 * Allow codes with a single used symbol, with codeword
@@ -648,7 +830,8 @@ build_decode_table(u32 decode_table[],
 			if (codespace_used != (1U << (max_codeword_len - 1)) ||
 			    len_counts[1] != 1)
 				return false;
-			entry = decode_results[*sorted_syms] | 1;
+			entry = make_decode_table_entry(decode_results,
+							*sorted_syms, 1);
 		}
 		/*
 		 * Note: the decode table still must be fully initialized, in
@@ -696,7 +879,8 @@ build_decode_table(u32 decode_table[],
 
 			/* Fill the first entry for the current codeword. */
 			decode_table[codeword] =
-				decode_results[*sorted_syms++] | len;
+				make_decode_table_entry(decode_results,
+							*sorted_syms++, len);
 
 			if (codeword == cur_table_end - 1) {
 				/* Last codeword (all 1's) */
@@ -779,19 +963,18 @@ build_decode_table(u32 decode_table[],
 
 			/*
 			 * Create the entry that points from the main table to
-			 * the subtable.  This entry contains the index of the
-			 * start of the subtable and the number of bits with
-			 * which the subtable is indexed (the log base 2 of the
-			 * number of entries it contains).
+			 * the subtable.
 			 */
 			decode_table[subtable_prefix] =
+				((u32)subtable_start << 16) |
+				HUFFDEC_EXCEPTIONAL |
 				HUFFDEC_SUBTABLE_POINTER |
-				HUFFDEC_RESULT_ENTRY(subtable_start) |
-				subtable_bits;
+				(subtable_bits << 8) | table_bits;
 		}
 
 		/* Fill the subtable entries for the current codeword. */
-		entry = decode_results[*sorted_syms++] | (len - table_bits);
+		entry = make_decode_table_entry(decode_results, *sorted_syms++,
+						len - table_bits);
 		i = subtable_start + (codeword >> table_bits);
 		stride = 1U << (len - table_bits);
 		do {
@@ -818,13 +1001,17 @@ build_precode_decode_table(struct libdeflate_decompressor *d)
 	/* When you change TABLEBITS, you must change ENOUGH, and vice versa! */
 	STATIC_ASSERT(PRECODE_TABLEBITS == 7 && PRECODE_ENOUGH == 128);
 
+	STATIC_ASSERT(ARRAY_LEN(precode_decode_results) ==
+		      DEFLATE_NUM_PRECODE_SYMS);
+
 	return build_decode_table(d->u.l.precode_decode_table,
 				  d->u.precode_lens,
 				  DEFLATE_NUM_PRECODE_SYMS,
 				  precode_decode_results,
 				  PRECODE_TABLEBITS,
 				  DEFLATE_MAX_PRE_CODEWORD_LEN,
-				  d->sorted_syms);
+				  d->sorted_syms,
+				  NULL);
 }
 
 /* Build the decode table for the literal/length code.  */
@@ -833,7 +1020,10 @@ build_litlen_decode_table(struct libdeflate_decompressor *d,
 			  unsigned num_litlen_syms, unsigned num_offset_syms)
 {
 	/* When you change TABLEBITS, you must change ENOUGH, and vice versa! */
-	STATIC_ASSERT(LITLEN_TABLEBITS == 10 && LITLEN_ENOUGH == 1334);
+	STATIC_ASSERT(LITLEN_TABLEBITS == 11 && LITLEN_ENOUGH == 2342);
+
+	STATIC_ASSERT(ARRAY_LEN(litlen_decode_results) ==
+		      DEFLATE_NUM_LITLEN_SYMS);
 
 	return build_decode_table(d->u.litlen_decode_table,
 				  d->u.l.lens,
@@ -841,7 +1031,8 @@ build_litlen_decode_table(struct libdeflate_decompressor *d,
 				  litlen_decode_results,
 				  LITLEN_TABLEBITS,
 				  DEFLATE_MAX_LITLEN_CODEWORD_LEN,
-				  d->sorted_syms);
+				  d->sorted_syms,
+				  &d->litlen_tablebits);
 }
 
 /* Build the decode table for the offset code.  */
@@ -852,33 +1043,17 @@ build_offset_decode_table(struct libdeflate_decompressor *d,
 	/* When you change TABLEBITS, you must change ENOUGH, and vice versa! */
 	STATIC_ASSERT(OFFSET_TABLEBITS == 8 && OFFSET_ENOUGH == 402);
 
+	STATIC_ASSERT(ARRAY_LEN(offset_decode_results) ==
+		      DEFLATE_NUM_OFFSET_SYMS);
+
 	return build_decode_table(d->offset_decode_table,
 				  d->u.l.lens + num_litlen_syms,
 				  num_offset_syms,
 				  offset_decode_results,
 				  OFFSET_TABLEBITS,
 				  DEFLATE_MAX_OFFSET_CODEWORD_LEN,
-				  d->sorted_syms);
-}
-
-static forceinline machine_word_t
-repeat_byte(u8 b)
-{
-	machine_word_t v;
-
-	STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64);
-
-	v = b;
-	v |= v << 8;
-	v |= v << 16;
-	v |= v << ((WORDBITS == 64) ? 32 : 0);
-	return v;
-}
-
-static forceinline void
-copy_word_unaligned(const void *src, void *dst)
-{
-	store_word_unaligned(load_word_unaligned(src), dst);
+				  d->sorted_syms,
+				  NULL);
 }
 
 /*****************************************************************************
@@ -886,12 +1061,15 @@ copy_word_unaligned(const void *src, void *dst)
  *****************************************************************************/
 
 typedef enum libdeflate_result (*decompress_func_t)
-	(struct libdeflate_decompressor *d,
-	 const void *in, size_t in_nbytes, void *out, size_t out_nbytes_avail,
+	(struct libdeflate_decompressor * restrict d,
+	 const void * restrict in, size_t in_nbytes,
+	 void * restrict out, size_t out_nbytes_avail,
 	 size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret);
 
 #define FUNCNAME deflate_decompress_default
-#define ATTRIBUTES
+#undef ATTRIBUTES
+#undef EXTRACT_VARBITS
+#undef EXTRACT_VARBITS8
 #include "decompress_template.h"
 
 /* Include architecture-specific implementation(s) if available. */


=====================================
lib/x86/cpu_features.h
=====================================
@@ -156,6 +156,8 @@ typedef char  __v64qi __attribute__((__vector_size__(64)));
 #define HAVE_BMI2_TARGET \
 	(HAVE_DYNAMIC_X86_CPU_FEATURES && \
 	 (GCC_PREREQ(4, 7) || __has_builtin(__builtin_ia32_pdep_di)))
+#define HAVE_BMI2_INTRIN \
+	(HAVE_BMI2_NATIVE || (HAVE_BMI2_TARGET && HAVE_TARGET_INTRINSICS))
 
 #endif /* __i386__ || __x86_64__ */
 


=====================================
lib/x86/decompress_impl.h
=====================================
@@ -4,18 +4,46 @@
 #include "cpu_features.h"
 
 /* BMI2 optimized version */
-#if HAVE_BMI2_TARGET && !HAVE_BMI2_NATIVE
-#  define FUNCNAME		deflate_decompress_bmi2
-#  define ATTRIBUTES		__attribute__((target("bmi2")))
+#if HAVE_BMI2_INTRIN
+#  define deflate_decompress_bmi2	deflate_decompress_bmi2
+#  define FUNCNAME			deflate_decompress_bmi2
+#  if !HAVE_BMI2_NATIVE
+#    define ATTRIBUTES			__attribute__((target("bmi2")))
+#  endif
+   /*
+    * Even with __attribute__((target("bmi2"))), gcc doesn't reliably use the
+    * bzhi instruction for 'word & BITMASK(count)'.  So use the bzhi intrinsic
+    * explicitly.  EXTRACT_VARBITS() is equivalent to 'word & BITMASK(count)';
+    * EXTRACT_VARBITS8() is equivalent to 'word & BITMASK((u8)count)'.
+    * Nevertheless, their implementation using the bzhi intrinsic is identical,
+    * as the bzhi instruction truncates the count to 8 bits implicitly.
+    */
+#  ifndef __clang__
+#    include <immintrin.h>
+#    ifdef __x86_64__
+#      define EXTRACT_VARBITS(word, count)  _bzhi_u64((word), (count))
+#      define EXTRACT_VARBITS8(word, count) _bzhi_u64((word), (count))
+#    else
+#      define EXTRACT_VARBITS(word, count)  _bzhi_u32((word), (count))
+#      define EXTRACT_VARBITS8(word, count) _bzhi_u32((word), (count))
+#    endif
+#  endif
 #  include "../decompress_template.h"
+#endif /* HAVE_BMI2_INTRIN */
+
+#if defined(deflate_decompress_bmi2) && HAVE_BMI2_NATIVE
+#define DEFAULT_IMPL	deflate_decompress_bmi2
+#else
 static inline decompress_func_t
 arch_select_decompress_func(void)
 {
+#ifdef deflate_decompress_bmi2
 	if (HAVE_BMI2(get_x86_cpu_features()))
 		return deflate_decompress_bmi2;
+#endif
 	return NULL;
 }
-#  define arch_select_decompress_func	arch_select_decompress_func
+#define arch_select_decompress_func	arch_select_decompress_func
 #endif
 
 #endif /* LIB_X86_DECOMPRESS_IMPL_H */


=====================================
lib/x86/matchfinder_impl.h
=====================================
@@ -28,7 +28,9 @@
 #ifndef LIB_X86_MATCHFINDER_IMPL_H
 #define LIB_X86_MATCHFINDER_IMPL_H
 
-#ifdef __AVX2__
+#include "cpu_features.h"
+
+#if HAVE_AVX2_NATIVE
 #  include <immintrin.h>
 static forceinline void
 matchfinder_init_avx2(mf_pos_t *data, size_t size)
@@ -73,7 +75,7 @@ matchfinder_rebase_avx2(mf_pos_t *data, size_t size)
 }
 #define matchfinder_rebase matchfinder_rebase_avx2
 
-#elif defined(__SSE2__)
+#elif HAVE_SSE2_NATIVE
 #  include <emmintrin.h>
 static forceinline void
 matchfinder_init_sse2(mf_pos_t *data, size_t size)
@@ -117,6 +119,6 @@ matchfinder_rebase_sse2(mf_pos_t *data, size_t size)
 	} while (size != 0);
 }
 #define matchfinder_rebase matchfinder_rebase_sse2
-#endif /* __SSE2__ */
+#endif /* HAVE_SSE2_NATIVE */
 
 #endif /* LIB_X86_MATCHFINDER_IMPL_H */


=====================================
libdeflate.h
=====================================
@@ -10,8 +10,8 @@ extern "C" {
 #endif
 
 #define LIBDEFLATE_VERSION_MAJOR	1
-#define LIBDEFLATE_VERSION_MINOR	13
-#define LIBDEFLATE_VERSION_STRING	"1.13"
+#define LIBDEFLATE_VERSION_MINOR	14
+#define LIBDEFLATE_VERSION_STRING	"1.14"
 
 #include <stddef.h>
 #include <stdint.h>


=====================================
scripts/afl-fuzz/fuzz.sh
=====================================
@@ -119,7 +119,9 @@ fi
 CFLAGS+=" -DLIBDEFLATE_ENABLE_ASSERTIONS"
 
 sudo sh -c "echo core > /proc/sys/kernel/core_pattern"
-sudo sh -c "echo performance | tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor"
+if [ -e /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor ]; then
+	sudo sh -c "echo performance | tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor"
+fi
 
 NPROC=$(getconf _NPROCESSORS_ONLN)
 


=====================================
scripts/run_tests.sh
=====================================
@@ -143,7 +143,7 @@ build_and_run_tests() {
 			features+=(avx2 avx bmi2 pclmul sse2)
 			;;
 		arm*|aarch*)
-			features+=(sha3 crc32 pmull neon)
+			features+=(dotprod sha3 crc32 pmull neon)
 			;;
 		esac
 	fi



View it on GitLab: https://salsa.debian.org/med-team/libdeflate/-/commit/d43d738959ea84d3eb9b539896d3492476ba9423

-- 
View it on GitLab: https://salsa.debian.org/med-team/libdeflate/-/commit/d43d738959ea84d3eb9b539896d3492476ba9423
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20221003/b2910aac/attachment-0001.htm>


More information about the debian-med-commit mailing list