[med-svn] [Git][med-team/libdeflate][master] 2 commits: New upstream version 1.14
Lance Lin (@linqigang)
gitlab at salsa.debian.org
Mon Oct 3 13:05:32 BST 2022
Lance Lin pushed to branch master at Debian Med / libdeflate
Commits:
d43d7389 by Lance Lin at 2022-10-03T18:59:36+07:00
New upstream version 1.14
- - - - -
7b8f0113 by Lance Lin at 2022-10-03T18:59:36+07:00
Update upstream source from tag 'upstream/1.14'
Update to upstream version '1.14'
with Debian dir f8515a34364e380ac874fe9d992a0755cba8641b
- - - - -
18 changed files:
- Makefile
- NEWS.md
- README.md
- common_defs.h
- lib/arm/adler32_impl.h
- lib/arm/cpu_features.c
- lib/arm/cpu_features.h
- lib/arm/matchfinder_impl.h
- lib/decompress_template.h
- lib/deflate_compress.c
- lib/deflate_constants.h
- lib/deflate_decompress.c
- lib/x86/cpu_features.h
- lib/x86/decompress_impl.h
- lib/x86/matchfinder_impl.h
- libdeflate.h
- scripts/afl-fuzz/fuzz.sh
- scripts/run_tests.sh
Changes:
=====================================
Makefile
=====================================
@@ -55,12 +55,13 @@ cc-option = $(shell if $(CC) $(1) -c -x c /dev/null -o /dev/null \
override CFLAGS := \
-O2 -fomit-frame-pointer -std=c99 -I. -Wall -Wundef \
- $(call cc-option,-Wpedantic) \
$(call cc-option,-Wdeclaration-after-statement) \
+ $(call cc-option,-Wimplicit-fallthrough) \
$(call cc-option,-Wmissing-prototypes) \
+ $(call cc-option,-Wpedantic) \
+ $(call cc-option,-Wshadow) \
$(call cc-option,-Wstrict-prototypes) \
$(call cc-option,-Wvla) \
- $(call cc-option,-Wimplicit-fallthrough) \
$(CFLAGS)
FREESTANDING :=
=====================================
NEWS.md
=====================================
@@ -1,5 +1,31 @@
# libdeflate release notes
+## Version 1.14
+
+Significantly improved decompression performance on all platforms. Examples
+include (measuring DEFLATE only):
+
+| Platform | Speedup over v1.13 |
+|------------------------------------|--------------------|
+| x86_64 (Intel Comet Lake), gcc | 1.287x |
+| x86_64 (Intel Comet Lake), clang | 1.437x |
+| x86_64 (Intel Ice Lake), gcc | 1.332x |
+| x86_64 (Intel Ice Lake), clang | 1.296x |
+| x86_64 (Intel Sandy Bridge), gcc | 1.162x |
+| x86_64 (Intel Sandy Bridge), clang | 1.092x |
+| x86_64 (AMD Zen 2), gcc | 1.263x |
+| x86_64 (AMD Zen 2), clang | 1.259x |
+| i386 (Intel Comet Lake), gcc | 1.570x |
+| i386 (Intel Comet Lake), clang | 1.344x |
+| arm64 (Apple M1), clang | 1.306x |
+| arm64 (Cortex-A76), clang | 1.355x |
+| arm64 (Cortex-A55), clang | 1.190x |
+| arm32 (Cortex-A76), clang | 1.665x |
+| arm32 (Cortex-A55), clang | 1.283x |
+
+Thanks to Dougall Johnson (https://dougallj.wordpress.com/) for ideas for many
+of the improvements.
+
## Version 1.13
* Changed the 32-bit Windows build of the library to use the default calling
=====================================
README.md
=====================================
@@ -27,11 +27,13 @@ For the release notes, see the [NEWS file](NEWS.md).
## Table of Contents
- [Building](#building)
- - [For UNIX](#for-unix)
- - [For macOS](#for-macos)
- - [For Windows](#for-windows)
- - [Using Cygwin](#using-cygwin)
- - [Using MSYS2](#using-msys2)
+ - [Using the Makefile](#using-the-makefile)
+ - [For UNIX](#for-unix)
+ - [For macOS](#for-macos)
+ - [For Windows](#for-windows)
+ - [Using Cygwin](#using-cygwin)
+ - [Using MSYS2](#using-msys2)
+ - [Using a custom build system](#using-a-custom-build-system)
- [API](#api)
- [Bindings for other programming languages](#bindings-for-other-programming-languages)
- [DEFLATE vs. zlib vs. gzip](#deflate-vs-zlib-vs-gzip)
@@ -42,7 +44,14 @@ For the release notes, see the [NEWS file](NEWS.md).
# Building
-## For UNIX
+libdeflate and the provided programs like `gzip` can be built using the provided
+Makefile. If only the library is needed, it can alternatively be easily
+integrated into applications and built using any build system; see [Using a
+custom build system](#using-a-custom-build-system).
+
+## Using the Makefile
+
+### For UNIX
Just run `make`, then (if desired) `make install`. You need GNU Make and either
GCC or Clang. GCC is recommended because it builds slightly faster binaries.
@@ -57,7 +66,7 @@ There are also many options which can be set on the `make` command line, e.g. to
omit library features or to customize the directories into which `make install`
installs files. See the Makefile for details.
-## For macOS
+### For macOS
Prebuilt macOS binaries can be installed with [Homebrew](https://brew.sh):
@@ -65,7 +74,7 @@ Prebuilt macOS binaries can be installed with [Homebrew](https://brew.sh):
But if you need to build the binaries yourself, see the section for UNIX above.
-## For Windows
+### For Windows
Prebuilt Windows binaries can be downloaded from
https://github.com/ebiggers/libdeflate/releases. But if you need to build the
@@ -84,7 +93,7 @@ binaries built with MinGW will be significantly faster.
Also note that 64-bit binaries are faster than 32-bit binaries and should be
preferred whenever possible.
-### Using Cygwin
+#### Using Cygwin
Run the Cygwin installer, available from https://cygwin.com/setup-x86_64.exe.
When you get to the package selection screen, choose the following additional
@@ -119,7 +128,7 @@ or to build 32-bit binaries:
make CC=i686-w64-mingw32-gcc
-### Using MSYS2
+#### Using MSYS2
Run the MSYS2 installer, available from http://www.msys2.org/. After
installing, open an MSYS2 shell and run:
@@ -161,6 +170,23 @@ and run the following commands:
Or to build 32-bit binaries, do the same but use "MSYS2 MinGW 32-bit" instead.
+## Using a custom build system
+
+The source files of the library are designed to be compilable directly, without
+any prerequisite step like running a `./configure` script. Therefore, as an
+alternative to building the library using the provided Makefile, the library
+source files can be easily integrated directly into your application and built
+using any build system.
+
+You should compile both `lib/*.c` and `lib/*/*.c`. You don't need to worry
+about excluding irrelevant architecture-specific code, as this is already
+handled in the source files themselves using `#ifdef`s.
+
+It is **strongly** recommended to use either gcc or clang, and to use `-O2`.
+
+If you are doing a freestanding build with `-ffreestanding`, you must add
+`-DFREESTANDING` as well, otherwise performance will suffer greatly.
+
# API
libdeflate has a simple API that is not zlib-compatible. You can create
=====================================
common_defs.h
=====================================
@@ -144,8 +144,17 @@ typedef size_t machine_word_t;
/* restrict - hint that writes only occur through the given pointer */
#ifdef __GNUC__
# define restrict __restrict__
+#elif defined(_MSC_VER)
+ /*
+ * Don't use MSVC's __restrict; it has nonstandard behavior.
+ * Standard restrict is okay, if it is supported.
+ */
+# if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)
+# define restrict restrict
+# else
+# define restrict
+# endif
#else
-/* Don't use MSVC's __restrict; it has nonstandard behavior. */
# define restrict
#endif
@@ -200,6 +209,7 @@ typedef size_t machine_word_t;
#define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d))
#define STATIC_ASSERT(expr) ((void)sizeof(char[1 - 2 * !(expr)]))
#define ALIGN(n, a) (((n) + (a) - 1) & ~((a) - 1))
+#define ROUND_UP(n, d) ((d) * DIV_ROUND_UP((n), (d)))
/* ========================================================================== */
/* Endianness handling */
@@ -513,8 +523,10 @@ bsr32(u32 v)
#ifdef __GNUC__
return 31 - __builtin_clz(v);
#elif defined(_MSC_VER)
- _BitScanReverse(&v, v);
- return v;
+ unsigned long i;
+
+ _BitScanReverse(&i, v);
+ return i;
#else
unsigned i = 0;
@@ -529,9 +541,11 @@ bsr64(u64 v)
{
#ifdef __GNUC__
return 63 - __builtin_clzll(v);
-#elif defined(_MSC_VER) && defined(_M_X64)
- _BitScanReverse64(&v, v);
- return v;
+#elif defined(_MSC_VER) && defined(_WIN64)
+ unsigned long i;
+
+ _BitScanReverse64(&i, v);
+ return i;
#else
unsigned i = 0;
@@ -563,8 +577,10 @@ bsf32(u32 v)
#ifdef __GNUC__
return __builtin_ctz(v);
#elif defined(_MSC_VER)
- _BitScanForward(&v, v);
- return v;
+ unsigned long i;
+
+ _BitScanForward(&i, v);
+ return i;
#else
unsigned i = 0;
@@ -579,9 +595,11 @@ bsf64(u64 v)
{
#ifdef __GNUC__
return __builtin_ctzll(v);
-#elif defined(_MSC_VER) && defined(_M_X64)
- _BitScanForward64(&v, v);
- return v;
+#elif defined(_MSC_VER) && defined(_WIN64)
+ unsigned long i;
+
+ _BitScanForward64(&i, v);
+ return i;
#else
unsigned i = 0;
=====================================
lib/arm/adler32_impl.h
=====================================
@@ -30,7 +30,9 @@
#include "cpu_features.h"
+/* Regular NEON implementation */
#if HAVE_NEON_INTRIN && CPU_IS_LITTLE_ENDIAN()
+# define adler32_neon adler32_neon
# define FUNCNAME adler32_neon
# define FUNCNAME_CHUNK adler32_neon_chunk
# define IMPL_ALIGNMENT 16
@@ -140,18 +142,119 @@ adler32_neon_chunk(const uint8x16_t *p, const uint8x16_t * const end,
*s2 += v_s2[0] + v_s2[1] + v_s2[2] + v_s2[3];
}
# include "../adler32_vec_template.h"
-# if HAVE_NEON_NATIVE
-# define DEFAULT_IMPL adler32_neon
+#endif /* Regular NEON implementation */
+
+/* NEON+dotprod implementation */
+#if HAVE_DOTPROD_INTRIN && CPU_IS_LITTLE_ENDIAN()
+# define adler32_neon_dotprod adler32_neon_dotprod
+# define FUNCNAME adler32_neon_dotprod
+# define FUNCNAME_CHUNK adler32_neon_dotprod_chunk
+# define IMPL_ALIGNMENT 16
+# define IMPL_SEGMENT_LEN 64
+# define IMPL_MAX_CHUNK_LEN MAX_CHUNK_LEN
+# if HAVE_DOTPROD_NATIVE
+# define ATTRIBUTES
# else
+# ifdef __clang__
+# define ATTRIBUTES __attribute__((target("dotprod")))
+ /*
+ * With gcc, arch=armv8.2-a is needed for dotprod intrinsics, unless the
+ * default target is armv8.3-a or later in which case it must be omitted.
+ * armv8.3-a or later can be detected by checking for __ARM_FEATURE_JCVT.
+ */
+# elif defined(__ARM_FEATURE_JCVT)
+# define ATTRIBUTES __attribute__((target("+dotprod")))
+# else
+# define ATTRIBUTES __attribute__((target("arch=armv8.2-a+dotprod")))
+# endif
+# endif
+# include <arm_neon.h>
+static forceinline ATTRIBUTES void
+adler32_neon_dotprod_chunk(const uint8x16_t *p, const uint8x16_t * const end,
+ u32 *s1, u32 *s2)
+{
+ const uint8x16_t mults_a = {
+ 64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49,
+ };
+ const uint8x16_t mults_b = {
+ 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33,
+ };
+ const uint8x16_t mults_c = {
+ 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
+ };
+ const uint8x16_t mults_d = {
+ 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1,
+ };
+ const uint8x16_t ones = {
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 , 1, 1,
+ };
+ uint32x4_t v_s1_a = { 0, 0, 0, 0 };
+ uint32x4_t v_s1_b = { 0, 0, 0, 0 };
+ uint32x4_t v_s1_c = { 0, 0, 0, 0 };
+ uint32x4_t v_s1_d = { 0, 0, 0, 0 };
+ uint32x4_t v_s2_a = { 0, 0, 0, 0 };
+ uint32x4_t v_s2_b = { 0, 0, 0, 0 };
+ uint32x4_t v_s2_c = { 0, 0, 0, 0 };
+ uint32x4_t v_s2_d = { 0, 0, 0, 0 };
+ uint32x4_t v_s1_sums_a = { 0, 0, 0, 0 };
+ uint32x4_t v_s1_sums_b = { 0, 0, 0, 0 };
+ uint32x4_t v_s1_sums_c = { 0, 0, 0, 0 };
+ uint32x4_t v_s1_sums_d = { 0, 0, 0, 0 };
+ uint32x4_t v_s1;
+ uint32x4_t v_s2;
+
+ do {
+ uint8x16_t bytes_a = *p++;
+ uint8x16_t bytes_b = *p++;
+ uint8x16_t bytes_c = *p++;
+ uint8x16_t bytes_d = *p++;
+
+ v_s1_sums_a += v_s1_a;
+ v_s1_a = vdotq_u32(v_s1_a, bytes_a, ones);
+ v_s2_a = vdotq_u32(v_s2_a, bytes_a, mults_a);
+
+ v_s1_sums_b += v_s1_b;
+ v_s1_b = vdotq_u32(v_s1_b, bytes_b, ones);
+ v_s2_b = vdotq_u32(v_s2_b, bytes_b, mults_b);
+
+ v_s1_sums_c += v_s1_c;
+ v_s1_c = vdotq_u32(v_s1_c, bytes_c, ones);
+ v_s2_c = vdotq_u32(v_s2_c, bytes_c, mults_c);
+
+ v_s1_sums_d += v_s1_d;
+ v_s1_d = vdotq_u32(v_s1_d, bytes_d, ones);
+ v_s2_d = vdotq_u32(v_s2_d, bytes_d, mults_d);
+ } while (p != end);
+
+ v_s1 = v_s1_a + v_s1_b + v_s1_c + v_s1_d;
+ v_s2 = v_s2_a + v_s2_b + v_s2_c + v_s2_d +
+ vqshlq_n_u32(v_s1_sums_a + v_s1_sums_b +
+ v_s1_sums_c + v_s1_sums_d, 6);
+ *s1 += v_s1[0] + v_s1[1] + v_s1[2] + v_s1[3];
+ *s2 += v_s2[0] + v_s2[1] + v_s2[2] + v_s2[3];
+}
+# include "../adler32_vec_template.h"
+#endif /* NEON+dotprod implementation */
+
+#if defined(adler32_neon_dotprod) && HAVE_DOTPROD_NATIVE
+#define DEFAULT_IMPL adler32_neon_dotprod
+#else
static inline adler32_func_t
arch_select_adler32_func(void)
{
- if (HAVE_NEON(get_arm_cpu_features()))
+ const u32 features MAYBE_UNUSED = get_arm_cpu_features();
+
+#ifdef adler32_neon_dotprod
+ if (HAVE_NEON(features) && HAVE_DOTPROD(features))
+ return adler32_neon_dotprod;
+#endif
+#ifdef adler32_neon
+ if (HAVE_NEON(features))
return adler32_neon;
+#endif
return NULL;
}
-# define arch_select_adler32_func arch_select_adler32_func
-# endif /* !HAVE_NEON_NATIVE */
-#endif /* HAVE_NEON_INTRIN && CPU_IS_LITTLE_ENDIAN() */
+#define arch_select_adler32_func arch_select_adler32_func
+#endif
#endif /* LIB_ARM_ADLER32_IMPL_H */
=====================================
lib/arm/cpu_features.c
=====================================
@@ -126,6 +126,8 @@ static u32 query_arm_cpu_features(void)
features |= ARM_CPU_FEATURE_CRC32;
if (hwcap & (1 << 17)) /* HWCAP_SHA3 */
features |= ARM_CPU_FEATURE_SHA3;
+ if (hwcap & (1 << 20)) /* HWCAP_ASIMDDP */
+ features |= ARM_CPU_FEATURE_DOTPROD;
#endif
return features;
}
@@ -140,12 +142,13 @@ static const struct {
const char *name;
u32 feature;
} feature_sysctls[] = {
- { "hw.optional.neon", ARM_CPU_FEATURE_NEON },
- { "hw.optional.AdvSIMD", ARM_CPU_FEATURE_NEON },
- { "hw.optional.arm.FEAT_PMULL", ARM_CPU_FEATURE_PMULL },
- { "hw.optional.armv8_crc32", ARM_CPU_FEATURE_CRC32 },
- { "hw.optional.armv8_2_sha3", ARM_CPU_FEATURE_SHA3 },
- { "hw.optional.arm.FEAT_SHA3", ARM_CPU_FEATURE_SHA3 },
+ { "hw.optional.neon", ARM_CPU_FEATURE_NEON },
+ { "hw.optional.AdvSIMD", ARM_CPU_FEATURE_NEON },
+ { "hw.optional.arm.FEAT_PMULL", ARM_CPU_FEATURE_PMULL },
+ { "hw.optional.armv8_crc32", ARM_CPU_FEATURE_CRC32 },
+ { "hw.optional.armv8_2_sha3", ARM_CPU_FEATURE_SHA3 },
+ { "hw.optional.arm.FEAT_SHA3", ARM_CPU_FEATURE_SHA3 },
+ { "hw.optional.arm.FEAT_DotProd", ARM_CPU_FEATURE_DOTPROD },
};
static u32 query_arm_cpu_features(void)
@@ -173,6 +176,7 @@ static const struct cpu_feature arm_cpu_feature_table[] = {
{ARM_CPU_FEATURE_PMULL, "pmull"},
{ARM_CPU_FEATURE_CRC32, "crc32"},
{ARM_CPU_FEATURE_SHA3, "sha3"},
+ {ARM_CPU_FEATURE_DOTPROD, "dotprod"},
};
volatile u32 libdeflate_arm_cpu_features = 0;
=====================================
lib/arm/cpu_features.h
=====================================
@@ -46,11 +46,13 @@
#define ARM_CPU_FEATURE_PMULL 0x00000002
#define ARM_CPU_FEATURE_CRC32 0x00000004
#define ARM_CPU_FEATURE_SHA3 0x00000008
+#define ARM_CPU_FEATURE_DOTPROD 0x00000010
-#define HAVE_NEON(features) (HAVE_NEON_NATIVE || ((features) & ARM_CPU_FEATURE_NEON))
-#define HAVE_PMULL(features) (HAVE_PMULL_NATIVE || ((features) & ARM_CPU_FEATURE_PMULL))
-#define HAVE_CRC32(features) (HAVE_CRC32_NATIVE || ((features) & ARM_CPU_FEATURE_CRC32))
-#define HAVE_SHA3(features) (HAVE_SHA3_NATIVE || ((features) & ARM_CPU_FEATURE_SHA3))
+#define HAVE_NEON(features) (HAVE_NEON_NATIVE || ((features) & ARM_CPU_FEATURE_NEON))
+#define HAVE_PMULL(features) (HAVE_PMULL_NATIVE || ((features) & ARM_CPU_FEATURE_PMULL))
+#define HAVE_CRC32(features) (HAVE_CRC32_NATIVE || ((features) & ARM_CPU_FEATURE_CRC32))
+#define HAVE_SHA3(features) (HAVE_SHA3_NATIVE || ((features) & ARM_CPU_FEATURE_SHA3))
+#define HAVE_DOTPROD(features) (HAVE_DOTPROD_NATIVE || ((features) & ARM_CPU_FEATURE_DOTPROD))
#if HAVE_DYNAMIC_ARM_CPU_FEATURES
#define ARM_CPU_FEATURES_KNOWN 0x80000000
@@ -156,6 +158,24 @@ static inline u32 get_arm_cpu_features(void) { return 0; }
# define HAVE_SHA3_INTRIN 0
#endif
+/* dotprod */
+#ifdef __aarch64__
+# ifdef __ARM_FEATURE_DOTPROD
+# define HAVE_DOTPROD_NATIVE 1
+# else
+# define HAVE_DOTPROD_NATIVE 0
+# endif
+# define HAVE_DOTPROD_TARGET \
+ (HAVE_DYNAMIC_ARM_CPU_FEATURES && \
+ (GCC_PREREQ(8, 1) || __has_builtin(__builtin_neon_vdotq_v)))
+# define HAVE_DOTPROD_INTRIN \
+ (HAVE_NEON_INTRIN && (HAVE_DOTPROD_NATIVE || HAVE_DOTPROD_TARGET))
+#else
+# define HAVE_DOTPROD_NATIVE 0
+# define HAVE_DOTPROD_TARGET 0
+# define HAVE_DOTPROD_INTRIN 0
+#endif
+
/*
* Work around bugs in arm_acle.h and arm_neon.h where sometimes intrinsics are
* only defined when the corresponding __ARM_FEATURE_* macro is defined. The
@@ -170,6 +190,9 @@ static inline u32 get_arm_cpu_features(void) { return 0; }
#if HAVE_SHA3_INTRIN && !HAVE_SHA3_NATIVE && defined(__clang__)
# define __ARM_FEATURE_SHA3 1
#endif
+#if HAVE_DOTPROD_INTRIN && !HAVE_DOTPROD_NATIVE && defined(__clang__)
+# define __ARM_FEATURE_DOTPROD 1
+#endif
#if HAVE_CRC32_INTRIN && !HAVE_CRC32_NATIVE && \
(defined(__clang__) || defined(__arm__))
# include <arm_acle.h>
@@ -179,6 +202,10 @@ static inline u32 get_arm_cpu_features(void) { return 0; }
# include <arm_neon.h>
# undef __ARM_FEATURE_SHA3
#endif
+#if HAVE_DOTPROD_INTRIN && !HAVE_DOTPROD_NATIVE && defined(__clang__)
+# include <arm_neon.h>
+# undef __ARM_FEATURE_DOTPROD
+#endif
#endif /* __arm__ || __aarch64__ */
=====================================
lib/arm/matchfinder_impl.h
=====================================
@@ -28,7 +28,9 @@
#ifndef LIB_ARM_MATCHFINDER_IMPL_H
#define LIB_ARM_MATCHFINDER_IMPL_H
-#ifdef __ARM_NEON
+#include "cpu_features.h"
+
+#if HAVE_NEON_NATIVE
# include <arm_neon.h>
static forceinline void
matchfinder_init_neon(mf_pos_t *data, size_t size)
@@ -81,6 +83,6 @@ matchfinder_rebase_neon(mf_pos_t *data, size_t size)
}
#define matchfinder_rebase matchfinder_rebase_neon
-#endif /* __ARM_NEON */
+#endif /* HAVE_NEON_NATIVE */
#endif /* LIB_ARM_MATCHFINDER_IMPL_H */
=====================================
lib/decompress_template.h
=====================================
@@ -31,7 +31,17 @@
* target instruction sets.
*/
-static enum libdeflate_result ATTRIBUTES
+#ifndef ATTRIBUTES
+# define ATTRIBUTES
+#endif
+#ifndef EXTRACT_VARBITS
+# define EXTRACT_VARBITS(word, count) ((word) & BITMASK(count))
+#endif
+#ifndef EXTRACT_VARBITS8
+# define EXTRACT_VARBITS8(word, count) ((word) & BITMASK((u8)(count)))
+#endif
+
+static enum libdeflate_result ATTRIBUTES MAYBE_UNUSED
FUNCNAME(struct libdeflate_decompressor * restrict d,
const void * restrict in, size_t in_nbytes,
void * restrict out, size_t out_nbytes_avail,
@@ -39,102 +49,141 @@ FUNCNAME(struct libdeflate_decompressor * restrict d,
{
u8 *out_next = out;
u8 * const out_end = out_next + out_nbytes_avail;
+ u8 * const out_fastloop_end =
+ out_end - MIN(out_nbytes_avail, FASTLOOP_MAX_BYTES_WRITTEN);
+
+ /* Input bitstream state; see deflate_decompress.c for documentation */
const u8 *in_next = in;
const u8 * const in_end = in_next + in_nbytes;
+ const u8 * const in_fastloop_end =
+ in_end - MIN(in_nbytes, FASTLOOP_MAX_BYTES_READ);
bitbuf_t bitbuf = 0;
- unsigned bitsleft = 0;
+ bitbuf_t saved_bitbuf;
+ u32 bitsleft = 0;
size_t overread_count = 0;
- unsigned i;
- unsigned is_final_block;
+
+ bool is_final_block;
unsigned block_type;
- u16 len;
- u16 nlen;
unsigned num_litlen_syms;
unsigned num_offset_syms;
- u16 tmp16;
- u32 tmp32;
+ bitbuf_t litlen_tablemask;
+ u32 entry;
next_block:
- /* Starting to read the next block. */
+ /* Starting to read the next block */
;
- STATIC_ASSERT(CAN_ENSURE(1 + 2 + 5 + 5 + 4));
- ENSURE_BITS(1 + 2 + 5 + 5 + 4);
+ STATIC_ASSERT(CAN_CONSUME(1 + 2 + 5 + 5 + 4 + 3));
+ REFILL_BITS();
- /* BFINAL: 1 bit */
- is_final_block = POP_BITS(1);
+ /* BFINAL: 1 bit */
+ is_final_block = bitbuf & BITMASK(1);
- /* BTYPE: 2 bits */
- block_type = POP_BITS(2);
+ /* BTYPE: 2 bits */
+ block_type = (bitbuf >> 1) & BITMASK(2);
if (block_type == DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN) {
- /* Dynamic Huffman block. */
+ /* Dynamic Huffman block */
- /* The order in which precode lengths are stored. */
+ /* The order in which precode lengths are stored */
static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = {
16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
};
unsigned num_explicit_precode_lens;
+ unsigned i;
- /* Read the codeword length counts. */
+ /* Read the codeword length counts. */
- STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == ((1 << 5) - 1) + 257);
- num_litlen_syms = POP_BITS(5) + 257;
+ STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == 257 + BITMASK(5));
+ num_litlen_syms = 257 + ((bitbuf >> 3) & BITMASK(5));
- STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == ((1 << 5) - 1) + 1);
- num_offset_syms = POP_BITS(5) + 1;
+ STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == 1 + BITMASK(5));
+ num_offset_syms = 1 + ((bitbuf >> 8) & BITMASK(5));
- STATIC_ASSERT(DEFLATE_NUM_PRECODE_SYMS == ((1 << 4) - 1) + 4);
- num_explicit_precode_lens = POP_BITS(4) + 4;
+ STATIC_ASSERT(DEFLATE_NUM_PRECODE_SYMS == 4 + BITMASK(4));
+ num_explicit_precode_lens = 4 + ((bitbuf >> 13) & BITMASK(4));
d->static_codes_loaded = false;
- /* Read the precode codeword lengths. */
+ /*
+ * Read the precode codeword lengths.
+ *
+ * A 64-bit bitbuffer is just one bit too small to hold the
+ * maximum number of precode lens, so to minimize branches we
+ * merge one len with the previous fields.
+ */
STATIC_ASSERT(DEFLATE_MAX_PRE_CODEWORD_LEN == (1 << 3) - 1);
- for (i = 0; i < num_explicit_precode_lens; i++) {
- ENSURE_BITS(3);
- d->u.precode_lens[deflate_precode_lens_permutation[i]] = POP_BITS(3);
+ if (CAN_CONSUME(3 * (DEFLATE_NUM_PRECODE_SYMS - 1))) {
+ d->u.precode_lens[deflate_precode_lens_permutation[0]] =
+ (bitbuf >> 17) & BITMASK(3);
+ bitbuf >>= 20;
+ bitsleft -= 20;
+ REFILL_BITS();
+ i = 1;
+ do {
+ d->u.precode_lens[deflate_precode_lens_permutation[i]] =
+ bitbuf & BITMASK(3);
+ bitbuf >>= 3;
+ bitsleft -= 3;
+ } while (++i < num_explicit_precode_lens);
+ } else {
+ bitbuf >>= 17;
+ bitsleft -= 17;
+ i = 0;
+ do {
+ if ((u8)bitsleft < 3)
+ REFILL_BITS();
+ d->u.precode_lens[deflate_precode_lens_permutation[i]] =
+ bitbuf & BITMASK(3);
+ bitbuf >>= 3;
+ bitsleft -= 3;
+ } while (++i < num_explicit_precode_lens);
}
-
for (; i < DEFLATE_NUM_PRECODE_SYMS; i++)
d->u.precode_lens[deflate_precode_lens_permutation[i]] = 0;
- /* Build the decode table for the precode. */
+ /* Build the decode table for the precode. */
SAFETY_CHECK(build_precode_decode_table(d));
- /* Expand the literal/length and offset codeword lengths. */
- for (i = 0; i < num_litlen_syms + num_offset_syms; ) {
- u32 entry;
+ /* Decode the litlen and offset codeword lengths. */
+ i = 0;
+ do {
unsigned presym;
u8 rep_val;
unsigned rep_count;
- ENSURE_BITS(DEFLATE_MAX_PRE_CODEWORD_LEN + 7);
+ if ((u8)bitsleft < DEFLATE_MAX_PRE_CODEWORD_LEN + 7)
+ REFILL_BITS();
- /* (The code below assumes that the precode decode table
- * does not have any subtables.) */
+ /*
+ * The code below assumes that the precode decode table
+ * doesn't have any subtables.
+ */
STATIC_ASSERT(PRECODE_TABLEBITS == DEFLATE_MAX_PRE_CODEWORD_LEN);
- /* Read the next precode symbol. */
- entry = d->u.l.precode_decode_table[BITS(DEFLATE_MAX_PRE_CODEWORD_LEN)];
- REMOVE_BITS(entry & HUFFDEC_LENGTH_MASK);
- presym = entry >> HUFFDEC_RESULT_SHIFT;
+ /* Decode the next precode symbol. */
+ entry = d->u.l.precode_decode_table[
+ bitbuf & BITMASK(DEFLATE_MAX_PRE_CODEWORD_LEN)];
+ bitbuf >>= (u8)entry;
+ bitsleft -= entry; /* optimization: subtract full entry */
+ presym = entry >> 16;
if (presym < 16) {
- /* Explicit codeword length */
+ /* Explicit codeword length */
d->u.l.lens[i++] = presym;
continue;
}
- /* Run-length encoded codeword lengths */
+ /* Run-length encoded codeword lengths */
- /* Note: we don't need verify that the repeat count
- * doesn't overflow the number of elements, since we
- * have enough extra spaces to allow for the worst-case
- * overflow (138 zeroes when only 1 length was
- * remaining).
+ /*
+ * Note: we don't need verify that the repeat count
+ * doesn't overflow the number of elements, since we've
+ * sized the lens array to have enough extra space to
+ * allow for the worst-case overrun (138 zeroes when
+ * only 1 length was remaining).
*
* In the case of the small repeat counts (presyms 16
* and 17), it is fastest to always write the maximum
@@ -149,11 +198,13 @@ next_block:
STATIC_ASSERT(DEFLATE_MAX_LENS_OVERRUN == 138 - 1);
if (presym == 16) {
- /* Repeat the previous length 3 - 6 times */
+ /* Repeat the previous length 3 - 6 times. */
SAFETY_CHECK(i != 0);
rep_val = d->u.l.lens[i - 1];
- STATIC_ASSERT(3 + ((1 << 2) - 1) == 6);
- rep_count = 3 + POP_BITS(2);
+ STATIC_ASSERT(3 + BITMASK(2) == 6);
+ rep_count = 3 + (bitbuf & BITMASK(2));
+ bitbuf >>= 2;
+ bitsleft -= 2;
d->u.l.lens[i + 0] = rep_val;
d->u.l.lens[i + 1] = rep_val;
d->u.l.lens[i + 2] = rep_val;
@@ -162,9 +213,11 @@ next_block:
d->u.l.lens[i + 5] = rep_val;
i += rep_count;
} else if (presym == 17) {
- /* Repeat zero 3 - 10 times */
- STATIC_ASSERT(3 + ((1 << 3) - 1) == 10);
- rep_count = 3 + POP_BITS(3);
+ /* Repeat zero 3 - 10 times. */
+ STATIC_ASSERT(3 + BITMASK(3) == 10);
+ rep_count = 3 + (bitbuf & BITMASK(3));
+ bitbuf >>= 3;
+ bitsleft -= 3;
d->u.l.lens[i + 0] = 0;
d->u.l.lens[i + 1] = 0;
d->u.l.lens[i + 2] = 0;
@@ -177,25 +230,45 @@ next_block:
d->u.l.lens[i + 9] = 0;
i += rep_count;
} else {
- /* Repeat zero 11 - 138 times */
- STATIC_ASSERT(11 + ((1 << 7) - 1) == 138);
- rep_count = 11 + POP_BITS(7);
+ /* Repeat zero 11 - 138 times. */
+ STATIC_ASSERT(11 + BITMASK(7) == 138);
+ rep_count = 11 + (bitbuf & BITMASK(7));
+ bitbuf >>= 7;
+ bitsleft -= 7;
memset(&d->u.l.lens[i], 0,
rep_count * sizeof(d->u.l.lens[i]));
i += rep_count;
}
- }
+ } while (i < num_litlen_syms + num_offset_syms);
+
} else if (block_type == DEFLATE_BLOCKTYPE_UNCOMPRESSED) {
+ u16 len, nlen;
- /* Uncompressed block: copy 'len' bytes literally from the input
- * buffer to the output buffer. */
+ /*
+ * Uncompressed block: copy 'len' bytes literally from the input
+ * buffer to the output buffer.
+ */
- ALIGN_INPUT();
+ bitsleft -= 3; /* for BTYPE and BFINAL */
- SAFETY_CHECK(in_end - in_next >= 4);
+ /*
+ * Align the bitstream to the next byte boundary. This means
+ * the next byte boundary as if we were reading a byte at a
+ * time. Therefore, we have to rewind 'in_next' by any bytes
+ * that have been refilled but not actually consumed yet (not
+ * counting overread bytes, which don't increment 'in_next').
+ */
+ bitsleft = (u8)bitsleft;
+ SAFETY_CHECK(overread_count <= (bitsleft >> 3));
+ in_next -= (bitsleft >> 3) - overread_count;
+ overread_count = 0;
+ bitbuf = 0;
+ bitsleft = 0;
- len = READ_U16();
- nlen = READ_U16();
+ SAFETY_CHECK(in_end - in_next >= 4);
+ len = get_unaligned_le16(in_next);
+ nlen = get_unaligned_le16(in_next + 2);
+ in_next += 4;
SAFETY_CHECK(len == (u16)~nlen);
if (unlikely(len > out_end - out_next))
@@ -209,6 +282,8 @@ next_block:
goto block_done;
} else {
+ unsigned i;
+
SAFETY_CHECK(block_type == DEFLATE_BLOCKTYPE_STATIC_HUFFMAN);
/*
@@ -221,6 +296,9 @@ next_block:
* dynamic Huffman block.
*/
+ bitbuf >>= 3; /* for BTYPE and BFINAL */
+ bitsleft -= 3;
+
if (d->static_codes_loaded)
goto have_decode_tables;
@@ -245,169 +323,442 @@ next_block:
num_offset_syms = 32;
}
- /* Decompressing a Huffman block (either dynamic or static) */
+ /* Decompressing a Huffman block (either dynamic or static) */
SAFETY_CHECK(build_offset_decode_table(d, num_litlen_syms, num_offset_syms));
SAFETY_CHECK(build_litlen_decode_table(d, num_litlen_syms, num_offset_syms));
have_decode_tables:
-
- /* The main DEFLATE decode loop */
- for (;;) {
- u32 entry;
- u32 length;
- u32 offset;
+ litlen_tablemask = BITMASK(d->litlen_tablebits);
+
+ /*
+ * This is the "fastloop" for decoding literals and matches. It does
+ * bounds checks on in_next and out_next in the loop conditions so that
+ * additional bounds checks aren't needed inside the loop body.
+ *
+ * To reduce latency, the bitbuffer is refilled and the next litlen
+ * decode table entry is preloaded before each loop iteration.
+ */
+ if (in_next >= in_fastloop_end || out_next >= out_fastloop_end)
+ goto generic_loop;
+ REFILL_BITS_IN_FASTLOOP();
+ entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
+ do {
+ u32 length, offset, lit;
const u8 *src;
u8 *dst;
- /* Decode a litlen symbol. */
- ENSURE_BITS(DEFLATE_MAX_LITLEN_CODEWORD_LEN);
- entry = d->u.litlen_decode_table[BITS(LITLEN_TABLEBITS)];
- if (entry & HUFFDEC_SUBTABLE_POINTER) {
- /* Litlen subtable required (uncommon case) */
- REMOVE_BITS(LITLEN_TABLEBITS);
- entry = d->u.litlen_decode_table[
- ((entry >> HUFFDEC_RESULT_SHIFT) & 0xFFFF) +
- BITS(entry & HUFFDEC_LENGTH_MASK)];
- }
- REMOVE_BITS(entry & HUFFDEC_LENGTH_MASK);
+ /*
+ * Consume the bits for the litlen decode table entry. Save the
+ * original bitbuf for later, in case the extra match length
+ * bits need to be extracted from it.
+ */
+ saved_bitbuf = bitbuf;
+ bitbuf >>= (u8)entry;
+ bitsleft -= entry; /* optimization: subtract full entry */
+
+ /*
+ * Begin by checking for a "fast" literal, i.e. a literal that
+ * doesn't need a subtable.
+ */
if (entry & HUFFDEC_LITERAL) {
- /* Literal */
- if (unlikely(out_next == out_end))
- return LIBDEFLATE_INSUFFICIENT_SPACE;
- *out_next++ = (u8)(entry >> HUFFDEC_RESULT_SHIFT);
- continue;
+ /*
+ * On 64-bit platforms, we decode up to 2 extra fast
+ * literals in addition to the primary item, as this
+ * increases performance and still leaves enough bits
+ * remaining for what follows. We could actually do 3,
+ * assuming LITLEN_TABLEBITS=11, but that actually
+ * decreases performance slightly (perhaps by messing
+ * with the branch prediction of the conditional refill
+ * that happens later while decoding the match offset).
+ *
+ * Note: the definitions of FASTLOOP_MAX_BYTES_WRITTEN
+ * and FASTLOOP_MAX_BYTES_READ need to be updated if the
+ * number of extra literals decoded here is changed.
+ */
+ if (/* enough bits for 2 fast literals + length + offset preload? */
+ CAN_CONSUME_AND_THEN_PRELOAD(2 * LITLEN_TABLEBITS +
+ LENGTH_MAXBITS,
+ OFFSET_TABLEBITS) &&
+ /* enough bits for 2 fast literals + slow literal + litlen preload? */
+ CAN_CONSUME_AND_THEN_PRELOAD(2 * LITLEN_TABLEBITS +
+ DEFLATE_MAX_LITLEN_CODEWORD_LEN,
+ LITLEN_TABLEBITS)) {
+ /* 1st extra fast literal */
+ lit = entry >> 16;
+ entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
+ saved_bitbuf = bitbuf;
+ bitbuf >>= (u8)entry;
+ bitsleft -= entry;
+ *out_next++ = lit;
+ if (entry & HUFFDEC_LITERAL) {
+ /* 2nd extra fast literal */
+ lit = entry >> 16;
+ entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
+ saved_bitbuf = bitbuf;
+ bitbuf >>= (u8)entry;
+ bitsleft -= entry;
+ *out_next++ = lit;
+ if (entry & HUFFDEC_LITERAL) {
+ /*
+ * Another fast literal, but
+ * this one is in lieu of the
+ * primary item, so it doesn't
+ * count as one of the extras.
+ */
+ lit = entry >> 16;
+ entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
+ REFILL_BITS_IN_FASTLOOP();
+ *out_next++ = lit;
+ continue;
+ }
+ }
+ } else {
+ /*
+ * Decode a literal. While doing so, preload
+ * the next litlen decode table entry and refill
+ * the bitbuffer. To reduce latency, we've
+ * arranged for there to be enough "preloadable"
+ * bits remaining to do the table preload
+ * independently of the refill.
+ */
+ STATIC_ASSERT(CAN_CONSUME_AND_THEN_PRELOAD(
+ LITLEN_TABLEBITS, LITLEN_TABLEBITS));
+ lit = entry >> 16;
+ entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
+ REFILL_BITS_IN_FASTLOOP();
+ *out_next++ = lit;
+ continue;
+ }
}
- /* Match or end-of-block */
-
- entry >>= HUFFDEC_RESULT_SHIFT;
- ENSURE_BITS(MAX_ENSURE);
+ /*
+ * It's not a literal entry, so it can be a length entry, a
+ * subtable pointer entry, or an end-of-block entry. Detect the
+ * two unlikely cases by testing the HUFFDEC_EXCEPTIONAL flag.
+ */
+ if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) {
+ /* Subtable pointer or end-of-block entry */
- /* Pop the extra length bits and add them to the length base to
- * produce the full length. */
- length = (entry >> HUFFDEC_LENGTH_BASE_SHIFT) +
- POP_BITS(entry & HUFFDEC_EXTRA_LENGTH_BITS_MASK);
+ if (unlikely(entry & HUFFDEC_END_OF_BLOCK))
+ goto block_done;
- /* The match destination must not end after the end of the
- * output buffer. For efficiency, combine this check with the
- * end-of-block check. We're using 0 for the special
- * end-of-block length, so subtract 1 and it turn it into
- * SIZE_MAX. */
- STATIC_ASSERT(HUFFDEC_END_OF_BLOCK_LENGTH == 0);
- if (unlikely((size_t)length - 1 >= out_end - out_next)) {
- if (unlikely(length != HUFFDEC_END_OF_BLOCK_LENGTH))
- return LIBDEFLATE_INSUFFICIENT_SPACE;
- goto block_done;
+ /*
+ * A subtable is required. Load and consume the
+ * subtable entry. The subtable entry can be of any
+ * type: literal, length, or end-of-block.
+ */
+ entry = d->u.litlen_decode_table[(entry >> 16) +
+ EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
+ saved_bitbuf = bitbuf;
+ bitbuf >>= (u8)entry;
+ bitsleft -= entry;
+
+ /*
+ * 32-bit platforms that use the byte-at-a-time refill
+ * method have to do a refill here for there to always
+ * be enough bits to decode a literal that requires a
+ * subtable, then preload the next litlen decode table
+ * entry; or to decode a match length that requires a
+ * subtable, then preload the offset decode table entry.
+ */
+ if (!CAN_CONSUME_AND_THEN_PRELOAD(DEFLATE_MAX_LITLEN_CODEWORD_LEN,
+ LITLEN_TABLEBITS) ||
+ !CAN_CONSUME_AND_THEN_PRELOAD(LENGTH_MAXBITS,
+ OFFSET_TABLEBITS))
+ REFILL_BITS_IN_FASTLOOP();
+ if (entry & HUFFDEC_LITERAL) {
+ /* Decode a literal that required a subtable. */
+ lit = entry >> 16;
+ entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
+ REFILL_BITS_IN_FASTLOOP();
+ *out_next++ = lit;
+ continue;
+ }
+ if (unlikely(entry & HUFFDEC_END_OF_BLOCK))
+ goto block_done;
+ /* Else, it's a length that required a subtable. */
}
- /* Decode the match offset. */
-
- entry = d->offset_decode_table[BITS(OFFSET_TABLEBITS)];
- if (entry & HUFFDEC_SUBTABLE_POINTER) {
- /* Offset subtable required (uncommon case) */
- REMOVE_BITS(OFFSET_TABLEBITS);
- entry = d->offset_decode_table[
- ((entry >> HUFFDEC_RESULT_SHIFT) & 0xFFFF) +
- BITS(entry & HUFFDEC_LENGTH_MASK)];
- }
- REMOVE_BITS(entry & HUFFDEC_LENGTH_MASK);
- entry >>= HUFFDEC_RESULT_SHIFT;
-
- STATIC_ASSERT(CAN_ENSURE(DEFLATE_MAX_EXTRA_LENGTH_BITS +
- DEFLATE_MAX_OFFSET_CODEWORD_LEN) &&
- CAN_ENSURE(DEFLATE_MAX_EXTRA_OFFSET_BITS));
- if (!CAN_ENSURE(DEFLATE_MAX_EXTRA_LENGTH_BITS +
- DEFLATE_MAX_OFFSET_CODEWORD_LEN +
- DEFLATE_MAX_EXTRA_OFFSET_BITS))
- ENSURE_BITS(DEFLATE_MAX_EXTRA_OFFSET_BITS);
-
- /* Pop the extra offset bits and add them to the offset base to
- * produce the full offset. */
- offset = (entry & HUFFDEC_OFFSET_BASE_MASK) +
- POP_BITS(entry >> HUFFDEC_EXTRA_OFFSET_BITS_SHIFT);
-
- /* The match source must not begin before the beginning of the
- * output buffer. */
- SAFETY_CHECK(offset <= out_next - (const u8 *)out);
+ /*
+ * Decode the match length: the length base value associated
+ * with the litlen symbol (which we extract from the decode
+ * table entry), plus the extra length bits. We don't need to
+ * consume the extra length bits here, as they were included in
+ * the bits consumed by the entry earlier. We also don't need
+ * to check for too-long matches here, as this is inside the
+ * fastloop where it's already been verified that the output
+ * buffer has enough space remaining to copy a max-length match.
+ */
+ length = entry >> 16;
+ length += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8);
/*
- * Copy the match: 'length' bytes at 'out_next - offset' to
- * 'out_next', possibly overlapping. If the match doesn't end
- * too close to the end of the buffer and offset >= WORDBYTES ||
- * offset == 1, take a fast path which copies a word at a time
- * -- potentially more than the length of the match, but that's
- * fine as long as we check for enough extra space.
- *
- * The remaining cases are not performance-critical so are
- * handled by a simple byte-by-byte copy.
+ * Decode the match offset. There are enough "preloadable" bits
+ * remaining to preload the offset decode table entry, but a
+ * refill might be needed before consuming it.
*/
+ STATIC_ASSERT(CAN_CONSUME_AND_THEN_PRELOAD(LENGTH_MAXFASTBITS,
+ OFFSET_TABLEBITS));
+ entry = d->offset_decode_table[bitbuf & BITMASK(OFFSET_TABLEBITS)];
+ if (CAN_CONSUME_AND_THEN_PRELOAD(OFFSET_MAXBITS,
+ LITLEN_TABLEBITS)) {
+ /*
+ * Decoding a match offset on a 64-bit platform. We may
+ * need to refill once, but then we can decode the whole
+ * offset and preload the next litlen table entry.
+ */
+ if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) {
+ /* Offset codeword requires a subtable */
+ if (unlikely((u8)bitsleft < OFFSET_MAXBITS +
+ LITLEN_TABLEBITS - PRELOAD_SLACK))
+ REFILL_BITS_IN_FASTLOOP();
+ bitbuf >>= OFFSET_TABLEBITS;
+ bitsleft -= OFFSET_TABLEBITS;
+ entry = d->offset_decode_table[(entry >> 16) +
+ EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
+ } else if (unlikely((u8)bitsleft < OFFSET_MAXFASTBITS +
+ LITLEN_TABLEBITS - PRELOAD_SLACK))
+ REFILL_BITS_IN_FASTLOOP();
+ } else {
+ /* Decoding a match offset on a 32-bit platform */
+ REFILL_BITS_IN_FASTLOOP();
+ if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) {
+ /* Offset codeword requires a subtable */
+ bitbuf >>= OFFSET_TABLEBITS;
+ bitsleft -= OFFSET_TABLEBITS;
+ entry = d->offset_decode_table[(entry >> 16) +
+ EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
+ REFILL_BITS_IN_FASTLOOP();
+ /* No further refill needed before extra bits */
+ STATIC_ASSERT(CAN_CONSUME(
+ OFFSET_MAXBITS - OFFSET_TABLEBITS));
+ } else {
+ /* No refill needed before extra bits */
+ STATIC_ASSERT(CAN_CONSUME(OFFSET_MAXFASTBITS));
+ }
+ }
+ saved_bitbuf = bitbuf;
+ bitbuf >>= (u8)entry;
+ bitsleft -= entry; /* optimization: subtract full entry */
+ offset = entry >> 16;
+ offset += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8);
+ /* Validate the match offset; needed even in the fastloop. */
+ SAFETY_CHECK(offset <= out_next - (const u8 *)out);
src = out_next - offset;
dst = out_next;
out_next += length;
- if (UNALIGNED_ACCESS_IS_FAST &&
- /* max overrun is writing 3 words for a min length match */
- likely(out_end - out_next >=
- 3 * WORDBYTES - DEFLATE_MIN_MATCH_LEN)) {
- if (offset >= WORDBYTES) { /* words don't overlap? */
- copy_word_unaligned(src, dst);
+ /*
+ * Before starting to issue the instructions to copy the match,
+ * refill the bitbuffer and preload the litlen decode table
+ * entry for the next loop iteration. This can increase
+ * performance by allowing the latency of the match copy to
+ * overlap with these other operations. To further reduce
+ * latency, we've arranged for there to be enough bits remaining
+ * to do the table preload independently of the refill, except
+ * on 32-bit platforms using the byte-at-a-time refill method.
+ */
+ if (!CAN_CONSUME_AND_THEN_PRELOAD(
+ MAX(OFFSET_MAXBITS - OFFSET_TABLEBITS,
+ OFFSET_MAXFASTBITS),
+ LITLEN_TABLEBITS) &&
+ unlikely((u8)bitsleft < LITLEN_TABLEBITS - PRELOAD_SLACK))
+ REFILL_BITS_IN_FASTLOOP();
+ entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
+ REFILL_BITS_IN_FASTLOOP();
+
+ /*
+ * Copy the match. On most CPUs the fastest method is a
+ * word-at-a-time copy, unconditionally copying about 5 words
+ * since this is enough for most matches without being too much.
+ *
+ * The normal word-at-a-time copy works for offset >= WORDBYTES,
+ * which is most cases. The case of offset == 1 is also common
+ * and is worth optimizing for, since it is just RLE encoding of
+ * the previous byte, which is the result of compressing long
+ * runs of the same byte.
+ *
+ * Writing past the match 'length' is allowed here, since it's
+ * been ensured there is enough output space left for a slight
+ * overrun. FASTLOOP_MAX_BYTES_WRITTEN needs to be updated if
+ * the maximum possible overrun here is changed.
+ */
+ if (UNALIGNED_ACCESS_IS_FAST && offset >= WORDBYTES) {
+ store_word_unaligned(load_word_unaligned(src), dst);
+ src += WORDBYTES;
+ dst += WORDBYTES;
+ store_word_unaligned(load_word_unaligned(src), dst);
+ src += WORDBYTES;
+ dst += WORDBYTES;
+ store_word_unaligned(load_word_unaligned(src), dst);
+ src += WORDBYTES;
+ dst += WORDBYTES;
+ store_word_unaligned(load_word_unaligned(src), dst);
+ src += WORDBYTES;
+ dst += WORDBYTES;
+ store_word_unaligned(load_word_unaligned(src), dst);
+ src += WORDBYTES;
+ dst += WORDBYTES;
+ while (dst < out_next) {
+ store_word_unaligned(load_word_unaligned(src), dst);
+ src += WORDBYTES;
+ dst += WORDBYTES;
+ store_word_unaligned(load_word_unaligned(src), dst);
src += WORDBYTES;
dst += WORDBYTES;
- copy_word_unaligned(src, dst);
+ store_word_unaligned(load_word_unaligned(src), dst);
src += WORDBYTES;
dst += WORDBYTES;
- do {
- copy_word_unaligned(src, dst);
- src += WORDBYTES;
- dst += WORDBYTES;
- } while (dst < out_next);
- } else if (offset == 1) {
- /* RLE encoding of previous byte, common if the
- * data contains many repeated bytes */
- machine_word_t v = repeat_byte(*src);
+ store_word_unaligned(load_word_unaligned(src), dst);
+ src += WORDBYTES;
+ dst += WORDBYTES;
+ store_word_unaligned(load_word_unaligned(src), dst);
+ src += WORDBYTES;
+ dst += WORDBYTES;
+ }
+ } else if (UNALIGNED_ACCESS_IS_FAST && offset == 1) {
+ machine_word_t v;
+ /*
+ * This part tends to get auto-vectorized, so keep it
+ * copying a multiple of 16 bytes at a time.
+ */
+ v = (machine_word_t)0x0101010101010101 * src[0];
+ store_word_unaligned(v, dst);
+ dst += WORDBYTES;
+ store_word_unaligned(v, dst);
+ dst += WORDBYTES;
+ store_word_unaligned(v, dst);
+ dst += WORDBYTES;
+ store_word_unaligned(v, dst);
+ dst += WORDBYTES;
+ while (dst < out_next) {
+ store_word_unaligned(v, dst);
+ dst += WORDBYTES;
+ store_word_unaligned(v, dst);
+ dst += WORDBYTES;
store_word_unaligned(v, dst);
dst += WORDBYTES;
store_word_unaligned(v, dst);
dst += WORDBYTES;
- do {
- store_word_unaligned(v, dst);
- dst += WORDBYTES;
- } while (dst < out_next);
- } else {
- *dst++ = *src++;
- *dst++ = *src++;
- do {
- *dst++ = *src++;
- } while (dst < out_next);
}
+ } else if (UNALIGNED_ACCESS_IS_FAST) {
+ store_word_unaligned(load_word_unaligned(src), dst);
+ src += offset;
+ dst += offset;
+ store_word_unaligned(load_word_unaligned(src), dst);
+ src += offset;
+ dst += offset;
+ do {
+ store_word_unaligned(load_word_unaligned(src), dst);
+ src += offset;
+ dst += offset;
+ store_word_unaligned(load_word_unaligned(src), dst);
+ src += offset;
+ dst += offset;
+ } while (dst < out_next);
} else {
- STATIC_ASSERT(DEFLATE_MIN_MATCH_LEN == 3);
*dst++ = *src++;
*dst++ = *src++;
do {
*dst++ = *src++;
} while (dst < out_next);
}
+ } while (in_next < in_fastloop_end && out_next < out_fastloop_end);
+
+ /*
+ * This is the generic loop for decoding literals and matches. This
+ * handles cases where in_next and out_next are close to the end of
+ * their respective buffers. Usually this loop isn't performance-
+ * critical, as most time is spent in the fastloop above instead. We
+ * therefore omit some optimizations here in favor of smaller code.
+ */
+generic_loop:
+ for (;;) {
+ u32 length, offset;
+ const u8 *src;
+ u8 *dst;
+
+ REFILL_BITS();
+ entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
+ saved_bitbuf = bitbuf;
+ bitbuf >>= (u8)entry;
+ bitsleft -= entry;
+ if (unlikely(entry & HUFFDEC_SUBTABLE_POINTER)) {
+ entry = d->u.litlen_decode_table[(entry >> 16) +
+ EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
+ saved_bitbuf = bitbuf;
+ bitbuf >>= (u8)entry;
+ bitsleft -= entry;
+ }
+ length = entry >> 16;
+ if (entry & HUFFDEC_LITERAL) {
+ if (unlikely(out_next == out_end))
+ return LIBDEFLATE_INSUFFICIENT_SPACE;
+ *out_next++ = length;
+ continue;
+ }
+ if (unlikely(entry & HUFFDEC_END_OF_BLOCK))
+ goto block_done;
+ length += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8);
+ if (unlikely(length > out_end - out_next))
+ return LIBDEFLATE_INSUFFICIENT_SPACE;
+
+ if (!CAN_CONSUME(LENGTH_MAXBITS + OFFSET_MAXBITS))
+ REFILL_BITS();
+ entry = d->offset_decode_table[bitbuf & BITMASK(OFFSET_TABLEBITS)];
+ if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) {
+ bitbuf >>= OFFSET_TABLEBITS;
+ bitsleft -= OFFSET_TABLEBITS;
+ entry = d->offset_decode_table[(entry >> 16) +
+ EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
+ if (!CAN_CONSUME(OFFSET_MAXBITS))
+ REFILL_BITS();
+ }
+ offset = entry >> 16;
+ offset += EXTRACT_VARBITS8(bitbuf, entry) >> (u8)(entry >> 8);
+ bitbuf >>= (u8)entry;
+ bitsleft -= entry;
+
+ SAFETY_CHECK(offset <= out_next - (const u8 *)out);
+ src = out_next - offset;
+ dst = out_next;
+ out_next += length;
+
+ STATIC_ASSERT(DEFLATE_MIN_MATCH_LEN == 3);
+ *dst++ = *src++;
+ *dst++ = *src++;
+ do {
+ *dst++ = *src++;
+ } while (dst < out_next);
}
block_done:
- /* Finished decoding a block. */
+ /* Finished decoding a block */
if (!is_final_block)
goto next_block;
- /* That was the last block. */
+ /* That was the last block. */
- /* Discard any readahead bits and check for excessive overread */
- ALIGN_INPUT();
+ bitsleft = (u8)bitsleft;
+
+ /*
+ * If any of the implicit appended zero bytes were consumed (not just
+ * refilled) before hitting end of stream, then the data is bad.
+ */
+ SAFETY_CHECK(overread_count <= (bitsleft >> 3));
+
+ /* Optionally return the actual number of bytes consumed. */
+ if (actual_in_nbytes_ret) {
+ /* Don't count bytes that were refilled but not consumed. */
+ in_next -= (bitsleft >> 3) - overread_count;
- /* Optionally return the actual number of bytes read */
- if (actual_in_nbytes_ret)
*actual_in_nbytes_ret = in_next - (u8 *)in;
+ }
- /* Optionally return the actual number of bytes written */
+ /* Optionally return the actual number of bytes written. */
if (actual_out_nbytes_ret) {
*actual_out_nbytes_ret = out_next - (u8 *)out;
} else {
@@ -419,3 +770,5 @@ block_done:
#undef FUNCNAME
#undef ATTRIBUTES
+#undef EXTRACT_VARBITS
+#undef EXTRACT_VARBITS8
=====================================
lib/deflate_compress.c
=====================================
@@ -475,7 +475,7 @@ struct deflate_output_bitstream;
struct libdeflate_compressor {
/* Pointer to the compress() implementation chosen at allocation time */
- void (*impl)(struct libdeflate_compressor *c, const u8 *in,
+ void (*impl)(struct libdeflate_compressor *restrict c, const u8 *in,
size_t in_nbytes, struct deflate_output_bitstream *os);
/* The compression level with which this compressor was created */
@@ -1041,7 +1041,6 @@ compute_length_counts(u32 A[], unsigned root_idx, unsigned len_counts[],
unsigned parent = A[node] >> NUM_SYMBOL_BITS;
unsigned parent_depth = A[parent] >> NUM_SYMBOL_BITS;
unsigned depth = parent_depth + 1;
- unsigned len = depth;
/*
* Set the depth of this node so that it is available when its
@@ -1054,19 +1053,19 @@ compute_length_counts(u32 A[], unsigned root_idx, unsigned len_counts[],
* constraint. This is not the optimal method for generating
* length-limited Huffman codes! But it should be good enough.
*/
- if (len >= max_codeword_len) {
- len = max_codeword_len;
+ if (depth >= max_codeword_len) {
+ depth = max_codeword_len;
do {
- len--;
- } while (len_counts[len] == 0);
+ depth--;
+ } while (len_counts[depth] == 0);
}
/*
* Account for the fact that we have a non-leaf node at the
* current depth.
*/
- len_counts[len]--;
- len_counts[len + 1] += 2;
+ len_counts[depth]--;
+ len_counts[depth + 1] += 2;
}
}
@@ -1189,11 +1188,9 @@ gen_codewords(u32 A[], u8 lens[], const unsigned len_counts[],
(next_codewords[len - 1] + len_counts[len - 1]) << 1;
for (sym = 0; sym < num_syms; sym++) {
- u8 len = lens[sym];
- u32 codeword = next_codewords[len]++;
-
/* DEFLATE requires bit-reversed codewords. */
- A[sym] = reverse_codeword(codeword, len);
+ A[sym] = reverse_codeword(next_codewords[lens[sym]]++,
+ lens[sym]);
}
}
=====================================
lib/deflate_constants.h
=====================================
@@ -49,11 +49,8 @@
/*
* Maximum number of extra bits that may be required to represent a match
* length or offset.
- *
- * TODO: are we going to have full DEFLATE64 support? If so, up to 16
- * length bits must be supported.
*/
#define DEFLATE_MAX_EXTRA_LENGTH_BITS 5
-#define DEFLATE_MAX_EXTRA_OFFSET_BITS 14
+#define DEFLATE_MAX_EXTRA_OFFSET_BITS 13
#endif /* LIB_DEFLATE_CONSTANTS_H */
=====================================
lib/deflate_decompress.c
=====================================
@@ -26,22 +26,20 @@
*
* ---------------------------------------------------------------------------
*
- * This is a highly optimized DEFLATE decompressor. When compiled with gcc on
- * x86_64, it decompresses data in about 52% of the time of zlib (48% if BMI2
- * instructions are available). On other architectures it should still be
- * significantly faster than zlib, but the difference may be smaller.
+ * This is a highly optimized DEFLATE decompressor. It is much faster than
+ * vanilla zlib, typically well over twice as fast, though results vary by CPU.
*
- * Why this is faster than zlib's implementation:
+ * Why this is faster than vanilla zlib:
*
* - Word accesses rather than byte accesses when reading input
* - Word accesses rather than byte accesses when copying matches
* - Faster Huffman decoding combined with various DEFLATE-specific tricks
- * - Larger bitbuffer variable that doesn't need to be filled as often
+ * - Larger bitbuffer variable that doesn't need to be refilled as often
* - Other optimizations to remove unnecessary branches
* - Only full-buffer decompression is supported, so the code doesn't need to
* support stopping and resuming decompression.
- * - On x86_64, compile a version of the decompression routine using BMI2
- * instructions and use it automatically at runtime when supported.
+ * - On x86_64, a version of the decompression routine is compiled with BMI2
+ * instructions enabled and is used automatically at runtime when supported.
*/
#include <limits.h>
@@ -66,77 +64,6 @@
# define SAFETY_CHECK(expr) if (unlikely(!(expr))) return LIBDEFLATE_BAD_DATA
#endif
-/*
- * Each TABLEBITS number is the base-2 logarithm of the number of entries in the
- * main portion of the corresponding decode table. Each number should be large
- * enough to ensure that for typical data, the vast majority of symbols can be
- * decoded by a direct lookup of the next TABLEBITS bits of compressed data.
- * However, this must be balanced against the fact that a larger table requires
- * more memory and requires more time to fill.
- *
- * Note: you cannot change a TABLEBITS number without also changing the
- * corresponding ENOUGH number!
- */
-#define PRECODE_TABLEBITS 7
-#define LITLEN_TABLEBITS 10
-#define OFFSET_TABLEBITS 8
-
-/*
- * Each ENOUGH number is the maximum number of decode table entries that may be
- * required for the corresponding Huffman code, including the main table and all
- * subtables. Each number depends on three parameters:
- *
- * (1) the maximum number of symbols in the code (DEFLATE_NUM_*_SYMS)
- * (2) the number of main table bits (the TABLEBITS numbers defined above)
- * (3) the maximum allowed codeword length (DEFLATE_MAX_*_CODEWORD_LEN)
- *
- * The ENOUGH numbers were computed using the utility program 'enough' from
- * zlib. This program enumerates all possible relevant Huffman codes to find
- * the worst-case usage of decode table entries.
- */
-#define PRECODE_ENOUGH 128 /* enough 19 7 7 */
-#define LITLEN_ENOUGH 1334 /* enough 288 10 15 */
-#define OFFSET_ENOUGH 402 /* enough 32 8 15 */
-
-/*
- * The main DEFLATE decompressor structure. Since this implementation only
- * supports full buffer decompression, this structure does not store the entire
- * decompression state, but rather only some arrays that are too large to
- * comfortably allocate on the stack.
- */
-struct libdeflate_decompressor {
-
- /*
- * The arrays aren't all needed at the same time. 'precode_lens' and
- * 'precode_decode_table' are unneeded after 'lens' has been filled.
- * Furthermore, 'lens' need not be retained after building the litlen
- * and offset decode tables. In fact, 'lens' can be in union with
- * 'litlen_decode_table' provided that 'offset_decode_table' is separate
- * and is built first.
- */
-
- union {
- u8 precode_lens[DEFLATE_NUM_PRECODE_SYMS];
-
- struct {
- u8 lens[DEFLATE_NUM_LITLEN_SYMS +
- DEFLATE_NUM_OFFSET_SYMS +
- DEFLATE_MAX_LENS_OVERRUN];
-
- u32 precode_decode_table[PRECODE_ENOUGH];
- } l;
-
- u32 litlen_decode_table[LITLEN_ENOUGH];
- } u;
-
- u32 offset_decode_table[OFFSET_ENOUGH];
-
- /* used only during build_decode_table() */
- u16 sorted_syms[DEFLATE_MAX_NUM_SYMS];
-
- bool static_codes_loaded;
-};
-
/*****************************************************************************
* Input bitstream *
*****************************************************************************/
@@ -144,19 +71,32 @@ struct libdeflate_decompressor {
/*
* The state of the "input bitstream" consists of the following variables:
*
- * - in_next: pointer to the next unread byte in the input buffer
+ * - in_next: a pointer to the next unread byte in the input buffer
*
- * - in_end: pointer just past the end of the input buffer
+ * - in_end: a pointer to just past the end of the input buffer
*
* - bitbuf: a word-sized variable containing bits that have been read from
- * the input buffer. The buffered bits are right-aligned
- * (they're the low-order bits).
+ * the input buffer or from the implicit appended zero bytes
+ *
+ * - bitsleft: the number of bits in 'bitbuf' available to be consumed.
+ * After REFILL_BITS_BRANCHLESS(), 'bitbuf' can actually
+ * contain more bits than this. However, only the bits counted
+ * by 'bitsleft' can actually be consumed; the rest can only be
+ * used for preloading.
*
- * - bitsleft: number of bits in 'bitbuf' that are valid.
+ * As a micro-optimization, we allow bits 8 and higher of
+ * 'bitsleft' to contain garbage. When consuming the bits
+ * associated with a decode table entry, this allows us to do
+ * 'bitsleft -= entry' instead of 'bitsleft -= (u8)entry'.
+ * On some CPUs, this helps reduce instruction dependencies.
+ * This does have the disadvantage that 'bitsleft' sometimes
+ * needs to be cast to 'u8', such as when it's used as a shift
+ * amount in REFILL_BITS_BRANCHLESS(). But that one happens
+ * for free since most CPUs ignore high bits in shift amounts.
*
- * To make it easier for the compiler to optimize the code by keeping variables
- * in registers, these are declared as normal variables and manipulated using
- * macros.
+ * - overread_count: the total number of implicit appended zero bytes that
+ * have been loaded into the bitbuffer, including any
+ * counted by 'bitsleft' and any already consumed
*/
/*
@@ -164,38 +104,123 @@ struct libdeflate_decompressor {
* performance, this should have size equal to a machine word.
*
* 64-bit platforms have a significant advantage: they get a bigger bitbuffer
- * which they have to fill less often.
+ * which they don't have to refill as often.
*/
typedef machine_word_t bitbuf_t;
+#define BITBUF_NBITS (8 * (int)sizeof(bitbuf_t))
+
+/* BITMASK(n) returns a bitmask of length 'n'. */
+#define BITMASK(n) (((bitbuf_t)1 << (n)) - 1)
/*
- * Number of bits the bitbuffer variable can hold.
- *
- * This is one less than the obvious value because of the optimized arithmetic
- * in FILL_BITS_WORDWISE() that leaves 'bitsleft' in the range
- * [WORDBITS - 8, WORDBITS - 1] rather than [WORDBITS - 7, WORDBITS].
+ * MAX_BITSLEFT is the maximum number of consumable bits, i.e. the maximum value
+ * of '(u8)bitsleft'. This is the size of the bitbuffer variable, minus 1 if
+ * the branchless refill method is being used (see REFILL_BITS_BRANCHLESS()).
*/
-#define BITBUF_NBITS (8 * sizeof(bitbuf_t) - 1)
+#define MAX_BITSLEFT \
+ (UNALIGNED_ACCESS_IS_FAST ? BITBUF_NBITS - 1 : BITBUF_NBITS)
/*
- * The maximum number of bits that can be ensured in the bitbuffer variable,
- * i.e. the maximum value of 'n' that can be passed ENSURE_BITS(n). The decoder
- * only reads whole bytes from memory, so this is the lowest value of 'bitsleft'
- * at which another byte cannot be read without first consuming some bits.
+ * CONSUMABLE_NBITS is the minimum number of bits that are guaranteed to be
+ * consumable (counted in 'bitsleft') immediately after refilling the bitbuffer.
+ * Since only whole bytes can be added to 'bitsleft', the worst case is
+ * 'MAX_BITSLEFT - 7': the smallest amount where another byte doesn't fit.
*/
-#define MAX_ENSURE (BITBUF_NBITS - 7)
+#define CONSUMABLE_NBITS (MAX_BITSLEFT - 7)
/*
- * Evaluates to true if 'n' is a valid argument to ENSURE_BITS(n), or false if
- * 'n' is too large to be passed to ENSURE_BITS(n). Note: if 'n' is a compile
- * time constant, then this expression will be a compile-type constant.
- * Therefore, CAN_ENSURE() can be used choose between alternative
- * implementations at compile time.
+ * FASTLOOP_PRELOADABLE_NBITS is the minimum number of bits that are guaranteed
+ * to be preloadable immediately after REFILL_BITS_IN_FASTLOOP(). (It is *not*
+ * guaranteed after REFILL_BITS(), since REFILL_BITS() falls back to a
+ * byte-at-a-time refill method near the end of input.) This may exceed the
+ * number of consumable bits (counted by 'bitsleft'). Any bits not counted in
+ * 'bitsleft' can only be used for precomputation and cannot be consumed.
+ */
+#define FASTLOOP_PRELOADABLE_NBITS \
+ (UNALIGNED_ACCESS_IS_FAST ? BITBUF_NBITS : CONSUMABLE_NBITS)
+
+/*
+ * PRELOAD_SLACK is the minimum number of bits that are guaranteed to be
+ * preloadable but not consumable, following REFILL_BITS_IN_FASTLOOP() and any
+ * subsequent consumptions. This is 1 bit if the branchless refill method is
+ * being used, and 0 bits otherwise.
+ */
+#define PRELOAD_SLACK MAX(0, FASTLOOP_PRELOADABLE_NBITS - MAX_BITSLEFT)
+
+/*
+ * CAN_CONSUME(n) is true if it's guaranteed that if the bitbuffer has just been
+ * refilled, then it's always possible to consume 'n' bits from it. 'n' should
+ * be a compile-time constant, to enable compile-time evaluation.
+ */
+#define CAN_CONSUME(n) (CONSUMABLE_NBITS >= (n))
+
+/*
+ * CAN_CONSUME_AND_THEN_PRELOAD(consume_nbits, preload_nbits) is true if it's
+ * guaranteed that after REFILL_BITS_IN_FASTLOOP(), it's always possible to
+ * consume 'consume_nbits' bits, then preload 'preload_nbits' bits. The
+ * arguments should be compile-time constants to enable compile-time evaluation.
+ */
+#define CAN_CONSUME_AND_THEN_PRELOAD(consume_nbits, preload_nbits) \
+ (CONSUMABLE_NBITS >= (consume_nbits) && \
+ FASTLOOP_PRELOADABLE_NBITS >= (consume_nbits) + (preload_nbits))
+
+/*
+ * REFILL_BITS_BRANCHLESS() branchlessly refills the bitbuffer variable by
+ * reading the next word from the input buffer and updating 'in_next' and
+ * 'bitsleft' based on how many bits were refilled -- counting whole bytes only.
+ * This is much faster than reading a byte at a time, at least if the CPU is
+ * little endian and supports fast unaligned memory accesses.
+ *
+ * The simplest way of branchlessly updating 'bitsleft' would be:
+ *
+ * bitsleft += (MAX_BITSLEFT - bitsleft) & ~7;
+ *
+ * To make it faster, we define MAX_BITSLEFT to be 'WORDBITS - 1' rather than
+ * WORDBITS, so that in binary it looks like 111111 or 11111. Then, we update
+ * 'bitsleft' by just setting the bits above the low 3 bits:
+ *
+ * bitsleft |= MAX_BITSLEFT & ~7;
+ *
+ * That compiles down to a single instruction like 'or $0x38, %rbp'. Using
+ * 'MAX_BITSLEFT == WORDBITS - 1' also has the advantage that refills can be
+ * done when 'bitsleft == MAX_BITSLEFT' without invoking undefined behavior.
+ *
+ * The simplest way of branchlessly updating 'in_next' would be:
+ *
+ * in_next += (MAX_BITSLEFT - bitsleft) >> 3;
+ *
+ * With 'MAX_BITSLEFT == WORDBITS - 1' we could use an XOR instead, though this
+ * isn't really better:
+ *
+ * in_next += (MAX_BITSLEFT ^ bitsleft) >> 3;
+ *
+ * An alternative which can be marginally better is the following:
+ *
+ * in_next += sizeof(bitbuf_t) - 1;
+ * in_next -= (bitsleft >> 3) & 0x7;
+ *
+ * It seems this would increase the number of CPU instructions from 3 (sub, shr,
+ * add) to 4 (add, shr, and, sub). However, if the CPU has a bitfield
+ * extraction instruction (e.g. arm's ubfx), it stays at 3, and is potentially
+ * more efficient because the length of the longest dependency chain decreases
+ * from 3 to 2. This alternative also has the advantage that it ignores the
+ * high bits in 'bitsleft', so it is compatible with the micro-optimization we
+ * use where we let the high bits of 'bitsleft' contain garbage.
*/
-#define CAN_ENSURE(n) ((n) <= MAX_ENSURE)
+#define REFILL_BITS_BRANCHLESS() \
+do { \
+ bitbuf |= get_unaligned_leword(in_next) << (u8)bitsleft; \
+ in_next += sizeof(bitbuf_t) - 1; \
+ in_next -= (bitsleft >> 3) & 0x7; \
+ bitsleft |= MAX_BITSLEFT & ~7; \
+} while (0)
/*
- * Fill the bitbuffer variable, reading one byte at a time.
+ * REFILL_BITS() loads bits from the input buffer until the bitbuffer variable
+ * contains at least CONSUMABLE_NBITS consumable bits.
+ *
+ * This checks for the end of input, and it doesn't guarantee
+ * FASTLOOP_PRELOADABLE_NBITS, so it can't be used in the fastloop.
*
* If we would overread the input buffer, we just don't read anything, leaving
* the bits zeroed but marking them filled. This simplifies the decompressor
@@ -212,181 +237,179 @@ typedef machine_word_t bitbuf_t;
* or return an error. However, we do it to be slightly more friendly to the
* not-recommended use case of decompressing with an unknown output size.)
*/
-#define FILL_BITS_BYTEWISE() \
-do { \
- if (likely(in_next != in_end)) { \
- bitbuf |= (bitbuf_t)*in_next++ << bitsleft; \
- } else { \
- overread_count++; \
- SAFETY_CHECK(overread_count <= sizeof(bitbuf)); \
- } \
- bitsleft += 8; \
-} while (bitsleft <= BITBUF_NBITS - 8)
-
-/*
- * Fill the bitbuffer variable by reading the next word from the input buffer
- * and branchlessly updating 'in_next' and 'bitsleft' based on how many bits
- * were filled. This can be significantly faster than FILL_BITS_BYTEWISE().
- * However, for this to work correctly, the word must be interpreted in
- * little-endian format. In addition, the memory access may be unaligned.
- * Therefore, this method is most efficient on little-endian architectures that
- * support fast unaligned access, such as x86 and x86_64.
- *
- * For faster updating of 'bitsleft', we consider the bitbuffer size in bits to
- * be 1 less than the word size and therefore be all 1 bits. Then the number of
- * bits filled is the value of the 0 bits in position >= 3 when changed to 1.
- * E.g. if words are 64 bits and bitsleft = 16 = b010000 then we refill b101000
- * = 40 bits = 5 bytes. This uses only 4 operations to update 'in_next' and
- * 'bitsleft': one each of +, ^, >>, and |. (Not counting operations the
- * compiler optimizes out.) In contrast, the alternative of:
- *
- * in_next += (BITBUF_NBITS - bitsleft) >> 3;
- * bitsleft += (BITBUF_NBITS - bitsleft) & ~7;
- *
- * (where BITBUF_NBITS would be WORDBITS rather than WORDBITS - 1) would on
- * average refill an extra bit, but uses 5 operations: two +, and one each of
- * -, >>, and &. Also the - and & must be completed before 'bitsleft' can be
- * updated, while the current solution updates 'bitsleft' with no dependencies.
- */
-#define FILL_BITS_WORDWISE() \
-do { \
- /* BITBUF_NBITS must be all 1's in binary, see above */ \
- STATIC_ASSERT((BITBUF_NBITS & (BITBUF_NBITS + 1)) == 0);\
- \
- bitbuf |= get_unaligned_leword(in_next) << bitsleft; \
- in_next += (bitsleft ^ BITBUF_NBITS) >> 3; \
- bitsleft |= BITBUF_NBITS & ~7; \
+#define REFILL_BITS() \
+do { \
+ if (UNALIGNED_ACCESS_IS_FAST && \
+ likely(in_end - in_next >= sizeof(bitbuf_t))) { \
+ REFILL_BITS_BRANCHLESS(); \
+ } else { \
+ while ((u8)bitsleft < CONSUMABLE_NBITS) { \
+ if (likely(in_next != in_end)) { \
+ bitbuf |= (bitbuf_t)*in_next++ << \
+ (u8)bitsleft; \
+ } else { \
+ overread_count++; \
+ SAFETY_CHECK(overread_count <= \
+ sizeof(bitbuf_t)); \
+ } \
+ bitsleft += 8; \
+ } \
+ } \
} while (0)
/*
- * Does the bitbuffer variable currently contain at least 'n' bits?
- */
-#define HAVE_BITS(n) (bitsleft >= (n))
-
-/*
- * Load more bits from the input buffer until the specified number of bits is
- * present in the bitbuffer variable. 'n' cannot be too large; see MAX_ENSURE
- * and CAN_ENSURE().
+ * REFILL_BITS_IN_FASTLOOP() is like REFILL_BITS(), but it doesn't check for the
+ * end of the input. It can only be used in the fastloop.
*/
-#define ENSURE_BITS(n) \
-if (!HAVE_BITS(n)) { \
- if (CPU_IS_LITTLE_ENDIAN() && \
- UNALIGNED_ACCESS_IS_FAST && \
- likely(in_end - in_next >= sizeof(bitbuf_t))) \
- FILL_BITS_WORDWISE(); \
- else \
- FILL_BITS_BYTEWISE(); \
-}
+#define REFILL_BITS_IN_FASTLOOP() \
+do { \
+ STATIC_ASSERT(UNALIGNED_ACCESS_IS_FAST || \
+ FASTLOOP_PRELOADABLE_NBITS == CONSUMABLE_NBITS); \
+ if (UNALIGNED_ACCESS_IS_FAST) { \
+ REFILL_BITS_BRANCHLESS(); \
+ } else { \
+ while ((u8)bitsleft < CONSUMABLE_NBITS) { \
+ bitbuf |= (bitbuf_t)*in_next++ << (u8)bitsleft; \
+ bitsleft += 8; \
+ } \
+ } \
+} while (0)
/*
- * Return the next 'n' bits from the bitbuffer variable without removing them.
+ * This is the worst-case maximum number of output bytes that are written to
+ * during each iteration of the fastloop. The worst case is 2 literals, then a
+ * match of length DEFLATE_MAX_MATCH_LEN. Additionally, some slack space must
+ * be included for the intentional overrun in the match copy implementation.
*/
-#define BITS(n) ((u32)bitbuf & (((u32)1 << (n)) - 1))
+#define FASTLOOP_MAX_BYTES_WRITTEN \
+ (2 + DEFLATE_MAX_MATCH_LEN + (5 * WORDBYTES) - 1)
/*
- * Remove the next 'n' bits from the bitbuffer variable.
+ * This is the worst-case maximum number of input bytes that are read during
+ * each iteration of the fastloop. To get this value, we first compute the
+ * greatest number of bits that can be refilled during a loop iteration. The
+ * refill at the beginning can add at most MAX_BITSLEFT, and the amount that can
+ * be refilled later is no more than the maximum amount that can be consumed by
+ * 2 literals that don't need a subtable, then a match. We convert this value
+ * to bytes, rounding up; this gives the maximum number of bytes that 'in_next'
+ * can be advanced. Finally, we add sizeof(bitbuf_t) to account for
+ * REFILL_BITS_BRANCHLESS() reading a word past 'in_next'.
*/
-#define REMOVE_BITS(n) (bitbuf >>= (n), bitsleft -= (n))
+#define FASTLOOP_MAX_BYTES_READ \
+ (DIV_ROUND_UP(MAX_BITSLEFT + (2 * LITLEN_TABLEBITS) + \
+ LENGTH_MAXBITS + OFFSET_MAXBITS, 8) + \
+ sizeof(bitbuf_t))
-/*
- * Remove and return the next 'n' bits from the bitbuffer variable.
- */
-#define POP_BITS(n) (tmp32 = BITS(n), REMOVE_BITS(n), tmp32)
+/*****************************************************************************
+ * Huffman decoding *
+ *****************************************************************************/
/*
- * Verify that the input buffer hasn't been overread, then align the input to
- * the next byte boundary, discarding any remaining bits in the current byte.
+ * The fastest way to decode Huffman-encoded data is basically to use a decode
+ * table that maps the next TABLEBITS bits of data to their symbol. Each entry
+ * decode_table[i] maps to the symbol whose codeword is a prefix of 'i'. A
+ * symbol with codeword length 'n' has '2**(TABLEBITS-n)' entries in the table.
*
- * Note that if the bitbuffer variable currently contains more than 7 bits, then
- * we must rewind 'in_next', effectively putting those bits back. Only the bits
- * in what would be the "current" byte if we were reading one byte at a time can
- * be actually discarded.
- */
-#define ALIGN_INPUT() \
-do { \
- SAFETY_CHECK(overread_count <= (bitsleft >> 3)); \
- in_next -= (bitsleft >> 3) - overread_count; \
- overread_count = 0; \
- bitbuf = 0; \
- bitsleft = 0; \
-} while(0)
-
-/*
- * Read a 16-bit value from the input. This must have been preceded by a call
- * to ALIGN_INPUT(), and the caller must have already checked for overread.
+ * Ideally, TABLEBITS and the maximum codeword length would be the same; some
+ * compression formats are designed with this goal in mind. Unfortunately, in
+ * DEFLATE, the maximum litlen and offset codeword lengths are 15 bits, which is
+ * too large for a practical TABLEBITS. It's not *that* much larger, though, so
+ * the workaround is to use a single level of subtables. In the main table,
+ * entries for prefixes of codewords longer than TABLEBITS contain a "pointer"
+ * to the appropriate subtable along with the number of bits it is indexed with.
+ *
+ * The most efficient way to allocate subtables is to allocate them dynamically
+ * after the main table. The worst-case number of table entries needed,
+ * including subtables, is precomputable; see the ENOUGH constants below.
+ *
+ * A useful optimization is to store the codeword lengths in the decode table so
+ * that they don't have to be looked up by indexing a separate table that maps
+ * symbols to their codeword lengths. We basically do this; however, for the
+ * litlen and offset codes we also implement some DEFLATE-specific optimizations
+ * that build in the consideration of the "extra bits" and the
+ * literal/length/end-of-block division. For the exact decode table entry
+ * format we use, see the definitions of the *_decode_results[] arrays below.
*/
-#define READ_U16() (tmp16 = get_unaligned_le16(in_next), in_next += 2, tmp16)
-/*****************************************************************************
- * Huffman decoding *
- *****************************************************************************/
/*
- * A decode table for order TABLEBITS consists of a main table of (1 <<
- * TABLEBITS) entries followed by a variable number of subtables.
- *
- * The decoding algorithm takes the next TABLEBITS bits of compressed data and
- * uses them as an index into the decode table. The resulting entry is either a
- * "direct entry", meaning that it contains the value desired, or a "subtable
- * pointer", meaning that the entry references a subtable that must be indexed
- * using more bits of the compressed data to decode the symbol.
- *
- * Each decode table (a main table along with its subtables, if any) is
- * associated with a Huffman code. Logically, the result of a decode table
- * lookup is a symbol from the alphabet from which the corresponding Huffman
- * code was constructed. A symbol with codeword length n <= TABLEBITS is
- * associated with 2**(TABLEBITS - n) direct entries in the table, whereas a
- * symbol with codeword length n > TABLEBITS is associated with one or more
- * subtable entries.
- *
- * On top of this basic design, we implement several optimizations:
- *
- * - We store the length of each codeword directly in each of its decode table
- * entries. This allows the codeword length to be produced without indexing
- * an additional table.
- *
- * - When beneficial, we don't store the Huffman symbol itself, but instead data
- * generated from it. For example, when decoding an offset symbol in DEFLATE,
- * it's more efficient if we can decode the offset base and number of extra
- * offset bits directly rather than decoding the offset symbol and then
- * looking up both of those values in an additional table or tables.
- *
- * The size of each decode table entry is 32 bits, which provides slightly
- * better performance than 16-bit entries on 32 and 64 bit processers, provided
- * that the table doesn't get so large that it takes up too much memory and
- * starts generating cache misses. The bits of each decode table entry are
- * defined as follows:
- *
- * - Bits 30 -- 31: flags (see below)
- * - Bits 8 -- 29: decode result: a Huffman symbol or related data
- * - Bits 0 -- 7: codeword length
+ * These are the TABLEBITS values we use for each of the DEFLATE Huffman codes,
+ * along with their corresponding ENOUGH values.
+ *
+ * For the precode, we use PRECODE_TABLEBITS == 7 since this is the maximum
+ * precode codeword length. This avoids ever needing subtables.
+ *
+ * For the litlen and offset codes, we cannot realistically avoid ever needing
+ * subtables, since litlen and offset codewords can be up to 15 bits. A higher
+ * TABLEBITS reduces the number of lookups that need a subtable, which increases
+ * performance; however, it increases memory usage and makes building the table
+ * take longer, which decreases performance. We choose values that work well in
+ * practice, making subtables rarely needed without making the tables too large.
+ *
+ * Our choice of OFFSET_TABLEBITS == 8 is a bit low; without any special
+ * considerations, 9 would fit the trade-off curve better. However, there is a
+ * performance benefit to using exactly 8 bits when it is a compile-time
+ * constant, as many CPUs can take the low byte more easily than the low 9 bits.
+ *
+ * zlib treats its equivalents of TABLEBITS as maximum values; whenever it
+ * builds a table, it caps the actual table_bits to the longest codeword. This
+ * makes sense in theory, as there's no need for the table to be any larger than
+ * needed to support the longest codeword. However, having the table bits be a
+ * compile-time constant is beneficial to the performance of the decode loop, so
+ * there is a trade-off. libdeflate currently uses the dynamic table_bits
+ * strategy for the litlen table only, due to its larger maximum size.
+ * PRECODE_TABLEBITS and OFFSET_TABLEBITS are smaller, so going dynamic there
+ * isn't as useful, and OFFSET_TABLEBITS=8 is useful as mentioned above.
+ *
+ * Each TABLEBITS value has a corresponding ENOUGH value that gives the
+ * worst-case maximum number of decode table entries, including the main table
+ * and all subtables. The ENOUGH value depends on three parameters:
+ *
+ * (1) the maximum number of symbols in the code (DEFLATE_NUM_*_SYMS)
+ * (2) the maximum number of main table bits (*_TABLEBITS)
+ * (3) the maximum allowed codeword length (DEFLATE_MAX_*_CODEWORD_LEN)
+ *
+ * The ENOUGH values were computed using the utility program 'enough' from zlib.
*/
+#define PRECODE_TABLEBITS 7
+#define PRECODE_ENOUGH 128 /* enough 19 7 7 */
+#define LITLEN_TABLEBITS 11
+#define LITLEN_ENOUGH 2342 /* enough 288 11 15 */
+#define OFFSET_TABLEBITS 8
+#define OFFSET_ENOUGH 402 /* enough 32 8 15 */
/*
- * This flag is set in all main decode table entries that represent subtable
- * pointers.
+ * make_decode_table_entry() creates a decode table entry for the given symbol
+ * by combining the static part 'decode_results[sym]' with the dynamic part
+ * 'len', which is the remaining codeword length (the codeword length for main
+ * table entries, or the codeword length minus TABLEBITS for subtable entries).
+ *
+ * In all cases, we add 'len' to each of the two low-order bytes to create the
+ * appropriately-formatted decode table entry. See the definitions of the
+ * *_decode_results[] arrays below, where the entry format is described.
*/
-#define HUFFDEC_SUBTABLE_POINTER 0x80000000
+static forceinline u32
+make_decode_table_entry(const u32 decode_results[], u32 sym, u32 len)
+{
+ return decode_results[sym] + (len << 8) + len;
+}
/*
- * This flag is set in all entries in the litlen decode table that represent
- * literals.
+ * Here is the format of our precode decode table entries. Bits not explicitly
+ * described contain zeroes:
+ *
+ * Bit 20-16: presym
+ * Bit 10-8: codeword length [not used]
+ * Bit 2-0: codeword length
+ *
+ * The precode decode table never has subtables, since we use
+ * PRECODE_TABLEBITS == DEFLATE_MAX_PRE_CODEWORD_LEN.
+ *
+ * precode_decode_results[] contains the static part of the entry for each
+ * symbol. make_decode_table_entry() produces the final entries.
*/
-#define HUFFDEC_LITERAL 0x40000000
-
-/* Mask for extracting the codeword length from a decode table entry. */
-#define HUFFDEC_LENGTH_MASK 0xFF
-
-/* Shift to extract the decode result from a decode table entry. */
-#define HUFFDEC_RESULT_SHIFT 8
-
-/* Shift a decode result into its position in the decode table entry. */
-#define HUFFDEC_RESULT_ENTRY(result) ((u32)(result) << HUFFDEC_RESULT_SHIFT)
-
-/* The decode result for each precode symbol. There is no special optimization
- * for the precode; the decode result is simply the symbol value. */
-static const u32 precode_decode_results[DEFLATE_NUM_PRECODE_SYMS] = {
-#define ENTRY(presym) HUFFDEC_RESULT_ENTRY(presym)
+static const u32 precode_decode_results[] = {
+#define ENTRY(presym) ((u32)presym << 16)
ENTRY(0) , ENTRY(1) , ENTRY(2) , ENTRY(3) ,
ENTRY(4) , ENTRY(5) , ENTRY(6) , ENTRY(7) ,
ENTRY(8) , ENTRY(9) , ENTRY(10) , ENTRY(11) ,
@@ -395,13 +418,97 @@ static const u32 precode_decode_results[DEFLATE_NUM_PRECODE_SYMS] = {
#undef ENTRY
};
-/* The decode result for each litlen symbol. For literals, this is the literal
- * value itself and the HUFFDEC_LITERAL flag. For lengths, this is the length
- * base and the number of extra length bits. */
-static const u32 litlen_decode_results[DEFLATE_NUM_LITLEN_SYMS] = {
+/* Litlen and offset decode table entry flags */
+
+/* Indicates a literal entry in the litlen decode table */
+#define HUFFDEC_LITERAL 0x80000000
+
+/* Indicates that HUFFDEC_SUBTABLE_POINTER or HUFFDEC_END_OF_BLOCK is set */
+#define HUFFDEC_EXCEPTIONAL 0x00008000
+
+/* Indicates a subtable pointer entry in the litlen or offset decode table */
+#define HUFFDEC_SUBTABLE_POINTER 0x00004000
- /* Literals */
-#define ENTRY(literal) (HUFFDEC_LITERAL | HUFFDEC_RESULT_ENTRY(literal))
+/* Indicates an end-of-block entry in the litlen decode table */
+#define HUFFDEC_END_OF_BLOCK 0x00002000
+
+/* Maximum number of bits that can be consumed by decoding a match length */
+#define LENGTH_MAXBITS (DEFLATE_MAX_LITLEN_CODEWORD_LEN + \
+ DEFLATE_MAX_EXTRA_LENGTH_BITS)
+#define LENGTH_MAXFASTBITS (LITLEN_TABLEBITS /* no subtable needed */ + \
+ DEFLATE_MAX_EXTRA_LENGTH_BITS)
+
+/*
+ * Here is the format of our litlen decode table entries. Bits not explicitly
+ * described contain zeroes:
+ *
+ * Literals:
+ * Bit 31: 1 (HUFFDEC_LITERAL)
+ * Bit 23-16: literal value
+ * Bit 15: 0 (!HUFFDEC_EXCEPTIONAL)
+ * Bit 14: 0 (!HUFFDEC_SUBTABLE_POINTER)
+ * Bit 13: 0 (!HUFFDEC_END_OF_BLOCK)
+ * Bit 11-8: remaining codeword length [not used]
+ * Bit 3-0: remaining codeword length
+ * Lengths:
+ * Bit 31: 0 (!HUFFDEC_LITERAL)
+ * Bit 24-16: length base value
+ * Bit 15: 0 (!HUFFDEC_EXCEPTIONAL)
+ * Bit 14: 0 (!HUFFDEC_SUBTABLE_POINTER)
+ * Bit 13: 0 (!HUFFDEC_END_OF_BLOCK)
+ * Bit 11-8: remaining codeword length
+ * Bit 4-0: remaining codeword length + number of extra bits
+ * End of block:
+ * Bit 31: 0 (!HUFFDEC_LITERAL)
+ * Bit 15: 1 (HUFFDEC_EXCEPTIONAL)
+ * Bit 14: 0 (!HUFFDEC_SUBTABLE_POINTER)
+ * Bit 13: 1 (HUFFDEC_END_OF_BLOCK)
+ * Bit 11-8: remaining codeword length [not used]
+ * Bit 3-0: remaining codeword length
+ * Subtable pointer:
+ * Bit 31: 0 (!HUFFDEC_LITERAL)
+ * Bit 30-16: index of start of subtable
+ * Bit 15: 1 (HUFFDEC_EXCEPTIONAL)
+ * Bit 14: 1 (HUFFDEC_SUBTABLE_POINTER)
+ * Bit 13: 0 (!HUFFDEC_END_OF_BLOCK)
+ * Bit 11-8: number of subtable bits
+ * Bit 3-0: number of main table bits
+ *
+ * This format has several desirable properties:
+ *
+ * - The codeword length, length slot base, and number of extra length bits
+ * are all built in. This eliminates the need to separately look up this
+ * information by indexing separate arrays by symbol or length slot.
+ *
+ * - The HUFFDEC_* flags enable easily distinguishing between the different
+ * types of entries. The HUFFDEC_LITERAL flag enables a fast path for
+ * literals; the high bit is used for this, as some CPUs can test the
+ * high bit more easily than other bits. The HUFFDEC_EXCEPTIONAL flag
+ * makes it possible to detect the two unlikely cases (subtable pointer
+ * and end of block) in a single bit flag test.
+ *
+ * - The low byte is the number of bits that need to be removed from the
+ * bitstream; this makes this value easily accessible, and it enables the
+ * micro-optimization of doing 'bitsleft -= entry' instead of
+ * 'bitsleft -= (u8)entry'. It also includes the number of extra bits,
+ * so they don't need to be removed separately.
+ *
+ * - The flags in bits 15-13 are arranged to be 0 when the
+ * "remaining codeword length" in bits 11-8 is needed, making this value
+ * fairly easily accessible as well via a shift and downcast.
+ *
+ * - Similarly, bits 13-12 are 0 when the "subtable bits" in bits 11-8 are
+ * needed, making it possible to extract this value with '& 0x3F' rather
+ * than '& 0xF'. This value is only used as a shift amount, so this can
+ * save an 'and' instruction as the masking by 0x3F happens implicitly.
+ *
+ * litlen_decode_results[] contains the static part of the entry for each
+ * symbol. make_decode_table_entry() produces the final entries.
+ */
+static const u32 litlen_decode_results[] = {
+
+ /* Literals */
+#define ENTRY(literal) (HUFFDEC_LITERAL | ((u32)literal << 16))
ENTRY(0) , ENTRY(1) , ENTRY(2) , ENTRY(3) ,
ENTRY(4) , ENTRY(5) , ENTRY(6) , ENTRY(7) ,
ENTRY(8) , ENTRY(9) , ENTRY(10) , ENTRY(11) ,
@@ -468,17 +575,12 @@ static const u32 litlen_decode_results[DEFLATE_NUM_LITLEN_SYMS] = {
ENTRY(252) , ENTRY(253) , ENTRY(254) , ENTRY(255) ,
#undef ENTRY
-#define HUFFDEC_EXTRA_LENGTH_BITS_MASK 0xFF
-#define HUFFDEC_LENGTH_BASE_SHIFT 8
-#define HUFFDEC_END_OF_BLOCK_LENGTH 0
-
-#define ENTRY(length_base, num_extra_bits) HUFFDEC_RESULT_ENTRY( \
- ((u32)(length_base) << HUFFDEC_LENGTH_BASE_SHIFT) | (num_extra_bits))
-
- /* End of block */
- ENTRY(HUFFDEC_END_OF_BLOCK_LENGTH, 0),
+ /* End of block */
+ HUFFDEC_EXCEPTIONAL | HUFFDEC_END_OF_BLOCK,
- /* Lengths */
+ /* Lengths */
+#define ENTRY(length_base, num_extra_bits) \
+ (((u32)(length_base) << 16) | (num_extra_bits))
ENTRY(3 , 0) , ENTRY(4 , 0) , ENTRY(5 , 0) , ENTRY(6 , 0),
ENTRY(7 , 0) , ENTRY(8 , 0) , ENTRY(9 , 0) , ENTRY(10 , 0),
ENTRY(11 , 1) , ENTRY(13 , 1) , ENTRY(15 , 1) , ENTRY(17 , 1),
@@ -490,16 +592,35 @@ static const u32 litlen_decode_results[DEFLATE_NUM_LITLEN_SYMS] = {
#undef ENTRY
};
-/* The decode result for each offset symbol. This is the offset base and the
- * number of extra offset bits. */
-static const u32 offset_decode_results[DEFLATE_NUM_OFFSET_SYMS] = {
+/* Maximum number of bits that can be consumed by decoding a match offset */
+#define OFFSET_MAXBITS (DEFLATE_MAX_OFFSET_CODEWORD_LEN + \
+ DEFLATE_MAX_EXTRA_OFFSET_BITS)
+#define OFFSET_MAXFASTBITS (OFFSET_TABLEBITS /* no subtable needed */ + \
+ DEFLATE_MAX_EXTRA_OFFSET_BITS)
-#define HUFFDEC_EXTRA_OFFSET_BITS_SHIFT 16
-#define HUFFDEC_OFFSET_BASE_MASK (((u32)1 << HUFFDEC_EXTRA_OFFSET_BITS_SHIFT) - 1)
-
-#define ENTRY(offset_base, num_extra_bits) HUFFDEC_RESULT_ENTRY( \
- ((u32)(num_extra_bits) << HUFFDEC_EXTRA_OFFSET_BITS_SHIFT) | \
- (offset_base))
+/*
+ * Here is the format of our offset decode table entries. Bits not explicitly
+ * described contain zeroes:
+ *
+ * Offsets:
+ * Bit 31-16: offset base value
+ * Bit 15: 0 (!HUFFDEC_EXCEPTIONAL)
+ * Bit 14: 0 (!HUFFDEC_SUBTABLE_POINTER)
+ * Bit 11-8: remaining codeword length
+ * Bit 4-0: remaining codeword length + number of extra bits
+ * Subtable pointer:
+ * Bit 31-16: index of start of subtable
+ * Bit 15: 1 (HUFFDEC_EXCEPTIONAL)
+ * Bit 14: 1 (HUFFDEC_SUBTABLE_POINTER)
+ * Bit 11-8: number of subtable bits
+ * Bit 3-0: number of main table bits
+ *
+ * These work the same way as the length entries and subtable pointer entries in
+ * the litlen decode table; see litlen_decode_results[] above.
+ */
+static const u32 offset_decode_results[] = {
+#define ENTRY(offset_base, num_extra_bits) \
+ (((u32)(offset_base) << 16) | (num_extra_bits))
ENTRY(1 , 0) , ENTRY(2 , 0) , ENTRY(3 , 0) , ENTRY(4 , 0) ,
ENTRY(5 , 1) , ENTRY(7 , 1) , ENTRY(9 , 2) , ENTRY(13 , 2) ,
ENTRY(17 , 3) , ENTRY(25 , 3) , ENTRY(33 , 4) , ENTRY(49 , 4) ,
@@ -507,10 +628,55 @@ static const u32 offset_decode_results[DEFLATE_NUM_OFFSET_SYMS] = {
ENTRY(257 , 7) , ENTRY(385 , 7) , ENTRY(513 , 8) , ENTRY(769 , 8) ,
ENTRY(1025 , 9) , ENTRY(1537 , 9) , ENTRY(2049 , 10) , ENTRY(3073 , 10) ,
ENTRY(4097 , 11) , ENTRY(6145 , 11) , ENTRY(8193 , 12) , ENTRY(12289 , 12) ,
- ENTRY(16385 , 13) , ENTRY(24577 , 13) , ENTRY(32769 , 14) , ENTRY(49153 , 14) ,
+ ENTRY(16385 , 13) , ENTRY(24577 , 13) , ENTRY(24577 , 13) , ENTRY(24577 , 13) ,
#undef ENTRY
};
+/*
+ * The main DEFLATE decompressor structure. Since libdeflate only supports
+ * full-buffer decompression, this structure doesn't store the entire
+ * decompression state, most of which is in stack variables. Instead, this
+ * struct just contains the decode tables and some temporary arrays used for
+ * building them, as these are too large to comfortably allocate on the stack.
+ *
+ * Storing the decode tables in the decompressor struct also allows the decode
+ * tables for the static codes to be reused whenever two static Huffman blocks
+ * are decoded without an intervening dynamic block, even across streams.
+ */
+struct libdeflate_decompressor {
+
+ /*
+ * The arrays aren't all needed at the same time. 'precode_lens' and
+ * 'precode_decode_table' are unneeded after 'lens' has been filled.
+ * Furthermore, 'lens' need not be retained after building the litlen
+ * and offset decode tables. In fact, 'lens' can be in union with
+ * 'litlen_decode_table' provided that 'offset_decode_table' is separate
+ * and is built first.
+ */
+
+ union {
+ u8 precode_lens[DEFLATE_NUM_PRECODE_SYMS];
+
+ struct {
+ u8 lens[DEFLATE_NUM_LITLEN_SYMS +
+ DEFLATE_NUM_OFFSET_SYMS +
+ DEFLATE_MAX_LENS_OVERRUN];
+
+ u32 precode_decode_table[PRECODE_ENOUGH];
+ } l;
+
+ u32 litlen_decode_table[LITLEN_ENOUGH];
+ } u;
+
+ u32 offset_decode_table[OFFSET_ENOUGH];
+
+ /* used only during build_decode_table() */
+ u16 sorted_syms[DEFLATE_MAX_NUM_SYMS];
+
+ bool static_codes_loaded;
+ unsigned litlen_tablebits;
+};
+
/*
* Build a table for fast decoding of symbols from a Huffman code. As input,
* this function takes the codeword length of each symbol which may be used in
@@ -534,17 +700,21 @@ static const u32 offset_decode_results[DEFLATE_NUM_OFFSET_SYMS] = {
* @num_syms
* The number of symbols in the code, including all unused symbols.
* @decode_results
- * An array which provides, for each symbol, the actual value to store into
- * the decode table. This value will be directly produced as the result of
- * decoding that symbol, thereby moving the indirection out of the decode
- * loop and into the table initialization.
+ * An array which gives the incomplete decode result for each symbol. The
+ * needed values in this array will be combined with codeword lengths to
+ * make the final decode table entries using make_decode_table_entry().
* @table_bits
* The log base-2 of the number of main table entries to use.
+ * If @table_bits_ret != NULL, then @table_bits is treated as a maximum
+ * value and it will be decreased if a smaller table would be sufficient.
* @max_codeword_len
* The maximum allowed codeword length for this Huffman code.
* Must be <= DEFLATE_MAX_CODEWORD_LEN.
* @sorted_syms
* A temporary array of length @num_syms.
+ * @table_bits_ret
+ * If non-NULL, then the dynamic table_bits is enabled, and the actual
+ * table_bits value will be returned here.
*
* Returns %true if successful; %false if the codeword lengths do not form a
* valid Huffman code.
@@ -554,9 +724,10 @@ build_decode_table(u32 decode_table[],
const u8 lens[],
const unsigned num_syms,
const u32 decode_results[],
- const unsigned table_bits,
- const unsigned max_codeword_len,
- u16 *sorted_syms)
+ unsigned table_bits,
+ unsigned max_codeword_len,
+ u16 *sorted_syms,
+ unsigned *table_bits_ret)
{
unsigned len_counts[DEFLATE_MAX_CODEWORD_LEN + 1];
unsigned offsets[DEFLATE_MAX_CODEWORD_LEN + 1];
@@ -576,6 +747,17 @@ build_decode_table(u32 decode_table[],
for (sym = 0; sym < num_syms; sym++)
len_counts[lens[sym]]++;
+ /*
+ * Determine the actual maximum codeword length that was used, and
+ * decrease table_bits to it if allowed.
+ */
+ while (max_codeword_len > 1 && len_counts[max_codeword_len] == 0)
+ max_codeword_len--;
+ if (table_bits_ret != NULL) {
+ table_bits = MIN(table_bits, max_codeword_len);
+ *table_bits_ret = table_bits;
+ }
+
/*
* Sort the symbols primarily by increasing codeword length and
* secondarily by increasing symbol value; or equivalently by their
@@ -632,7 +814,7 @@ build_decode_table(u32 decode_table[],
*/
/* sym=0, len=1 (arbitrary) */
- entry = decode_results[0] | 1;
+ entry = make_decode_table_entry(decode_results, 0, 1);
} else {
/*
* Allow codes with a single used symbol, with codeword
@@ -648,7 +830,8 @@ build_decode_table(u32 decode_table[],
if (codespace_used != (1U << (max_codeword_len - 1)) ||
len_counts[1] != 1)
return false;
- entry = decode_results[*sorted_syms] | 1;
+ entry = make_decode_table_entry(decode_results,
+ *sorted_syms, 1);
}
/*
* Note: the decode table still must be fully initialized, in
@@ -696,7 +879,8 @@ build_decode_table(u32 decode_table[],
/* Fill the first entry for the current codeword. */
decode_table[codeword] =
- decode_results[*sorted_syms++] | len;
+ make_decode_table_entry(decode_results,
+ *sorted_syms++, len);
if (codeword == cur_table_end - 1) {
/* Last codeword (all 1's) */
@@ -779,19 +963,18 @@ build_decode_table(u32 decode_table[],
/*
* Create the entry that points from the main table to
- * the subtable. This entry contains the index of the
- * start of the subtable and the number of bits with
- * which the subtable is indexed (the log base 2 of the
- * number of entries it contains).
+ * the subtable.
*/
decode_table[subtable_prefix] =
+ ((u32)subtable_start << 16) |
+ HUFFDEC_EXCEPTIONAL |
HUFFDEC_SUBTABLE_POINTER |
- HUFFDEC_RESULT_ENTRY(subtable_start) |
- subtable_bits;
+ (subtable_bits << 8) | table_bits;
}
/* Fill the subtable entries for the current codeword. */
- entry = decode_results[*sorted_syms++] | (len - table_bits);
+ entry = make_decode_table_entry(decode_results, *sorted_syms++,
+ len - table_bits);
i = subtable_start + (codeword >> table_bits);
stride = 1U << (len - table_bits);
do {
@@ -818,13 +1001,17 @@ build_precode_decode_table(struct libdeflate_decompressor *d)
/* When you change TABLEBITS, you must change ENOUGH, and vice versa! */
STATIC_ASSERT(PRECODE_TABLEBITS == 7 && PRECODE_ENOUGH == 128);
+ STATIC_ASSERT(ARRAY_LEN(precode_decode_results) ==
+ DEFLATE_NUM_PRECODE_SYMS);
+
return build_decode_table(d->u.l.precode_decode_table,
d->u.precode_lens,
DEFLATE_NUM_PRECODE_SYMS,
precode_decode_results,
PRECODE_TABLEBITS,
DEFLATE_MAX_PRE_CODEWORD_LEN,
- d->sorted_syms);
+ d->sorted_syms,
+ NULL);
}
/* Build the decode table for the literal/length code. */
@@ -833,7 +1020,10 @@ build_litlen_decode_table(struct libdeflate_decompressor *d,
unsigned num_litlen_syms, unsigned num_offset_syms)
{
/* When you change TABLEBITS, you must change ENOUGH, and vice versa! */
- STATIC_ASSERT(LITLEN_TABLEBITS == 10 && LITLEN_ENOUGH == 1334);
+ STATIC_ASSERT(LITLEN_TABLEBITS == 11 && LITLEN_ENOUGH == 2342);
+
+ STATIC_ASSERT(ARRAY_LEN(litlen_decode_results) ==
+ DEFLATE_NUM_LITLEN_SYMS);
return build_decode_table(d->u.litlen_decode_table,
d->u.l.lens,
@@ -841,7 +1031,8 @@ build_litlen_decode_table(struct libdeflate_decompressor *d,
litlen_decode_results,
LITLEN_TABLEBITS,
DEFLATE_MAX_LITLEN_CODEWORD_LEN,
- d->sorted_syms);
+ d->sorted_syms,
+ &d->litlen_tablebits);
}
/* Build the decode table for the offset code. */
@@ -852,33 +1043,17 @@ build_offset_decode_table(struct libdeflate_decompressor *d,
/* When you change TABLEBITS, you must change ENOUGH, and vice versa! */
STATIC_ASSERT(OFFSET_TABLEBITS == 8 && OFFSET_ENOUGH == 402);
+ STATIC_ASSERT(ARRAY_LEN(offset_decode_results) ==
+ DEFLATE_NUM_OFFSET_SYMS);
+
return build_decode_table(d->offset_decode_table,
d->u.l.lens + num_litlen_syms,
num_offset_syms,
offset_decode_results,
OFFSET_TABLEBITS,
DEFLATE_MAX_OFFSET_CODEWORD_LEN,
- d->sorted_syms);
-}
-
-static forceinline machine_word_t
-repeat_byte(u8 b)
-{
- machine_word_t v;
-
- STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64);
-
- v = b;
- v |= v << 8;
- v |= v << 16;
- v |= v << ((WORDBITS == 64) ? 32 : 0);
- return v;
-}
-
-static forceinline void
-copy_word_unaligned(const void *src, void *dst)
-{
- store_word_unaligned(load_word_unaligned(src), dst);
+ d->sorted_syms,
+ NULL);
}
/*****************************************************************************
@@ -886,12 +1061,15 @@ copy_word_unaligned(const void *src, void *dst)
*****************************************************************************/
typedef enum libdeflate_result (*decompress_func_t)
- (struct libdeflate_decompressor *d,
- const void *in, size_t in_nbytes, void *out, size_t out_nbytes_avail,
+ (struct libdeflate_decompressor * restrict d,
+ const void * restrict in, size_t in_nbytes,
+ void * restrict out, size_t out_nbytes_avail,
size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret);
#define FUNCNAME deflate_decompress_default
-#define ATTRIBUTES
+#undef ATTRIBUTES
+#undef EXTRACT_VARBITS
+#undef EXTRACT_VARBITS8
#include "decompress_template.h"
/* Include architecture-specific implementation(s) if available. */
=====================================
lib/x86/cpu_features.h
=====================================
@@ -156,6 +156,8 @@ typedef char __v64qi __attribute__((__vector_size__(64)));
#define HAVE_BMI2_TARGET \
(HAVE_DYNAMIC_X86_CPU_FEATURES && \
(GCC_PREREQ(4, 7) || __has_builtin(__builtin_ia32_pdep_di)))
+#define HAVE_BMI2_INTRIN \
+ (HAVE_BMI2_NATIVE || (HAVE_BMI2_TARGET && HAVE_TARGET_INTRINSICS))
#endif /* __i386__ || __x86_64__ */
=====================================
lib/x86/decompress_impl.h
=====================================
@@ -4,18 +4,46 @@
#include "cpu_features.h"
/* BMI2 optimized version */
-#if HAVE_BMI2_TARGET && !HAVE_BMI2_NATIVE
-# define FUNCNAME deflate_decompress_bmi2
-# define ATTRIBUTES __attribute__((target("bmi2")))
+#if HAVE_BMI2_INTRIN
+# define deflate_decompress_bmi2 deflate_decompress_bmi2
+# define FUNCNAME deflate_decompress_bmi2
+# if !HAVE_BMI2_NATIVE
+# define ATTRIBUTES __attribute__((target("bmi2")))
+# endif
+ /*
+ * Even with __attribute__((target("bmi2"))), gcc doesn't reliably use the
+ * bzhi instruction for 'word & BITMASK(count)'. So use the bzhi intrinsic
+ * explicitly. EXTRACT_VARBITS() is equivalent to 'word & BITMASK(count)';
+ * EXTRACT_VARBITS8() is equivalent to 'word & BITMASK((u8)count)'.
+ * Nevertheless, their implementation using the bzhi intrinsic is identical,
+ * as the bzhi instruction truncates the count to 8 bits implicitly.
+ */
+# ifndef __clang__
+# include <immintrin.h>
+# ifdef __x86_64__
+# define EXTRACT_VARBITS(word, count) _bzhi_u64((word), (count))
+# define EXTRACT_VARBITS8(word, count) _bzhi_u64((word), (count))
+# else
+# define EXTRACT_VARBITS(word, count) _bzhi_u32((word), (count))
+# define EXTRACT_VARBITS8(word, count) _bzhi_u32((word), (count))
+# endif
+# endif
# include "../decompress_template.h"
+#endif /* HAVE_BMI2_INTRIN */
+
+#if defined(deflate_decompress_bmi2) && HAVE_BMI2_NATIVE
+#define DEFAULT_IMPL deflate_decompress_bmi2
+#else
static inline decompress_func_t
arch_select_decompress_func(void)
{
+#ifdef deflate_decompress_bmi2
if (HAVE_BMI2(get_x86_cpu_features()))
return deflate_decompress_bmi2;
+#endif
return NULL;
}
-# define arch_select_decompress_func arch_select_decompress_func
+#define arch_select_decompress_func arch_select_decompress_func
#endif
#endif /* LIB_X86_DECOMPRESS_IMPL_H */
=====================================
lib/x86/matchfinder_impl.h
=====================================
@@ -28,7 +28,9 @@
#ifndef LIB_X86_MATCHFINDER_IMPL_H
#define LIB_X86_MATCHFINDER_IMPL_H
-#ifdef __AVX2__
+#include "cpu_features.h"
+
+#if HAVE_AVX2_NATIVE
# include <immintrin.h>
static forceinline void
matchfinder_init_avx2(mf_pos_t *data, size_t size)
@@ -73,7 +75,7 @@ matchfinder_rebase_avx2(mf_pos_t *data, size_t size)
}
#define matchfinder_rebase matchfinder_rebase_avx2
-#elif defined(__SSE2__)
+#elif HAVE_SSE2_NATIVE
# include <emmintrin.h>
static forceinline void
matchfinder_init_sse2(mf_pos_t *data, size_t size)
@@ -117,6 +119,6 @@ matchfinder_rebase_sse2(mf_pos_t *data, size_t size)
} while (size != 0);
}
#define matchfinder_rebase matchfinder_rebase_sse2
-#endif /* __SSE2__ */
+#endif /* HAVE_SSE2_NATIVE */
#endif /* LIB_X86_MATCHFINDER_IMPL_H */
=====================================
libdeflate.h
=====================================
@@ -10,8 +10,8 @@ extern "C" {
#endif
#define LIBDEFLATE_VERSION_MAJOR 1
-#define LIBDEFLATE_VERSION_MINOR 13
-#define LIBDEFLATE_VERSION_STRING "1.13"
+#define LIBDEFLATE_VERSION_MINOR 14
+#define LIBDEFLATE_VERSION_STRING "1.14"
#include <stddef.h>
#include <stdint.h>
=====================================
scripts/afl-fuzz/fuzz.sh
=====================================
@@ -119,7 +119,9 @@ fi
CFLAGS+=" -DLIBDEFLATE_ENABLE_ASSERTIONS"
sudo sh -c "echo core > /proc/sys/kernel/core_pattern"
-sudo sh -c "echo performance | tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor"
+if [ -e /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor ]; then
+ sudo sh -c "echo performance | tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor"
+fi
NPROC=$(getconf _NPROCESSORS_ONLN)
=====================================
scripts/run_tests.sh
=====================================
@@ -143,7 +143,7 @@ build_and_run_tests() {
features+=(avx2 avx bmi2 pclmul sse2)
;;
arm*|aarch*)
- features+=(sha3 crc32 pmull neon)
+ features+=(dotprod sha3 crc32 pmull neon)
;;
esac
fi
View it on GitLab: https://salsa.debian.org/med-team/libdeflate/-/compare/2732bee1eaffc869518dcd22936d68d726891420...7b8f0113750f86ce40a092065f27a8af712691b8
--
View it on GitLab: https://salsa.debian.org/med-team/libdeflate/-/compare/2732bee1eaffc869518dcd22936d68d726891420...7b8f0113750f86ce40a092065f27a8af712691b8
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20221003/f555b5c3/attachment-0001.htm>
More information about the debian-med-commit
mailing list