[med-svn] [libzstd] 01/07: Imported Upstream version 0.5.1

Thu Mar 17 10:55:37 UTC 2016

This is an automated email from the git hooks/post-receive script.

daube-guest pushed a commit to branch master
in repository libzstd.

commit 87f18e2e8a414c34d31bca48c8dfb296a56e11aa
Author: Kevin Murray <spam at kdmurray.id.au>
Date:   Mon Feb 22 08:59:35 2016 +1100

    Imported Upstream version 0.5.1
---
 .gitignore                                      |    6 +
 .travis.yml                                     |    3 +-
 Makefile                                        |   37 +-
 NEWS                                            |   12 +
 README.md                                       |  127 +-
 contrib/cmake/CMakeLists.txt                    |    2 +-
 contrib/cmake/lib/CMakeLists.txt                |   23 +-
 contrib/cmake/programs/CMakeLists.txt           |   13 +-
 images/CSpeed.png                               |  Bin 35874 -> 0 bytes
 images/Cspeed4.png                              |  Bin 0 -> 47376 bytes
 images/DCspeed5.png                             |  Bin 0 -> 69388 bytes
 images/DSpeed.png                               |  Bin 9143 -> 0 bytes
 images/Dspeed4.png                              |  Bin 0 -> 9499 bytes
 lib/Makefile                                    |   11 +-
 lib/README.md                                   |   56 +
 lib/bitstream.h                                 |   63 +-
 lib/divsufsort.c                                | 1913 +++++++++++++++++
 lib/divsufsort.h                                |   67 +
 lib/error_private.h                             |   59 +-
 lib/error_public.h                              |   11 +-
 lib/fse.c                                       |  478 ++---
 lib/fse.h                                       |   36 +-
 lib/fse_static.h                                |   31 +-
 lib/huff0.c                                     |  633 +++---
 lib/huff0.h                                     |    9 +-
 lib/huff0_static.h                              |   47 +-
 lib/legacy/zstd_legacy.h                        |    6 +-
 lib/legacy/zstd_v02.c                           |    7 +-
 lib/legacy/zstd_v03.c                           |    6 +-
 lib/legacy/{zstd_v03.c => zstd_v04.c}           | 2561 ++++++++++++++---------
 lib/legacy/zstd_v04.h                           |  148 ++
 lib/mem.h                                       |   61 +-
 lib/{zstd_buffered.c => zbuff.c}                |  114 +-
 lib/{zstd_buffered.h => zbuff.h}                |   90 +-
 lib/{zstd_buffered_static.h => zbuff_static.h}  |   11 +-
 lib/zdict.c                                     |  923 ++++++++
 lib/{zstd_buffered_static.h => zdict.h}         |   43 +-
 lib/zdict_static.h                              |   80 +
 lib/zstd.h                                      |   92 +-
 lib/zstd_compress.c                             | 1449 +++++++------
 lib/zstd_decompress.c                           |  678 ++++--
 lib/zstd_internal.h                             |  106 +-
 lib/zstd_opt.h                                  | 1125 ++++++++++
 lib/zstd_static.h                               |  295 +--
 programs/Makefile                               |   60 +-
 programs/bench.c                                |  114 +-
 programs/datagen.c                              |   42 +-
 programs/dibio.c                                |  277 +++
 programs/dibio.h                                |   52 +
 programs/fileio.c                               |  343 +--
 programs/fileio.h                               |   32 +-
 programs/fuzzer.c                               |   32 +-
 programs/legacy/fileio_legacy.c                 |   82 +
 programs/paramgrill.c                           |  262 +--
 programs/playTests.sh                           |   72 +-
 programs/xxhash.c                               |    2 +-
 programs/zbufftest.c                            |   14 +-
 programs/zstd.1                                 |   66 +-
 programs/zstdcli.c                              |  269 ++-
 visual/2013/fullbench/fullbench.vcxproj         |    3 -
 visual/2013/fullbench/fullbench.vcxproj.filters |    9 -
 visual/2013/zstd.sln                            |    4 +-
 visual/2013/zstd/zstd.vcxproj                   |   13 +-
 visual/2013/zstd/zstd.vcxproj.filters           |   39 +-
 visual/2013/zstdlib/zstdlib.vcxproj             |    8 +-
 visual/2013/zstdlib/zstdlib.vcxproj.filters     |   18 +-
 66 files changed, 9666 insertions(+), 3579 deletions(-)

diff --git a/.gitignore b/.gitignore
index 8641d7f..a31ffdc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -44,3 +44,9 @@ ipch/
 
 # Other files
 .directory
+_codelite
+_zstdbench
+
+lib/zstd_opt_LZ5.c
+lib/zstd_opt_llen.c
+lib/zstd_opt_nollen.c
\ No newline at end of file
diff --git a/.travis.yml b/.travis.yml
index 065e6f1..fda0b05 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -2,7 +2,6 @@ language: c
 
 before_install:
   - sudo apt-get update  -qq
-  - sudo apt-get install -qq gcc-arm-linux-gnueabi
   - sudo apt-get install -qq clang
   - sudo apt-get install -qq g++-multilib
   - sudo apt-get install -qq gcc-multilib
@@ -13,7 +12,7 @@ env:
   - ZSTD_TRAVIS_CI_ENV=cmaketest
   - ZSTD_TRAVIS_CI_ENV=clangtest  
   - ZSTD_TRAVIS_CI_ENV=gpptest  
-  - ZSTD_TRAVIS_CI_ENV=armtest  
+  - ZSTD_TRAVIS_CI_ENV=armtest-w-install  
   - ZSTD_TRAVIS_CI_ENV=test  
   - ZSTD_TRAVIS_CI_ENV="-C programs test32"  
   - ZSTD_TRAVIS_CI_ENV="-C programs test-zstd_nolegacy"
diff --git a/Makefile b/Makefile
index 88de5d0..93d5e05 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
 # ################################################################
 # zstd - Makefile
-# Copyright (C) Yann Collet 2014-2015
+# Copyright (C) Yann Collet 2014-2016
 # All rights reserved.
 # 
 # BSD license
@@ -27,12 +27,11 @@
 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 # 
 # You can contact the author at :
-#  - zstd source repository : https://github.com/Cyan4973/zstd
-#  - Public forum : https://groups.google.com/forum/#!forum/lz4c
+#  - zstd homepage : http://www.zstd.net/
 # ################################################################
 
 # force a version number : uncomment below export (otherwise, default to the one declared into zstd.h)
-#export VERSION := 0.4.6
+#export VERSION := 0.5.1
 
 PRGDIR  = programs
 ZSTDDIR = lib
@@ -90,8 +89,34 @@ gpptest: clean
 	$(MAKE) all CC=g++ CFLAGS="-O3 -Wall -Wextra -Wundef -Wshadow -Wcast-align -Werror"
 
 armtest: clean
-	$(MAKE) -C $(ZSTDDIR) all CC=arm-linux-gnueabi-gcc MOREFLAGS="-Werror"
-	$(MAKE) -C $(PRGDIR) CC=arm-linux-gnueabi-gcc MOREFLAGS="-Werror -static"
+#	$(MAKE) -C $(ZSTDDIR) all CC=arm-linux-gnueabi-gcc MOREFLAGS="-Werror"
+	$(MAKE) -C $(PRGDIR) datagen   # use native, faster
+	$(MAKE) -C $(PRGDIR) test CC=arm-linux-gnueabi-gcc ZSTDRTTEST= MOREFLAGS=-static # MOREFLAGS="-Werror -static"
+
+# for Travis CI
+arminstall: clean   
+	sudo apt-get install -q qemu  
+	sudo apt-get install -q binfmt-support
+	sudo apt-get install -q qemu-user-static
+	sudo apt-get install -q gcc-arm-linux-gnueabi
+
+# for Travis CI
+armtest-w-install: clean arminstall armtest
+
+ppctest: clean
+	$(MAKE) -C $(PRGDIR) datagen   # use native, faster
+	$(MAKE) -C $(PRGDIR) test CC=powerpc-linux-gnu-gcc ZSTDRTTEST= MOREFLAGS=-static # MOREFLAGS="-Werror -static" 
+
+# for Travis CI
+ppcinstall: clean   
+	sudo apt-get install -q qemu  
+	sudo apt-get install -q binfmt-support
+	sudo apt-get install -q qemu-user-static
+	sudo apt-get update  -q
+	sudo apt-get install -q gcc-powerpc-linux-gnu   # unfortunately, doesn't work on Travis CI (package not available)
+
+# for Travis CI
+ppctest-w-install: clean ppcinstall ppctest
 
 usan: clean
 	$(MAKE) test CC=clang MOREFLAGS="-g -fsanitize=undefined"
diff --git a/NEWS b/NEWS
index 8aaf02d..ccbf15e 100644
--- a/NEWS
+++ b/NEWS
@@ -1,3 +1,15 @@
+v0.5.1
+New : Optimal parsing => Very high compression modes, thanks to Przemyslaw Skibinski
+Changed : Dictionary builder integrated into libzstd and zstd cli
+Changed (!) : zstd cli now uses "multiple input files" as default mode. See `zstd -h`.
+Fix : high compression modes for big-endian platforms
+New : zstd cli : `-t` | `--test` command
+
+v0.5.0
+New : dictionary builder utility
+Changed : streaming & dictionary API
+Improved : better compression of small data
+
 v0.4.7
 Improved : small compression speed improvement in HC mode
 Changed : `zstd_decompress.c` has ZSTD_LEGACY_SUPPORT to 0 by default
diff --git a/README.md b/README.md
index 4a339cc..b84d8a8 100644
--- a/README.md
+++ b/README.md
@@ -7,38 +7,133 @@ It is provided as a BSD-license package, hosted on Github.
 |master      | [![Build Status](https://travis-ci.org/Cyan4973/zstd.svg?branch=master)](https://travis-ci.org/Cyan4973/zstd) |
 |dev         | [![Build Status](https://travis-ci.org/Cyan4973/zstd.svg?branch=dev)](https://travis-ci.org/Cyan4973/zstd) |
 
-For a taste of its performance, here are a few benchmark numbers from a number of compression codecs suitable for real-time. The test was completed on a Core i7-5600U @ 2.6 GHz, using m^2's [fsbench 0.14.3](http://encode.ru/threads/1371-Filesystem-benchmark?p=34029&viewfull=1#post34029) compiled with gcc 4.8.4, on the [Silesia compression corpus](http://sun.aei.polsl.pl/~sdeor/index.php?page=silesia).
+As a reference, several fast compression algorithms were tested and compared to [zlib] on a Core i7-3930K CPU @ 4.5GHz, using [lzbench], an open-source in-memory benchmark by @inikep compiled with gcc 5.2.1, on the [Silesia compression corpus].
+
+[lzbench]: https://github.com/inikep/lzbench
+[Silesia compression corpus]: http://sun.aei.polsl.pl/~sdeor/index.php?page=silesia
+
 
 |Name             | Ratio | C.speed | D.speed |
 |-----------------|-------|--------:|--------:|
 |                 |       |   MB/s  |  MB/s   |
-| **zstd 0.4**    |**2.872**|**280**| **670** |
-| [zlib] 1.2.8 -1 | 2.730 |    70   |   300   | 
-| QuickLZ 1.5.1b6 | 2.237 |   370   |   415   |
-| LZO 2.06        | 2.106 |   400   |   580   |
-| [LZ4] r131      | 2.101 |   450   |  2100   |
-| Snappy 1.1.0    | 2.091 |   330   |  1100   |
-| LZF 3.6         | 2.077 |   200   |   560   |
+|**zstd 0.5.1 -1**|**2.876**|**330**| **890** |
+| [zlib] 1.2.8 -1 | 2.730 |    95   |   360   |
+| brotli -0       | 2.708 |   220   |   430   |
+| QuickLZ 1.5     | 2.237 |   510   |   605   |
+| LZO 2.09        | 2.106 |   610   |   870   |
+| [LZ4] r131      | 2.101 |   620   |  3100   |
+| Snappy 1.1.3    | 2.091 |   480   |  1600   |
+| LZF 3.6         | 2.077 |   375   |   790   |
 
 [zlib]:http://www.zlib.net/
-[LZ4]:http://www.lz4.org/
+[LZ4]: http://www.lz4.org/
 
-Zstd can also offer stronger compression ratio at the cost of compression speed. Speed / Ratio trade-off is configurable by small increment, to fit different situations. Note however that decompression speed is preserved and remain roughly the same at all settings, a property shared by most LZ compression algorithms, such as [zlib]. The following test is run on a Core i7-3930K CPU @ 4.5GHz, using [lzbench], an open-source in-memory benchmark by inikep compiled with gcc 5.2.1, on the [Sil [...]
+Zstd can also offer stronger compression ratio at the cost of compression speed. 
+Speed vs Compression trade-off is configurable by small increment. Decompression speed is preserved and remain roughly the same at all settings, a property shared by most LZ compression algorithms, such as [zlib].
 
-[lzbench]:https://github.com/inikep/lzbench
+The following test is run on a Core i7-3930K CPU @ 4.5GHz, using [lzbench], an open-source in-memory benchmark by @inikep compiled with gcc 5.2.1, on the [Silesia compression corpus].
 
 Compression Speed vs Ratio | Decompression Speed
 ---------------------------|--------------------
-![Compression Speed vs Ratio](images/CSpeed.png "Compression Speed vs Ratio") | ![Decompression Speed](images/DSpeed.png "Decompression Speed")
+![Compression Speed vs Ratio](images/Cspeed4.png "Compression Speed vs Ratio") | ![Decompression Speed](images/Dspeed4.png "Decompression Speed")
 
+Several algorithms can produce higher compression ratio at slower speed, falling outside of the graph.
+For a larger picture including very slow modes, [click on this link](images/DCspeed5.png) .
 
-Zstd entropy stage is provided by [Huff0 and FSE, from Finite State Entropy library](https://github.com/Cyan4973/FiniteStateEntropy).
 
-Its memory requirement can be configured to fit into low-memory hardware configurations, or servers handling multiple connections/contexts in parallel.
+### The case for Small Data compression
+
+Above chart provides results applicable to large files or large streams scenarios (200 MB for this case).
+Small data (< 64 KB) come with different perspectives.
+The smaller the amount of data to compress, the more difficult it is to achieve any significant compression.
+On reaching the 1 KB region, it becomes almost impossible to compress anything.
+This problem is common to any compression algorithms, and throwing CPU power at it achieves little gains.
+
+The reason is, compression algorithms learn from past data how to compress future data.
+But at the beginning of a new file, there is no "past" to build upon.
+
+To solve this situation, Zstd now offers a __training mode__,
+which can be used to make the algorithm fit a selected type of data, by providing it with some samples.
+The result of the training is a file called "dictionary", which can be loaded before compression and decompression.
+Using this dictionary, the compression ratio achievable on small data improves dramatically :
+
+| Collection Name    | Direct compression | Dictionary Compression | Gains      | Average unit | Range       |
+| ---------------    | ------------------ | ---------------------- | ---------  | ------------:| -----       |
+| Small JSON records | x1.331 - x1.366	  | x5.860 - x6.830        | ~ __x4.7__ | 300          | 200 - 400   |
+| Mercurial events   | x2.322 - x2.538    | x3.377 - x4.462        | ~ __x1.5__ | 1.5 KB       | 20 - 200 KB |	
+| Large JSON docs    | x3.813 - x4.043    | x8.935 - x13.366       | ~ __x2.8__ | 6 KB         | 800 - 20 KB |	
+
+These compression gains are achieved without any speed loss, and prove in general a bit faster to compress and decompress.
+
+Dictionary work if there is some correlation in a family of small data (there is no _universal dictionary_).
+Hence, deploying one dictionary per type of data will provide the greater benefits.
+
+Large documents will benefit proportionally less, since dictionary gains are mostly effective in the first few KB.
+Then, the compression algorithm will rely more and more on already decoded content to compress the rest of the file.
+
+#### Dictionary compression How To :
+
+##### _Using the Command Line Utility_ :
+
+1) Create the dictionary
+
+`zstd --train FullPathToTrainingSet/* -o dictionaryName`
+
+2) Compression with dictionary
+
+`zstd FILE -D dictionaryName`
+
+3) Decompress with dictionary
 
-Zstd has not yet reached "stable format" status. It doesn't guarantee yet that its current compressed format will remain stable and supported in future versions. During this period, it can still change to adapt new optimizations still being investigated. "Stable Format" is projected sometimes early 2016. 
+`zstd --decompress FILE.zst -D dictionaryName`
 
-That being said, the library is now fairly robust, able to withstand hazards situations, including invalid inputs. The library reliability has been tested using [Fuzz Testing](https://en.wikipedia.org/wiki/Fuzz_testing), with both [internal tools](programs/fuzzer.c) and [external ones](http://lcamtuf.coredump.cx/afl). Therefore, it seems now safe to test Zstandard even within production environments.
+##### _Using API_ :
+
+1) Create dictionary
+
+```
+#include "zdict.h"
+(...)
+/* Train a dictionary from a memory buffer `samplesBuffer`, 
+   where `nbSamples` samples have been stored concatenated. */
+size_t dictSize = ZDICT_trainFromBuffer(dictBuffer, dictBufferCapacity,
+                                        samplesBuffer, samplesSizes, nbSamples);
+```
+
+2) Compression with dictionary
+
+```
+#include "zstd.h"
+(...)
+ZSTD_CCtx* context = ZSTD_createCCtx();
+size_t compressedSize = ZSTD_compress_usingDict(context, dst, dstCapacity, src, srcSize, dict, dictSize, compressionLevel);
+```
+
+3) Decompress with dictionary
+
+```
+#include "zstd.h"
+(...)
+ZSTD_DCtx* context = ZSTD_createDCtx();
+size_t regeneratedSize = ZSTD_decompress_usingDict(context, dst, dstCapacity, cSrc, cSrcSize, dict, dictSize);
+```
+
+
+### Status
+
+Zstd has not yet reached "stable format" status. It doesn't guarantee yet that its current compression format will remain stable in future versions. During this period, it can still change to adapt new optimizations still being investigated. "Stable Format" is projected H1 2016, and will be tagged `v1.0`.
+
+That being said, the library is now fairly robust, able to withstand hazards situations, including invalid inputs. It also features legacy support, so that documents compressed with current and previous version of zstd can still be decoded in the future. 
+Library reliability has been tested using [Fuzz Testing](https://en.wikipedia.org/wiki/Fuzz_testing), with both [internal tools](programs/fuzzer.c) and [external ones](http://lcamtuf.coredump.cx/afl). Therefore, Zstandard is considered safe for testings, even within production environments.
 
 ### Branch Policy
+
 The "dev" branch is the one where all contributions will be merged before reaching "master". If you plan to propose a patch, please commit into the "dev" branch or its own feature branch. Direct commit to "master" are not permitted.
+
+
+### Trivia
+
+Zstd entropy stage is provided by [Huff0 and FSE, from Finite State Entropy library](https://github.com/Cyan4973/FiniteStateEntropy).
+
+Its memory requirement can be configured to fit into low-memory hardware configurations, or servers handling multiple connections/contexts in parallel.
+
diff --git a/contrib/cmake/CMakeLists.txt b/contrib/cmake/CMakeLists.txt
index 3687c9e..4188337 100644
--- a/contrib/cmake/CMakeLists.txt
+++ b/contrib/cmake/CMakeLists.txt
@@ -1,6 +1,6 @@
 # ################################################################
 # zstd - Makefile
-# Copyright (C) Yann Collet 2014-2015
+# Copyright (C) Yann Collet 2014-2016
 # All rights reserved.
 # 
 # BSD license
diff --git a/contrib/cmake/lib/CMakeLists.txt b/contrib/cmake/lib/CMakeLists.txt
index a8247dd..bb2c057 100644
--- a/contrib/cmake/lib/CMakeLists.txt
+++ b/contrib/cmake/lib/CMakeLists.txt
@@ -1,6 +1,6 @@
 # ################################################################
 # zstd - Makefile
-# Copyright (C) Yann Collet 2014-2015
+# Copyright (C) Yann Collet 2014-2016
 # All rights reserved.
 # 
 # BSD license
@@ -27,8 +27,7 @@
 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 # 
 # You can contact the author at :
-#  - zstd source repository : https://github.com/Cyan4973/zstd
-#  - Public forum : https://groups.google.com/forum/#!forum/lz4c
+#  - zstd homepage : http://www.zstd.net/
 # ################################################################
 
 # Get library version based on information from input content (use regular exp)
@@ -58,9 +57,11 @@ GetLibraryVersion("${HEADER_CONTENT}" LIBVER_MAJOR LIBVER_MINOR LIBVER_RELEASE)
 MESSAGE("ZSTD VERSION ${LIBVER_MAJOR}.${LIBVER_MINOR}.${LIBVER_RELEASE}")
 
 SET(Sources
+        ${LIBRARY_DIR}/divsufsort.c
         ${LIBRARY_DIR}/fse.c
         ${LIBRARY_DIR}/huff0.c
-        ${LIBRARY_DIR}/zstd_buffered.c
+        ${LIBRARY_DIR}/zbuff.c
+        ${LIBRARY_DIR}/zdict.c
         ${LIBRARY_DIR}/zstd_compress.c
         ${LIBRARY_DIR}/zstd_decompress.c)
 
@@ -73,8 +74,10 @@ SET(Headers
         ${LIBRARY_DIR}/huff0.h
         ${LIBRARY_DIR}/huff0_static.h
         ${LIBRARY_DIR}/mem.h
-        ${LIBRARY_DIR}/zstd_buffered_static.h
-        ${LIBRARY_DIR}/zstd_buffered.h
+        ${LIBRARY_DIR}/zbuff.h
+        ${LIBRARY_DIR}/zbuff_static.h
+        ${LIBRARY_DIR}/zdict.h
+        ${LIBRARY_DIR}/zdict_static.h
         ${LIBRARY_DIR}/zstd_internal.h
         ${LIBRARY_DIR}/zstd_static.h
         ${LIBRARY_DIR}/zstd.h)
@@ -86,13 +89,15 @@ IF (ZSTD_LEGACY_SUPPORT)
     SET(Sources ${Sources}
             ${LIBRARY_LEGACY_DIR}/zstd_v01.c
             ${LIBRARY_LEGACY_DIR}/zstd_v02.c
-            ${LIBRARY_LEGACY_DIR}/zstd_v03.c)
+            ${LIBRARY_LEGACY_DIR}/zstd_v03.c
+            ${LIBRARY_LEGACY_DIR}/zstd_v04.c)
 
     SET(Headers ${Headers}
             ${LIBRARY_LEGACY_DIR}/zstd_legacy.h
             ${LIBRARY_LEGACY_DIR}/zstd_v01.h
             ${LIBRARY_LEGACY_DIR}/zstd_v02.h
-            ${LIBRARY_LEGACY_DIR}/zstd_v03.h)
+            ${LIBRARY_LEGACY_DIR}/zstd_v03.h
+            ${LIBRARY_LEGACY_DIR}/zstd_v04.h)
 ENDIF (ZSTD_LEGACY_SUPPORT)
 
 IF (MSVC)
@@ -161,7 +166,7 @@ IF (UNIX)
     SET(INSTALL_INCLUDE_DIR ${PREFIX}/include)
 
     # install target
-    INSTALL(FILES ${LIBRARY_DIR}/zstd.h DESTINATION ${INSTALL_INCLUDE_DIR})
+    INSTALL(FILES ${LIBRARY_DIR}/zstd.h ${LIBRARY_DIR}/zstd_buffered.h ${LIBRARY_DIR}/dictBuilder.h DESTINATION ${INSTALL_INCLUDE_DIR})
     INSTALL(TARGETS libzstd_static DESTINATION ${INSTALL_LIBRARY_DIR})
     INSTALL(TARGETS libzstd_shared LIBRARY DESTINATION ${INSTALL_LIBRARY_DIR})
 
diff --git a/contrib/cmake/programs/CMakeLists.txt b/contrib/cmake/programs/CMakeLists.txt
index af9a057..ebee7c2 100644
--- a/contrib/cmake/programs/CMakeLists.txt
+++ b/contrib/cmake/programs/CMakeLists.txt
@@ -1,6 +1,6 @@
 # ################################################################
 # zstd - Makefile
-# Copyright (C) Yann Collet 2014-2015
+# Copyright (C) Yann Collet 2014-2016
 # All rights reserved.
 #
 # BSD license
@@ -27,8 +27,7 @@
 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #
 # You can contact the author at :
-#  - zstd source repository : https://github.com/Cyan4973/zstd
-#  - Public forum : https://groups.google.com/forum/#!forum/lz4c
+#  - zstd homepage : http://www.zstd.net/
 # ################################################################
 
 PROJECT(programs)
@@ -59,7 +58,7 @@ IF (ZSTD_LEGACY_SUPPORT)
     SET(ZSTD_FILEIO_LEGACY ${PROGRAMS_LEGACY_DIR}/fileio_legacy.c)
 ENDIF (ZSTD_LEGACY_SUPPORT)
 
-ADD_EXECUTABLE(zstd ${PROGRAMS_DIR}/zstdcli.c ${PROGRAMS_DIR}/fileio.c ${PROGRAMS_DIR}/bench.c ${PROGRAMS_DIR}/xxhash.c ${PROGRAMS_DIR}/datagen.c ${ZSTD_FILEIO_LEGACY})
+ADD_EXECUTABLE(zstd ${PROGRAMS_DIR}/zstdcli.c ${PROGRAMS_DIR}/fileio.c ${PROGRAMS_DIR}/bench.c ${PROGRAMS_DIR}/xxhash.c ${PROGRAMS_DIR}/datagen.c ${PROGRAMS_DIR}/dibio.c ${ZSTD_FILEIO_LEGACY})
 TARGET_LINK_LIBRARIES(zstd libzstd_static)
 
 ADD_EXECUTABLE(fullbench ${PROGRAMS_DIR}/datagen.c ${PROGRAMS_DIR}/fullbench.c)
@@ -69,9 +68,9 @@ ADD_EXECUTABLE(fuzzer ${PROGRAMS_DIR}/datagen.c ${PROGRAMS_DIR}/xxhash.c ${PROGR
 TARGET_LINK_LIBRARIES(fuzzer libzstd_static)
 
 IF (UNIX)
-    ADD_EXECUTABLE(zstd-noBench ${PROGRAMS_DIR}/zstdcli.c ${PROGRAMS_DIR}/fileio.c ${ZSTD_FILEIO_LEGACY})
-    TARGET_LINK_LIBRARIES(zstd-noBench libzstd_static)
-    SET_TARGET_PROPERTIES(zstd-noBench PROPERTIES COMPILE_DEFINITIONS "ZSTD_NOBENCH")
+    ADD_EXECUTABLE(zstd-frugal ${PROGRAMS_DIR}/zstdcli.c ${PROGRAMS_DIR}/fileio.c)
+    TARGET_LINK_LIBRARIES(zstd-frugal libzstd_static)
+    SET_TARGET_PROPERTIES(zstd-frugal PROPERTIES COMPILE_DEFINITIONS "ZSTD_NOBENCH;ZSTD_NODICT")
 
     ADD_EXECUTABLE(zbufftest ${PROGRAMS_DIR}/datagen.c ${PROGRAMS_DIR}/xxhash.c ${PROGRAMS_DIR}/zbufftest.c)
     TARGET_LINK_LIBRARIES(zbufftest libzstd_static)
diff --git a/images/CSpeed.png b/images/CSpeed.png
deleted file mode 100644
index 5ba0561..0000000
Binary files a/images/CSpeed.png and /dev/null differ
diff --git a/images/Cspeed4.png b/images/Cspeed4.png
new file mode 100644
index 0000000..843e5eb
Binary files /dev/null and b/images/Cspeed4.png differ
diff --git a/images/DCspeed5.png b/images/DCspeed5.png
new file mode 100644
index 0000000..db5ef3c
Binary files /dev/null and b/images/DCspeed5.png differ
diff --git a/images/DSpeed.png b/images/DSpeed.png
deleted file mode 100644
index 1cd4713..0000000
Binary files a/images/DSpeed.png and /dev/null differ
diff --git a/images/Dspeed4.png b/images/Dspeed4.png
new file mode 100644
index 0000000..107e26c
Binary files /dev/null and b/images/Dspeed4.png differ
diff --git a/lib/Makefile b/lib/Makefile
index a7172b6..6bdf2f8 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -1,6 +1,6 @@
 # ################################################################
 # ZSTD library - Makefile
-# Copyright (C) Yann Collet 2015
+# Copyright (C) Yann Collet 2015-2016
 # All rights reserved.
 # 
 # BSD license
@@ -28,7 +28,6 @@
 # 
 # You can contact the author at :
 #  - ZSTD homepage : http://www.zstd.net
-#  - ZSTD source repository : https://github.com/Cyan4973/zstd
 # ################################################################
 
 # Version numbers
@@ -52,8 +51,8 @@ FLAGS   = $(CPPFLAGS) $(CFLAGS) $(LDFLAGS) $(MOREFLAGS)
 LIBDIR ?= $(PREFIX)/lib
 INCLUDEDIR=$(PREFIX)/include
 
-ZSTD_FILES := zstd_compress.c zstd_decompress.c fse.c huff0.c
-ZSTD_LEGACY:= legacy/zstd_v01.c legacy/zstd_v02.c legacy/zstd_v03.c
+ZSTD_FILES := zstd_compress.c zstd_decompress.c fse.c huff0.c zdict.c divsufsort.c
+ZSTD_LEGACY:= legacy/zstd_v01.c legacy/zstd_v02.c legacy/zstd_v03.c legacy/zstd_v04.c
 
 ifeq ($(ZSTD_LEGACY_SUPPORT), 0)
 CPPFLAGS  += -DZSTD_LEGACY_SUPPORT=0
@@ -119,6 +118,8 @@ install: libzstd libzstd.pc
 	@cp -a libzstd.pc $(DESTDIR)$(LIBDIR)/pkgconfig/
 	@install -m 644 libzstd.a $(DESTDIR)$(LIBDIR)/libzstd.a
 	@install -m 644 zstd.h $(DESTDIR)$(INCLUDEDIR)/zstd.h
+	@install -m 644 zstd.h $(DESTDIR)$(INCLUDEDIR)/zbuff.h
+	@install -m 644 zstd.h $(DESTDIR)$(INCLUDEDIR)/zdict.h
 	@echo zstd static and shared library installed
 
 uninstall:
@@ -128,6 +129,8 @@ uninstall:
 	@[ -x $(DESTDIR)$(LIBDIR)/libzstd.$(SHARED_EXT_VER) ] && rm -f $(DESTDIR)$(LIBDIR)/libzstd.$(SHARED_EXT_VER)
 	@[ -f $(DESTDIR)$(LIBDIR)/libzstd.a ] && rm -f $(DESTDIR)$(LIBDIR)/libzstd.a
 	@[ -f $(DESTDIR)$(INCLUDEDIR)/zstd.h ] && rm -f $(DESTDIR)$(INCLUDEDIR)/zstd.h
+	@[ -f $(DESTDIR)$(INCLUDEDIR)/zstd.h ] && rm -f $(DESTDIR)$(INCLUDEDIR)/zbuff.h
+	@[ -f $(DESTDIR)$(INCLUDEDIR)/zstd.h ] && rm -f $(DESTDIR)$(INCLUDEDIR)/zdict.h
 	@echo zstd libraries successfully uninstalled
 
 endif
diff --git a/lib/README.md b/lib/README.md
new file mode 100644
index 0000000..a044554
--- /dev/null
+++ b/lib/README.md
@@ -0,0 +1,56 @@
+zstd - library files
+================================
+
+The __lib__ directory contains several files, but depending on target use case, some of them may not be necessary.
+
+#### Minimal library files
+
+##### Shared ressources
+
+- [mem.h](mem.h)
+- [error_private.h](error_private.h)
+- [error_public.h](error_public.h)
+
+##### zstd core compression
+
+- [bitstream.h](bitstream.h)
+- fse.c
+- fse.h
+- fse_static.h
+- huff0.c
+- huff0.h
+- huff0_static.h
+- zstd_compress.c
+- zstd_decompress.c
+- zstd_internal.h
+- zstd_opt.h
+- zstd.h
+- zstd_static.h
+
+#### Buffered streaming
+
+This complementary API makes streaming integration easier.
+It is used by `zstd` command line utility :
+
+- zbuff.c
+- zbuff.h
+- zbuff_static.h
+
+#### Dictionary builder
+
+To create dictionaries from training sets :
+
+- divsufsort.c
+- divsufsort.h
+- zdict.c
+- zdict.h
+- zdict_static.h
+
+#### Miscellaneous
+
+The other files are not source code. There are :
+
+ - LICENSE : contains the BSD license text
+ - Makefile : script to compile or install zstd library (static or dynamic)
+ - libzstd.pc.in : for pkg-config (make install)
+
diff --git a/lib/bitstream.h b/lib/bitstream.h
index fbd0f3f..e0487e8 100644
--- a/lib/bitstream.h
+++ b/lib/bitstream.h
@@ -1,8 +1,8 @@
 /* ******************************************************************
    bitstream
-   Part of NewGen Entropy library
+   Part of FSE library
    header file (to include)
-   Copyright (C) 2013-2015, Yann Collet.
+   Copyright (C) 2013-2016, Yann Collet.
 
    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
 
@@ -31,7 +31,6 @@
 
    You can contact the author at :
    - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
-   - Public forum : https://groups.google.com/forum/#!forum/lz4c
 ****************************************************************** */
 #ifndef BITSTREAM_H_MODULE
 #define BITSTREAM_H_MODULE
@@ -47,17 +46,17 @@ extern "C" {
 *  these functions are defined into a .h to be included.
 */
 
-/******************************************
-*  Includes
+/*-****************************************
+*  Dependencies
 ******************************************/
 #include "mem.h"            /* unaligned access routines */
 #include "error_private.h"  /* error codes and messages */
 
 
-/********************************************
-*  bitStream compression API (write forward)
+/*-******************************************
+*  bitStream encoding API (write forward)
 ********************************************/
-/*
+/*!
 * bitStream can mix input from multiple sources.
 * A critical property of these streams is that they encode and decode in **reverse** direction.
 * So the first bit sequence you add will be the last to be read, like a LIFO stack.
@@ -71,32 +70,32 @@ typedef struct
     char*  endPtr;
 } BIT_CStream_t;
 
-MEM_STATIC size_t BIT_initCStream(BIT_CStream_t* bitC, void* dstBuffer, size_t maxDstSize);
+MEM_STATIC size_t BIT_initCStream(BIT_CStream_t* bitC, void* dstBuffer, size_t dstCapacity);
 MEM_STATIC void   BIT_addBits(BIT_CStream_t* bitC, size_t value, unsigned nbBits);
 MEM_STATIC void   BIT_flushBits(BIT_CStream_t* bitC);
 MEM_STATIC size_t BIT_closeCStream(BIT_CStream_t* bitC);
 
-/*
-* Start by initCStream, providing the maximum size of write buffer to write into.
+/*!
+* Start by initCStream, providing the size of buffer to write into.
 * bitStream will never write outside of this buffer.
-* buffer must be at least as large as a size_t, otherwise function result will be an error code.
+* @dstCapacity must be >= sizeof(size_t), otherwise @return will be an error code.
 *
 * bits are first added to a local register.
 * Local register is size_t, hence 64-bits on 64-bits systems, or 32-bits on 32-bits systems.
-* Writing data into memory is a manual operation, performed by the flushBits function.
+* Writing data into memory is an explicit operation, performed by the flushBits function.
 * Hence keep track how many bits are potentially stored into local register to avoid register overflow.
 * After a flushBits, a maximum of 7 bits might still be stored into local register.
 *
-* Avoid storing elements of more than 25 bits if you want compatibility with 32-bits bitstream readers.
+* Avoid storing elements of more than 24 bits if you want compatibility with 32-bits bitstream readers.
 *
 * Last operation is to close the bitStream.
 * The function returns the final size of CStream in bytes.
-* If data couldn't fit into dstBuffer, it will return a 0 ( == not storable)
+* If data couldn't fit into @dstBuffer, it will return a 0 ( == not storable)
 */
 
 
-/**********************************************
-*  bitStream decompression API (read backward)
+/*-********************************************
+*  bitStream decoding API (read backward)
 **********************************************/
 typedef struct
 {
@@ -118,19 +117,19 @@ MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD);
 MEM_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t* bitD);
 
 
-/*
+/*!
 * Start by invoking BIT_initDStream().
 * A chunk of the bitStream is then stored into a local register.
 * Local register size is 64-bits on 64-bits systems, 32-bits on 32-bits systems (size_t).
 * You can then retrieve bitFields stored into the local register, **in reverse order**.
-* Local register is manually filled from memory by the BIT_reloadDStream() method.
+* Local register is explicitly reloaded from memory by the BIT_reloadDStream() method.
 * A reload guarantee a minimum of ((8*sizeof(size_t))-7) bits when its result is BIT_DStream_unfinished.
 * Otherwise, it can be less than that, so proceed accordingly.
 * Checking if DStream has reached its end can be performed with BIT_endOfDStream()
 */
 
 
-/******************************************
+/*-****************************************
 *  unsafe API
 ******************************************/
 MEM_STATIC void BIT_addBitsFast(BIT_CStream_t* bitC, size_t value, unsigned nbBits);
@@ -144,7 +143,7 @@ MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits);
 
 
 
-/****************************************************************
+/*-**************************************************************
 *  Helper functions
 ****************************************************************/
 MEM_STATIC unsigned BIT_highbit32 (register U32 val)
@@ -170,10 +169,9 @@ MEM_STATIC unsigned BIT_highbit32 (register U32 val)
 }
 
 
-/****************************************************************
+/*-**************************************************************
 *  bitStream encoding
 ****************************************************************/
-
 MEM_STATIC size_t BIT_initCStream(BIT_CStream_t* bitC, void* startPtr, size_t maxSize)
 {
     bitC->bitContainer = 0;
@@ -240,10 +238,9 @@ MEM_STATIC size_t BIT_closeCStream(BIT_CStream_t* bitC)
 }
 
 
-/**********************************************************
+/*-********************************************************
 * bitStream decoding
 **********************************************************/
-
 /*!BIT_initDStream
 *  Initialize a BIT_DStream_t.
 *  @bitD : a pointer to an already allocated BIT_DStream_t structure
@@ -255,8 +252,7 @@ MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, si
 {
     if (srcSize < 1) { memset(bitD, 0, sizeof(*bitD)); return ERROR(srcSize_wrong); }
 
-    if (srcSize >=  sizeof(size_t))   /* normal case */
-    {
+    if (srcSize >=  sizeof(size_t)) {  /* normal case */
         U32 contain32;
         bitD->start = (const char*)srcBuffer;
         bitD->ptr   = (const char*)srcBuffer + srcSize - sizeof(size_t);
@@ -264,9 +260,7 @@ MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, si
         contain32 = ((const BYTE*)srcBuffer)[srcSize-1];
         if (contain32 == 0) return ERROR(GENERIC);   /* endMark not present */
         bitD->bitsConsumed = 8 - BIT_highbit32(contain32);
-    }
-    else
-    {
+    } else {
         U32 contain32;
         bitD->start = (const char*)srcBuffer;
         bitD->ptr   = bitD->start;
@@ -342,23 +336,20 @@ MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD)
 	if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8))  /* should never happen */
 		return BIT_DStream_overflow;
 
-    if (bitD->ptr >= bitD->start + sizeof(bitD->bitContainer))
-    {
+    if (bitD->ptr >= bitD->start + sizeof(bitD->bitContainer)) {
         bitD->ptr -= bitD->bitsConsumed >> 3;
         bitD->bitsConsumed &= 7;
         bitD->bitContainer = MEM_readLEST(bitD->ptr);
         return BIT_DStream_unfinished;
     }
-    if (bitD->ptr == bitD->start)
-    {
+    if (bitD->ptr == bitD->start) {
         if (bitD->bitsConsumed < sizeof(bitD->bitContainer)*8) return BIT_DStream_endOfBuffer;
         return BIT_DStream_completed;
     }
     {
         U32 nbBytes = bitD->bitsConsumed >> 3;
         BIT_DStream_status result = BIT_DStream_unfinished;
-        if (bitD->ptr - nbBytes < bitD->start)
-        {
+        if (bitD->ptr - nbBytes < bitD->start) {
             nbBytes = (U32)(bitD->ptr - bitD->start);  /* ptr > start */
             result = BIT_DStream_endOfBuffer;
         }
diff --git a/lib/divsufsort.c b/lib/divsufsort.c
new file mode 100644
index 0000000..60cceb0
--- /dev/null
+++ b/lib/divsufsort.c
@@ -0,0 +1,1913 @@
+/*
+ * divsufsort.c for libdivsufsort-lite
+ * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*- Compiler specifics -*/
+#ifdef __clang__
+#pragma clang diagnostic ignored "-Wshorten-64-to-32"
+#endif
+
+#if defined(_MSC_VER)
+#  pragma warning(disable : 4244)
+#  pragma warning(disable : 4127)    /* C4127 : Condition expression is constant */
+#endif
+
+
+/*- Dependencies -*/
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "divsufsort.h"
+
+/*- Constants -*/
+#if defined(INLINE)
+# undef INLINE
+#endif
+#if !defined(INLINE)
+# define INLINE __inline
+#endif
+#if defined(ALPHABET_SIZE) && (ALPHABET_SIZE < 1)
+# undef ALPHABET_SIZE
+#endif
+#if !defined(ALPHABET_SIZE)
+# define ALPHABET_SIZE (256)
+#endif
+#define BUCKET_A_SIZE (ALPHABET_SIZE)
+#define BUCKET_B_SIZE (ALPHABET_SIZE * ALPHABET_SIZE)
+#if defined(SS_INSERTIONSORT_THRESHOLD)
+# if SS_INSERTIONSORT_THRESHOLD < 1
+#  undef SS_INSERTIONSORT_THRESHOLD
+#  define SS_INSERTIONSORT_THRESHOLD (1)
+# endif
+#else
+# define SS_INSERTIONSORT_THRESHOLD (8)
+#endif
+#if defined(SS_BLOCKSIZE)
+# if SS_BLOCKSIZE < 0
+#  undef SS_BLOCKSIZE
+#  define SS_BLOCKSIZE (0)
+# elif 32768 <= SS_BLOCKSIZE
+#  undef SS_BLOCKSIZE
+#  define SS_BLOCKSIZE (32767)
+# endif
+#else
+# define SS_BLOCKSIZE (1024)
+#endif
+/* minstacksize = log(SS_BLOCKSIZE) / log(3) * 2 */
+#if SS_BLOCKSIZE == 0
+# define SS_MISORT_STACKSIZE (96)
+#elif SS_BLOCKSIZE <= 4096
+# define SS_MISORT_STACKSIZE (16)
+#else
+# define SS_MISORT_STACKSIZE (24)
+#endif
+#define SS_SMERGE_STACKSIZE (32)
+#define TR_INSERTIONSORT_THRESHOLD (8)
+#define TR_STACKSIZE (64)
+
+
+/*- Macros -*/
+#ifndef SWAP
+# define SWAP(_a, _b) do { t = (_a); (_a) = (_b); (_b) = t; } while(0)
+#endif /* SWAP */
+#ifndef MIN
+# define MIN(_a, _b) (((_a) < (_b)) ? (_a) : (_b))
+#endif /* MIN */
+#ifndef MAX
+# define MAX(_a, _b) (((_a) > (_b)) ? (_a) : (_b))
+#endif /* MAX */
+#define STACK_PUSH(_a, _b, _c, _d)\
+  do {\
+    assert(ssize < STACK_SIZE);\
+    stack[ssize].a = (_a), stack[ssize].b = (_b),\
+    stack[ssize].c = (_c), stack[ssize++].d = (_d);\
+  } while(0)
+#define STACK_PUSH5(_a, _b, _c, _d, _e)\
+  do {\
+    assert(ssize < STACK_SIZE);\
+    stack[ssize].a = (_a), stack[ssize].b = (_b),\
+    stack[ssize].c = (_c), stack[ssize].d = (_d), stack[ssize++].e = (_e);\
+  } while(0)
+#define STACK_POP(_a, _b, _c, _d)\
+  do {\
+    assert(0 <= ssize);\
+    if(ssize == 0) { return; }\
+    (_a) = stack[--ssize].a, (_b) = stack[ssize].b,\
+    (_c) = stack[ssize].c, (_d) = stack[ssize].d;\
+  } while(0)
+#define STACK_POP5(_a, _b, _c, _d, _e)\
+  do {\
+    assert(0 <= ssize);\
+    if(ssize == 0) { return; }\
+    (_a) = stack[--ssize].a, (_b) = stack[ssize].b,\
+    (_c) = stack[ssize].c, (_d) = stack[ssize].d, (_e) = stack[ssize].e;\
+  } while(0)
+#define BUCKET_A(_c0) bucket_A[(_c0)]
+#if ALPHABET_SIZE == 256
+#define BUCKET_B(_c0, _c1) (bucket_B[((_c1) << 8) | (_c0)])
+#define BUCKET_BSTAR(_c0, _c1) (bucket_B[((_c0) << 8) | (_c1)])
+#else
+#define BUCKET_B(_c0, _c1) (bucket_B[(_c1) * ALPHABET_SIZE + (_c0)])
+#define BUCKET_BSTAR(_c0, _c1) (bucket_B[(_c0) * ALPHABET_SIZE + (_c1)])
+#endif
+
+
+/*- Private Functions -*/
+
+static const int lg_table[256]= {
+ -1,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,
+  5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,
+  6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
+  6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
+  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7
+};
+
+#if (SS_BLOCKSIZE == 0) || (SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE)
+
+static INLINE
+int
+ss_ilg(int n) {
+#if SS_BLOCKSIZE == 0
+  return (n & 0xffff0000) ?
+          ((n & 0xff000000) ?
+            24 + lg_table[(n >> 24) & 0xff] :
+            16 + lg_table[(n >> 16) & 0xff]) :
+          ((n & 0x0000ff00) ?
+             8 + lg_table[(n >>  8) & 0xff] :
+             0 + lg_table[(n >>  0) & 0xff]);
+#elif SS_BLOCKSIZE < 256
+  return lg_table[n];
+#else
+  return (n & 0xff00) ?
+          8 + lg_table[(n >> 8) & 0xff] :
+          0 + lg_table[(n >> 0) & 0xff];
+#endif
+}
+
+#endif /* (SS_BLOCKSIZE == 0) || (SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE) */
+
+#if SS_BLOCKSIZE != 0
+
+static const int sqq_table[256] = {
+  0,  16,  22,  27,  32,  35,  39,  42,  45,  48,  50,  53,  55,  57,  59,  61,
+ 64,  65,  67,  69,  71,  73,  75,  76,  78,  80,  81,  83,  84,  86,  87,  89,
+ 90,  91,  93,  94,  96,  97,  98,  99, 101, 102, 103, 104, 106, 107, 108, 109,
+110, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126,
+128, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
+143, 144, 144, 145, 146, 147, 148, 149, 150, 150, 151, 152, 153, 154, 155, 155,
+156, 157, 158, 159, 160, 160, 161, 162, 163, 163, 164, 165, 166, 167, 167, 168,
+169, 170, 170, 171, 172, 173, 173, 174, 175, 176, 176, 177, 178, 178, 179, 180,
+181, 181, 182, 183, 183, 184, 185, 185, 186, 187, 187, 188, 189, 189, 190, 191,
+192, 192, 193, 193, 194, 195, 195, 196, 197, 197, 198, 199, 199, 200, 201, 201,
+202, 203, 203, 204, 204, 205, 206, 206, 207, 208, 208, 209, 209, 210, 211, 211,
+212, 212, 213, 214, 214, 215, 215, 216, 217, 217, 218, 218, 219, 219, 220, 221,
+221, 222, 222, 223, 224, 224, 225, 225, 226, 226, 227, 227, 228, 229, 229, 230,
+230, 231, 231, 232, 232, 233, 234, 234, 235, 235, 236, 236, 237, 237, 238, 238,
+239, 240, 240, 241, 241, 242, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247,
+247, 248, 248, 249, 249, 250, 250, 251, 251, 252, 252, 253, 253, 254, 254, 255
+};
+
+static INLINE
+int
+ss_isqrt(int x) {
+  int y, e;
+
+  if(x >= (SS_BLOCKSIZE * SS_BLOCKSIZE)) { return SS_BLOCKSIZE; }
+  e = (x & 0xffff0000) ?
+        ((x & 0xff000000) ?
+          24 + lg_table[(x >> 24) & 0xff] :
+          16 + lg_table[(x >> 16) & 0xff]) :
+        ((x & 0x0000ff00) ?
+           8 + lg_table[(x >>  8) & 0xff] :
+           0 + lg_table[(x >>  0) & 0xff]);
+
+  if(e >= 16) {
+    y = sqq_table[x >> ((e - 6) - (e & 1))] << ((e >> 1) - 7);
+    if(e >= 24) { y = (y + 1 + x / y) >> 1; }
+    y = (y + 1 + x / y) >> 1;
+  } else if(e >= 8) {
+    y = (sqq_table[x >> ((e - 6) - (e & 1))] >> (7 - (e >> 1))) + 1;
+  } else {
+    return sqq_table[x] >> 4;
+  }
+
+  return (x < (y * y)) ? y - 1 : y;
+}
+
+#endif /* SS_BLOCKSIZE != 0 */
+
+
+/*---------------------------------------------------------------------------*/
+
+/* Compares two suffixes. */
+static INLINE
+int
+ss_compare(const unsigned char *T,
+           const int *p1, const int *p2,
+           int depth) {
+  const unsigned char *U1, *U2, *U1n, *U2n;
+
+  for(U1 = T + depth + *p1,
+      U2 = T + depth + *p2,
+      U1n = T + *(p1 + 1) + 2,
+      U2n = T + *(p2 + 1) + 2;
+      (U1 < U1n) && (U2 < U2n) && (*U1 == *U2);
+      ++U1, ++U2) {
+  }
+
+  return U1 < U1n ?
+        (U2 < U2n ? *U1 - *U2 : 1) :
+        (U2 < U2n ? -1 : 0);
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+#if (SS_BLOCKSIZE != 1) && (SS_INSERTIONSORT_THRESHOLD != 1)
+
+/* Insertionsort for small size groups */
+static
+void
+ss_insertionsort(const unsigned char *T, const int *PA,
+                 int *first, int *last, int depth) {
+  int *i, *j;
+  int t;
+  int r;
+
+  for(i = last - 2; first <= i; --i) {
+    for(t = *i, j = i + 1; 0 < (r = ss_compare(T, PA + t, PA + *j, depth));) {
+      do { *(j - 1) = *j; } while((++j < last) && (*j < 0));
+      if(last <= j) { break; }
+    }
+    if(r == 0) { *j = ~*j; }
+    *(j - 1) = t;
+  }
+}
+
+#endif /* (SS_BLOCKSIZE != 1) && (SS_INSERTIONSORT_THRESHOLD != 1) */
+
+
+/*---------------------------------------------------------------------------*/
+
+#if (SS_BLOCKSIZE == 0) || (SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE)
+
+static INLINE
+void
+ss_fixdown(const unsigned char *Td, const int *PA,
+           int *SA, int i, int size) {
+  int j, k;
+  int v;
+  int c, d, e;
+
+  for(v = SA[i], c = Td[PA[v]]; (j = 2 * i + 1) < size; SA[i] = SA[k], i = k) {
+    d = Td[PA[SA[k = j++]]];
+    if(d < (e = Td[PA[SA[j]]])) { k = j; d = e; }
+    if(d <= c) { break; }
+  }
+  SA[i] = v;
+}
+
+/* Simple top-down heapsort. */
+static
+void
+ss_heapsort(const unsigned char *Td, const int *PA, int *SA, int size) {
+  int i, m;
+  int t;
+
+  m = size;
+  if((size % 2) == 0) {
+    m--;
+    if(Td[PA[SA[m / 2]]] < Td[PA[SA[m]]]) { SWAP(SA[m], SA[m / 2]); }
+  }
+
+  for(i = m / 2 - 1; 0 <= i; --i) { ss_fixdown(Td, PA, SA, i, m); }
+  if((size % 2) == 0) { SWAP(SA[0], SA[m]); ss_fixdown(Td, PA, SA, 0, m); }
+  for(i = m - 1; 0 < i; --i) {
+    t = SA[0], SA[0] = SA[i];
+    ss_fixdown(Td, PA, SA, 0, i);
+    SA[i] = t;
+  }
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+/* Returns the median of three elements. */
+static INLINE
+int *
+ss_median3(const unsigned char *Td, const int *PA,
+           int *v1, int *v2, int *v3) {
+  int *t;
+  if(Td[PA[*v1]] > Td[PA[*v2]]) { SWAP(v1, v2); }
+  if(Td[PA[*v2]] > Td[PA[*v3]]) {
+    if(Td[PA[*v1]] > Td[PA[*v3]]) { return v1; }
+    else { return v3; }
+  }
+  return v2;
+}
+
+/* Returns the median of five elements. */
+static INLINE
+int *
+ss_median5(const unsigned char *Td, const int *PA,
+           int *v1, int *v2, int *v3, int *v4, int *v5) {
+  int *t;
+  if(Td[PA[*v2]] > Td[PA[*v3]]) { SWAP(v2, v3); }
+  if(Td[PA[*v4]] > Td[PA[*v5]]) { SWAP(v4, v5); }
+  if(Td[PA[*v2]] > Td[PA[*v4]]) { SWAP(v2, v4); SWAP(v3, v5); }
+  if(Td[PA[*v1]] > Td[PA[*v3]]) { SWAP(v1, v3); }
+  if(Td[PA[*v1]] > Td[PA[*v4]]) { SWAP(v1, v4); SWAP(v3, v5); }
+  if(Td[PA[*v3]] > Td[PA[*v4]]) { return v4; }
+  return v3;
+}
+
+/* Returns the pivot element. */
+static INLINE
+int *
+ss_pivot(const unsigned char *Td, const int *PA, int *first, int *last) {
+  int *middle;
+  int t;
+
+  t = last - first;
+  middle = first + t / 2;
+
+  if(t <= 512) {
+    if(t <= 32) {
+      return ss_median3(Td, PA, first, middle, last - 1);
+    } else {
+      t >>= 2;
+      return ss_median5(Td, PA, first, first + t, middle, last - 1 - t, last - 1);
+    }
+  }
+  t >>= 3;
+  first  = ss_median3(Td, PA, first, first + t, first + (t << 1));
+  middle = ss_median3(Td, PA, middle - t, middle, middle + t);
+  last   = ss_median3(Td, PA, last - 1 - (t << 1), last - 1 - t, last - 1);
+  return ss_median3(Td, PA, first, middle, last);
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+/* Binary partition for substrings. */
+static INLINE
+int *
+ss_partition(const int *PA,
+                    int *first, int *last, int depth) {
+  int *a, *b;
+  int t;
+  for(a = first - 1, b = last;;) {
+    for(; (++a < b) && ((PA[*a] + depth) >= (PA[*a + 1] + 1));) { *a = ~*a; }
+    for(; (a < --b) && ((PA[*b] + depth) <  (PA[*b + 1] + 1));) { }
+    if(b <= a) { break; }
+    t = ~*b;
+    *b = *a;
+    *a = t;
+  }
+  if(first < a) { *first = ~*first; }
+  return a;
+}
+
+/* Multikey introsort for medium size groups. */
+static
+void
+ss_mintrosort(const unsigned char *T, const int *PA,
+              int *first, int *last,
+              int depth) {
+#define STACK_SIZE SS_MISORT_STACKSIZE
+  struct { int *a, *b, c; int d; } stack[STACK_SIZE];
+  const unsigned char *Td;
+  int *a, *b, *c, *d, *e, *f;
+  int s, t;
+  int ssize;
+  int limit;
+  int v, x = 0;
+
+  for(ssize = 0, limit = ss_ilg(last - first);;) {
+
+    if((last - first) <= SS_INSERTIONSORT_THRESHOLD) {
+#if 1 < SS_INSERTIONSORT_THRESHOLD
+      if(1 < (last - first)) { ss_insertionsort(T, PA, first, last, depth); }
+#endif
+      STACK_POP(first, last, depth, limit);
+      continue;
+    }
+
+    Td = T + depth;
+    if(limit-- == 0) { ss_heapsort(Td, PA, first, last - first); }
+    if(limit < 0) {
+      for(a = first + 1, v = Td[PA[*first]]; a < last; ++a) {
+        if((x = Td[PA[*a]]) != v) {
+          if(1 < (a - first)) { break; }
+          v = x;
+          first = a;
+        }
+      }
+      if(Td[PA[*first] - 1] < v) {
+        first = ss_partition(PA, first, a, depth);
+      }
+      if((a - first) <= (last - a)) {
+        if(1 < (a - first)) {
+          STACK_PUSH(a, last, depth, -1);
+          last = a, depth += 1, limit = ss_ilg(a - first);
+        } else {
+          first = a, limit = -1;
+        }
+      } else {
+        if(1 < (last - a)) {
+          STACK_PUSH(first, a, depth + 1, ss_ilg(a - first));
+          first = a, limit = -1;
+        } else {
+          last = a, depth += 1, limit = ss_ilg(a - first);
+        }
+      }
+      continue;
+    }
+
+    /* choose pivot */
+    a = ss_pivot(Td, PA, first, last);
+    v = Td[PA[*a]];
+    SWAP(*first, *a);
+
+    /* partition */
+    for(b = first; (++b < last) && ((x = Td[PA[*b]]) == v);) { }
+    if(((a = b) < last) && (x < v)) {
+      for(; (++b < last) && ((x = Td[PA[*b]]) <= v);) {
+        if(x == v) { SWAP(*b, *a); ++a; }
+      }
+    }
+    for(c = last; (b < --c) && ((x = Td[PA[*c]]) == v);) { }
+    if((b < (d = c)) && (x > v)) {
+      for(; (b < --c) && ((x = Td[PA[*c]]) >= v);) {
+        if(x == v) { SWAP(*c, *d); --d; }
+      }
+    }
+    for(; b < c;) {
+      SWAP(*b, *c);
+      for(; (++b < c) && ((x = Td[PA[*b]]) <= v);) {
+        if(x == v) { SWAP(*b, *a); ++a; }
+      }
+      for(; (b < --c) && ((x = Td[PA[*c]]) >= v);) {
+        if(x == v) { SWAP(*c, *d); --d; }
+      }
+    }
+
+    if(a <= d) {
+      c = b - 1;
+
+      if((s = a - first) > (t = b - a)) { s = t; }
+      for(e = first, f = b - s; 0 < s; --s, ++e, ++f) { SWAP(*e, *f); }
+      if((s = d - c) > (t = last - d - 1)) { s = t; }
+      for(e = b, f = last - s; 0 < s; --s, ++e, ++f) { SWAP(*e, *f); }
+
+      a = first + (b - a), c = last - (d - c);
+      b = (v <= Td[PA[*a] - 1]) ? a : ss_partition(PA, a, c, depth);
+
+      if((a - first) <= (last - c)) {
+        if((last - c) <= (c - b)) {
+          STACK_PUSH(b, c, depth + 1, ss_ilg(c - b));
+          STACK_PUSH(c, last, depth, limit);
+          last = a;
+        } else if((a - first) <= (c - b)) {
+          STACK_PUSH(c, last, depth, limit);
+          STACK_PUSH(b, c, depth + 1, ss_ilg(c - b));
+          last = a;
+        } else {
+          STACK_PUSH(c, last, depth, limit);
+          STACK_PUSH(first, a, depth, limit);
+          first = b, last = c, depth += 1, limit = ss_ilg(c - b);
+        }
+      } else {
+        if((a - first) <= (c - b)) {
+          STACK_PUSH(b, c, depth + 1, ss_ilg(c - b));
+          STACK_PUSH(first, a, depth, limit);
+          first = c;
+        } else if((last - c) <= (c - b)) {
+          STACK_PUSH(first, a, depth, limit);
+          STACK_PUSH(b, c, depth + 1, ss_ilg(c - b));
+          first = c;
+        } else {
+          STACK_PUSH(first, a, depth, limit);
+          STACK_PUSH(c, last, depth, limit);
+          first = b, last = c, depth += 1, limit = ss_ilg(c - b);
+        }
+      }
+    } else {
+      limit += 1;
+      if(Td[PA[*first] - 1] < v) {
+        first = ss_partition(PA, first, last, depth);
+        limit = ss_ilg(last - first);
+      }
+      depth += 1;
+    }
+  }
+#undef STACK_SIZE
+}
+
+#endif /* (SS_BLOCKSIZE == 0) || (SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE) */
+
+
+/*---------------------------------------------------------------------------*/
+
+#if SS_BLOCKSIZE != 0
+
+static INLINE
+void
+ss_blockswap(int *a, int *b, int n) {
+  int t;
+  for(; 0 < n; --n, ++a, ++b) {
+    t = *a, *a = *b, *b = t;
+  }
+}
+
+static INLINE
+void
+ss_rotate(int *first, int *middle, int *last) {
+  int *a, *b, t;
+  int l, r;
+  l = middle - first, r = last - middle;
+  for(; (0 < l) && (0 < r);) {
+    if(l == r) { ss_blockswap(first, middle, l); break; }
+    if(l < r) {
+      a = last - 1, b = middle - 1;
+      t = *a;
+      do {
+        *a-- = *b, *b-- = *a;
+        if(b < first) {
+          *a = t;
+          last = a;
+          if((r -= l + 1) <= l) { break; }
+          a -= 1, b = middle - 1;
+          t = *a;
+        }
+      } while(1);
+    } else {
+      a = first, b = middle;
+      t = *a;
+      do {
+        *a++ = *b, *b++ = *a;
+        if(last <= b) {
+          *a = t;
+          first = a + 1;
+          if((l -= r + 1) <= r) { break; }
+          a += 1, b = middle;
+          t = *a;
+        }
+      } while(1);
+    }
+  }
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+static
+void
+ss_inplacemerge(const unsigned char *T, const int *PA,
+                int *first, int *middle, int *last,
+                int depth) {
+  const int *p;
+  int *a, *b;
+  int len, half;
+  int q, r;
+  int x;
+
+  for(;;) {
+    if(*(last - 1) < 0) { x = 1; p = PA + ~*(last - 1); }
+    else                { x = 0; p = PA +  *(last - 1); }
+    for(a = first, len = middle - first, half = len >> 1, r = -1;
+        0 < len;
+        len = half, half >>= 1) {
+      b = a + half;
+      q = ss_compare(T, PA + ((0 <= *b) ? *b : ~*b), p, depth);
+      if(q < 0) {
+        a = b + 1;
+        half -= (len & 1) ^ 1;
+      } else {
+        r = q;
+      }
+    }
+    if(a < middle) {
+      if(r == 0) { *a = ~*a; }
+      ss_rotate(a, middle, last);
+      last -= middle - a;
+      middle = a;
+      if(first == middle) { break; }
+    }
+    --last;
+    if(x != 0) { while(*--last < 0) { } }
+    if(middle == last) { break; }
+  }
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+/* Merge-forward with internal buffer. */
+static
+void
+ss_mergeforward(const unsigned char *T, const int *PA,
+                int *first, int *middle, int *last,
+                int *buf, int depth) {
+  int *a, *b, *c, *bufend;
+  int t;
+  int r;
+
+  bufend = buf + (middle - first) - 1;
+  ss_blockswap(buf, first, middle - first);
+
+  for(t = *(a = first), b = buf, c = middle;;) {
+    r = ss_compare(T, PA + *b, PA + *c, depth);
+    if(r < 0) {
+      do {
+        *a++ = *b;
+        if(bufend <= b) { *bufend = t; return; }
+        *b++ = *a;
+      } while(*b < 0);
+    } else if(r > 0) {
+      do {
+        *a++ = *c, *c++ = *a;
+        if(last <= c) {
+          while(b < bufend) { *a++ = *b, *b++ = *a; }
+          *a = *b, *b = t;
+          return;
+        }
+      } while(*c < 0);
+    } else {
+      *c = ~*c;
+      do {
+        *a++ = *b;
+        if(bufend <= b) { *bufend = t; return; }
+        *b++ = *a;
+      } while(*b < 0);
+
+      do {
+        *a++ = *c, *c++ = *a;
+        if(last <= c) {
+          while(b < bufend) { *a++ = *b, *b++ = *a; }
+          *a = *b, *b = t;
+          return;
+        }
+      } while(*c < 0);
+    }
+  }
+}
+
+/* Merge-backward with internal buffer. */
+static
+void
+ss_mergebackward(const unsigned char *T, const int *PA,
+                 int *first, int *middle, int *last,
+                 int *buf, int depth) {
+  const int *p1, *p2;
+  int *a, *b, *c, *bufend;
+  int t;
+  int r;
+  int x;
+
+  bufend = buf + (last - middle) - 1;
+  ss_blockswap(buf, middle, last - middle);
+
+  x = 0;
+  if(*bufend < 0)       { p1 = PA + ~*bufend; x |= 1; }
+  else                  { p1 = PA +  *bufend; }
+  if(*(middle - 1) < 0) { p2 = PA + ~*(middle - 1); x |= 2; }
+  else                  { p2 = PA +  *(middle - 1); }
+  for(t = *(a = last - 1), b = bufend, c = middle - 1;;) {
+    r = ss_compare(T, p1, p2, depth);
+    if(0 < r) {
+      if(x & 1) { do { *a-- = *b, *b-- = *a; } while(*b < 0); x ^= 1; }
+      *a-- = *b;
+      if(b <= buf) { *buf = t; break; }
+      *b-- = *a;
+      if(*b < 0) { p1 = PA + ~*b; x |= 1; }
+      else       { p1 = PA +  *b; }
+    } else if(r < 0) {
+      if(x & 2) { do { *a-- = *c, *c-- = *a; } while(*c < 0); x ^= 2; }
+      *a-- = *c, *c-- = *a;
+      if(c < first) {
+        while(buf < b) { *a-- = *b, *b-- = *a; }
+        *a = *b, *b = t;
+        break;
+      }
+      if(*c < 0) { p2 = PA + ~*c; x |= 2; }
+      else       { p2 = PA +  *c; }
+    } else {
+      if(x & 1) { do { *a-- = *b, *b-- = *a; } while(*b < 0); x ^= 1; }
+      *a-- = ~*b;
+      if(b <= buf) { *buf = t; break; }
+      *b-- = *a;
+      if(x & 2) { do { *a-- = *c, *c-- = *a; } while(*c < 0); x ^= 2; }
+      *a-- = *c, *c-- = *a;
+      if(c < first) {
+        while(buf < b) { *a-- = *b, *b-- = *a; }
+        *a = *b, *b = t;
+        break;
+      }
+      if(*b < 0) { p1 = PA + ~*b; x |= 1; }
+      else       { p1 = PA +  *b; }
+      if(*c < 0) { p2 = PA + ~*c; x |= 2; }
+      else       { p2 = PA +  *c; }
+    }
+  }
+}
+
+/* D&C based merge. */
+static
+void
+ss_swapmerge(const unsigned char *T, const int *PA,
+             int *first, int *middle, int *last,
+             int *buf, int bufsize, int depth) {
+#define STACK_SIZE SS_SMERGE_STACKSIZE
+#define GETIDX(a) ((0 <= (a)) ? (a) : (~(a)))
+#define MERGE_CHECK(a, b, c)\
+  do {\
+    if(((c) & 1) ||\
+       (((c) & 2) && (ss_compare(T, PA + GETIDX(*((a) - 1)), PA + *(a), depth) == 0))) {\
+      *(a) = ~*(a);\
+    }\
+    if(((c) & 4) && ((ss_compare(T, PA + GETIDX(*((b) - 1)), PA + *(b), depth) == 0))) {\
+      *(b) = ~*(b);\
+    }\
+  } while(0)
+  struct { int *a, *b, *c; int d; } stack[STACK_SIZE];
+  int *l, *r, *lm, *rm;
+  int m, len, half;
+  int ssize;
+  int check, next;
+
+  for(check = 0, ssize = 0;;) {
+    if((last - middle) <= bufsize) {
+      if((first < middle) && (middle < last)) {
+        ss_mergebackward(T, PA, first, middle, last, buf, depth);
+      }
+      MERGE_CHECK(first, last, check);
+      STACK_POP(first, middle, last, check);
+      continue;
+    }
+
+    if((middle - first) <= bufsize) {
+      if(first < middle) {
+        ss_mergeforward(T, PA, first, middle, last, buf, depth);
+      }
+      MERGE_CHECK(first, last, check);
+      STACK_POP(first, middle, last, check);
+      continue;
+    }
+
+    for(m = 0, len = MIN(middle - first, last - middle), half = len >> 1;
+        0 < len;
+        len = half, half >>= 1) {
+      if(ss_compare(T, PA + GETIDX(*(middle + m + half)),
+                       PA + GETIDX(*(middle - m - half - 1)), depth) < 0) {
+        m += half + 1;
+        half -= (len & 1) ^ 1;
+      }
+    }
+
+    if(0 < m) {
+      lm = middle - m, rm = middle + m;
+      ss_blockswap(lm, middle, m);
+      l = r = middle, next = 0;
+      if(rm < last) {
+        if(*rm < 0) {
+          *rm = ~*rm;
+          if(first < lm) { for(; *--l < 0;) { } next |= 4; }
+          next |= 1;
+        } else if(first < lm) {
+          for(; *r < 0; ++r) { }
+          next |= 2;
+        }
+      }
+
+      if((l - first) <= (last - r)) {
+        STACK_PUSH(r, rm, last, (next & 3) | (check & 4));
+        middle = lm, last = l, check = (check & 3) | (next & 4);
+      } else {
+        if((next & 2) && (r == middle)) { next ^= 6; }
+        STACK_PUSH(first, lm, l, (check & 3) | (next & 4));
+        first = r, middle = rm, check = (next & 3) | (check & 4);
+      }
+    } else {
+      if(ss_compare(T, PA + GETIDX(*(middle - 1)), PA + *middle, depth) == 0) {
+        *middle = ~*middle;
+      }
+      MERGE_CHECK(first, last, check);
+      STACK_POP(first, middle, last, check);
+    }
+  }
+#undef STACK_SIZE
+}
+
+#endif /* SS_BLOCKSIZE != 0 */
+
+
+/*---------------------------------------------------------------------------*/
+
+/* Substring sort */
+static
+void
+sssort(const unsigned char *T, const int *PA,
+       int *first, int *last,
+       int *buf, int bufsize,
+       int depth, int n, int lastsuffix) {
+  int *a;
+#if SS_BLOCKSIZE != 0
+  int *b, *middle, *curbuf;
+  int j, k, curbufsize, limit;
+#endif
+  int i;
+
+  if(lastsuffix != 0) { ++first; }
+
+#if SS_BLOCKSIZE == 0
+  ss_mintrosort(T, PA, first, last, depth);
+#else
+  if((bufsize < SS_BLOCKSIZE) &&
+      (bufsize < (last - first)) &&
+      (bufsize < (limit = ss_isqrt(last - first)))) {
+    if(SS_BLOCKSIZE < limit) { limit = SS_BLOCKSIZE; }
+    buf = middle = last - limit, bufsize = limit;
+  } else {
+    middle = last, limit = 0;
+  }
+  for(a = first, i = 0; SS_BLOCKSIZE < (middle - a); a += SS_BLOCKSIZE, ++i) {
+#if SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE
+    ss_mintrosort(T, PA, a, a + SS_BLOCKSIZE, depth);
+#elif 1 < SS_BLOCKSIZE
+    ss_insertionsort(T, PA, a, a + SS_BLOCKSIZE, depth);
+#endif
+    curbufsize = last - (a + SS_BLOCKSIZE);
+    curbuf = a + SS_BLOCKSIZE;
+    if(curbufsize <= bufsize) { curbufsize = bufsize, curbuf = buf; }
+    for(b = a, k = SS_BLOCKSIZE, j = i; j & 1; b -= k, k <<= 1, j >>= 1) {
+      ss_swapmerge(T, PA, b - k, b, b + k, curbuf, curbufsize, depth);
+    }
+  }
+#if SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE
+  ss_mintrosort(T, PA, a, middle, depth);
+#elif 1 < SS_BLOCKSIZE
+  ss_insertionsort(T, PA, a, middle, depth);
+#endif
+  for(k = SS_BLOCKSIZE; i != 0; k <<= 1, i >>= 1) {
+    if(i & 1) {
+      ss_swapmerge(T, PA, a - k, a, middle, buf, bufsize, depth);
+      a -= k;
+    }
+  }
+  if(limit != 0) {
+#if SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE
+    ss_mintrosort(T, PA, middle, last, depth);
+#elif 1 < SS_BLOCKSIZE
+    ss_insertionsort(T, PA, middle, last, depth);
+#endif
+    ss_inplacemerge(T, PA, first, middle, last, depth);
+  }
+#endif
+
+  if(lastsuffix != 0) {
+    /* Insert last type B* suffix. */
+    int PAi[2]; PAi[0] = PA[*(first - 1)], PAi[1] = n - 2;
+    for(a = first, i = *(first - 1);
+        (a < last) && ((*a < 0) || (0 < ss_compare(T, &(PAi[0]), PA + *a, depth)));
+        ++a) {
+      *(a - 1) = *a;
+    }
+    *(a - 1) = i;
+  }
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+static INLINE
+int
+tr_ilg(int n) {
+  return (n & 0xffff0000) ?
+          ((n & 0xff000000) ?
+            24 + lg_table[(n >> 24) & 0xff] :
+            16 + lg_table[(n >> 16) & 0xff]) :
+          ((n & 0x0000ff00) ?
+             8 + lg_table[(n >>  8) & 0xff] :
+             0 + lg_table[(n >>  0) & 0xff]);
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+/* Simple insertionsort for small size groups. */
+static
+void
+tr_insertionsort(const int *ISAd, int *first, int *last) {
+  int *a, *b;
+  int t, r;
+
+  for(a = first + 1; a < last; ++a) {
+    for(t = *a, b = a - 1; 0 > (r = ISAd[t] - ISAd[*b]);) {
+      do { *(b + 1) = *b; } while((first <= --b) && (*b < 0));
+      if(b < first) { break; }
+    }
+    if(r == 0) { *b = ~*b; }
+    *(b + 1) = t;
+  }
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+static INLINE
+void
+tr_fixdown(const int *ISAd, int *SA, int i, int size) {
+  int j, k;
+  int v;
+  int c, d, e;
+
+  for(v = SA[i], c = ISAd[v]; (j = 2 * i + 1) < size; SA[i] = SA[k], i = k) {
+    d = ISAd[SA[k = j++]];
+    if(d < (e = ISAd[SA[j]])) { k = j; d = e; }
+    if(d <= c) { break; }
+  }
+  SA[i] = v;
+}
+
+/* Simple top-down heapsort. */
+static
+void
+tr_heapsort(const int *ISAd, int *SA, int size) {
+  int i, m;
+  int t;
+
+  m = size;
+  if((size % 2) == 0) {
+    m--;
+    if(ISAd[SA[m / 2]] < ISAd[SA[m]]) { SWAP(SA[m], SA[m / 2]); }
+  }
+
+  for(i = m / 2 - 1; 0 <= i; --i) { tr_fixdown(ISAd, SA, i, m); }
+  if((size % 2) == 0) { SWAP(SA[0], SA[m]); tr_fixdown(ISAd, SA, 0, m); }
+  for(i = m - 1; 0 < i; --i) {
+    t = SA[0], SA[0] = SA[i];
+    tr_fixdown(ISAd, SA, 0, i);
+    SA[i] = t;
+  }
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+/* Returns the median of three elements. */
+static INLINE
+int *
+tr_median3(const int *ISAd, int *v1, int *v2, int *v3) {
+  int *t;
+  if(ISAd[*v1] > ISAd[*v2]) { SWAP(v1, v2); }
+  if(ISAd[*v2] > ISAd[*v3]) {
+    if(ISAd[*v1] > ISAd[*v3]) { return v1; }
+    else { return v3; }
+  }
+  return v2;
+}
+
+/* Returns the median of five elements. */
+static INLINE
+int *
+tr_median5(const int *ISAd,
+           int *v1, int *v2, int *v3, int *v4, int *v5) {
+  int *t;
+  if(ISAd[*v2] > ISAd[*v3]) { SWAP(v2, v3); }
+  if(ISAd[*v4] > ISAd[*v5]) { SWAP(v4, v5); }
+  if(ISAd[*v2] > ISAd[*v4]) { SWAP(v2, v4); SWAP(v3, v5); }
+  if(ISAd[*v1] > ISAd[*v3]) { SWAP(v1, v3); }
+  if(ISAd[*v1] > ISAd[*v4]) { SWAP(v1, v4); SWAP(v3, v5); }
+  if(ISAd[*v3] > ISAd[*v4]) { return v4; }
+  return v3;
+}
+
+/* Returns the pivot element. */
+static INLINE
+int *
+tr_pivot(const int *ISAd, int *first, int *last) {
+  int *middle;
+  int t;
+
+  t = last - first;
+  middle = first + t / 2;
+
+  if(t <= 512) {
+    if(t <= 32) {
+      return tr_median3(ISAd, first, middle, last - 1);
+    } else {
+      t >>= 2;
+      return tr_median5(ISAd, first, first + t, middle, last - 1 - t, last - 1);
+    }
+  }
+  t >>= 3;
+  first  = tr_median3(ISAd, first, first + t, first + (t << 1));
+  middle = tr_median3(ISAd, middle - t, middle, middle + t);
+  last   = tr_median3(ISAd, last - 1 - (t << 1), last - 1 - t, last - 1);
+  return tr_median3(ISAd, first, middle, last);
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+typedef struct _trbudget_t trbudget_t;
+struct _trbudget_t {
+  int chance;
+  int remain;
+  int incval;
+  int count;
+};
+
+static INLINE
+void
+trbudget_init(trbudget_t *budget, int chance, int incval) {
+  budget->chance = chance;
+  budget->remain = budget->incval = incval;
+}
+
+static INLINE
+int
+trbudget_check(trbudget_t *budget, int size) {
+  if(size <= budget->remain) { budget->remain -= size; return 1; }
+  if(budget->chance == 0) { budget->count += size; return 0; }
+  budget->remain += budget->incval - size;
+  budget->chance -= 1;
+  return 1;
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+static INLINE
+void
+tr_partition(const int *ISAd,
+             int *first, int *middle, int *last,
+             int **pa, int **pb, int v) {
+  int *a, *b, *c, *d, *e, *f;
+  int t, s;
+  int x = 0;
+
+  for(b = middle - 1; (++b < last) && ((x = ISAd[*b]) == v);) { }
+  if(((a = b) < last) && (x < v)) {
+    for(; (++b < last) && ((x = ISAd[*b]) <= v);) {
+      if(x == v) { SWAP(*b, *a); ++a; }
+    }
+  }
+  for(c = last; (b < --c) && ((x = ISAd[*c]) == v);) { }
+  if((b < (d = c)) && (x > v)) {
+    for(; (b < --c) && ((x = ISAd[*c]) >= v);) {
+      if(x == v) { SWAP(*c, *d); --d; }
+    }
+  }
+  for(; b < c;) {
+    SWAP(*b, *c);
+    for(; (++b < c) && ((x = ISAd[*b]) <= v);) {
+      if(x == v) { SWAP(*b, *a); ++a; }
+    }
+    for(; (b < --c) && ((x = ISAd[*c]) >= v);) {
+      if(x == v) { SWAP(*c, *d); --d; }
+    }
+  }
+
+  if(a <= d) {
+    c = b - 1;
+    if((s = a - first) > (t = b - a)) { s = t; }
+    for(e = first, f = b - s; 0 < s; --s, ++e, ++f) { SWAP(*e, *f); }
+    if((s = d - c) > (t = last - d - 1)) { s = t; }
+    for(e = b, f = last - s; 0 < s; --s, ++e, ++f) { SWAP(*e, *f); }
+    first += (b - a), last -= (d - c);
+  }
+  *pa = first, *pb = last;
+}
+
+static
+void
+tr_copy(int *ISA, const int *SA,
+        int *first, int *a, int *b, int *last,
+        int depth) {
+  /* sort suffixes of middle partition
+     by using sorted order of suffixes of left and right partition. */
+  int *c, *d, *e;
+  int s, v;
+
+  v = b - SA - 1;
+  for(c = first, d = a - 1; c <= d; ++c) {
+    if((0 <= (s = *c - depth)) && (ISA[s] == v)) {
+      *++d = s;
+      ISA[s] = d - SA;
+    }
+  }
+  for(c = last - 1, e = d + 1, d = b; e < d; --c) {
+    if((0 <= (s = *c - depth)) && (ISA[s] == v)) {
+      *--d = s;
+      ISA[s] = d - SA;
+    }
+  }
+}
+
+static
+void
+tr_partialcopy(int *ISA, const int *SA,
+               int *first, int *a, int *b, int *last,
+               int depth) {
+  int *c, *d, *e;
+  int s, v;
+  int rank, lastrank, newrank = -1;
+
+  v = b - SA - 1;
+  lastrank = -1;
+  for(c = first, d = a - 1; c <= d; ++c) {
+    if((0 <= (s = *c - depth)) && (ISA[s] == v)) {
+      *++d = s;
+      rank = ISA[s + depth];
+      if(lastrank != rank) { lastrank = rank; newrank = d - SA; }
+      ISA[s] = newrank;
+    }
+  }
+
+  lastrank = -1;
+  for(e = d; first <= e; --e) {
+    rank = ISA[*e];
+    if(lastrank != rank) { lastrank = rank; newrank = e - SA; }
+    if(newrank != rank) { ISA[*e] = newrank; }
+  }
+
+  lastrank = -1;
+  for(c = last - 1, e = d + 1, d = b; e < d; --c) {
+    if((0 <= (s = *c - depth)) && (ISA[s] == v)) {
+      *--d = s;
+      rank = ISA[s + depth];
+      if(lastrank != rank) { lastrank = rank; newrank = d - SA; }
+      ISA[s] = newrank;
+    }
+  }
+}
+
+static
+void
+tr_introsort(int *ISA, const int *ISAd,
+             int *SA, int *first, int *last,
+             trbudget_t *budget) {
+#define STACK_SIZE TR_STACKSIZE
+  struct { const int *a; int *b, *c; int d, e; }stack[STACK_SIZE];
+  int *a, *b, *c;
+  int t;
+  int v, x = 0;
+  int incr = ISAd - ISA;
+  int limit, next;
+  int ssize, trlink = -1;
+
+  for(ssize = 0, limit = tr_ilg(last - first);;) {
+
+    if(limit < 0) {
+      if(limit == -1) {
+        /* tandem repeat partition */
+        tr_partition(ISAd - incr, first, first, last, &a, &b, last - SA - 1);
+
+        /* update ranks */
+        if(a < last) {
+          for(c = first, v = a - SA - 1; c < a; ++c) { ISA[*c] = v; }
+        }
+        if(b < last) {
+          for(c = a, v = b - SA - 1; c < b; ++c) { ISA[*c] = v; }
+        }
+
+        /* push */
+        if(1 < (b - a)) {
+          STACK_PUSH5(NULL, a, b, 0, 0);
+          STACK_PUSH5(ISAd - incr, first, last, -2, trlink);
+          trlink = ssize - 2;
+        }
+        if((a - first) <= (last - b)) {
+          if(1 < (a - first)) {
+            STACK_PUSH5(ISAd, b, last, tr_ilg(last - b), trlink);
+            last = a, limit = tr_ilg(a - first);
+          } else if(1 < (last - b)) {
+            first = b, limit = tr_ilg(last - b);
+          } else {
+            STACK_POP5(ISAd, first, last, limit, trlink);
+          }
+        } else {
+          if(1 < (last - b)) {
+            STACK_PUSH5(ISAd, first, a, tr_ilg(a - first), trlink);
+            first = b, limit = tr_ilg(last - b);
+          } else if(1 < (a - first)) {
+            last = a, limit = tr_ilg(a - first);
+          } else {
+            STACK_POP5(ISAd, first, last, limit, trlink);
+          }
+        }
+      } else if(limit == -2) {
+        /* tandem repeat copy */
+        a = stack[--ssize].b, b = stack[ssize].c;
+        if(stack[ssize].d == 0) {
+          tr_copy(ISA, SA, first, a, b, last, ISAd - ISA);
+        } else {
+          if(0 <= trlink) { stack[trlink].d = -1; }
+          tr_partialcopy(ISA, SA, first, a, b, last, ISAd - ISA);
+        }
+        STACK_POP5(ISAd, first, last, limit, trlink);
+      } else {
+        /* sorted partition */
+        if(0 <= *first) {
+          a = first;
+          do { ISA[*a] = a - SA; } while((++a < last) && (0 <= *a));
+          first = a;
+        }
+        if(first < last) {
+          a = first; do { *a = ~*a; } while(*++a < 0);
+          next = (ISA[*a] != ISAd[*a]) ? tr_ilg(a - first + 1) : -1;
+          if(++a < last) { for(b = first, v = a - SA - 1; b < a; ++b) { ISA[*b] = v; } }
+
+          /* push */
+          if(trbudget_check(budget, a - first)) {
+            if((a - first) <= (last - a)) {
+              STACK_PUSH5(ISAd, a, last, -3, trlink);
+              ISAd += incr, last = a, limit = next;
+            } else {
+              if(1 < (last - a)) {
+                STACK_PUSH5(ISAd + incr, first, a, next, trlink);
+                first = a, limit = -3;
+              } else {
+                ISAd += incr, last = a, limit = next;
+              }
+            }
+          } else {
+            if(0 <= trlink) { stack[trlink].d = -1; }
+            if(1 < (last - a)) {
+              first = a, limit = -3;
+            } else {
+              STACK_POP5(ISAd, first, last, limit, trlink);
+            }
+          }
+        } else {
+          STACK_POP5(ISAd, first, last, limit, trlink);
+        }
+      }
+      continue;
+    }
+
+    if((last - first) <= TR_INSERTIONSORT_THRESHOLD) {
+      tr_insertionsort(ISAd, first, last);
+      limit = -3;
+      continue;
+    }
+
+    if(limit-- == 0) {
+      tr_heapsort(ISAd, first, last - first);
+      for(a = last - 1; first < a; a = b) {
+        for(x = ISAd[*a], b = a - 1; (first <= b) && (ISAd[*b] == x); --b) { *b = ~*b; }
+      }
+      limit = -3;
+      continue;
+    }
+
+    /* choose pivot */
+    a = tr_pivot(ISAd, first, last);
+    SWAP(*first, *a);
+    v = ISAd[*first];
+
+    /* partition */
+    tr_partition(ISAd, first, first + 1, last, &a, &b, v);
+    if((last - first) != (b - a)) {
+      next = (ISA[*a] != v) ? tr_ilg(b - a) : -1;
+
+      /* update ranks */
+      for(c = first, v = a - SA - 1; c < a; ++c) { ISA[*c] = v; }
+      if(b < last) { for(c = a, v = b - SA - 1; c < b; ++c) { ISA[*c] = v; } }
+
+      /* push */
+      if((1 < (b - a)) && (trbudget_check(budget, b - a))) {
+        if((a - first) <= (last - b)) {
+          if((last - b) <= (b - a)) {
+            if(1 < (a - first)) {
+              STACK_PUSH5(ISAd + incr, a, b, next, trlink);
+              STACK_PUSH5(ISAd, b, last, limit, trlink);
+              last = a;
+            } else if(1 < (last - b)) {
+              STACK_PUSH5(ISAd + incr, a, b, next, trlink);
+              first = b;
+            } else {
+              ISAd += incr, first = a, last = b, limit = next;
+            }
+          } else if((a - first) <= (b - a)) {
+            if(1 < (a - first)) {
+              STACK_PUSH5(ISAd, b, last, limit, trlink);
+              STACK_PUSH5(ISAd + incr, a, b, next, trlink);
+              last = a;
+            } else {
+              STACK_PUSH5(ISAd, b, last, limit, trlink);
+              ISAd += incr, first = a, last = b, limit = next;
+            }
+          } else {
+            STACK_PUSH5(ISAd, b, last, limit, trlink);
+            STACK_PUSH5(ISAd, first, a, limit, trlink);
+            ISAd += incr, first = a, last = b, limit = next;
+          }
+        } else {
+          if((a - first) <= (b - a)) {
+            if(1 < (last - b)) {
+              STACK_PUSH5(ISAd + incr, a, b, next, trlink);
+              STACK_PUSH5(ISAd, first, a, limit, trlink);
+              first = b;
+            } else if(1 < (a - first)) {
+              STACK_PUSH5(ISAd + incr, a, b, next, trlink);
+              last = a;
+            } else {
+              ISAd += incr, first = a, last = b, limit = next;
+            }
+          } else if((last - b) <= (b - a)) {
+            if(1 < (last - b)) {
+              STACK_PUSH5(ISAd, first, a, limit, trlink);
+              STACK_PUSH5(ISAd + incr, a, b, next, trlink);
+              first = b;
+            } else {
+              STACK_PUSH5(ISAd, first, a, limit, trlink);
+              ISAd += incr, first = a, last = b, limit = next;
+            }
+          } else {
+            STACK_PUSH5(ISAd, first, a, limit, trlink);
+            STACK_PUSH5(ISAd, b, last, limit, trlink);
+            ISAd += incr, first = a, last = b, limit = next;
+          }
+        }
+      } else {
+        if((1 < (b - a)) && (0 <= trlink)) { stack[trlink].d = -1; }
+        if((a - first) <= (last - b)) {
+          if(1 < (a - first)) {
+            STACK_PUSH5(ISAd, b, last, limit, trlink);
+            last = a;
+          } else if(1 < (last - b)) {
+            first = b;
+          } else {
+            STACK_POP5(ISAd, first, last, limit, trlink);
+          }
+        } else {
+          if(1 < (last - b)) {
+            STACK_PUSH5(ISAd, first, a, limit, trlink);
+            first = b;
+          } else if(1 < (a - first)) {
+            last = a;
+          } else {
+            STACK_POP5(ISAd, first, last, limit, trlink);
+          }
+        }
+      }
+    } else {
+      if(trbudget_check(budget, last - first)) {
+        limit = tr_ilg(last - first), ISAd += incr;
+      } else {
+        if(0 <= trlink) { stack[trlink].d = -1; }
+        STACK_POP5(ISAd, first, last, limit, trlink);
+      }
+    }
+  }
+#undef STACK_SIZE
+}
+
+
+
+/*---------------------------------------------------------------------------*/
+
+/* Tandem repeat sort */
+static
+void
+trsort(int *ISA, int *SA, int n, int depth) {
+  int *ISAd;
+  int *first, *last;
+  trbudget_t budget;
+  int t, skip, unsorted;
+
+  trbudget_init(&budget, tr_ilg(n) * 2 / 3, n);
+/*  trbudget_init(&budget, tr_ilg(n) * 3 / 4, n); */
+  for(ISAd = ISA + depth; -n < *SA; ISAd += ISAd - ISA) {
+    first = SA;
+    skip = 0;
+    unsorted = 0;
+    do {
+      if((t = *first) < 0) { first -= t; skip += t; }
+      else {
+        if(skip != 0) { *(first + skip) = skip; skip = 0; }
+        last = SA + ISA[t] + 1;
+        if(1 < (last - first)) {
+          budget.count = 0;
+          tr_introsort(ISA, ISAd, SA, first, last, &budget);
+          if(budget.count != 0) { unsorted += budget.count; }
+          else { skip = first - last; }
+        } else if((last - first) == 1) {
+          skip = -1;
+        }
+        first = last;
+      }
+    } while(first < (SA + n));
+    if(skip != 0) { *(first + skip) = skip; }
+    if(unsorted == 0) { break; }
+  }
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+/* Sorts suffixes of type B*. */
+static
+int
+sort_typeBstar(const unsigned char *T, int *SA,
+               int *bucket_A, int *bucket_B,
+               int n, int openMP) {
+  int *PAb, *ISAb, *buf;
+#ifdef LIBBSC_OPENMP
+  int *curbuf;
+  int l;
+#endif
+  int i, j, k, t, m, bufsize;
+  int c0, c1;
+#ifdef LIBBSC_OPENMP
+  int d0, d1;
+#endif
+  (void)openMP;
+
+  /* Initialize bucket arrays. */
+  for(i = 0; i < BUCKET_A_SIZE; ++i) { bucket_A[i] = 0; }
+  for(i = 0; i < BUCKET_B_SIZE; ++i) { bucket_B[i] = 0; }
+
+  /* Count the number of occurrences of the first one or two characters of each
+     type A, B and B* suffix. Moreover, store the beginning position of all
+     type B* suffixes into the array SA. */
+  for(i = n - 1, m = n, c0 = T[n - 1]; 0 <= i;) {
+    /* type A suffix. */
+    do { ++BUCKET_A(c1 = c0); } while((0 <= --i) && ((c0 = T[i]) >= c1));
+    if(0 <= i) {
+      /* type B* suffix. */
+      ++BUCKET_BSTAR(c0, c1);
+      SA[--m] = i;
+      /* type B suffix. */
+      for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) <= c1); --i, c1 = c0) {
+        ++BUCKET_B(c0, c1);
+      }
+    }
+  }
+  m = n - m;
+/*
+note:
+  A type B* suffix is lexicographically smaller than a type B suffix that
+  begins with the same first two characters.
+*/
+
+  /* Calculate the index of start/end point of each bucket. */
+  for(c0 = 0, i = 0, j = 0; c0 < ALPHABET_SIZE; ++c0) {
+    t = i + BUCKET_A(c0);
+    BUCKET_A(c0) = i + j; /* start point */
+    i = t + BUCKET_B(c0, c0);
+    for(c1 = c0 + 1; c1 < ALPHABET_SIZE; ++c1) {
+      j += BUCKET_BSTAR(c0, c1);
+      BUCKET_BSTAR(c0, c1) = j; /* end point */
+      i += BUCKET_B(c0, c1);
+    }
+  }
+
+  if(0 < m) {
+    /* Sort the type B* suffixes by their first two characters. */
+    PAb = SA + n - m; ISAb = SA + m;
+    for(i = m - 2; 0 <= i; --i) {
+      t = PAb[i], c0 = T[t], c1 = T[t + 1];
+      SA[--BUCKET_BSTAR(c0, c1)] = i;
+    }
+    t = PAb[m - 1], c0 = T[t], c1 = T[t + 1];
+    SA[--BUCKET_BSTAR(c0, c1)] = m - 1;
+
+    /* Sort the type B* substrings using sssort. */
+#ifdef LIBBSC_OPENMP
+    if (openMP)
+    {
+        buf = SA + m;
+        c0 = ALPHABET_SIZE - 2, c1 = ALPHABET_SIZE - 1, j = m;
+#pragma omp parallel default(shared) private(bufsize, curbuf, k, l, d0, d1)
+        {
+          bufsize = (n - (2 * m)) / omp_get_num_threads();
+          curbuf = buf + omp_get_thread_num() * bufsize;
+          k = 0;
+          for(;;) {
+            #pragma omp critical(sssort_lock)
+            {
+              if(0 < (l = j)) {
+                d0 = c0, d1 = c1;
+                do {
+                  k = BUCKET_BSTAR(d0, d1);
+                  if(--d1 <= d0) {
+                    d1 = ALPHABET_SIZE - 1;
+                    if(--d0 < 0) { break; }
+                  }
+                } while(((l - k) <= 1) && (0 < (l = k)));
+                c0 = d0, c1 = d1, j = k;
+              }
+            }
+            if(l == 0) { break; }
+            sssort(T, PAb, SA + k, SA + l,
+                   curbuf, bufsize, 2, n, *(SA + k) == (m - 1));
+          }
+        }
+    }
+    else
+    {
+        buf = SA + m, bufsize = n - (2 * m);
+        for(c0 = ALPHABET_SIZE - 2, j = m; 0 < j; --c0) {
+          for(c1 = ALPHABET_SIZE - 1; c0 < c1; j = i, --c1) {
+            i = BUCKET_BSTAR(c0, c1);
+            if(1 < (j - i)) {
+              sssort(T, PAb, SA + i, SA + j,
+                     buf, bufsize, 2, n, *(SA + i) == (m - 1));
+            }
+          }
+        }
+    }
+#else
+    buf = SA + m, bufsize = n - (2 * m);
+    for(c0 = ALPHABET_SIZE - 2, j = m; 0 < j; --c0) {
+      for(c1 = ALPHABET_SIZE - 1; c0 < c1; j = i, --c1) {
+        i = BUCKET_BSTAR(c0, c1);
+        if(1 < (j - i)) {
+          sssort(T, PAb, SA + i, SA + j,
+                 buf, bufsize, 2, n, *(SA + i) == (m - 1));
+        }
+      }
+    }
+#endif
+
+    /* Compute ranks of type B* substrings. */
+    for(i = m - 1; 0 <= i; --i) {
+      if(0 <= SA[i]) {
+        j = i;
+        do { ISAb[SA[i]] = i; } while((0 <= --i) && (0 <= SA[i]));
+        SA[i + 1] = i - j;
+        if(i <= 0) { break; }
+      }
+      j = i;
+      do { ISAb[SA[i] = ~SA[i]] = j; } while(SA[--i] < 0);
+      ISAb[SA[i]] = j;
+    }
+
+    /* Construct the inverse suffix array of type B* suffixes using trsort. */
+    trsort(ISAb, SA, m, 1);
+
+    /* Set the sorted order of tyoe B* suffixes. */
+    for(i = n - 1, j = m, c0 = T[n - 1]; 0 <= i;) {
+      for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) >= c1); --i, c1 = c0) { }
+      if(0 <= i) {
+        t = i;
+        for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) <= c1); --i, c1 = c0) { }
+        SA[ISAb[--j]] = ((t == 0) || (1 < (t - i))) ? t : ~t;
+      }
+    }
+
+    /* Calculate the index of start/end point of each bucket. */
+    BUCKET_B(ALPHABET_SIZE - 1, ALPHABET_SIZE - 1) = n; /* end point */
+    for(c0 = ALPHABET_SIZE - 2, k = m - 1; 0 <= c0; --c0) {
+      i = BUCKET_A(c0 + 1) - 1;
+      for(c1 = ALPHABET_SIZE - 1; c0 < c1; --c1) {
+        t = i - BUCKET_B(c0, c1);
+        BUCKET_B(c0, c1) = i; /* end point */
+
+        /* Move all type B* suffixes to the correct position. */
+        for(i = t, j = BUCKET_BSTAR(c0, c1);
+            j <= k;
+            --i, --k) { SA[i] = SA[k]; }
+      }
+      BUCKET_BSTAR(c0, c0 + 1) = i - BUCKET_B(c0, c0) + 1; /* start point */
+      BUCKET_B(c0, c0) = i; /* end point */
+    }
+  }
+
+  return m;
+}
+
+/* Constructs the suffix array by using the sorted order of type B* suffixes. */
+static
+void
+construct_SA(const unsigned char *T, int *SA,
+             int *bucket_A, int *bucket_B,
+             int n, int m) {
+  int *i, *j, *k;
+  int s;
+  int c0, c1, c2;
+
+  if(0 < m) {
+    /* Construct the sorted order of type B suffixes by using
+       the sorted order of type B* suffixes. */
+    for(c1 = ALPHABET_SIZE - 2; 0 <= c1; --c1) {
+      /* Scan the suffix array from right to left. */
+      for(i = SA + BUCKET_BSTAR(c1, c1 + 1),
+          j = SA + BUCKET_A(c1 + 1) - 1, k = NULL, c2 = -1;
+          i <= j;
+          --j) {
+        if(0 < (s = *j)) {
+          assert(T[s] == c1);
+          assert(((s + 1) < n) && (T[s] <= T[s + 1]));
+          assert(T[s - 1] <= T[s]);
+          *j = ~s;
+          c0 = T[--s];
+          if((0 < s) && (T[s - 1] > c0)) { s = ~s; }
+          if(c0 != c2) {
+            if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; }
+            k = SA + BUCKET_B(c2 = c0, c1);
+          }
+          assert(k < j);
+          *k-- = s;
+        } else {
+          assert(((s == 0) && (T[s] == c1)) || (s < 0));
+          *j = ~s;
+        }
+      }
+    }
+  }
+
+  /* Construct the suffix array by using
+     the sorted order of type B suffixes. */
+  k = SA + BUCKET_A(c2 = T[n - 1]);
+  *k++ = (T[n - 2] < c2) ? ~(n - 1) : (n - 1);
+  /* Scan the suffix array from left to right. */
+  for(i = SA, j = SA + n; i < j; ++i) {
+    if(0 < (s = *i)) {
+      assert(T[s - 1] >= T[s]);
+      c0 = T[--s];
+      if((s == 0) || (T[s - 1] < c0)) { s = ~s; }
+      if(c0 != c2) {
+        BUCKET_A(c2) = k - SA;
+        k = SA + BUCKET_A(c2 = c0);
+      }
+      assert(i < k);
+      *k++ = s;
+    } else {
+      assert(s < 0);
+      *i = ~s;
+    }
+  }
+}
+
+/* Constructs the burrows-wheeler transformed string directly
+   by using the sorted order of type B* suffixes. */
+static
+int
+construct_BWT(const unsigned char *T, int *SA,
+              int *bucket_A, int *bucket_B,
+              int n, int m) {
+  int *i, *j, *k, *orig;
+  int s;
+  int c0, c1, c2;
+
+  if(0 < m) {
+    /* Construct the sorted order of type B suffixes by using
+       the sorted order of type B* suffixes. */
+    for(c1 = ALPHABET_SIZE - 2; 0 <= c1; --c1) {
+      /* Scan the suffix array from right to left. */
+      for(i = SA + BUCKET_BSTAR(c1, c1 + 1),
+          j = SA + BUCKET_A(c1 + 1) - 1, k = NULL, c2 = -1;
+          i <= j;
+          --j) {
+        if(0 < (s = *j)) {
+          assert(T[s] == c1);
+          assert(((s + 1) < n) && (T[s] <= T[s + 1]));
+          assert(T[s - 1] <= T[s]);
+          c0 = T[--s];
+          *j = ~((int)c0);
+          if((0 < s) && (T[s - 1] > c0)) { s = ~s; }
+          if(c0 != c2) {
+            if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; }
+            k = SA + BUCKET_B(c2 = c0, c1);
+          }
+          assert(k < j);
+          *k-- = s;
+        } else if(s != 0) {
+          *j = ~s;
+#ifndef NDEBUG
+        } else {
+          assert(T[s] == c1);
+#endif
+        }
+      }
+    }
+  }
+
+  /* Construct the BWTed string by using
+     the sorted order of type B suffixes. */
+  k = SA + BUCKET_A(c2 = T[n - 1]);
+  *k++ = (T[n - 2] < c2) ? ~((int)T[n - 2]) : (n - 1);
+  /* Scan the suffix array from left to right. */
+  for(i = SA, j = SA + n, orig = SA; i < j; ++i) {
+    if(0 < (s = *i)) {
+      assert(T[s - 1] >= T[s]);
+      c0 = T[--s];
+      *i = c0;
+      if((0 < s) && (T[s - 1] < c0)) { s = ~((int)T[s - 1]); }
+      if(c0 != c2) {
+        BUCKET_A(c2) = k - SA;
+        k = SA + BUCKET_A(c2 = c0);
+      }
+      assert(i < k);
+      *k++ = s;
+    } else if(s != 0) {
+      *i = ~s;
+    } else {
+      orig = i;
+    }
+  }
+
+  return orig - SA;
+}
+
+/* Constructs the burrows-wheeler transformed string directly
+   by using the sorted order of type B* suffixes. */
+static
+int
+construct_BWT_indexes(const unsigned char *T, int *SA,
+                      int *bucket_A, int *bucket_B,
+                      int n, int m,
+                      unsigned char * num_indexes, int * indexes) {
+  int *i, *j, *k, *orig;
+  int s;
+  int c0, c1, c2;
+
+  int mod = n / 8;
+  {
+      mod |= mod >> 1;  mod |= mod >> 2;
+      mod |= mod >> 4;  mod |= mod >> 8;
+      mod |= mod >> 16; mod >>= 1;
+
+      *num_indexes = (unsigned char)((n - 1) / (mod + 1));
+  }
+
+  if(0 < m) {
+    /* Construct the sorted order of type B suffixes by using
+       the sorted order of type B* suffixes. */
+    for(c1 = ALPHABET_SIZE - 2; 0 <= c1; --c1) {
+      /* Scan the suffix array from right to left. */
+      for(i = SA + BUCKET_BSTAR(c1, c1 + 1),
+          j = SA + BUCKET_A(c1 + 1) - 1, k = NULL, c2 = -1;
+          i <= j;
+          --j) {
+        if(0 < (s = *j)) {
+          assert(T[s] == c1);
+          assert(((s + 1) < n) && (T[s] <= T[s + 1]));
+          assert(T[s - 1] <= T[s]);
+
+          if ((s & mod) == 0) indexes[s / (mod + 1) - 1] = j - SA;
+
+          c0 = T[--s];
+          *j = ~((int)c0);
+          if((0 < s) && (T[s - 1] > c0)) { s = ~s; }
+          if(c0 != c2) {
+            if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; }
+            k = SA + BUCKET_B(c2 = c0, c1);
+          }
+          assert(k < j);
+          *k-- = s;
+        } else if(s != 0) {
+          *j = ~s;
+#ifndef NDEBUG
+        } else {
+          assert(T[s] == c1);
+#endif
+        }
+      }
+    }
+  }
+
+  /* Construct the BWTed string by using
+     the sorted order of type B suffixes. */
+  k = SA + BUCKET_A(c2 = T[n - 1]);
+  if (T[n - 2] < c2) {
+    if (((n - 1) & mod) == 0) indexes[(n - 1) / (mod + 1) - 1] = k - SA;
+    *k++ = ~((int)T[n - 2]);
+  }
+  else {
+    *k++ = n - 1;
+  }
+
+  /* Scan the suffix array from left to right. */
+  for(i = SA, j = SA + n, orig = SA; i < j; ++i) {
+    if(0 < (s = *i)) {
+      assert(T[s - 1] >= T[s]);
+
+      if ((s & mod) == 0) indexes[s / (mod + 1) - 1] = i - SA;
+
+      c0 = T[--s];
+      *i = c0;
+      if(c0 != c2) {
+        BUCKET_A(c2) = k - SA;
+        k = SA + BUCKET_A(c2 = c0);
+      }
+      assert(i < k);
+      if((0 < s) && (T[s - 1] < c0)) {
+          if ((s & mod) == 0) indexes[s / (mod + 1) - 1] = k - SA;
+          *k++ = ~((int)T[s - 1]);
+      } else
+        *k++ = s;
+    } else if(s != 0) {
+      *i = ~s;
+    } else {
+      orig = i;
+    }
+  }
+
+  return orig - SA;
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+/*- Function -*/
+
+int
+divsufsort(const unsigned char *T, int *SA, int n, int openMP) {
+  int *bucket_A, *bucket_B;
+  int m;
+  int err = 0;
+
+  /* Check arguments. */
+  if((T == NULL) || (SA == NULL) || (n < 0)) { return -1; }
+  else if(n == 0) { return 0; }
+  else if(n == 1) { SA[0] = 0; return 0; }
+  else if(n == 2) { m = (T[0] < T[1]); SA[m ^ 1] = 0, SA[m] = 1; return 0; }
+
+  bucket_A = (int *)malloc(BUCKET_A_SIZE * sizeof(int));
+  bucket_B = (int *)malloc(BUCKET_B_SIZE * sizeof(int));
+
+  /* Suffixsort. */
+  if((bucket_A != NULL) && (bucket_B != NULL)) {
+    m = sort_typeBstar(T, SA, bucket_A, bucket_B, n, openMP);
+    construct_SA(T, SA, bucket_A, bucket_B, n, m);
+  } else {
+    err = -2;
+  }
+
+  free(bucket_B);
+  free(bucket_A);
+
+  return err;
+}
+
+int
+divbwt(const unsigned char *T, unsigned char *U, int *A, int n, unsigned char * num_indexes, int * indexes, int openMP) {
+  int *B;
+  int *bucket_A, *bucket_B;
+  int m, pidx, i;
+
+  /* Check arguments. */
+  if((T == NULL) || (U == NULL) || (n < 0)) { return -1; }
+  else if(n <= 1) { if(n == 1) { U[0] = T[0]; } return n; }
+
+  if((B = A) == NULL) { B = (int *)malloc((size_t)(n + 1) * sizeof(int)); }
+  bucket_A = (int *)malloc(BUCKET_A_SIZE * sizeof(int));
+  bucket_B = (int *)malloc(BUCKET_B_SIZE * sizeof(int));
+
+  /* Burrows-Wheeler Transform. */
+  if((B != NULL) && (bucket_A != NULL) && (bucket_B != NULL)) {
+    m = sort_typeBstar(T, B, bucket_A, bucket_B, n, openMP);
+
+    if (num_indexes == NULL || indexes == NULL) {
+        pidx = construct_BWT(T, B, bucket_A, bucket_B, n, m);
+    } else {
+        pidx = construct_BWT_indexes(T, B, bucket_A, bucket_B, n, m, num_indexes, indexes);
+    }
+
+    /* Copy to output string. */
+    U[0] = T[n - 1];
+    for(i = 0; i < pidx; ++i) { U[i + 1] = (unsigned char)B[i]; }
+    for(i += 1; i < n; ++i) { U[i] = (unsigned char)B[i]; }
+    pidx += 1;
+  } else {
+    pidx = -2;
+  }
+
+  free(bucket_B);
+  free(bucket_A);
+  if(A == NULL) { free(B); }
+
+  return pidx;
+}
diff --git a/lib/divsufsort.h b/lib/divsufsort.h
new file mode 100644
index 0000000..dac0936
--- /dev/null
+++ b/lib/divsufsort.h
@@ -0,0 +1,67 @@
+/*
+ * divsufsort.h for libdivsufsort-lite
+ * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _DIVSUFSORT_H
+#define _DIVSUFSORT_H 1
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+
+/*- Prototypes -*/
+
+/**
+ * Constructs the suffix array of a given string.
+ * @param T[0..n-1] The input string.
+ * @param SA[0..n-1] The output array of suffixes.
+ * @param n The length of the given string.
+ * @param openMP enables OpenMP optimization.
+ * @return 0 if no error occurred, -1 or -2 otherwise.
+ */
+int
+divsufsort(const unsigned char *T, int *SA, int n, int openMP);
+
+/**
+ * Constructs the burrows-wheeler transformed string of a given string.
+ * @param T[0..n-1] The input string.
+ * @param U[0..n-1] The output string. (can be T)
+ * @param A[0..n-1] The temporary array. (can be NULL)
+ * @param n The length of the given string.
+ * @param num_indexes The length of secondary indexes array. (can be NULL)
+ * @param indexes The secondary indexes array. (can be NULL)
+ * @param openMP enables OpenMP optimization.
+ * @return The primary index if no error occurred, -1 or -2 otherwise.
+ */
+int
+divbwt(const unsigned char *T, unsigned char *U, int *A, int n, unsigned char * num_indexes, int * indexes, int openMP);
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif /* __cplusplus */
+
+#endif /* _DIVSUFSORT_H */
diff --git a/lib/error_private.h b/lib/error_private.h
index e567538..c0c3f49 100644
--- a/lib/error_private.h
+++ b/lib/error_private.h
@@ -40,14 +40,14 @@ extern "C" {
 #endif
 
 
-/* *****************************************
-*  Includes
+/* ****************************************
+*  Dependencies
 ******************************************/
-#include <stddef.h>        /* size_t, ptrdiff_t */
+#include <stddef.h>        /* size_t */
 #include "error_public.h"  /* enum list */
 
 
-/* *****************************************
+/* ****************************************
 *  Compiler-specific
 ******************************************/
 #if defined(__GNUC__)
@@ -61,43 +61,52 @@ extern "C" {
 #endif
 
 
-/* *****************************************
-*  Error Codes
+/*-****************************************
+*  Customization
 ******************************************/
+typedef ZSTD_ErrorCode ERR_enum;
 #define PREFIX(name) ZSTD_error_##name
 
+
+/*-****************************************
+*  Error codes handling
+******************************************/
 #ifdef ERROR
-#  undef ERROR   /* reported already defined on VS 2015 by Rich Geldreich */
+#  undef ERROR   /* reported already defined on VS 2015 (Rich Geldreich) */
 #endif
 #define ERROR(name) (size_t)-PREFIX(name)
 
 ERR_STATIC unsigned ERR_isError(size_t code) { return (code > ERROR(maxCode)); }
 
+ERR_STATIC ERR_enum ERR_getError(size_t code) { if (!ERR_isError(code)) return (ERR_enum)0; return (ERR_enum) (0-code); }
+
 
-/* *****************************************
+/*-****************************************
 *  Error Strings
 ******************************************/
 
 ERR_STATIC const char* ERR_getErrorName(size_t code)
 {
-    static const char* codeError = "Unspecified error code";
-    switch( (size_t)(0-code) )
+    static const char* notErrorCode = "Unspecified error code";
+    switch( ERR_getError(code) )
     {
-    case ZSTD_error_No_Error: return "No error detected";
-    case ZSTD_error_GENERIC:  return "Error (generic)";
-    case ZSTD_error_prefix_unknown: return "Unknown frame descriptor";
-    case ZSTD_error_frameParameter_unsupported: return "Unsupported frame parameter";
-    case ZSTD_error_frameParameter_unsupportedBy32bitsImplementation: return "Frame parameter unsupported in 32-bits mode";
-    case ZSTD_error_init_missing: return "Context should be init first";
-    case ZSTD_error_memory_allocation: return "Allocation error : not enough memory";
-    case ZSTD_error_dstSize_tooSmall: return "Destination buffer is too small";
-    case ZSTD_error_srcSize_wrong: return "Src size incorrect";
-    case ZSTD_error_corruption_detected: return "Corrupted block detected";
-    case ZSTD_error_tableLog_tooLarge: return "tableLog requires too much memory";
-    case ZSTD_error_maxSymbolValue_tooLarge: return "Unsupported max possible Symbol Value : too large";
-    case ZSTD_error_maxSymbolValue_tooSmall: return "Specified maxSymbolValue is too small";
-    case ZSTD_error_maxCode:
-    default: return codeError;
+    case PREFIX(no_error): return "No error detected";
+    case PREFIX(GENERIC):  return "Error (generic)";
+    case PREFIX(prefix_unknown): return "Unknown frame descriptor";
+    case PREFIX(frameParameter_unsupported): return "Unsupported frame parameter";
+    case PREFIX(frameParameter_unsupportedBy32bits): return "Frame parameter unsupported in 32-bits mode";
+    case PREFIX(init_missing): return "Context should be init first";
+    case PREFIX(memory_allocation): return "Allocation error : not enough memory";
+    case PREFIX(stage_wrong): return "Operation not authorized at current processing stage";
+    case PREFIX(dstSize_tooSmall): return "Destination buffer is too small";
+    case PREFIX(srcSize_wrong): return "Src size incorrect";
+    case PREFIX(corruption_detected): return "Corrupted block detected";
+    case PREFIX(tableLog_tooLarge): return "tableLog requires too much memory";
+    case PREFIX(maxSymbolValue_tooLarge): return "Unsupported max possible Symbol Value : too large";
+    case PREFIX(maxSymbolValue_tooSmall): return "Specified maxSymbolValue is too small";
+    case PREFIX(dictionary_corrupted): return "Dictionary is corrupted";
+    case PREFIX(maxCode):
+    default: return notErrorCode;   /* should be impossible, due to ERR_getError() */
     }
 }
 
diff --git a/lib/error_public.h b/lib/error_public.h
index 78b0e80..655e28e 100644
--- a/lib/error_public.h
+++ b/lib/error_public.h
@@ -39,14 +39,14 @@ extern "C" {
 
 
 /* ****************************************
-*  error list
+*  error codes list
 ******************************************/
-enum {
-  ZSTD_error_No_Error,
+typedef enum {
+  ZSTD_error_no_error,
   ZSTD_error_GENERIC,
   ZSTD_error_prefix_unknown,
   ZSTD_error_frameParameter_unsupported,
-  ZSTD_error_frameParameter_unsupportedBy32bitsImplementation,
+  ZSTD_error_frameParameter_unsupportedBy32bits,
   ZSTD_error_init_missing,
   ZSTD_error_memory_allocation,
   ZSTD_error_stage_wrong,
@@ -56,8 +56,9 @@ enum {
   ZSTD_error_tableLog_tooLarge,
   ZSTD_error_maxSymbolValue_tooLarge,
   ZSTD_error_maxSymbolValue_tooSmall,
+  ZSTD_error_dictionary_corrupted,
   ZSTD_error_maxCode
-};
+} ZSTD_ErrorCode;
 
 /* note : functions provide error codes in reverse negative order,
           so compare with (size_t)(0-enum) */
diff --git a/lib/fse.c b/lib/fse.c
index e74c1e8..986a0da 100644
--- a/lib/fse.c
+++ b/lib/fse.c
@@ -141,102 +141,6 @@ typedef U32 DTable_max_t[FSE_DTABLE_SIZE_U32(FSE_MAX_TABLELOG)];
 
 
 /* Function templates */
-size_t FSE_count_generic(unsigned* count, unsigned* maxSymbolValuePtr, const FSE_FUNCTION_TYPE* source, size_t sourceSize, unsigned safe)
-{
-    const FSE_FUNCTION_TYPE* ip = source;
-    const FSE_FUNCTION_TYPE* const iend = ip+sourceSize;
-    unsigned maxSymbolValue = *maxSymbolValuePtr;
-    unsigned max=0;
-    int s;
-
-    U32 Counting1[FSE_MAX_SYMBOL_VALUE+1] = { 0 };
-    U32 Counting2[FSE_MAX_SYMBOL_VALUE+1] = { 0 };
-    U32 Counting3[FSE_MAX_SYMBOL_VALUE+1] = { 0 };
-    U32 Counting4[FSE_MAX_SYMBOL_VALUE+1] = { 0 };
-
-    /* safety checks */
-    if (!sourceSize)
-    {
-        memset(count, 0, (maxSymbolValue + 1) * sizeof(FSE_FUNCTION_TYPE));
-        *maxSymbolValuePtr = 0;
-        return 0;
-    }
-    if (maxSymbolValue > FSE_MAX_SYMBOL_VALUE) return ERROR(GENERIC);   /* maxSymbolValue too large : unsupported */
-    if (!maxSymbolValue) maxSymbolValue = FSE_MAX_SYMBOL_VALUE;            /* 0 == default */
-
-    if ((safe) || (sizeof(FSE_FUNCTION_TYPE)>1))
-    {
-        /* check input values, to avoid count table overflow */
-        while (ip < iend-3)
-        {
-            if (*ip>maxSymbolValue) return ERROR(GENERIC); Counting1[*ip++]++;
-            if (*ip>maxSymbolValue) return ERROR(GENERIC); Counting2[*ip++]++;
-            if (*ip>maxSymbolValue) return ERROR(GENERIC); Counting3[*ip++]++;
-            if (*ip>maxSymbolValue) return ERROR(GENERIC); Counting4[*ip++]++;
-        }
-    }
-    else
-    {
-        U32 cached = MEM_read32(ip); ip += 4;
-        while (ip < iend-15)
-        {
-            U32 c = cached; cached = MEM_read32(ip); ip += 4;
-            Counting1[(BYTE) c     ]++;
-            Counting2[(BYTE)(c>>8) ]++;
-            Counting3[(BYTE)(c>>16)]++;
-            Counting4[       c>>24 ]++;
-            c = cached; cached = MEM_read32(ip); ip += 4;
-            Counting1[(BYTE) c     ]++;
-            Counting2[(BYTE)(c>>8) ]++;
-            Counting3[(BYTE)(c>>16)]++;
-            Counting4[       c>>24 ]++;
-            c = cached; cached = MEM_read32(ip); ip += 4;
-            Counting1[(BYTE) c     ]++;
-            Counting2[(BYTE)(c>>8) ]++;
-            Counting3[(BYTE)(c>>16)]++;
-            Counting4[       c>>24 ]++;
-            c = cached; cached = MEM_read32(ip); ip += 4;
-            Counting1[(BYTE) c     ]++;
-            Counting2[(BYTE)(c>>8) ]++;
-            Counting3[(BYTE)(c>>16)]++;
-            Counting4[       c>>24 ]++;
-        }
-        ip-=4;
-    }
-
-    /* finish last symbols */
-    while (ip<iend) { if ((safe) && (*ip>maxSymbolValue)) return ERROR(GENERIC); Counting1[*ip++]++; }
-
-    for (s=0; s<=(int)maxSymbolValue; s++)
-    {
-        count[s] = Counting1[s] + Counting2[s] + Counting3[s] + Counting4[s];
-        if (count[s] > max) max = count[s];
-    }
-
-    while (!count[maxSymbolValue]) maxSymbolValue--;
-    *maxSymbolValuePtr = maxSymbolValue;
-    return (size_t)max;
-}
-
-/* hidden fast variant (unsafe) */
-size_t FSE_FUNCTION_NAME(FSE_countFast, FSE_FUNCTION_EXTENSION)
-(unsigned* count, unsigned* maxSymbolValuePtr, const FSE_FUNCTION_TYPE* source, size_t sourceSize)
-{
-    return FSE_count_generic(count, maxSymbolValuePtr, source, sourceSize, 0);
-}
-
-size_t FSE_FUNCTION_NAME(FSE_count, FSE_FUNCTION_EXTENSION)
-(unsigned* count, unsigned* maxSymbolValuePtr, const FSE_FUNCTION_TYPE* source, size_t sourceSize)
-{
-    if ((sizeof(FSE_FUNCTION_TYPE)==1) && (*maxSymbolValuePtr >= 255))
-    {
-        *maxSymbolValuePtr = 255;
-        return FSE_count_generic(count, maxSymbolValuePtr, source, sourceSize, 0);
-    }
-    return FSE_count_generic(count, maxSymbolValuePtr, source, sourceSize, 1);
-}
-
-
 static U32 FSE_tableStep(U32 tableSize) { return (tableSize>>1) + (tableSize>>3) + 3; }
 
 size_t FSE_buildCTable(FSE_CTable* ct, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
@@ -264,35 +168,28 @@ size_t FSE_buildCTable(FSE_CTable* ct, const short* normalizedCounter, unsigned
 
     /* symbol start positions */
     cumul[0] = 0;
-    for (i=1; i<=maxSymbolValue+1; i++)
-    {
-        if (normalizedCounter[i-1]==-1)   /* Low proba symbol */
-        {
+    for (i=1; i<=maxSymbolValue+1; i++) {
+        if (normalizedCounter[i-1]==-1) {  /* Low proba symbol */
             cumul[i] = cumul[i-1] + 1;
             tableSymbol[highThreshold--] = (FSE_FUNCTION_TYPE)(i-1);
-        }
-        else
+        } else {
             cumul[i] = cumul[i-1] + normalizedCounter[i-1];
-    }
+    }   }
     cumul[maxSymbolValue+1] = tableSize+1;
 
     /* Spread symbols */
-    for (symbol=0; symbol<=maxSymbolValue; symbol++)
-    {
+    for (symbol=0; symbol<=maxSymbolValue; symbol++) {
         int nbOccurences;
-        for (nbOccurences=0; nbOccurences<normalizedCounter[symbol]; nbOccurences++)
-        {
+        for (nbOccurences=0; nbOccurences<normalizedCounter[symbol]; nbOccurences++) {
             tableSymbol[position] = (FSE_FUNCTION_TYPE)symbol;
             position = (position + step) & tableMask;
             while (position > highThreshold) position = (position + step) & tableMask;   /* Low proba area */
-        }
-    }
+    }   }
 
     if (position!=0) return ERROR(GENERIC);   /* Must have gone through all positions */
 
     /* Build table */
-    for (i=0; i<tableSize; i++)
-    {
+    for (i=0; i<tableSize; i++) {
         FSE_FUNCTION_TYPE s = tableSymbol[i];   /* note : static analyzer may not understand tableSymbol is properly initialized */
         tableU16[cumul[s]++] = (U16) (tableSize+i);   /* TableU16 : sorted by symbol order; gives next state value */
     }
@@ -301,15 +198,14 @@ size_t FSE_buildCTable(FSE_CTable* ct, const short* normalizedCounter, unsigned
     {
         unsigned s;
         unsigned total = 0;
-        for (s=0; s<=maxSymbolValue; s++)
-        {
+        for (s=0; s<=maxSymbolValue; s++) {
             switch (normalizedCounter[s])
             {
             case  0:
                 break;
             case -1:
             case  1:
-                symbolTT[s].deltaNbBits = tableLog << 16;
+                symbolTT[s].deltaNbBits = (tableLog << 16) - (1<<tableLog);
                 symbolTT[s].deltaFindState = total - 1;
                 total ++;
                 break;
@@ -320,10 +216,7 @@ size_t FSE_buildCTable(FSE_CTable* ct, const short* normalizedCounter, unsigned
                     symbolTT[s].deltaNbBits = (maxBitsOut << 16) - minStatePlus;
                     symbolTT[s].deltaFindState = total - normalizedCounter[s];
                     total +=  normalizedCounter[s];
-                }
-            }
-        }
-    }
+    }   }   }   }
 
     return 0;
 }
@@ -361,45 +254,35 @@ size_t FSE_buildDTable(FSE_DTable* dt, const short* normalizedCounter, unsigned
 
     /* Init, lay down lowprob symbols */
     DTableH.tableLog = (U16)tableLog;
-    for (s=0; s<=maxSymbolValue; s++)
-    {
-        if (normalizedCounter[s]==-1)
-        {
+    for (s=0; s<=maxSymbolValue; s++) {
+        if (normalizedCounter[s]==-1) {
             tableDecode[highThreshold--].symbol = (FSE_FUNCTION_TYPE)s;
             symbolNext[s] = 1;
-        }
-        else
-        {
+        } else {
             if (normalizedCounter[s] >= largeLimit) noLarge=0;
             symbolNext[s] = normalizedCounter[s];
-        }
-    }
+    }   }
 
     /* Spread symbols */
-    for (s=0; s<=maxSymbolValue; s++)
-    {
+    for (s=0; s<=maxSymbolValue; s++) {
         int i;
-        for (i=0; i<normalizedCounter[s]; i++)
-        {
+        for (i=0; i<normalizedCounter[s]; i++) {
             tableDecode[position].symbol = (FSE_FUNCTION_TYPE)s;
             position = (position + step) & tableMask;
             while (position > highThreshold) position = (position + step) & tableMask;   /* lowprob area */
-        }
-    }
+    }   }
 
     if (position!=0) return ERROR(GENERIC);   /* position must reach all cells once, otherwise normalizedCounter is incorrect */
 
     /* Build Decoding table */
     {
         U32 i;
-        for (i=0; i<tableSize; i++)
-        {
+        for (i=0; i<tableSize; i++) {
             FSE_FUNCTION_TYPE symbol = (FSE_FUNCTION_TYPE)(tableDecode[i].symbol);
             U16 nextState = symbolNext[symbol]++;
             tableDecode[i].nbBits = (BYTE) (tableLog - BIT_highbit32 ((U32)nextState) );
             tableDecode[i].newState = (U16) ( (nextState << tableDecode[i].nbBits) - tableSize);
-        }
-    }
+    }   }
 
     DTableH.fastMode = (U16)noLarge;
     memcpy(dt, &DTableH, sizeof(DTableH));
@@ -408,7 +291,7 @@ size_t FSE_buildDTable(FSE_DTable* dt, const short* normalizedCounter, unsigned
 
 
 #ifndef FSE_COMMONDEFS_ONLY
-/******************************************
+/*-****************************************
 *  FSE helper functions
 ******************************************/
 unsigned FSE_isError(size_t code) { return ERR_isError(code); }
@@ -416,7 +299,7 @@ unsigned FSE_isError(size_t code) { return ERR_isError(code); }
 const char* FSE_getErrorName(size_t code) { return ERR_getErrorName(code); }
 
 
-/****************************************************************
+/*-**************************************************************
 *  FSE NCount encoding-decoding
 ****************************************************************/
 size_t FSE_NCountWriteBound(unsigned maxSymbolValue, unsigned tableLog)
@@ -425,10 +308,7 @@ size_t FSE_NCountWriteBound(unsigned maxSymbolValue, unsigned tableLog)
     return maxSymbolValue ? maxHeaderSize : FSE_NCOUNTBOUND;  /* maxSymbolValue==0 ? use default */
 }
 
-static short FSE_abs(short a)
-{
-    return a<0 ? -a : a;
-}
+static short FSE_abs(short a) { return a<0 ? -a : a; }
 
 static size_t FSE_writeNCount_generic (void* header, size_t headerBufferSize,
                                        const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog,
@@ -457,14 +337,11 @@ static size_t FSE_writeNCount_generic (void* header, size_t headerBufferSize,
     threshold = tableSize;
     nbBits = tableLog+1;
 
-    while (remaining>1)   /* stops at 1 */
-    {
-        if (previous0)
-        {
+    while (remaining>1) {  /* stops at 1 */
+        if (previous0) {
             unsigned start = charnum;
             while (!normalizedCounter[charnum]) charnum++;
-            while (charnum >= start+24)
-            {
+            while (charnum >= start+24) {
                 start+=24;
                 bitStream += 0xFFFFU << bitCount;
                 if ((!writeIsSafe) && (out > oend-2)) return ERROR(dstSize_tooSmall);   /* Buffer overflow */
@@ -473,24 +350,21 @@ static size_t FSE_writeNCount_generic (void* header, size_t headerBufferSize,
                 out+=2;
                 bitStream>>=16;
             }
-            while (charnum >= start+3)
-            {
+            while (charnum >= start+3) {
                 start+=3;
                 bitStream += 3 << bitCount;
                 bitCount += 2;
             }
             bitStream += (charnum-start) << bitCount;
             bitCount += 2;
-            if (bitCount>16)
-            {
+            if (bitCount>16) {
                 if ((!writeIsSafe) && (out > oend - 2)) return ERROR(dstSize_tooSmall);   /* Buffer overflow */
                 out[0] = (BYTE)bitStream;
                 out[1] = (BYTE)(bitStream>>8);
                 out += 2;
                 bitStream >>= 16;
                 bitCount -= 16;
-            }
-        }
+        }   }
         {
             short count = normalizedCounter[charnum++];
             const short max = (short)((2*threshold-1)-remaining);
@@ -504,16 +378,14 @@ static size_t FSE_writeNCount_generic (void* header, size_t headerBufferSize,
             previous0 = (count==1);
             while (remaining<threshold) nbBits--, threshold>>=1;
         }
-        if (bitCount>16)
-        {
+        if (bitCount>16) {
             if ((!writeIsSafe) && (out > oend - 2)) return ERROR(dstSize_tooSmall);   /* Buffer overflow */
             out[0] = (BYTE)bitStream;
             out[1] = (BYTE)(bitStream>>8);
             out += 2;
             bitStream >>= 16;
             bitCount -= 16;
-        }
-    }
+    }   }
 
     /* flush remaining bitStream */
     if ((!writeIsSafe) && (out > oend - 2)) return ERROR(dstSize_tooSmall);   /* Buffer overflow */
@@ -564,27 +436,19 @@ size_t FSE_readNCount (short* normalizedCounter, unsigned* maxSVPtr, unsigned* t
     threshold = 1<<nbBits;
     nbBits++;
 
-    while ((remaining>1) && (charnum<=*maxSVPtr))
-    {
-        if (previous0)
-        {
+    while ((remaining>1) && (charnum<=*maxSVPtr)) {
+        if (previous0) {
             unsigned n0 = charnum;
-            while ((bitStream & 0xFFFF) == 0xFFFF)
-            {
+            while ((bitStream & 0xFFFF) == 0xFFFF) {
                 n0+=24;
-                if (ip < iend-5)
-                {
+                if (ip < iend-5) {
                     ip+=2;
                     bitStream = MEM_readLE32(ip) >> bitCount;
-                }
-                else
-                {
+                } else {
                     bitStream >>= 16;
                     bitCount+=16;
-                }
-            }
-            while ((bitStream & 3) == 3)
-            {
+            }   }
+            while ((bitStream & 3) == 3) {
                 n0+=3;
                 bitStream>>=2;
                 bitCount+=2;
@@ -593,8 +457,7 @@ size_t FSE_readNCount (short* normalizedCounter, unsigned* maxSVPtr, unsigned* t
             bitCount += 2;
             if (n0 > *maxSVPtr) return ERROR(maxSymbolValue_tooSmall);
             while (charnum < n0) normalizedCounter[charnum++] = 0;
-            if ((ip <= iend-7) || (ip + (bitCount>>3) <= iend-4))
-            {
+            if ((ip <= iend-7) || (ip + (bitCount>>3) <= iend-4)) {
                 ip += bitCount>>3;
                 bitCount &= 7;
                 bitStream = MEM_readLE32(ip) >> bitCount;
@@ -606,13 +469,10 @@ size_t FSE_readNCount (short* normalizedCounter, unsigned* maxSVPtr, unsigned* t
             const short max = (short)((2*threshold-1)-remaining);
             short count;
 
-            if ((bitStream & (threshold-1)) < (U32)max)
-            {
+            if ((bitStream & (threshold-1)) < (U32)max) {
                 count = (short)(bitStream & (threshold-1));
                 bitCount   += nbBits-1;
-            }
-            else
-            {
+            } else {
                 count = (short)(bitStream & (2*threshold-1));
                 if (count >= threshold) count -= max;
                 bitCount   += nbBits;
@@ -622,27 +482,20 @@ size_t FSE_readNCount (short* normalizedCounter, unsigned* maxSVPtr, unsigned* t
             remaining -= FSE_abs(count);
             normalizedCounter[charnum++] = count;
             previous0 = !count;
-            while (remaining < threshold)
-            {
+            while (remaining < threshold) {
                 nbBits--;
                 threshold >>= 1;
             }
 
-            {
-                if ((ip <= iend-7) || (ip + (bitCount>>3) <= iend-4))
-                {
-                    ip += bitCount>>3;
-                    bitCount &= 7;
-                }
-                else
-                {
-                    bitCount -= (int)(8 * (iend - 4 - ip));
-					ip = iend - 4;
-				}
-                bitStream = MEM_readLE32(ip) >> (bitCount & 31);
+            if ((ip <= iend-7) || (ip + (bitCount>>3) <= iend-4)) {
+                ip += bitCount>>3;
+                bitCount &= 7;
+            } else {
+                bitCount -= (int)(8 * (iend - 4 - ip));
+                ip = iend - 4;
             }
-        }
-    }
+            bitStream = MEM_readLE32(ip) >> (bitCount & 31);
+    }   }
     if (remaining != 1) return ERROR(GENERIC);
     *maxSVPtr = charnum-1;
 
@@ -652,10 +505,130 @@ size_t FSE_readNCount (short* normalizedCounter, unsigned* maxSVPtr, unsigned* t
 }
 
 
-/****************************************************************
+/*-**************************************************************
+*  Counting histogram
+****************************************************************/
+/*! FSE_count_simple
+    This function just counts byte values within @src,
+    and store the histogram into @count.
+    This function is unsafe : it doesn't check that all values within @src can fit into @count.
+    For this reason, prefer using a table @count with 256 elements.
+    @return : highest count for a single element
+*/
+static size_t FSE_count_simple(unsigned* count, unsigned* maxSymbolValuePtr,
+                               const void* src, size_t srcSize)
+{
+    const BYTE* ip = (const BYTE*)src;
+    const BYTE* const end = ip + srcSize;
+    unsigned maxSymbolValue = *maxSymbolValuePtr;
+    unsigned max=0;
+    U32 s;
+
+    memset(count, 0, (maxSymbolValue+1)*sizeof(*count));
+    if (srcSize==0) { *maxSymbolValuePtr = 0; return 0; }
+
+    while (ip<end) count[*ip++]++;
+
+    while (!count[maxSymbolValue]) maxSymbolValue--;
+    *maxSymbolValuePtr = maxSymbolValue;
+
+    for (s=0; s<=maxSymbolValue; s++) if (count[s] > max) max = count[s];
+
+    return (size_t)max;
+}
+
+
+static size_t FSE_count_parallel(unsigned* count, unsigned* maxSymbolValuePtr,
+                                const void* source, size_t sourceSize,
+                                unsigned checkMax)
+{
+    const BYTE* ip = (const BYTE*)source;
+    const BYTE* const iend = ip+sourceSize;
+    unsigned maxSymbolValue = *maxSymbolValuePtr;
+    unsigned max=0;
+    U32 s;
+
+    U32 Counting1[256] = { 0 };
+    U32 Counting2[256] = { 0 };
+    U32 Counting3[256] = { 0 };
+    U32 Counting4[256] = { 0 };
+
+    /* safety checks */
+    if (!sourceSize) {
+        memset(count, 0, maxSymbolValue + 1);
+        *maxSymbolValuePtr = 0;
+        return 0;
+    }
+    if (!maxSymbolValue) maxSymbolValue = 255;            /* 0 == default */
+
+    {   /* by stripes of 16 bytes */
+        U32 cached = MEM_read32(ip); ip += 4;
+        while (ip < iend-15) {
+            U32 c = cached; cached = MEM_read32(ip); ip += 4;
+            Counting1[(BYTE) c     ]++;
+            Counting2[(BYTE)(c>>8) ]++;
+            Counting3[(BYTE)(c>>16)]++;
+            Counting4[       c>>24 ]++;
+            c = cached; cached = MEM_read32(ip); ip += 4;
+            Counting1[(BYTE) c     ]++;
+            Counting2[(BYTE)(c>>8) ]++;
+            Counting3[(BYTE)(c>>16)]++;
+            Counting4[       c>>24 ]++;
+            c = cached; cached = MEM_read32(ip); ip += 4;
+            Counting1[(BYTE) c     ]++;
+            Counting2[(BYTE)(c>>8) ]++;
+            Counting3[(BYTE)(c>>16)]++;
+            Counting4[       c>>24 ]++;
+            c = cached; cached = MEM_read32(ip); ip += 4;
+            Counting1[(BYTE) c     ]++;
+            Counting2[(BYTE)(c>>8) ]++;
+            Counting3[(BYTE)(c>>16)]++;
+            Counting4[       c>>24 ]++;
+        }
+        ip-=4;
+    }
+
+    /* finish last symbols */
+    while (ip<iend) Counting1[*ip++]++;
+
+    if (checkMax) {   /* verify stats will fit into destination table */
+        for (s=255; s>maxSymbolValue; s--) {
+            Counting1[s] += Counting2[s] + Counting3[s] + Counting4[s];
+            if (Counting1[s]) return ERROR(maxSymbolValue_tooSmall);
+    }   }
+
+    for (s=0; s<=maxSymbolValue; s++) {
+        count[s] = Counting1[s] + Counting2[s] + Counting3[s] + Counting4[s];
+        if (count[s] > max) max = count[s];
+    }
+
+    while (!count[maxSymbolValue]) maxSymbolValue--;
+    *maxSymbolValuePtr = maxSymbolValue;
+    return (size_t)max;
+}
+
+/* fast variant (unsafe : won't check if src contains values beyond count[] limit) */
+size_t FSE_countFast(unsigned* count, unsigned* maxSymbolValuePtr,
+                     const void* source, size_t sourceSize)
+{
+    if (sourceSize < 1500) return FSE_count_simple(count, maxSymbolValuePtr, source, sourceSize);
+    return FSE_count_parallel(count, maxSymbolValuePtr, source, sourceSize, 0);
+}
+
+size_t FSE_count(unsigned* count, unsigned* maxSymbolValuePtr,
+                 const void* source, size_t sourceSize)
+{
+    if (*maxSymbolValuePtr <255)
+        return FSE_count_parallel(count, maxSymbolValuePtr, source, sourceSize, 1);
+    *maxSymbolValuePtr = 255;
+    return FSE_countFast(count, maxSymbolValuePtr, source, sourceSize);
+}
+
+
+/*-**************************************************************
 *  FSE Compression Code
 ****************************************************************/
-/*
+/*!
 FSE_CTable is a variable size structure which contains :
     U16 tableLog;
     U16 maxSymbolValue;
@@ -686,7 +659,6 @@ void  FSE_freeCTable (FSE_CTable* ct)
     free(ct);
 }
 
-
 /* provides the minimum logSize to safely represent a distribution */
 static unsigned FSE_minTableLog(size_t srcSize, unsigned maxSymbolValue)
 {
@@ -723,22 +695,18 @@ static size_t FSE_normalizeM2(short* norm, U32 tableLog, const unsigned* count,
     U32 lowThreshold = (U32)(total >> tableLog);
     U32 lowOne = (U32)((total * 3) >> (tableLog + 1));
 
-    for (s=0; s<=maxSymbolValue; s++)
-    {
-        if (count[s] == 0)
-        {
+    for (s=0; s<=maxSymbolValue; s++) {
+        if (count[s] == 0) {
             norm[s]=0;
             continue;
         }
-        if (count[s] <= lowThreshold)
-        {
+        if (count[s] <= lowThreshold) {
             norm[s] = -1;
             distributed++;
             total -= count[s];
             continue;
         }
-        if (count[s] <= lowOne)
-        {
+        if (count[s] <= lowOne) {
             norm[s] = 1;
             distributed++;
             total -= count[s];
@@ -748,25 +716,20 @@ static size_t FSE_normalizeM2(short* norm, U32 tableLog, const unsigned* count,
     }
     ToDistribute = (1 << tableLog) - distributed;
 
-    if ((total / ToDistribute) > lowOne)
-    {
+    if ((total / ToDistribute) > lowOne) {
         /* risk of rounding to zero */
         lowOne = (U32)((total * 3) / (ToDistribute * 2));
-        for (s=0; s<=maxSymbolValue; s++)
-        {
-            if ((norm[s] == -2) && (count[s] <= lowOne))
-            {
+        for (s=0; s<=maxSymbolValue; s++) {
+            if ((norm[s] == -2) && (count[s] <= lowOne)) {
                 norm[s] = 1;
                 distributed++;
                 total -= count[s];
                 continue;
-            }
-        }
+        }   }
         ToDistribute = (1 << tableLog) - distributed;
     }
 
-    if (distributed == maxSymbolValue+1)
-    {
+    if (distributed == maxSymbolValue+1) {
         /* all values are pretty poor;
            probably incompressible data (should have already been detected);
            find max, then give all remaining points to max */
@@ -782,10 +745,8 @@ static size_t FSE_normalizeM2(short* norm, U32 tableLog, const unsigned* count,
         U64 const mid = (1ULL << (vStepLog-1)) - 1;
         U64 const rStep = ((((U64)1<<vStepLog) * ToDistribute) + mid) / total;   /* scale on remaining */
         U64 tmpTotal = mid;
-        for (s=0; s<=maxSymbolValue; s++)
-        {
-            if (norm[s]==-2)
-            {
+        for (s=0; s<=maxSymbolValue; s++) {
+            if (norm[s]==-2) {
                 U64 end = tmpTotal + (count[s] * rStep);
                 U32 sStart = (U32)(tmpTotal >> vStepLog);
                 U32 sEnd = (U32)(end >> vStepLog);
@@ -794,9 +755,7 @@ static size_t FSE_normalizeM2(short* norm, U32 tableLog, const unsigned* count,
                     return ERROR(GENERIC);
                 norm[s] = (short)weight;
                 tmpTotal = end;
-            }
-        }
-    }
+    }   }   }
 
     return 0;
 }
@@ -809,7 +768,7 @@ size_t FSE_normalizeCount (short* normalizedCounter, unsigned tableLog,
     /* Sanity checks */
     if (tableLog==0) tableLog = FSE_DEFAULT_TABLELOG;
     if (tableLog < FSE_MIN_TABLELOG) return ERROR(GENERIC);   /* Unsupported size */
-    if (tableLog > FSE_MAX_TABLELOG) return ERROR(GENERIC);   /* Unsupported size */
+    if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge);   /* Unsupported size */
     if (tableLog < FSE_minTableLog(total, maxSymbolValue)) return ERROR(GENERIC);   /* Too small tableLog, compression potentially impossible */
 
     {
@@ -823,38 +782,23 @@ size_t FSE_normalizeCount (short* normalizedCounter, unsigned tableLog,
         short largestP=0;
         U32 lowThreshold = (U32)(total >> tableLog);
 
-        for (s=0; s<=maxSymbolValue; s++)
-        {
-            if (count[s] == total) return 0;
-            if (count[s] == 0)
-            {
-                normalizedCounter[s]=0;
-                continue;
-            }
-            if (count[s] <= lowThreshold)
-            {
+        for (s=0; s<=maxSymbolValue; s++) {
+            if (count[s] == total) return 0;   /* rle special case */
+            if (count[s] == 0) { normalizedCounter[s]=0; continue; }
+            if (count[s] <= lowThreshold) {
                 normalizedCounter[s] = -1;
                 stillToDistribute--;
-            }
-            else
-            {
+            } else {
                 short proba = (short)((count[s]*step) >> scale);
-                if (proba<8)
-                {
+                if (proba<8) {
                     U64 restToBeat = vStep * rtbTable[proba];
                     proba += (count[s]*step) - ((U64)proba<<scale) > restToBeat;
                 }
-                if (proba > largestP)
-                {
-                    largestP=proba;
-                    largest=s;
-                }
+                if (proba > largestP) largestP=proba, largest=s;
                 normalizedCounter[s] = proba;
                 stillToDistribute -= proba;
-            }
-        }
-        if (-stillToDistribute >= (normalizedCounter[largest] >> 1))
-        {
+        }   }
+        if (-stillToDistribute >= (normalizedCounter[largest] >> 1)) {
             /* corner case, need another normalization method */
             size_t errorCode = FSE_normalizeM2(normalizedCounter, tableLog, count, total, maxSymbolValue);
             if (FSE_isError(errorCode)) return errorCode;
@@ -904,10 +848,12 @@ size_t FSE_buildCTable_raw (FSE_CTable* ct, unsigned nbBits)
         tableU16[s] = (U16)(tableSize + s);
 
     /* Build Symbol Transformation Table */
-    for (s=0; s<=maxSymbolValue; s++)
     {
-        symbolTT[s].deltaNbBits = nbBits << 16;
-        symbolTT[s].deltaFindState = s-1;
+        const U32 deltaNbBits = (nbBits << 16) - (1 << nbBits);
+        for (s=0; s<=maxSymbolValue; s++) {
+            symbolTT[s].deltaNbBits = deltaNbBits;
+            symbolTT[s].deltaFindState = s-1;
+        }
     }
 
     return 0;
@@ -930,10 +876,8 @@ size_t FSE_buildCTable_rle (FSE_CTable* ct, BYTE symbolValue)
     tableU16[1] = 0;   /* just in case */
 
     /* Build Symbol Transformation Table */
-    {
-        symbolTT[symbolValue].deltaNbBits = 0;
-        symbolTT[symbolValue].deltaFindState = 0;
-    }
+    symbolTT[symbolValue].deltaNbBits = 0;
+    symbolTT[symbolValue].deltaFindState = 0;
 
     return 0;
 }
@@ -963,15 +907,13 @@ static size_t FSE_compress_usingCTable_generic (void* dst, size_t dstSize,
 #define FSE_FLUSHBITS(s)  (fast ? BIT_flushBitsFast(s) : BIT_flushBits(s))
 
     /* join to even */
-    if (srcSize & 1)
-    {
+    if (srcSize & 1) {
         FSE_encodeSymbol(&bitC, &CState1, *--ip);
         FSE_FLUSHBITS(&bitC);
     }
 
     /* join to mod 4 */
-    if ((sizeof(bitC.bitContainer)*8 > FSE_MAX_TABLELOG*4+7 ) && (srcSize & 2))   /* test bit 2 */
-    {
+    if ((sizeof(bitC.bitContainer)*8 > FSE_MAX_TABLELOG*4+7 ) && (srcSize & 2)) {  /* test bit 2 */
         FSE_encodeSymbol(&bitC, &CState2, *--ip);
         FSE_encodeSymbol(&bitC, &CState1, *--ip);
         FSE_FLUSHBITS(&bitC);
@@ -987,8 +929,7 @@ static size_t FSE_compress_usingCTable_generic (void* dst, size_t dstSize,
 
         FSE_encodeSymbol(&bitC, &CState1, *--ip);
 
-        if (sizeof(bitC.bitContainer)*8 > FSE_MAX_TABLELOG*4+7 )   /* this test must be static */
-        {
+        if (sizeof(bitC.bitContainer)*8 > FSE_MAX_TABLELOG*4+7 ) {  /* this test must be static */
             FSE_encodeSymbol(&bitC, &CState2, *--ip);
             FSE_encodeSymbol(&bitC, &CState1, *--ip);
         }
@@ -1071,7 +1012,7 @@ size_t FSE_compress (void* dst, size_t dstSize, const void* src, size_t srcSize)
 }
 
 
-/*********************************************************
+/*-*******************************************************
 *  Decompression (Byte symbols)
 *********************************************************/
 size_t FSE_buildDTable_rle (FSE_DTable* dt, BYTE symbolValue)
@@ -1109,8 +1050,7 @@ size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits)
     /* Build Decoding Table */
     DTableH->tableLog = (U16)nbBits;
     DTableH->fastMode = 1;
-    for (s=0; s<=maxSymbolValue; s++)
-    {
+    for (s=0; s<=maxSymbolValue; s++) {
         dinfo[s].newState = 0;
         dinfo[s].symbol = (BYTE)s;
         dinfo[s].nbBits = (BYTE)nbBits;
@@ -1144,8 +1084,7 @@ FORCE_INLINE size_t FSE_decompress_usingDTable_generic(
 #define FSE_GETSYMBOL(statePtr) fast ? FSE_decodeSymbolFast(statePtr, &bitD) : FSE_decodeSymbol(statePtr, &bitD)
 
     /* 4 symbols per loop */
-    for ( ; (BIT_reloadDStream(&bitD)==BIT_DStream_unfinished) && (op<olimit) ; op+=4)
-    {
+    for ( ; (BIT_reloadDStream(&bitD)==BIT_DStream_unfinished) && (op<olimit) ; op+=4) {
         op[0] = FSE_GETSYMBOL(&state1);
 
         if (FSE_MAX_TABLELOG*2+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
@@ -1166,8 +1105,7 @@ FORCE_INLINE size_t FSE_decompress_usingDTable_generic(
 
     /* tail */
     /* note : BIT_reloadDStream(&bitD) >= FSE_DStream_partiallyFilled; Ends at exactly BIT_DStream_completed */
-    while (1)
-    {
+    while (1) {
         if ( (BIT_reloadDStream(&bitD)>BIT_DStream_completed) || (op==omax) || (BIT_endOfDStream(&bitD) && (fast || FSE_endOfDState(&state1))) )
             break;
 
diff --git a/lib/fse.h b/lib/fse.h
index dd1190f..db6f49c 100644
--- a/lib/fse.h
+++ b/lib/fse.h
@@ -46,7 +46,7 @@ extern "C" {
 #include <stddef.h>    /* size_t, ptrdiff_t */
 
 
-/* *****************************************
+/*-****************************************
 *  FSE simple functions
 ******************************************/
 size_t FSE_compress(void* dst, size_t maxDstSize,
@@ -124,13 +124,13 @@ or to save and provide normalized distribution using external method.
 
 /*!
 FSE_count():
-   Provides the precise count of each symbol within a table 'count'
-   'count' is a table of unsigned int, of minimum size (maxSymbolValuePtr[0]+1).
-   maxSymbolValuePtr[0] will be updated if detected smaller than initially expected
-   return : the count of the most frequent symbol (which is not identified)
-            if return == srcSize, there is only one symbol.
-            if FSE_isError(return), it's an error code. */
-size_t FSE_count(unsigned* count, unsigned* maxSymbolValuePtr, const unsigned char* src, size_t srcSize);
+   Provides the precise count of each byte within a table 'count'
+   'count' is a table of unsigned int, of minimum size (*maxSymbolValuePtr+1).
+   *maxSymbolValuePtr will be updated if detected smaller than initial value.
+   @return : the count of the most frequent symbol (which is not identified)
+             if return == srcSize, there is only one symbol.
+             Can also return an error code, which can be tested with FSE_isError() */
+size_t FSE_count(unsigned* count, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize);
 
 /*!
 FSE_optimalTableLog():
@@ -170,18 +170,18 @@ void        FSE_freeCTable (FSE_CTable* ct);
 
 /*!
 FSE_buildCTable():
-   Builds 'ct', which must be already allocated, using FSE_createCTable()
+   Builds @ct, which must be already allocated, using FSE_createCTable()
    return : 0
             or an errorCode, which can be tested using FSE_isError() */
 size_t FSE_buildCTable(FSE_CTable* ct, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog);
 
 /*!
 FSE_compress_usingCTable():
-   Compress 'src' using 'ct' into 'dst' which must be already allocated
-   return : size of compressed data (<= maxDstSize)
-            or 0 if compressed data could not fit into 'dst'
+   Compress @src using @ct into @dst which must be already allocated
+   return : size of compressed data (<= @dstCapacity)
+            or 0 if compressed data could not fit into @dst
             or an errorCode, which can be tested using FSE_isError() */
-size_t FSE_compress_usingCTable (void* dst, size_t maxDstSize, const void* src, size_t srcSize, const FSE_CTable* ct);
+size_t FSE_compress_usingCTable (void* dst, size_t dstCapacity, const void* src, size_t srcSize, const FSE_CTable* ct);
 
 /*!
 Tutorial :
@@ -221,7 +221,7 @@ If there is an error, both functions will return an ErrorCode (which can be test
 
 'CTable' can then be used to compress 'src', with FSE_compress_usingCTable().
 Similar to FSE_count(), the convention is that 'src' is assumed to be a table of char of size 'srcSize'
-The function returns the size of compressed data (without header), necessarily <= maxDstSize.
+The function returns the size of compressed data (without header), necessarily <= @dstCapacity.
 If it returns '0', compressed data could not fit into 'dst'.
 If there is an error, the function will return an ErrorCode (which can be tested using FSE_isError()).
 */
@@ -253,11 +253,11 @@ size_t FSE_buildDTable (FSE_DTable* dt, const short* normalizedCounter, unsigned
 
 /*!
 FSE_decompress_usingDTable():
-   Decompress compressed source 'cSrc' of size 'cSrcSize' using 'dt'
-   into 'dst' which must be already allocated.
-   return : size of regenerated data (necessarily <= maxDstSize)
+   Decompress compressed source @cSrc of size @cSrcSize using @dt
+   into @dst which must be already allocated.
+   return : size of regenerated data (necessarily <= @dstCapacity)
             or an errorCode, which can be tested using FSE_isError() */
-size_t FSE_decompress_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const FSE_DTable* dt);
+size_t FSE_decompress_usingDTable(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, const FSE_DTable* dt);
 
 /*!
 Tutorial :
diff --git a/lib/fse_static.h b/lib/fse_static.h
index a881e35..ca303db 100644
--- a/lib/fse_static.h
+++ b/lib/fse_static.h
@@ -63,8 +63,8 @@ extern "C" {
 /* *****************************************
 *  FSE advanced API
 *******************************************/
-size_t FSE_countFast(unsigned* count, unsigned* maxSymbolValuePtr, const unsigned char* src, size_t srcSize);
-/* same as FSE_count(), but blindly trust that all values within src are <= *maxSymbolValuePtr  */
+size_t FSE_countFast(unsigned* count, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize);
+/* same as FSE_count(), but blindly trusts that all byte values within src are <= *maxSymbolValuePtr  */
 
 size_t FSE_buildCTable_raw (FSE_CTable* ct, unsigned nbBits);
 /* build a fake FSE_CTable, designed to not compress an input, where each symbol uses nbBits */
@@ -223,8 +223,7 @@ static unsigned char FSE_decodeSymbolFast(FSE_DState_t* DStatePtr, BIT_DStream_t
 /* *****************************************
 *  Implementation of inlined functions
 *******************************************/
-typedef struct
-{
+typedef struct {
     int deltaFindState;
     U32 deltaNbBits;
 } FSE_symbolCompressionTransform; /* total 8 bytes */
@@ -240,6 +239,19 @@ MEM_STATIC void FSE_initCState(FSE_CState_t* statePtr, const FSE_CTable* ct)
     statePtr->stateLog = tableLog;
 }
 
+MEM_STATIC void FSE_initCState2(FSE_CState_t* statePtr, const FSE_CTable* ct, U32 symbol)
+{
+    FSE_initCState(statePtr, ct);
+    {
+        const FSE_symbolCompressionTransform symbolTT = ((const FSE_symbolCompressionTransform*)(statePtr->symbolTT))[symbol];
+        const U16* stateTable = (const U16*)(statePtr->stateTable);
+        U32 nbBitsOut  = (U32)((symbolTT.deltaNbBits + (1<<15)) >> 16);
+        statePtr->value = (nbBitsOut << 16) - symbolTT.deltaNbBits;
+        statePtr->value = stateTable[(statePtr->value >> nbBitsOut) + symbolTT.deltaFindState];
+
+    }
+}
+
 MEM_STATIC void FSE_encodeSymbol(BIT_CStream_t* bitC, FSE_CState_t* statePtr, U32 symbol)
 {
     const FSE_symbolCompressionTransform symbolTT = ((const FSE_symbolCompressionTransform*)(statePtr->symbolTT))[symbol];
@@ -278,6 +290,17 @@ MEM_STATIC void FSE_initDState(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD, con
     DStatePtr->table = dt + 1;
 }
 
+MEM_STATIC size_t FSE_getStateValue(FSE_DState_t* DStatePtr)
+{
+    return DStatePtr->state;
+}
+
+MEM_STATIC BYTE FSE_peakSymbol(FSE_DState_t* DStatePtr)
+{
+    const FSE_decode_t DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    return DInfo.symbol;
+}
+
 MEM_STATIC BYTE FSE_decodeSymbol(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD)
 {
     const FSE_decode_t DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state];
diff --git a/lib/huff0.c b/lib/huff0.c
index 26a7639..929bc87 100644
--- a/lib/huff0.c
+++ b/lib/huff0.c
@@ -49,7 +49,6 @@
 #  pragma warning(disable : 4127)        /* disable: C4127: conditional expression is constant */
 #else
 #  ifdef __GNUC__
-#    define GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
 #    define FORCE_INLINE static inline __attribute__((always_inline))
 #  else
 #    define FORCE_INLINE static inline
@@ -107,7 +106,8 @@ typedef struct nodeElt_s {
     @dst : destination buffer
     @CTable : huffman tree to save, using huff0 representation
     @return : size of saved CTable */
-size_t HUF_writeCTable (void* dst, size_t maxDstSize, const HUF_CElt* CTable, U32 maxSymbolValue, U32 huffLog)
+size_t HUF_writeCTable (void* dst, size_t maxDstSize,
+                        const HUF_CElt* CTable, U32 maxSymbolValue, U32 huffLog)
 {
     BYTE bitsToWeight[HUF_MAX_TABLELOG + 1];
     BYTE huffWeight[HUF_MAX_SYMBOL_VALUE + 1];
@@ -129,10 +129,8 @@ size_t HUF_writeCTable (void* dst, size_t maxDstSize, const HUF_CElt* CTable, U3
     size = FSE_compress(op+1, maxDstSize-1, huffWeight, maxSymbolValue);   /* don't need last symbol stat : implied */
     if (HUF_isError(size)) return size;
     if (size >= 128) return ERROR(GENERIC);   /* should never happen, since maxSymbolValue <= 255 */
-    if ((size <= 1) || (size >= maxSymbolValue/2))
-    {
-        if (size==1)   /* RLE */
-        {
+    if ((size <= 1) || (size >= maxSymbolValue/2)) {
+        if (size==1) {  /* RLE */
             /* only possible case : serie of 1 (because there are at least 2) */
             /* can only be 2^n or (2^n-1), otherwise not an huffman tree */
             BYTE code;
@@ -173,6 +171,66 @@ size_t HUF_writeCTable (void* dst, size_t maxDstSize, const HUF_CElt* CTable, U3
 }
 
 
+static size_t HUF_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats,
+                            U32* nbSymbolsPtr, U32* tableLogPtr,
+                            const void* src, size_t srcSize);
+
+
+size_t HUF_readCTable (HUF_CElt* CTable, U32 maxSymbolValue, const void* src, size_t srcSize)
+{
+    BYTE huffWeight[HUF_MAX_SYMBOL_VALUE + 1];
+    U32 rankVal[HUF_ABSOLUTEMAX_TABLELOG + 1];   /* large enough for values from 0 to 16 */
+    U32 tableLog = 0;
+    size_t iSize;
+    U32 nbSymbols = 0;
+    U32 n;
+    U32 nextRankStart;
+    //memset(huffWeight, 0, sizeof(huffWeight));   /* is not necessary, even though some analyzer complain ... */
+
+    /* get symbol weights */
+    iSize = HUF_readStats(huffWeight, HUF_MAX_SYMBOL_VALUE+1, rankVal, &nbSymbols, &tableLog, src, srcSize);
+    if (HUF_isError(iSize)) return iSize;
+
+    /* check result */
+    if (tableLog > HUF_MAX_TABLELOG) return ERROR(tableLog_tooLarge);
+    if (nbSymbols > maxSymbolValue+1) return ERROR(maxSymbolValue_tooSmall);
+
+    /* Prepare base value per rank */
+    nextRankStart = 0;
+    for (n=1; n<=tableLog; n++) {
+        U32 current = nextRankStart;
+        nextRankStart += (rankVal[n] << (n-1));
+        rankVal[n] = current;
+    }
+
+    /* fill nbBits */
+    for (n=0; n<nbSymbols; n++) {
+        const U32 w = huffWeight[n];
+        CTable[n].nbBits = (BYTE)(tableLog + 1 - w);
+    }
+
+    /* fill val */
+    {
+        U16 nbPerRank[HUF_MAX_TABLELOG+1] = {0};
+        U16 valPerRank[HUF_MAX_TABLELOG+1] = {0};
+        for (n=0; n<nbSymbols; n++)
+            nbPerRank[CTable[n].nbBits]++;
+        {
+            /* determine stating value per rank */
+            U16 min = 0;
+            for (n=HUF_MAX_TABLELOG; n>0; n--) {
+                valPerRank[n] = min;      /* get starting value within each rank */
+                min += nbPerRank[n];
+                min >>= 1;
+        }   }
+        for (n=0; n<=maxSymbolValue; n++)
+            CTable[n].val = valPerRank[CTable[n].nbBits]++;   /* assign value within rank, symbol order */
+    }
+
+    return iSize;
+}
+
+
 static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
 {
     int totalCost = 0;
@@ -186,8 +244,7 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
         const U32 baseCost = 1 << (largestBits - maxNbBits);
         U32 n = lastNonNull;
 
-        while (huffNode[n].nbBits > maxNbBits)
-        {
+        while (huffNode[n].nbBits > maxNbBits) {
             totalCost += baseCost - (1 << (largestBits - huffNode[n].nbBits));
             huffNode[n].nbBits = (BYTE)maxNbBits;
             n --;
@@ -206,18 +263,15 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
 
             /* Get pos of last (smallest) symbol per rank */
             memset(rankLast, 0xF0, sizeof(rankLast));
-            for (pos=n ; pos >= 0; pos--)
-            {
+            for (pos=n ; pos >= 0; pos--) {
                 if (huffNode[pos].nbBits >= currentNbBits) continue;
                 currentNbBits = huffNode[pos].nbBits;   /* < maxNbBits */
                 rankLast[maxNbBits-currentNbBits] = pos;
             }
 
-            while (totalCost > 0)
-            {
+            while (totalCost > 0) {
                 U32 nBitsToDecrease = BIT_highbit32(totalCost) + 1;
-                for ( ; nBitsToDecrease > 1; nBitsToDecrease--)
-                {
+                for ( ; nBitsToDecrease > 1; nBitsToDecrease--) {
                     U32 highPos = rankLast[nBitsToDecrease];
                     U32 lowPos = rankLast[nBitsToDecrease-1];
                     if (highPos == noSymbol) continue;
@@ -226,8 +280,7 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
                         U32 highTotal = huffNode[highPos].count;
                         U32 lowTotal = 2 * huffNode[lowPos].count;
                         if (highTotal <= lowTotal) break;
-                    }
-                }
+                }   }
                 /* only triggered when no more rank 1 symbol left => find closest one (note : there is necessarily at least one !) */
                 while ((nBitsToDecrease<=HUF_MAX_TABLELOG) && (rankLast[nBitsToDecrease] == noSymbol))  /* HUF_MAX_TABLELOG test just to please gcc 5+; but it should not be necessary */
                     nBitsToDecrease ++;
@@ -237,18 +290,14 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
                 huffNode[rankLast[nBitsToDecrease]].nbBits ++;
                 if (rankLast[nBitsToDecrease] == 0)    /* special case, reached largest symbol */
                     rankLast[nBitsToDecrease] = noSymbol;
-                else
-                {
+                else {
                     rankLast[nBitsToDecrease]--;
                     if (huffNode[rankLast[nBitsToDecrease]].nbBits != maxNbBits-nBitsToDecrease)
                         rankLast[nBitsToDecrease] = noSymbol;   /* this rank is now empty */
-                }
-            }
+            }   }
 
-            while (totalCost < 0)   /* Sometimes, cost correction overshoot */
-            {
-                if (rankLast[1] == noSymbol)   /* special case : no rank 1 symbol (using maxNbBits-1); let's create one from largest rank 0 (using maxNbBits) */
-                {
+            while (totalCost < 0) {  /* Sometimes, cost correction overshoot */
+                if (rankLast[1] == noSymbol) {  /* special case : no rank 1 symbol (using maxNbBits-1); let's create one from largest rank 0 (using maxNbBits) */
                     while (huffNode[n].nbBits == maxNbBits) n--;
                     huffNode[n+1].nbBits--;
                     rankLast[1] = n+1;
@@ -258,9 +307,7 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
                 huffNode[ rankLast[1] + 1 ].nbBits--;
                 rankLast[1]++;
                 totalCost ++;
-            }
-        }
-    }
+    }   }   }
 
     return maxNbBits;
 }
@@ -277,15 +324,13 @@ static void HUF_sort(nodeElt* huffNode, const U32* count, U32 maxSymbolValue)
     U32 n;
 
     memset(rank, 0, sizeof(rank));
-    for (n=0; n<=maxSymbolValue; n++)
-    {
+    for (n=0; n<=maxSymbolValue; n++) {
         U32 r = BIT_highbit32(count[n] + 1);
         rank[r].base ++;
     }
     for (n=30; n>0; n--) rank[n-1].base += rank[n].base;
     for (n=0; n<32; n++) rank[n].current = rank[n].base;
-    for (n=0; n<=maxSymbolValue; n++)
-    {
+    for (n=0; n<=maxSymbolValue; n++) {
         U32 c = count[n];
         U32 r = BIT_highbit32(c+1) + 1;
         U32 pos = rank[r].current++;
@@ -325,8 +370,7 @@ size_t HUF_buildCTable (HUF_CElt* tree, const U32* count, U32 maxSymbolValue, U3
     huffNode0[0].count = (U32)(1U<<31);
 
     /* create parents */
-    while (nodeNb <= nodeRoot)
-    {
+    while (nodeNb <= nodeRoot) {
         U32 n1 = (huffNode[lowS].count < huffNode[lowN].count) ? lowS-- : lowN++;
         U32 n2 = (huffNode[lowS].count < huffNode[lowN].count) ? lowS-- : lowN++;
         huffNode[nodeNb].count = huffNode[n1].count + huffNode[n2].count;
@@ -354,8 +398,7 @@ size_t HUF_buildCTable (HUF_CElt* tree, const U32* count, U32 maxSymbolValue, U3
         {
             /* determine stating value per rank */
             U16 min = 0;
-            for (n=maxNbBits; n>0; n--)
-            {
+            for (n=maxNbBits; n>0; n--) {
                 valPerRank[n] = min;      /* get starting value within each rank */
                 min += nbPerRank[n];
                 min >>= 1;
@@ -385,7 +428,7 @@ size_t HUF_compressBound(size_t size) { return HUF_COMPRESSBOUND(size); }
 #define HUF_FLUSHBITS_2(stream) \
     if (sizeof((stream)->bitContainer)*8 < HUF_MAX_TABLELOG*4+7) HUF_FLUSHBITS(stream)
 
-size_t HUF_compress_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable)
+size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable)
 {
     const BYTE* ip = (const BYTE*) src;
     BYTE* const ostart = (BYTE*)dst;
@@ -414,8 +457,7 @@ size_t HUF_compress_usingCTable(void* dst, size_t dstSize, const void* src, size
         default: ;
     }
 
-    for (; n>0; n-=4)   /* note : n&3==0 at this stage */
-    {
+    for (; n>0; n-=4) {  /* note : n&3==0 at this stage */
         HUF_encodeSymbol(&bitC, ip[n- 1], CTable);
         HUF_FLUSHBITS_1(&bitC);
         HUF_encodeSymbol(&bitC, ip[n- 2], CTable);
@@ -430,7 +472,7 @@ size_t HUF_compress_usingCTable(void* dst, size_t dstSize, const void* src, size
 }
 
 
-static size_t HUF_compress_into4Segments(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable)
+size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable)
 {
     size_t segmentSize = (srcSize+3)/4;   /* first 3 segments */
     size_t errorCode;
@@ -444,28 +486,28 @@ static size_t HUF_compress_into4Segments(void* dst, size_t dstSize, const void*
     if (srcSize < 12) return 0;   /* no saving possible : too small input */
     op += 6;   /* jumpTable */
 
-    errorCode = HUF_compress_usingCTable(op, oend-op, ip, segmentSize, CTable);
+    errorCode = HUF_compress1X_usingCTable(op, oend-op, ip, segmentSize, CTable);
     if (HUF_isError(errorCode)) return errorCode;
     if (errorCode==0) return 0;
     MEM_writeLE16(ostart, (U16)errorCode);
 
     ip += segmentSize;
     op += errorCode;
-    errorCode = HUF_compress_usingCTable(op, oend-op, ip, segmentSize, CTable);
+    errorCode = HUF_compress1X_usingCTable(op, oend-op, ip, segmentSize, CTable);
     if (HUF_isError(errorCode)) return errorCode;
     if (errorCode==0) return 0;
     MEM_writeLE16(ostart+2, (U16)errorCode);
 
     ip += segmentSize;
     op += errorCode;
-    errorCode = HUF_compress_usingCTable(op, oend-op, ip, segmentSize, CTable);
+    errorCode = HUF_compress1X_usingCTable(op, oend-op, ip, segmentSize, CTable);
     if (HUF_isError(errorCode)) return errorCode;
     if (errorCode==0) return 0;
     MEM_writeLE16(ostart+4, (U16)errorCode);
 
     ip += segmentSize;
     op += errorCode;
-    errorCode = HUF_compress_usingCTable(op, oend-op, ip, iend-ip, CTable);
+    errorCode = HUF_compress1X_usingCTable(op, oend-op, ip, iend-ip, CTable);
     if (HUF_isError(errorCode)) return errorCode;
     if (errorCode==0) return 0;
 
@@ -474,9 +516,11 @@ static size_t HUF_compress_into4Segments(void* dst, size_t dstSize, const void*
 }
 
 
-size_t HUF_compress2 (void* dst, size_t dstSize,
+static size_t HUF_compress_internal (
+                void* dst, size_t dstSize,
                 const void* src, size_t srcSize,
-                unsigned maxSymbolValue, unsigned huffLog)
+                unsigned maxSymbolValue, unsigned huffLog,
+                unsigned singleStream)
 {
     BYTE* const ostart = (BYTE*)dst;
     BYTE* op = ostart;
@@ -487,7 +531,7 @@ size_t HUF_compress2 (void* dst, size_t dstSize,
     size_t errorCode;
 
     /* checks & inits */
-    if (srcSize < 1) return 0;  /* Uncompressed */
+    if (srcSize < 1) return 0;  /* Uncompressed - note : 1 means rle, so first byte must be correct */
     if (dstSize < 1) return 0;  /* not compressible within dst budget */
     if (srcSize > 128 * 1024) return ERROR(srcSize_wrong);   /* current block size limit */
     if (huffLog > HUF_MAX_TABLELOG) return ERROR(tableLog_tooLarge);
@@ -512,8 +556,10 @@ size_t HUF_compress2 (void* dst, size_t dstSize,
     op += errorCode;
 
     /* Compress */
-    //errorCode = HUF_compress_usingCTable(op, oend - op, src, srcSize, CTable);   /* single segment */
-    errorCode = HUF_compress_into4Segments(op, oend - op, src, srcSize, CTable);
+    if (singleStream)
+        errorCode = HUF_compress1X_usingCTable(op, oend - op, src, srcSize, CTable);   /* single segment */
+    else
+        errorCode = HUF_compress4X_usingCTable(op, oend - op, src, srcSize, CTable);
     if (HUF_isError(errorCode)) return errorCode;
     if (errorCode==0) return 0;
     op += errorCode;
@@ -525,13 +571,29 @@ size_t HUF_compress2 (void* dst, size_t dstSize,
     return op-ostart;
 }
 
+
+size_t HUF_compress1X (void* dst, size_t dstSize,
+                const void* src, size_t srcSize,
+                unsigned maxSymbolValue, unsigned huffLog)
+{
+    return HUF_compress_internal(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, 1);
+}
+
+size_t HUF_compress2 (void* dst, size_t dstSize,
+                const void* src, size_t srcSize,
+                unsigned maxSymbolValue, unsigned huffLog)
+{
+    return HUF_compress_internal(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, 0);
+}
+
+
 size_t HUF_compress (void* dst, size_t maxDstSize, const void* src, size_t srcSize)
 {
     return HUF_compress2(dst, maxDstSize, src, (U32)srcSize, 255, HUF_DEFAULT_TABLELOG);
 }
 
 
-/*********************************************************
+/* *******************************************************
 *  Huff0 : Huffman block decompression
 *********************************************************/
 typedef struct { BYTE byte; BYTE nbBits; } HUF_DEltX2;   /* single-symbol decoding */
@@ -558,31 +620,24 @@ static size_t HUF_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats,
 
     //memset(huffWeight, 0, hwSize);   /* is not necessary, even though some analyzer complain ... */
 
-    if (iSize >= 128)  /* special header */
-    {
-        if (iSize >= (242))   /* RLE */
-        {
+    if (iSize >= 128)  { /* special header */
+        if (iSize >= (242)) {  /* RLE */
             static int l[14] = { 1, 2, 3, 4, 7, 8, 15, 16, 31, 32, 63, 64, 127, 128 };
             oSize = l[iSize-242];
             memset(huffWeight, 1, hwSize);
             iSize = 0;
         }
-        else   /* Incompressible */
-        {
+        else {   /* Incompressible */
             oSize = iSize - 127;
             iSize = ((oSize+1)/2);
             if (iSize+1 > srcSize) return ERROR(srcSize_wrong);
             if (oSize >= hwSize) return ERROR(corruption_detected);
             ip += 1;
-            for (n=0; n<oSize; n+=2)
-            {
+            for (n=0; n<oSize; n+=2) {
                 huffWeight[n]   = ip[n/2] >> 4;
                 huffWeight[n+1] = ip[n/2] & 15;
-            }
-        }
-    }
-    else  /* header compressed with FSE (normal case) */
-    {
+    }   }   }
+    else  {   /* header compressed with FSE (normal case) */
         if (iSize+1 > srcSize) return ERROR(srcSize_wrong);
         oSize = FSE_decompress(huffWeight, hwSize-1, ip+1, iSize);   /* max (hwSize-1) values decoded, as last one is implied */
         if (FSE_isError(oSize)) return oSize;
@@ -591,8 +646,7 @@ static size_t HUF_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats,
     /* collect weight stats */
     memset(rankStats, 0, (HUF_ABSOLUTEMAX_TABLELOG + 1) * sizeof(U32));
     weightTotal = 0;
-    for (n=0; n<oSize; n++)
-    {
+    for (n=0; n<oSize; n++) {
         if (huffWeight[n] >= HUF_ABSOLUTEMAX_TABLELOG) return ERROR(corruption_detected);
         rankStats[huffWeight[n]]++;
         weightTotal += (1 << huffWeight[n]) >> 1;
@@ -601,7 +655,7 @@ static size_t HUF_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats,
     /* get last non-null symbol weight (implied, total must be 2^n) */
     tableLog = BIT_highbit32(weightTotal) + 1;
     if (tableLog > HUF_ABSOLUTEMAX_TABLELOG) return ERROR(corruption_detected);
-    {
+    {   /* determine last weight */
         U32 total = 1 << tableLog;
         U32 rest = total - weightTotal;
         U32 verif = 1 << BIT_highbit32(rest);
@@ -621,9 +675,9 @@ static size_t HUF_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats,
 }
 
 
-/**************************/
-/* single-symbol decoding */
-/**************************/
+/*-***************************/
+/*  single-symbol decoding   */
+/*-***************************/
 
 size_t HUF_readDTableX2 (U16* DTable, const void* src, size_t srcSize)
 {
@@ -645,20 +699,18 @@ size_t HUF_readDTableX2 (U16* DTable, const void* src, size_t srcSize)
 
     /* check result */
     if (tableLog > DTable[0]) return ERROR(tableLog_tooLarge);   /* DTable is too small */
-    DTable[0] = (U16)tableLog;   /* maybe should separate sizeof DTable, as allocated, from used size of DTable, in case of DTable re-use */
+    DTable[0] = (U16)tableLog;   /* maybe should separate sizeof allocated DTable, from used size of DTable, in case of re-use */
 
     /* Prepare ranks */
     nextRankStart = 0;
-    for (n=1; n<=tableLog; n++)
-    {
+    for (n=1; n<=tableLog; n++) {
         U32 current = nextRankStart;
         nextRankStart += (rankVal[n] << (n-1));
         rankVal[n] = current;
     }
 
     /* fill DTable */
-    for (n=0; n<nbSymbols; n++)
-    {
+    for (n=0; n<nbSymbols; n++) {
         const U32 w = huffWeight[n];
         const U32 length = (1 << w) >> 1;
         U32 i;
@@ -696,8 +748,7 @@ static inline size_t HUF_decodeStreamX2(BYTE* p, BIT_DStream_t* const bitDPtr, B
     BYTE* const pStart = p;
 
     /* up to 4 symbols at a time */
-    while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) && (p <= pEnd-4))
-    {
+    while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) && (p <= pEnd-4)) {
         HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
         HUF_DECODE_SYMBOLX2_1(p, bitDPtr);
         HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
@@ -759,94 +810,91 @@ size_t HUF_decompress4X2_usingDTable(
     const void* cSrc, size_t cSrcSize,
     const U16* DTable)
 {
-    if (cSrcSize < 10) return ERROR(corruption_detected);   /* strict minimum : jump table + 1 byte per stream */
-
-    {
-        const BYTE* const istart = (const BYTE*) cSrc;
-        BYTE* const ostart = (BYTE*) dst;
-        BYTE* const oend = ostart + dstSize;
-        const void* const dtPtr = DTable;
-        const HUF_DEltX2* const dt = ((const HUF_DEltX2*)dtPtr) +1;
-        const U32 dtLog = DTable[0];
-        size_t errorCode;
+    const BYTE* const istart = (const BYTE*) cSrc;
+    BYTE* const ostart = (BYTE*) dst;
+    BYTE* const oend = ostart + dstSize;
+    const void* const dtPtr = DTable;
+    const HUF_DEltX2* const dt = ((const HUF_DEltX2*)dtPtr) +1;
+    const U32 dtLog = DTable[0];
+    size_t errorCode;
 
-        /* Init */
-        BIT_DStream_t bitD1;
-        BIT_DStream_t bitD2;
-        BIT_DStream_t bitD3;
-        BIT_DStream_t bitD4;
-        const size_t length1 = MEM_readLE16(istart);
-        const size_t length2 = MEM_readLE16(istart+2);
-        const size_t length3 = MEM_readLE16(istart+4);
-        size_t length4;
-        const BYTE* const istart1 = istart + 6;  /* jumpTable */
-        const BYTE* const istart2 = istart1 + length1;
-        const BYTE* const istart3 = istart2 + length2;
-        const BYTE* const istart4 = istart3 + length3;
-        const size_t segmentSize = (dstSize+3) / 4;
-        BYTE* const opStart2 = ostart + segmentSize;
-        BYTE* const opStart3 = opStart2 + segmentSize;
-        BYTE* const opStart4 = opStart3 + segmentSize;
-        BYTE* op1 = ostart;
-        BYTE* op2 = opStart2;
-        BYTE* op3 = opStart3;
-        BYTE* op4 = opStart4;
-        U32 endSignal;
+    /* Check */
+    if (cSrcSize < 10) return ERROR(corruption_detected);   /* strict minimum : jump table + 1 byte per stream */
 
-        length4 = cSrcSize - (length1 + length2 + length3 + 6);
-        if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
-        errorCode = BIT_initDStream(&bitD1, istart1, length1);
-        if (HUF_isError(errorCode)) return errorCode;
-        errorCode = BIT_initDStream(&bitD2, istart2, length2);
-        if (HUF_isError(errorCode)) return errorCode;
-        errorCode = BIT_initDStream(&bitD3, istart3, length3);
-        if (HUF_isError(errorCode)) return errorCode;
-        errorCode = BIT_initDStream(&bitD4, istart4, length4);
-        if (HUF_isError(errorCode)) return errorCode;
+    /* Init */
+    BIT_DStream_t bitD1;
+    BIT_DStream_t bitD2;
+    BIT_DStream_t bitD3;
+    BIT_DStream_t bitD4;
+    const size_t length1 = MEM_readLE16(istart);
+    const size_t length2 = MEM_readLE16(istart+2);
+    const size_t length3 = MEM_readLE16(istart+4);
+    size_t length4;
+    const BYTE* const istart1 = istart + 6;  /* jumpTable */
+    const BYTE* const istart2 = istart1 + length1;
+    const BYTE* const istart3 = istart2 + length2;
+    const BYTE* const istart4 = istart3 + length3;
+    const size_t segmentSize = (dstSize+3) / 4;
+    BYTE* const opStart2 = ostart + segmentSize;
+    BYTE* const opStart3 = opStart2 + segmentSize;
+    BYTE* const opStart4 = opStart3 + segmentSize;
+    BYTE* op1 = ostart;
+    BYTE* op2 = opStart2;
+    BYTE* op3 = opStart3;
+    BYTE* op4 = opStart4;
+    U32 endSignal;
+
+    length4 = cSrcSize - (length1 + length2 + length3 + 6);
+    if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
+    errorCode = BIT_initDStream(&bitD1, istart1, length1);
+    if (HUF_isError(errorCode)) return errorCode;
+    errorCode = BIT_initDStream(&bitD2, istart2, length2);
+    if (HUF_isError(errorCode)) return errorCode;
+    errorCode = BIT_initDStream(&bitD3, istart3, length3);
+    if (HUF_isError(errorCode)) return errorCode;
+    errorCode = BIT_initDStream(&bitD4, istart4, length4);
+    if (HUF_isError(errorCode)) return errorCode;
 
-        /* 16-32 symbols per loop (4-8 symbols per stream) */
+    /* 16-32 symbols per loop (4-8 symbols per stream) */
+    endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
+    for ( ; (endSignal==BIT_DStream_unfinished) && (op4<(oend-7)) ; ) {
+        HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
+        HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
+        HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
+        HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
+        HUF_DECODE_SYMBOLX2_1(op1, &bitD1);
+        HUF_DECODE_SYMBOLX2_1(op2, &bitD2);
+        HUF_DECODE_SYMBOLX2_1(op3, &bitD3);
+        HUF_DECODE_SYMBOLX2_1(op4, &bitD4);
+        HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
+        HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
+        HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
+        HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
+        HUF_DECODE_SYMBOLX2_0(op1, &bitD1);
+        HUF_DECODE_SYMBOLX2_0(op2, &bitD2);
+        HUF_DECODE_SYMBOLX2_0(op3, &bitD3);
+        HUF_DECODE_SYMBOLX2_0(op4, &bitD4);
         endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
-        for ( ; (endSignal==BIT_DStream_unfinished) && (op4<(oend-7)) ; )
-        {
-            HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
-            HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
-            HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
-            HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
-            HUF_DECODE_SYMBOLX2_1(op1, &bitD1);
-            HUF_DECODE_SYMBOLX2_1(op2, &bitD2);
-            HUF_DECODE_SYMBOLX2_1(op3, &bitD3);
-            HUF_DECODE_SYMBOLX2_1(op4, &bitD4);
-            HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
-            HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
-            HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
-            HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
-            HUF_DECODE_SYMBOLX2_0(op1, &bitD1);
-            HUF_DECODE_SYMBOLX2_0(op2, &bitD2);
-            HUF_DECODE_SYMBOLX2_0(op3, &bitD3);
-            HUF_DECODE_SYMBOLX2_0(op4, &bitD4);
-
-            endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
-        }
+    }
 
-        /* check corruption */
-        if (op1 > opStart2) return ERROR(corruption_detected);
-        if (op2 > opStart3) return ERROR(corruption_detected);
-        if (op3 > opStart4) return ERROR(corruption_detected);
-        /* note : op4 supposed already verified within main loop */
+    /* check corruption */
+    if (op1 > opStart2) return ERROR(corruption_detected);
+    if (op2 > opStart3) return ERROR(corruption_detected);
+    if (op3 > opStart4) return ERROR(corruption_detected);
+    /* note : op4 supposed already verified within main loop */
 
-        /* finish bitStreams one by one */
-        HUF_decodeStreamX2(op1, &bitD1, opStart2, dt, dtLog);
-        HUF_decodeStreamX2(op2, &bitD2, opStart3, dt, dtLog);
-        HUF_decodeStreamX2(op3, &bitD3, opStart4, dt, dtLog);
-        HUF_decodeStreamX2(op4, &bitD4, oend,     dt, dtLog);
+    /* finish bitStreams one by one */
+    HUF_decodeStreamX2(op1, &bitD1, opStart2, dt, dtLog);
+    HUF_decodeStreamX2(op2, &bitD2, opStart3, dt, dtLog);
+    HUF_decodeStreamX2(op3, &bitD3, opStart4, dt, dtLog);
+    HUF_decodeStreamX2(op4, &bitD4, oend,     dt, dtLog);
 
-        /* check */
-        endSignal = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4);
-        if (!endSignal) return ERROR(corruption_detected);
+    /* check */
+    endSignal = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4);
+    if (!endSignal) return ERROR(corruption_detected);
 
-        /* decoded size */
-        return dstSize;
-    }
+    /* decoded size */
+    return dstSize;
 }
 
 
@@ -866,9 +914,9 @@ size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cS
 }
 
 
-/***************************/
+/* *************************/
 /* double-symbols decoding */
-/***************************/
+/* *************************/
 
 static void HUF_fillDTableX4Level2(HUF_DEltX4* DTable, U32 sizeLog, const U32 consumed,
                            const U32* rankValOrigin, const int minWeight,
@@ -883,8 +931,7 @@ static void HUF_fillDTableX4Level2(HUF_DEltX4* DTable, U32 sizeLog, const U32 co
     memcpy(rankVal, rankValOrigin, sizeof(rankVal));
 
     /* fill skipped values */
-    if (minWeight>1)
-    {
+    if (minWeight>1) {
         U32 i, skipSize = rankVal[minWeight];
         MEM_writeLE16(&(DElt.sequence), baseSeq);
         DElt.nbBits   = (BYTE)(consumed);
@@ -894,8 +941,7 @@ static void HUF_fillDTableX4Level2(HUF_DEltX4* DTable, U32 sizeLog, const U32 co
     }
 
     /* fill DTable */
-    for (s=0; s<sortedListSize; s++)   /* note : sortedSymbols already skipped */
-    {
+    for (s=0; s<sortedListSize; s++) {   /* note : sortedSymbols already skipped */
         const U32 symbol = sortedSymbols[s].symbol;
         const U32 weight = sortedSymbols[s].weight;
         const U32 nbBits = nbBitsBaseline - weight;
@@ -928,16 +974,14 @@ static void HUF_fillDTableX4(HUF_DEltX4* DTable, const U32 targetLog,
     memcpy(rankVal, rankValOrigin, sizeof(rankVal));
 
     /* fill DTable */
-    for (s=0; s<sortedListSize; s++)
-    {
+    for (s=0; s<sortedListSize; s++) {
         const U16 symbol = sortedList[s].symbol;
         const U32 weight = sortedList[s].weight;
         const U32 nbBits = nbBitsBaseline - weight;
         const U32 start = rankVal[weight];
         const U32 length = 1 << (targetLog-nbBits);
 
-        if (targetLog-nbBits >= minBits)   /* enough room for a second symbol */
-        {
+        if (targetLog-nbBits >= minBits) {   /* enough room for a second symbol */
             U32 sortedRank;
             int minWeight = nbBits + scaleLog;
             if (minWeight < 1) minWeight = 1;
@@ -946,9 +990,7 @@ static void HUF_fillDTableX4(HUF_DEltX4* DTable, const U32 targetLog,
                            rankValOrigin[nbBits], minWeight,
                            sortedList+sortedRank, sortedListSize-sortedRank,
                            nbBitsBaseline, symbol);
-        }
-        else
-        {
+        } else {
             U32 i;
             const U32 end = start + length;
             HUF_DEltX4 DElt;
@@ -993,8 +1035,7 @@ size_t HUF_readDTableX4 (U32* DTable, const void* src, size_t srcSize)
     /* Get start index of each weight */
     {
         U32 w, nextRankStart = 0;
-        for (w=1; w<=maxW; w++)
-        {
+        for (w=1; w<=maxW; w++) {
             U32 current = nextRankStart;
             nextRankStart += rankStats[w];
             rankStart[w] = current;
@@ -1006,8 +1047,7 @@ size_t HUF_readDTableX4 (U32* DTable, const void* src, size_t srcSize)
     /* sort symbols by weight */
     {
         U32 s;
-        for (s=0; s<nbSymbols; s++)
-        {
+        for (s=0; s<nbSymbols; s++) {
             U32 w = weightList[s];
             U32 r = rankStart[w]++;
             sortedSymbol[r].symbol = (BYTE)s;
@@ -1023,21 +1063,16 @@ size_t HUF_readDTableX4 (U32* DTable, const void* src, size_t srcSize)
         U32 w, consumed;
         const int rescale = (memLog-tableLog) - 1;   /* tableLog <= memLog */
         U32* rankVal0 = rankVal[0];
-        for (w=1; w<=maxW; w++)
-        {
+        for (w=1; w<=maxW; w++) {
             U32 current = nextRankVal;
             nextRankVal += rankStats[w] << (w+rescale);
             rankVal0[w] = current;
         }
-        for (consumed = minBits; consumed <= memLog - minBits; consumed++)
-        {
+        for (consumed = minBits; consumed <= memLog - minBits; consumed++) {
             U32* rankValPtr = rankVal[consumed];
-            for (w = 1; w <= maxW; w++)
-            {
+            for (w = 1; w <= maxW; w++) {
                 rankValPtr[w] = rankVal0[w] >> consumed;
-            }
-        }
-    }
+    }   }   }
 
     HUF_fillDTableX4(dt, memLog,
                    sortedSymbol, sizeOfSort,
@@ -1061,15 +1096,12 @@ static U32 HUF_decodeLastSymbolX4(void* op, BIT_DStream_t* DStream, const HUF_DE
     const size_t val = BIT_lookBitsFast(DStream, dtLog);   /* note : dtLog >= 1 */
     memcpy(op, dt+val, 1);
     if (dt[val].length==1) BIT_skipBits(DStream, dt[val].nbBits);
-    else
-    {
-        if (DStream->bitsConsumed < (sizeof(DStream->bitContainer)*8))
-        {
+    else {
+        if (DStream->bitsConsumed < (sizeof(DStream->bitContainer)*8)) {
             BIT_skipBits(DStream, dt[val].nbBits);
             if (DStream->bitsConsumed > (sizeof(DStream->bitContainer)*8))
                 DStream->bitsConsumed = (sizeof(DStream->bitContainer)*8);   /* ugly hack; works only because it's the last symbol. Note : can't easily extract nbBits from just this symbol */
-        }
-    }
+    }   }
     return 1;
 }
 
@@ -1090,8 +1122,7 @@ static inline size_t HUF_decodeStreamX4(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* c
     BYTE* const pStart = p;
 
     /* up to 8 symbols at a time */
-    while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) && (p < pEnd-7))
-    {
+    while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) && (p < pEnd-7)) {
         HUF_DECODE_SYMBOLX4_2(p, bitDPtr);
         HUF_DECODE_SYMBOLX4_1(p, bitDPtr);
         HUF_DECODE_SYMBOLX4_2(p, bitDPtr);
@@ -1207,8 +1238,7 @@ size_t HUF_decompress4X4_usingDTable(
 
         /* 16-32 symbols per loop (4-8 symbols per stream) */
         endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
-        for ( ; (endSignal==BIT_DStream_unfinished) && (op4<(oend-7)) ; )
-        {
+        for ( ; (endSignal==BIT_DStream_unfinished) && (op4<(oend-7)) ; ) {
             HUF_DECODE_SYMBOLX4_2(op1, &bitD1);
             HUF_DECODE_SYMBOLX4_2(op2, &bitD2);
             HUF_DECODE_SYMBOLX4_2(op3, &bitD3);
@@ -1266,9 +1296,9 @@ size_t HUF_decompress4X4 (void* dst, size_t dstSize, const void* cSrc, size_t cS
 }
 
 
-/**********************************/
+/* ********************************/
 /* quad-symbol decoding           */
-/**********************************/
+/* ********************************/
 typedef struct { BYTE nbBits; BYTE nbBytes; } HUF_DDescX6;
 typedef union { BYTE byte[4]; U32 sequence; } HUF_DSeqX6;
 
@@ -1288,22 +1318,18 @@ static void HUF_fillDTableX6LevelN(HUF_DDescX6* DDescription, HUF_DSeqX6* DSeque
     memcpy(rankVal, rankValOrigin[consumed], sizeof(rankVal));
 
     /* fill skipped values */
-    if (minWeight>1)
-    {
+    if (minWeight>1) {
         U32 i;
         const U32 skipSize = rankVal[minWeight];
-        for (i = 0; i < skipSize; i++)
-        {
+        for (i = 0; i < skipSize; i++) {
             DSequence[i] = baseSeq;
             DDescription[i] = DDesc;
-        }
-    }
+    }   }
 
     /* fill DTable */
     DDesc.nbBytes++;
     symbolStartPos = rankStart[minWeight];
-    for (s=symbolStartPos; s<sortedListSize; s++)
-    {
+    for (s=symbolStartPos; s<sortedListSize; s++) {
         const BYTE symbol = sortedSymbols[s].symbol;
         const U32  weight = sortedSymbols[s].weight;   /* >= 1 (sorted) */
         const int  nbBits = nbBitsBaseline - weight;   /* >= 1 (by construction) */
@@ -1313,25 +1339,20 @@ static void HUF_fillDTableX6LevelN(HUF_DDescX6* DDescription, HUF_DSeqX6* DSeque
         baseSeq.byte[level] = symbol;
         DDesc.nbBits = (BYTE)totalBits;
 
-        if ((level<3) && (sizeLog-totalBits >= minBits))   /* enough room for another symbol */
-        {
+        if ((level<3) && (sizeLog-totalBits >= minBits)) {  /* enough room for another symbol */
             int nextMinWeight = totalBits + scaleLog;
             if (nextMinWeight < 1) nextMinWeight = 1;
             HUF_fillDTableX6LevelN(DDescription+start, DSequence+start, sizeLog-nbBits,
                            rankValOrigin, totalBits, nextMinWeight, maxWeight,
                            sortedSymbols, sortedListSize, rankStart,
                            nbBitsBaseline, baseSeq, DDesc);   /* recursive (max : level 3) */
-        }
-        else
-        {
+        } else {
             U32 i;
             const U32 end = start + length;
-            for (i = start; i < end; i++)
-            {
+            for (i = start; i < end; i++) {
                 DDescription[i] = DDesc;
                 DSequence[i] = baseSeq;
-            }
-        }
+        }   }
         rankVal[weight] += length;
     }
 }
@@ -1365,8 +1386,7 @@ size_t HUF_readDTableX6 (U32* DTable, const void* src, size_t srcSize)
     /* Get start index of each weight */
     {
         U32 w, nextRankStart = 0;
-        for (w=1; w<=maxW; w++)
-        {
+        for (w=1; w<=maxW; w++) {
             U32 current = nextRankStart;
             nextRankStart += rankStats[w];
             rankStart[w] = current;
@@ -1378,8 +1398,7 @@ size_t HUF_readDTableX6 (U32* DTable, const void* src, size_t srcSize)
     /* sort symbols by weight */
     {
         U32 s;
-        for (s=0; s<nbSymbols; s++)
-        {
+        for (s=0; s<nbSymbols; s++) {
             U32 w = weightList[s];
             U32 r = rankStart[w]++;
             sortedSymbol[r].symbol = (BYTE)s;
@@ -1395,21 +1414,16 @@ size_t HUF_readDTableX6 (U32* DTable, const void* src, size_t srcSize)
         U32 w, consumed;
         const int rescale = (memLog-tableLog) - 1;   /* tableLog <= memLog */
         U32* rankVal0 = rankVal[0];
-        for (w=1; w<=maxW; w++)
-        {
+        for (w=1; w<=maxW; w++) {
             U32 current = nextRankVal;
             nextRankVal += rankStats[w] << (w+rescale);
             rankVal0[w] = current;
         }
-        for (consumed = minBits; consumed <= memLog - minBits; consumed++)
-        {
+        for (consumed = minBits; consumed <= memLog - minBits; consumed++) {
             U32* rankValPtr = rankVal[consumed];
-            for (w = 1; w <= maxW; w++)
-            {
+            for (w = 1; w <= maxW; w++) {
                 rankValPtr[w] = rankVal0[w] >> consumed;
-            }
-        }
-    }
+    }   }   }
 
     /* fill tables */
     {
@@ -1445,15 +1459,13 @@ static U32 HUF_decodeLastSymbolsX6(void* op, const U32 maxL, BIT_DStream_t* DStr
 {
     const size_t val = BIT_lookBitsFast(DStream, dtLog);   /* note : dtLog >= 1 */
     U32 length = dd[val].nbBytes;
-    if (length <= maxL)
-    {
+    if (length <= maxL) {
         memcpy(op, ds+val, length);
         BIT_skipBits(DStream, dd[val].nbBits);
         return length;
     }
     memcpy(op, ds+val, maxL);
-    if (DStream->bitsConsumed < (sizeof(DStream->bitContainer)*8))
-    {
+    if (DStream->bitsConsumed < (sizeof(DStream->bitContainer)*8)) {
         BIT_skipBits(DStream, dd[val].nbBits);
         if (DStream->bitsConsumed > (sizeof(DStream->bitContainer)*8))
             DStream->bitsConsumed = (sizeof(DStream->bitContainer)*8);   /* ugly hack; works only because it's the last symbol. Note : can't easily extract nbBits from just this symbol */
@@ -1482,8 +1494,7 @@ static inline size_t HUF_decodeStreamX6(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* c
     BYTE* const pStart = p;
 
     /* up to 16 symbols at a time */
-    while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) && (p <= pEnd-16))
-    {
+    while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) && (p <= pEnd-16)) {
         HUF_DECODE_SYMBOLX6_2(p, bitDPtr);
         HUF_DECODE_SYMBOLX6_1(p, bitDPtr);
         HUF_DECODE_SYMBOLX6_2(p, bitDPtr);
@@ -1548,97 +1559,95 @@ size_t HUF_decompress4X6_usingDTable(
     const void* cSrc, size_t cSrcSize,
     const U32* DTable)
 {
-    if (cSrcSize < 10) return ERROR(corruption_detected);   /* strict minimum : jump table + 1 byte per stream */
+    const BYTE* const istart = (const BYTE*) cSrc;
+    BYTE* const ostart = (BYTE*) dst;
+    BYTE* const oend = ostart + dstSize;
 
-    {
-        const BYTE* const istart = (const BYTE*) cSrc;
-        BYTE* const ostart = (BYTE*) dst;
-        BYTE* const oend = ostart + dstSize;
+    const U32 dtLog = DTable[0];
+    const void* const ddPtr = DTable+1;
+    const HUF_DDescX6* dd = (const HUF_DDescX6*)ddPtr;
+    const void* const dsPtr = DTable + 1 + ((size_t)1<<(dtLog-1));
+    const HUF_DSeqX6* ds = (const HUF_DSeqX6*)dsPtr;
+    size_t errorCode;
 
-        const U32 dtLog = DTable[0];
-        const void* const ddPtr = DTable+1;
-        const HUF_DDescX6* dd = (const HUF_DDescX6*)ddPtr;
-        const void* const dsPtr = DTable + 1 + ((size_t)1<<(dtLog-1));
-        const HUF_DSeqX6* ds = (const HUF_DSeqX6*)dsPtr;
-        size_t errorCode;
+    /* Check */
+    if (cSrcSize < 10) return ERROR(corruption_detected);   /* strict minimum : jump table + 1 byte per stream */
 
-        /* Init */
-        BIT_DStream_t bitD1;
-        BIT_DStream_t bitD2;
-        BIT_DStream_t bitD3;
-        BIT_DStream_t bitD4;
-        const size_t length1 = MEM_readLE16(istart);
-        const size_t length2 = MEM_readLE16(istart+2);
-        const size_t length3 = MEM_readLE16(istart+4);
-        size_t length4;
-        const BYTE* const istart1 = istart + 6;  /* jumpTable */
-        const BYTE* const istart2 = istart1 + length1;
-        const BYTE* const istart3 = istart2 + length2;
-        const BYTE* const istart4 = istart3 + length3;
-        const size_t segmentSize = (dstSize+3) / 4;
-        BYTE* const opStart2 = ostart + segmentSize;
-        BYTE* const opStart3 = opStart2 + segmentSize;
-        BYTE* const opStart4 = opStart3 + segmentSize;
-        BYTE* op1 = ostart;
-        BYTE* op2 = opStart2;
-        BYTE* op3 = opStart3;
-        BYTE* op4 = opStart4;
-        U32 endSignal;
+    /* Init */
+    BIT_DStream_t bitD1;
+    BIT_DStream_t bitD2;
+    BIT_DStream_t bitD3;
+    BIT_DStream_t bitD4;
+    const size_t length1 = MEM_readLE16(istart);
+    const size_t length2 = MEM_readLE16(istart+2);
+    const size_t length3 = MEM_readLE16(istart+4);
+    size_t length4;
+    const BYTE* const istart1 = istart + 6;  /* jumpTable */
+    const BYTE* const istart2 = istart1 + length1;
+    const BYTE* const istart3 = istart2 + length2;
+    const BYTE* const istart4 = istart3 + length3;
+    const size_t segmentSize = (dstSize+3) / 4;
+    BYTE* const opStart2 = ostart + segmentSize;
+    BYTE* const opStart3 = opStart2 + segmentSize;
+    BYTE* const opStart4 = opStart3 + segmentSize;
+    BYTE* op1 = ostart;
+    BYTE* op2 = opStart2;
+    BYTE* op3 = opStart3;
+    BYTE* op4 = opStart4;
+    U32 endSignal;
+
+    length4 = cSrcSize - (length1 + length2 + length3 + 6);
+    if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
+    errorCode = BIT_initDStream(&bitD1, istart1, length1);
+    if (HUF_isError(errorCode)) return errorCode;
+    errorCode = BIT_initDStream(&bitD2, istart2, length2);
+    if (HUF_isError(errorCode)) return errorCode;
+    errorCode = BIT_initDStream(&bitD3, istart3, length3);
+    if (HUF_isError(errorCode)) return errorCode;
+    errorCode = BIT_initDStream(&bitD4, istart4, length4);
+    if (HUF_isError(errorCode)) return errorCode;
 
-        length4 = cSrcSize - (length1 + length2 + length3 + 6);
-        if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
-        errorCode = BIT_initDStream(&bitD1, istart1, length1);
-        if (HUF_isError(errorCode)) return errorCode;
-        errorCode = BIT_initDStream(&bitD2, istart2, length2);
-        if (HUF_isError(errorCode)) return errorCode;
-        errorCode = BIT_initDStream(&bitD3, istart3, length3);
-        if (HUF_isError(errorCode)) return errorCode;
-        errorCode = BIT_initDStream(&bitD4, istart4, length4);
-        if (HUF_isError(errorCode)) return errorCode;
+    /* 16-64 symbols per loop (4-16 symbols per stream) */
+    endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
+    for ( ; (op3 <= opStart4) && (endSignal==BIT_DStream_unfinished) && (op4<=(oend-16)) ; ) {
+        HUF_DECODE_SYMBOLX6_2(op1, &bitD1);
+        HUF_DECODE_SYMBOLX6_2(op2, &bitD2);
+        HUF_DECODE_SYMBOLX6_2(op3, &bitD3);
+        HUF_DECODE_SYMBOLX6_2(op4, &bitD4);
+        HUF_DECODE_SYMBOLX6_1(op1, &bitD1);
+        HUF_DECODE_SYMBOLX6_1(op2, &bitD2);
+        HUF_DECODE_SYMBOLX6_1(op3, &bitD3);
+        HUF_DECODE_SYMBOLX6_1(op4, &bitD4);
+        HUF_DECODE_SYMBOLX6_2(op1, &bitD1);
+        HUF_DECODE_SYMBOLX6_2(op2, &bitD2);
+        HUF_DECODE_SYMBOLX6_2(op3, &bitD3);
+        HUF_DECODE_SYMBOLX6_2(op4, &bitD4);
+        HUF_DECODE_SYMBOLX6_0(op1, &bitD1);
+        HUF_DECODE_SYMBOLX6_0(op2, &bitD2);
+        HUF_DECODE_SYMBOLX6_0(op3, &bitD3);
+        HUF_DECODE_SYMBOLX6_0(op4, &bitD4);
 
-        /* 16-64 symbols per loop (4-16 symbols per stream) */
         endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
-        for ( ; (op3 <= opStart4) && (endSignal==BIT_DStream_unfinished) && (op4<=(oend-16)) ; )
-        {
-            HUF_DECODE_SYMBOLX6_2(op1, &bitD1);
-            HUF_DECODE_SYMBOLX6_2(op2, &bitD2);
-            HUF_DECODE_SYMBOLX6_2(op3, &bitD3);
-            HUF_DECODE_SYMBOLX6_2(op4, &bitD4);
-            HUF_DECODE_SYMBOLX6_1(op1, &bitD1);
-            HUF_DECODE_SYMBOLX6_1(op2, &bitD2);
-            HUF_DECODE_SYMBOLX6_1(op3, &bitD3);
-            HUF_DECODE_SYMBOLX6_1(op4, &bitD4);
-            HUF_DECODE_SYMBOLX6_2(op1, &bitD1);
-            HUF_DECODE_SYMBOLX6_2(op2, &bitD2);
-            HUF_DECODE_SYMBOLX6_2(op3, &bitD3);
-            HUF_DECODE_SYMBOLX6_2(op4, &bitD4);
-            HUF_DECODE_SYMBOLX6_0(op1, &bitD1);
-            HUF_DECODE_SYMBOLX6_0(op2, &bitD2);
-            HUF_DECODE_SYMBOLX6_0(op3, &bitD3);
-            HUF_DECODE_SYMBOLX6_0(op4, &bitD4);
-
-            endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
-        }
+    }
 
-        /* check corruption */
-        if (op1 > opStart2) return ERROR(corruption_detected);
-        if (op2 > opStart3) return ERROR(corruption_detected);
-        if (op3 > opStart4) return ERROR(corruption_detected);
-        /* note : op4 supposed already verified within main loop */
+    /* check corruption */
+    if (op1 > opStart2) return ERROR(corruption_detected);
+    if (op2 > opStart3) return ERROR(corruption_detected);
+    if (op3 > opStart4) return ERROR(corruption_detected);
+    /* note : op4 supposed already verified within main loop */
 
-        /* finish bitStreams one by one */
-        HUF_decodeStreamX6(op1, &bitD1, opStart2, DTable, dtLog);
-        HUF_decodeStreamX6(op2, &bitD2, opStart3, DTable, dtLog);
-        HUF_decodeStreamX6(op3, &bitD3, opStart4, DTable, dtLog);
-        HUF_decodeStreamX6(op4, &bitD4, oend,     DTable, dtLog);
+    /* finish bitStreams one by one */
+    HUF_decodeStreamX6(op1, &bitD1, opStart2, DTable, dtLog);
+    HUF_decodeStreamX6(op2, &bitD2, opStart3, DTable, dtLog);
+    HUF_decodeStreamX6(op3, &bitD3, opStart4, DTable, dtLog);
+    HUF_decodeStreamX6(op4, &bitD4, oend,     DTable, dtLog);
 
-        /* check */
-        endSignal = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4);
-        if (!endSignal) return ERROR(corruption_detected);
+    /* check */
+    endSignal = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4);
+    if (!endSignal) return ERROR(corruption_detected);
 
-        /* decoded size */
-        return dstSize;
-    }
+    /* decoded size */
+    return dstSize;
 }
 
 
@@ -1657,9 +1666,9 @@ size_t HUF_decompress4X6 (void* dst, size_t dstSize, const void* cSrc, size_t cS
 }
 
 
-/**********************************/
+/* ********************************/
 /* Generic decompression selector */
-/**********************************/
+/* ********************************/
 
 typedef struct { U32 tableTime; U32 decode256Time; } algo_time_t;
 static const algo_time_t algoTime[16 /* Quantization */][3 /* single, double, quad */] =
diff --git a/lib/huff0.h b/lib/huff0.h
index 2ebd5cf..fe28d7b 100644
--- a/lib/huff0.h
+++ b/lib/huff0.h
@@ -1,7 +1,7 @@
 /* ******************************************************************
    Huff0 : Huffman coder, part of New Generation Entropy library
    header file
-   Copyright (C) 2013-2015, Yann Collet.
+   Copyright (C) 2013-2016, Yann Collet.
 
    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
 
@@ -30,7 +30,6 @@
 
    You can contact the author at :
    - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
-   - Public forum : https://groups.google.com/forum/#!forum/lz4c
 ****************************************************************** */
 #ifndef HUFF0_H
 #define HUFF0_H
@@ -66,8 +65,10 @@ HUF_compress():
 HUF_decompress():
     Decompress Huff0 data from buffer 'cSrc', of size 'cSrcSize',
     into already allocated destination buffer 'dst', of size 'dstSize'.
-    'dstSize' must be the exact size of original (uncompressed) data.
-    Note : in contrast with FSE, HUF_decompress can regenerate RLE (cSrcSize==1) and uncompressed (cSrcSize==dstSize) data, because it knows size to regenerate.
+    @dstSize : must be the **exact** size of original (uncompressed) data.
+    Note : in contrast with FSE, HUF_decompress can regenerate
+           RLE (cSrcSize==1) and uncompressed (cSrcSize==dstSize) data,
+           because it knows size to regenerate.
     @return : size of regenerated data (== dstSize)
               or an error code, which can be tested using HUF_isError()
 */
diff --git a/lib/huff0_static.h b/lib/huff0_static.h
index 5df0727..8403396 100644
--- a/lib/huff0_static.h
+++ b/lib/huff0_static.h
@@ -1,7 +1,7 @@
 /* ******************************************************************
-   Huff0 : Huffman coder, part of New Generation Entropy library
-   header file for static linking (only)
-   Copyright (C) 2013-2015, Yann Collet
+   Huff0 : Huffman codec, part of New Generation Entropy library
+   header file, for static linking only
+   Copyright (C) 2013-2016, Yann Collet
 
    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
 
@@ -30,7 +30,6 @@
 
    You can contact the author at :
    - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
-   - Public forum : https://groups.google.com/forum/#!forum/lz4c
 ****************************************************************** */
 #ifndef HUFF0_STATIC_H
 #define HUFF0_STATIC_H
@@ -47,15 +46,21 @@ extern "C" {
 
 
 /* ****************************************
-*  Static allocation macros
+*  Static allocation
 ******************************************/
 /* Huff0 buffer bounds */
 #define HUF_CTABLEBOUND 129
 #define HUF_BLOCKBOUND(size) (size + (size>>8) + 8)   /* only true if incompressible pre-filtered with fast heuristic */
 #define HUF_COMPRESSBOUND(size) (HUF_CTABLEBOUND + HUF_BLOCKBOUND(size))   /* Macro version, useful for static allocation */
 
+/* static allocation of Huff0's Compression Table */
+#define HUF_CREATE_STATIC_CTABLE(name, maxSymbolValue) \
+    U32 name##hb[maxSymbolValue+1]; \
+    void* name##hv = &(name##hb); \
+    HUF_CElt* name = (HUF_CElt*)(name##hv)   /* no final ; */
+
 /* static allocation of Huff0's DTable */
-#define HUF_DTABLE_SIZE(maxTableLog)   (1 + (1<<maxTableLog))  /* nb Cells; use unsigned short for X2, unsigned int for X4 */
+#define HUF_DTABLE_SIZE(maxTableLog)   (1 + (1<<maxTableLog))
 #define HUF_CREATE_STATIC_DTABLEX2(DTable, maxTableLog) \
         unsigned short DTable[HUF_DTABLE_SIZE(maxTableLog)] = { maxTableLog }
 #define HUF_CREATE_STATIC_DTABLEX4(DTable, maxTableLog) \
@@ -86,13 +91,11 @@ The following API allows targeting specific sub-functions for advanced tasks.
 For example, it's possible to compress several blocks using the same 'CTable',
 or to save and regenerate 'CTable' using external methods.
 */
-
 /* FSE_count() : find it within "fse.h" */
-
 typedef struct HUF_CElt_s HUF_CElt;   /* incomplete type */
-size_t HUF_buildCTable (HUF_CElt* tree, const unsigned* count, unsigned maxSymbolValue, unsigned maxNbBits);
-size_t HUF_writeCTable (void* dst, size_t maxDstSize, const HUF_CElt* tree, unsigned maxSymbolValue, unsigned huffLog);
-size_t HUF_compress_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable);
+size_t HUF_buildCTable (HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue, unsigned maxNbBits);
+size_t HUF_writeCTable (void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog);
+size_t HUF_compress4X_into4Segments(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable);
 
 
 /*!
@@ -100,19 +103,33 @@ HUF_decompress() does the following:
 1. select the decompression algorithm (X2, X4, X6) based on pre-computed heuristics
 2. build Huffman table from save, using HUF_readDTableXn()
 3. decode 1 or 4 segments in parallel using HUF_decompressSXn_usingDTable
-
 */
 size_t HUF_readDTableX2 (unsigned short* DTable, const void* src, size_t srcSize);
 size_t HUF_readDTableX4 (unsigned* DTable, const void* src, size_t srcSize);
 size_t HUF_readDTableX6 (unsigned* DTable, const void* src, size_t srcSize);
 
+size_t HUF_decompress4X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const unsigned short* DTable);
+size_t HUF_decompress4X4_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const unsigned* DTable);
+size_t HUF_decompress4X6_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const unsigned* DTable);
+
+
+/* single stream variants */
+
+size_t HUF_compress1X (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog);
+size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable);
+
 size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* single-symbol decoder */
 size_t HUF_decompress1X4 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* double-symbol decoder */
 size_t HUF_decompress1X6 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* quad-symbol decoder */
 
-size_t HUF_decompress4X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const unsigned short* DTable);
-size_t HUF_decompress4X4_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const unsigned* DTable);
-size_t HUF_decompress4X6_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const unsigned* DTable);
+size_t HUF_decompress1X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const unsigned short* DTable);
+size_t HUF_decompress1X4_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const unsigned* DTable);
+size_t HUF_decompress1X6_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const unsigned* DTable);
+
+
+/* Loading a CTable saved with HUF_writeCTable() */
+
+size_t HUF_readCTable (HUF_CElt* CTable, unsigned maxSymbolValue, const void* src, size_t srcSize);
 
 
 #if defined (__cplusplus)
diff --git a/lib/legacy/zstd_legacy.h b/lib/legacy/zstd_legacy.h
index b3e5eb2..4ae1deb 100644
--- a/lib/legacy/zstd_legacy.h
+++ b/lib/legacy/zstd_legacy.h
@@ -45,6 +45,7 @@ extern "C" {
 #include "zstd_v01.h"
 #include "zstd_v02.h"
 #include "zstd_v03.h"
+#include "zstd_v04.h"
 
 MEM_STATIC unsigned ZSTD_isLegacy (U32 magicNumberLE)
 {
@@ -52,7 +53,8 @@ MEM_STATIC unsigned ZSTD_isLegacy (U32 magicNumberLE)
 	{
 		case ZSTDv01_magicNumberLE :
 		case ZSTDv02_magicNumber :
-		case ZSTDv03_magicNumber : return 1;
+		case ZSTDv03_magicNumber : 
+		case ZSTDv04_magicNumber : return 1;
 		default : return 0;
 	}
 }
@@ -71,6 +73,8 @@ MEM_STATIC size_t ZSTD_decompressLegacy(
 			return ZSTDv02_decompress(dst, maxOriginalSize, src, compressedSize);
 		case ZSTDv03_magicNumber :
 			return ZSTDv03_decompress(dst, maxOriginalSize, src, compressedSize);
+		case ZSTDv04_magicNumber :
+			return ZSTDv04_decompress(dst, maxOriginalSize, src, compressedSize);
 		default :
 		    return ERROR(prefix_unknown);
 	}
diff --git a/lib/legacy/zstd_v02.c b/lib/legacy/zstd_v02.c
index 73136f8..860df94 100644
--- a/lib/legacy/zstd_v02.c
+++ b/lib/legacy/zstd_v02.c
@@ -2133,7 +2133,8 @@ static size_t HUF_readDTableX4 (U32* DTable, const void* src, size_t srcSize)
     if (tableLog > memLog) return ERROR(tableLog_tooLarge);   /* DTable can't fit code depth */
 
     /* find maxWeight */
-    for (maxW = tableLog; rankStats[maxW]==0; maxW--) {}  /* necessarily finds a solution before 0 */
+    for (maxW = tableLog; rankStats[maxW]==0; maxW--)
+        {if (!maxW) return ERROR(GENERIC); }  /* necessarily finds a solution before maxW==0 */
 
     /* Get start index of each weight */
     {
@@ -2465,7 +2466,9 @@ static size_t HUF_readDTableX6 (U32* DTable, const void* src, size_t srcSize)
     if (tableLog > memLog) return ERROR(tableLog_tooLarge);   /* DTable is too small */
 
     /* find maxWeight */
-    for (maxW = tableLog; rankStats[maxW]==0; maxW--) {}  /* necessarily finds a solution before 0 */
+    for (maxW = tableLog; rankStats[maxW]==0; maxW--)
+        { if (!maxW) return ERROR(GENERIC); }  /* necessarily finds a solution before maxW==0 */
+
 
     /* Get start index of each weight */
     {
diff --git a/lib/legacy/zstd_v03.c b/lib/legacy/zstd_v03.c
index 5c75eb4..6a048fd 100644
--- a/lib/legacy/zstd_v03.c
+++ b/lib/legacy/zstd_v03.c
@@ -2133,7 +2133,8 @@ static size_t HUF_readDTableX4 (U32* DTable, const void* src, size_t srcSize)
     if (tableLog > memLog) return ERROR(tableLog_tooLarge);   /* DTable can't fit code depth */
 
     /* find maxWeight */
-    for (maxW = tableLog; rankStats[maxW]==0; maxW--) {}  /* necessarily finds a solution before 0 */
+    for (maxW = tableLog; rankStats[maxW]==0; maxW--)
+        { if (!maxW) return ERROR(GENERIC); }  /* necessarily finds a solution before maxW==0 */
 
     /* Get start index of each weight */
     {
@@ -2465,7 +2466,8 @@ static size_t HUF_readDTableX6 (U32* DTable, const void* src, size_t srcSize)
     if (tableLog > memLog) return ERROR(tableLog_tooLarge);   /* DTable is too small */
 
     /* find maxWeight */
-    for (maxW = tableLog; rankStats[maxW]==0; maxW--) {}  /* necessarily finds a solution before 0 */
+    for (maxW = tableLog; rankStats[maxW]==0; maxW--)
+        { if (!maxW) return ERROR(GENERIC); }  /* necessarily finds a solution before maxW==0 */
 
     /* Get start index of each weight */
     {
diff --git a/lib/legacy/zstd_v03.c b/lib/legacy/zstd_v04.c
similarity index 70%
copy from lib/legacy/zstd_v03.c
copy to lib/legacy/zstd_v04.c
index 5c75eb4..57d724c 100644
--- a/lib/legacy/zstd_v03.c
+++ b/lib/legacy/zstd_v04.c
@@ -1,6 +1,7 @@
 /* ******************************************************************
-   Error codes and messages
-   Copyright (C) 2013-2015, Yann Collet
+   zstd_v04.c
+   Decompression module for ZSTD v0.4 legacy format
+   Copyright (C) 2016, Yann Collet.
 
    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
 
@@ -27,63 +28,12 @@
    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-   You can contact the author at :
-   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
-   - Public forum : https://groups.google.com/forum/#!forum/lz4c
+    You can contact the author at :
+    - Homepage : http://www.zstd.net/
 ****************************************************************** */
-#ifndef ERROR_H_MODULE
-#define ERROR_H_MODULE
-
-#if defined (__cplusplus)
-extern "C" {
-#endif
-
-#include <stddef.h>    /* size_t, ptrdiff_t */
-#include "zstd_v03.h"
-
-/******************************************
-*  Compiler-specific
-******************************************/
-#if defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
-#  define ERR_STATIC static inline
-#elif defined(_MSC_VER)
-#  define ERR_STATIC static __inline
-#elif defined(__GNUC__)
-#  define ERR_STATIC static __attribute__((unused))
-#else
-#  define ERR_STATIC static  /* this version may generate warnings for unused static functions; disable the relevant warning */
-#endif
 
-
-/******************************************
-*  Error Management
-******************************************/
-#define PREFIX(name) ZSTD_error_##name
-
-#define ERROR(name) (size_t)-PREFIX(name)
-
-#define ERROR_LIST(ITEM) \
-        ITEM(PREFIX(No_Error)) ITEM(PREFIX(GENERIC)) \
-        ITEM(PREFIX(memory_allocation)) \
-        ITEM(PREFIX(dstSize_tooSmall)) ITEM(PREFIX(srcSize_wrong)) \
-        ITEM(PREFIX(prefix_unknown)) ITEM(PREFIX(corruption_detected)) \
-        ITEM(PREFIX(tableLog_tooLarge)) ITEM(PREFIX(maxSymbolValue_tooLarge)) ITEM(PREFIX(maxSymbolValue_tooSmall)) \
-        ITEM(PREFIX(maxCode))
-
-#define ERROR_GENERATE_ENUM(ENUM) ENUM,
-typedef enum { ERROR_LIST(ERROR_GENERATE_ENUM) } ERR_codes;  /* enum is exposed, to detect & handle specific errors; compare function result to -enum value */
-
-#define ERROR_CONVERTTOSTRING(STRING) #STRING,
-#define ERROR_GENERATE_STRING(EXPR) ERROR_CONVERTTOSTRING(EXPR)
-
-ERR_STATIC unsigned ERR_isError(size_t code) { return (code > ERROR(maxCode)); }
-
-
-#if defined (__cplusplus)
-}
-#endif
-
-#endif /* ERROR_H_MODULE */
+/*- Dependencies -*/
+#include "zstd_v04.h"
 
 
 /* ******************************************************************
@@ -373,7 +323,602 @@ MEM_STATIC void MEM_writeLEST(void* memPtr, size_t val)
 }
 #endif
 
-#endif /* MEM_H_MODULE */
+#endif /* MEM_H_MODULE */
+
+/* ******************************************************************
+   Error codes list
+   Copyright (C) 2016, Yann Collet
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - Source repository : https://github.com/Cyan4973/zstd
+****************************************************************** */
+#ifndef ERROR_PUBLIC_H_MODULE
+#define ERROR_PUBLIC_H_MODULE
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/* ****************************************
+*  error list
+******************************************/
+enum {
+  ZSTD_error_No_Error,
+  ZSTD_error_GENERIC,
+  ZSTD_error_prefix_unknown,
+  ZSTD_error_frameParameter_unsupported,
+  ZSTD_error_frameParameter_unsupportedBy32bitsImplementation,
+  ZSTD_error_init_missing,
+  ZSTD_error_memory_allocation,
+  ZSTD_error_stage_wrong,
+  ZSTD_error_dstSize_tooSmall,
+  ZSTD_error_srcSize_wrong,
+  ZSTD_error_corruption_detected,
+  ZSTD_error_tableLog_tooLarge,
+  ZSTD_error_maxSymbolValue_tooLarge,
+  ZSTD_error_maxSymbolValue_tooSmall,
+  ZSTD_error_maxCode
+};
+
+/* note : functions provide error codes in reverse negative order,
+          so compare with (size_t)(0-enum) */
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ERROR_PUBLIC_H_MODULE */
+
+
+
+/*
+    zstd - standard compression library
+    Header File for static linking only
+    Copyright (C) 2014-2015, Yann Collet.
+
+    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+    * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following disclaimer
+    in the documentation and/or other materials provided with the
+    distribution.
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - zstd source repository : https://github.com/Cyan4973/zstd
+    - ztsd public forum : https://groups.google.com/forum/#!forum/lz4c
+*/
+#ifndef ZSTD_STATIC_H
+#define ZSTD_STATIC_H
+
+/* The objects defined into this file shall be considered experimental.
+ * They are not considered stable, as their prototype may change in the future.
+ * You can use them for tests, provide feedback, or if you can endure risks of future changes.
+ */
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/* *************************************
+*  Types
+***************************************/
+#define ZSTD_WINDOWLOG_MAX 26
+#define ZSTD_WINDOWLOG_MIN 18
+#define ZSTD_WINDOWLOG_ABSOLUTEMIN 11
+#define ZSTD_CONTENTLOG_MAX (ZSTD_WINDOWLOG_MAX+1)
+#define ZSTD_CONTENTLOG_MIN 4
+#define ZSTD_HASHLOG_MAX 28
+#define ZSTD_HASHLOG_MIN 4
+#define ZSTD_SEARCHLOG_MAX (ZSTD_CONTENTLOG_MAX-1)
+#define ZSTD_SEARCHLOG_MIN 1
+#define ZSTD_SEARCHLENGTH_MAX 7
+#define ZSTD_SEARCHLENGTH_MIN 4
+
+/** from faster to stronger */
+typedef enum { ZSTD_fast, ZSTD_greedy, ZSTD_lazy, ZSTD_lazy2, ZSTD_btlazy2 } ZSTD_strategy;
+
+typedef struct
+{
+    U64 srcSize;       /* optional : tells how much bytes are present in the frame. Use 0 if not known. */
+    U32 windowLog;     /* largest match distance : larger == more compression, more memory needed during decompression */
+    U32 contentLog;    /* full search segment : larger == more compression, slower, more memory (useless for fast) */
+    U32 hashLog;       /* dispatch table : larger == more memory, faster */
+    U32 searchLog;     /* nb of searches : larger == more compression, slower */
+    U32 searchLength;  /* size of matches : larger == faster decompression, sometimes less compression */
+    ZSTD_strategy strategy;
+} ZSTD_parameters;
+
+typedef ZSTDv04_Dctx ZSTD_DCtx;
+
+/* *************************************
+*  Advanced functions
+***************************************/
+/** ZSTD_decompress_usingDict
+*   Same as ZSTD_decompressDCtx, using a Dictionary content as prefix
+*   Note : dict can be NULL, in which case, it's equivalent to ZSTD_decompressDCtx() */
+static size_t ZSTD_decompress_usingDict(ZSTD_DCtx* ctx,
+                                             void* dst, size_t maxDstSize,
+                                       const void* src, size_t srcSize,
+                                       const void* dict,size_t dictSize);
+
+
+/* **************************************
+*  Streaming functions (direct mode)
+****************************************/
+static size_t ZSTD_resetDCtx(ZSTD_DCtx* dctx);
+static size_t ZSTD_getFrameParams(ZSTD_parameters* params, const void* src, size_t srcSize);
+static void   ZSTD_decompress_insertDictionary(ZSTD_DCtx* ctx, const void* src, size_t srcSize);
+
+static size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx);
+static size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize);
+
+/**
+  Streaming decompression, bufferless mode
+
+  A ZSTD_DCtx object is required to track streaming operations.
+  Use ZSTD_createDCtx() / ZSTD_freeDCtx() to manage it.
+  A ZSTD_DCtx object can be re-used multiple times. Use ZSTD_resetDCtx() to return to fresh status.
+
+  First operation is to retrieve frame parameters, using ZSTD_getFrameParams().
+  This function doesn't consume its input. It needs enough input data to properly decode the frame header.
+  Objective is to retrieve *params.windowlog, to know minimum amount of memory required during decoding.
+  Result : 0 when successful, it means the ZSTD_parameters structure has been filled.
+           >0 : means there is not enough data into src. Provides the expected size to successfully decode header.
+           errorCode, which can be tested using ZSTD_isError() (For example, if it's not a ZSTD header)
+
+  Then, you can optionally insert a dictionary.
+  This operation must mimic the compressor behavior, otherwise decompression will fail or be corrupted.
+
+  Then it's possible to start decompression.
+  Use ZSTD_nextSrcSizeToDecompress() and ZSTD_decompressContinue() alternatively.
+  ZSTD_nextSrcSizeToDecompress() tells how much bytes to provide as 'srcSize' to ZSTD_decompressContinue().
+  ZSTD_decompressContinue() requires this exact amount of bytes, or it will fail.
+  ZSTD_decompressContinue() needs previous data blocks during decompression, up to (1 << windowlog).
+  They should preferably be located contiguously, prior to current block. Alternatively, a round buffer is also possible.
+
+  @result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst'.
+  It can be zero, which is not an error; it just means ZSTD_decompressContinue() has decoded some header.
+
+  A frame is fully decoded when ZSTD_nextSrcSizeToDecompress() returns zero.
+  Context can then be reset to start a new decompression.
+*/
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+/* ******************************************************************
+   Error codes and messages
+   Copyright (C) 2013-2016, Yann Collet
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - Source repository : https://github.com/Cyan4973/zstd
+****************************************************************** */
+/* Note : this module is expected to remain private, do not expose it */
+
+#ifndef ERROR_H_MODULE
+#define ERROR_H_MODULE
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/* *****************************************
+*  Includes
+******************************************/
+#include <stddef.h>        /* size_t, ptrdiff_t */
+
+
+/* *****************************************
+*  Compiler-specific
+******************************************/
+#if defined(__GNUC__)
+#  define ERR_STATIC static __attribute__((unused))
+#elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+#  define ERR_STATIC static inline
+#elif defined(_MSC_VER)
+#  define ERR_STATIC static __inline
+#else
+#  define ERR_STATIC static  /* this version may generate warnings for unused static functions; disable the relevant warning */
+#endif
+
+
+/* *****************************************
+*  Error Codes
+******************************************/
+#define PREFIX(name) ZSTD_error_##name
+
+#ifdef ERROR
+#  undef ERROR   /* reported already defined on VS 2015 by Rich Geldreich */
+#endif
+#define ERROR(name) (size_t)-PREFIX(name)
+
+ERR_STATIC unsigned ERR_isError(size_t code) { return (code > ERROR(maxCode)); }
+
+
+/* *****************************************
+*  Error Strings
+******************************************/
+
+ERR_STATIC const char* ERR_getErrorName(size_t code)
+{
+    static const char* codeError = "Unspecified error code";
+    switch( (size_t)(0-code) )
+    {
+    case ZSTD_error_No_Error: return "No error detected";
+    case ZSTD_error_GENERIC:  return "Error (generic)";
+    case ZSTD_error_prefix_unknown: return "Unknown frame descriptor";
+    case ZSTD_error_frameParameter_unsupported: return "Unsupported frame parameter";
+    case ZSTD_error_frameParameter_unsupportedBy32bitsImplementation: return "Frame parameter unsupported in 32-bits mode";
+    case ZSTD_error_init_missing: return "Context should be init first";
+    case ZSTD_error_memory_allocation: return "Allocation error : not enough memory";
+    case ZSTD_error_dstSize_tooSmall: return "Destination buffer is too small";
+    case ZSTD_error_srcSize_wrong: return "Src size incorrect";
+    case ZSTD_error_corruption_detected: return "Corrupted block detected";
+    case ZSTD_error_tableLog_tooLarge: return "tableLog requires too much memory";
+    case ZSTD_error_maxSymbolValue_tooLarge: return "Unsupported max possible Symbol Value : too large";
+    case ZSTD_error_maxSymbolValue_tooSmall: return "Specified maxSymbolValue is too small";
+    case ZSTD_error_maxCode:
+    default: return codeError;
+    }
+}
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ERROR_H_MODULE */
+
+
+#endif  /* ZSTD_STATIC_H */
+
+
+/*
+    zstd_internal - common functions to include
+    Header File for include
+    Copyright (C) 2014-2015, Yann Collet.
+
+    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+    * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following disclaimer
+    in the documentation and/or other materials provided with the
+    distribution.
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - zstd source repository : https://github.com/Cyan4973/zstd
+    - ztsd public forum : https://groups.google.com/forum/#!forum/lz4c
+*/
+#ifndef ZSTD_CCOMMON_H_MODULE
+#define ZSTD_CCOMMON_H_MODULE
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/* *************************************
+*  Common macros
+***************************************/
+#define MIN(a,b) ((a)<(b) ? (a) : (b))
+#define MAX(a,b) ((a)>(b) ? (a) : (b))
+
+
+/* *************************************
+*  Common constants
+***************************************/
+#define ZSTD_MAGICNUMBER 0xFD2FB524   /* v0.4 */
+
+#define KB *(1 <<10)
+#define MB *(1 <<20)
+#define GB *(1U<<30)
+
+#define BLOCKSIZE (128 KB)                 /* define, for static allocation */
+
+static const size_t ZSTD_blockHeaderSize = 3;
+static const size_t ZSTD_frameHeaderSize_min = 5;
+#define ZSTD_frameHeaderSize_max 5         /* define, for static allocation */
+
+#define BIT7 128
+#define BIT6  64
+#define BIT5  32
+#define BIT4  16
+#define BIT1   2
+#define BIT0   1
+
+#define IS_RAW BIT0
+#define IS_RLE BIT1
+
+#define MINMATCH 4
+#define REPCODE_STARTVALUE 4
+
+#define MLbits   7
+#define LLbits   6
+#define Offbits  5
+#define MaxML  ((1<<MLbits) - 1)
+#define MaxLL  ((1<<LLbits) - 1)
+#define MaxOff ((1<<Offbits)- 1)
+#define MLFSELog   10
+#define LLFSELog   10
+#define OffFSELog   9
+#define MaxSeq MAX(MaxLL, MaxML)
+
+#define MIN_SEQUENCES_SIZE (2 /*seqNb*/ + 2 /*dumps*/ + 3 /*seqTables*/ + 1 /*bitStream*/)
+#define MIN_CBLOCK_SIZE (3 /*litCSize*/ + MIN_SEQUENCES_SIZE)
+
+typedef enum { bt_compressed, bt_raw, bt_rle, bt_end } blockType_t;
+
+
+/* ******************************************
+*  Shared functions to include for inlining
+********************************************/
+static void ZSTD_copy8(void* dst, const void* src) { memcpy(dst, src, 8); }
+
+#define COPY8(d,s) { ZSTD_copy8(d,s); d+=8; s+=8; }
+
+/*! ZSTD_wildcopy : custom version of memcpy(), can copy up to 7-8 bytes too many */
+static void ZSTD_wildcopy(void* dst, const void* src, size_t length)
+{
+    const BYTE* ip = (const BYTE*)src;
+    BYTE* op = (BYTE*)dst;
+    BYTE* const oend = op + length;
+    do
+        COPY8(op, ip)
+    while (op < oend);
+}
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+
+/* ******************************************************************
+   FSE : Finite State Entropy coder
+   header file
+   Copyright (C) 2013-2015, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+   - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+#ifndef FSE_H
+#define FSE_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/* *****************************************
+*  Includes
+******************************************/
+#include <stddef.h>    /* size_t, ptrdiff_t */
+
+
+/* *****************************************
+*  FSE simple functions
+******************************************/
+static size_t FSE_decompress(void* dst,  size_t maxDstSize,
+                const void* cSrc, size_t cSrcSize);
+/*!
+FSE_decompress():
+    Decompress FSE data from buffer 'cSrc', of size 'cSrcSize',
+    into already allocated destination buffer 'dst', of size 'maxDstSize'.
+    return : size of regenerated data (<= maxDstSize)
+             or an error code, which can be tested using FSE_isError()
+
+    ** Important ** : FSE_decompress() doesn't decompress non-compressible nor RLE data !!!
+    Why ? : making this distinction requires a header.
+    Header management is intentionally delegated to the user layer, which can better manage special cases.
+*/
+
+
+/* *****************************************
+*  Tool functions
+******************************************/
+/* Error Management */
+static unsigned    FSE_isError(size_t code);        /* tells if a return value is an error code */
+
+
+
+/* *****************************************
+*  FSE detailed API
+******************************************/
+/*!
+FSE_compress() does the following:
+1. count symbol occurrence from source[] into table count[]
+2. normalize counters so that sum(count[]) == Power_of_2 (2^tableLog)
+3. save normalized counters to memory buffer using writeNCount()
+4. build encoding table 'CTable' from normalized counters
+5. encode the data stream using encoding table 'CTable'
+
+FSE_decompress() does the following:
+1. read normalized counters with readNCount()
+2. build decoding table 'DTable' from normalized counters
+3. decode the data stream using decoding table 'DTable'
+
+The following API allows targeting specific sub-functions for advanced tasks.
+For example, it's possible to compress several blocks using the same 'CTable',
+or to save and provide normalized distribution using external method.
+*/
+
+
+/* *** DECOMPRESSION *** */
+
+/*!
+FSE_readNCount():
+   Read compactly saved 'normalizedCounter' from 'rBuffer'.
+   return : size read from 'rBuffer'
+            or an errorCode, which can be tested using FSE_isError()
+            maxSymbolValuePtr[0] and tableLogPtr[0] will also be updated with their respective values */
+static  size_t FSE_readNCount (short* normalizedCounter, unsigned* maxSymbolValuePtr, unsigned* tableLogPtr, const void* rBuffer, size_t rBuffSize);
+
+/*!
+Constructor and Destructor of type FSE_DTable
+    Note that its size depends on 'tableLog' */
+typedef unsigned FSE_DTable;   /* don't allocate that. It's just a way to be more restrictive than void* */
+
+/*!
+FSE_buildDTable():
+   Builds 'dt', which must be already allocated, using FSE_createDTable()
+   return : 0,
+            or an errorCode, which can be tested using FSE_isError() */
+static size_t FSE_buildDTable ( FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog);
+
+/*!
+FSE_decompress_usingDTable():
+   Decompress compressed source 'cSrc' of size 'cSrcSize' using 'dt'
+   into 'dst' which must be already allocated.
+   return : size of regenerated data (necessarily <= maxDstSize)
+            or an errorCode, which can be tested using FSE_isError() */
+static  size_t FSE_decompress_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const FSE_DTable* dt);
+
+/*!
+Tutorial :
+----------
+(Note : these functions only decompress FSE-compressed blocks.
+ If block is uncompressed, use memcpy() instead
+ If block is a single repeated byte, use memset() instead )
+
+The first step is to obtain the normalized frequencies of symbols.
+This can be performed by FSE_readNCount() if it was saved using FSE_writeNCount().
+'normalizedCounter' must be already allocated, and have at least 'maxSymbolValuePtr[0]+1' cells of signed short.
+In practice, that means it's necessary to know 'maxSymbolValue' beforehand,
+or size the table to handle worst case situations (typically 256).
+FSE_readNCount() will provide 'tableLog' and 'maxSymbolValue'.
+The result of FSE_readNCount() is the number of bytes read from 'rBuffer'.
+Note that 'rBufferSize' must be at least 4 bytes, even if useful information is less than that.
+If there is an error, the function will return an error code, which can be tested using FSE_isError().
+
+The next step is to build the decompression tables 'FSE_DTable' from 'normalizedCounter'.
+This is performed by the function FSE_buildDTable().
+The space required by 'FSE_DTable' must be already allocated using FSE_createDTable().
+If there is an error, the function will return an error code, which can be tested using FSE_isError().
+
+'FSE_DTable' can then be used to decompress 'cSrc', with FSE_decompress_usingDTable().
+'cSrcSize' must be strictly correct, otherwise decompression will fail.
+FSE_decompress_usingDTable() result will tell how many bytes were regenerated (<=maxDstSize).
+If there is an error, the function will return an error code, which can be tested using FSE_isError(). (ex: dst buffer too small)
+*/
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif  /* FSE_H */
 
 
 /* ******************************************************************
@@ -425,7 +970,6 @@ extern "C" {
 *  these functions are defined into a .h to be included.
 */
 
-
 /**********************************************
 *  bitStream decompression API (read backward)
 **********************************************/
@@ -495,7 +1039,6 @@ MEM_STATIC unsigned BIT_highbit32 (register U32 val)
 }
 
 
-
 /**********************************************************
 * bitStream decoding
 **********************************************************/
@@ -638,102 +1181,7 @@ MEM_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t* DStream)
 #endif
 
 #endif /* BITSTREAM_H_MODULE */
-/* ******************************************************************
-   Error codes and messages
-   Copyright (C) 2013-2015, Yann Collet
-
-   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-
-       * Redistributions of source code must retain the above copyright
-   notice, this list of conditions and the following disclaimer.
-       * Redistributions in binary form must reproduce the above
-   copyright notice, this list of conditions and the following disclaimer
-   in the documentation and/or other materials provided with the
-   distribution.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-   You can contact the author at :
-   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
-   - Public forum : https://groups.google.com/forum/#!forum/lz4c
-****************************************************************** */
-#ifndef ERROR_H_MODULE
-#define ERROR_H_MODULE
-
-#if defined (__cplusplus)
-extern "C" {
-#endif
-
-
-/******************************************
-*  Compiler-specific
-******************************************/
-#if defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
-#  define ERR_STATIC static inline
-#elif defined(_MSC_VER)
-#  define ERR_STATIC static __inline
-#elif defined(__GNUC__)
-#  define ERR_STATIC static __attribute__((unused))
-#else
-#  define ERR_STATIC static  /* this version may generate warnings for unused static functions; disable the relevant warning */
-#endif
-
-
-/******************************************
-*  Error Management
-******************************************/
-#define PREFIX(name) ZSTD_error_##name
 
-#define ERROR(name) (size_t)-PREFIX(name)
-
-#define ERROR_LIST(ITEM) \
-        ITEM(PREFIX(No_Error)) ITEM(PREFIX(GENERIC)) \
-        ITEM(PREFIX(dstSize_tooSmall)) ITEM(PREFIX(srcSize_wrong)) \
-        ITEM(PREFIX(prefix_unknown)) ITEM(PREFIX(corruption_detected)) \
-        ITEM(PREFIX(tableLog_tooLarge)) ITEM(PREFIX(maxSymbolValue_tooLarge)) ITEM(PREFIX(maxSymbolValue_tooSmall)) \
-        ITEM(PREFIX(maxCode))
-
-#define ERROR_GENERATE_ENUM(ENUM) ENUM,
-typedef enum { ERROR_LIST(ERROR_GENERATE_ENUM) } ERR_codes;  /* enum is exposed, to detect & handle specific errors; compare function result to -enum value */
-
-#define ERROR_CONVERTTOSTRING(STRING) #STRING,
-#define ERROR_GENERATE_STRING(EXPR) ERROR_CONVERTTOSTRING(EXPR)
-static const char* ERR_strings[] = { ERROR_LIST(ERROR_GENERATE_STRING) };
-
-ERR_STATIC unsigned ERR_isError(size_t code) { return (code > ERROR(maxCode)); }
-
-ERR_STATIC const char* ERR_getErrorName(size_t code)
-{
-    static const char* codeError = "Unspecified error code";
-    if (ERR_isError(code)) return ERR_strings[-(int)(code)];
-    return codeError;
-}
-
-
-#if defined (__cplusplus)
-}
-#endif
-
-#endif /* ERROR_H_MODULE */
-/*
-Constructor and Destructor of type FSE_CTable
-    Note that its size depends on 'tableLog' and 'maxSymbolValue' */
-typedef unsigned FSE_CTable;   /* don't allocate that. It's just a way to be more restrictive than void* */
-typedef unsigned FSE_DTable;   /* don't allocate that. It's just a way to be more restrictive than void* */
 
 
 /* ******************************************************************
@@ -770,27 +1218,30 @@ typedef unsigned FSE_DTable;   /* don't allocate that. It's just a way to be mor
    - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
    - Public forum : https://groups.google.com/forum/#!forum/lz4c
 ****************************************************************** */
+#ifndef FSE_STATIC_H
+#define FSE_STATIC_H
+
 #if defined (__cplusplus)
 extern "C" {
 #endif
 
 
-/******************************************
+/* *****************************************
 *  Static allocation
-******************************************/
+*******************************************/
 /* FSE buffer bounds */
 #define FSE_NCOUNTBOUND 512
 #define FSE_BLOCKBOUND(size) (size + (size>>7))
 #define FSE_COMPRESSBOUND(size) (FSE_NCOUNTBOUND + FSE_BLOCKBOUND(size))   /* Macro version, useful for static allocation */
 
-/* You can statically allocate FSE CTable/DTable as a table of unsigned using below macro */
+/* It is possible to statically allocate FSE CTable/DTable as a table of unsigned using below macros */
 #define FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue)   (1 + (1<<(maxTableLog-1)) + ((maxSymbolValue+1)*2))
 #define FSE_DTABLE_SIZE_U32(maxTableLog)                   (1 + (1<<maxTableLog))
 
 
-/******************************************
+/* *****************************************
 *  FSE advanced API
-******************************************/
+*******************************************/
 static size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits);
 /* build a fake FSE_DTable, designed to read an uncompressed bitstream where each symbol uses nbBits */
 
@@ -798,9 +1249,10 @@ static size_t FSE_buildDTable_rle (FSE_DTable* dt, unsigned char symbolValue);
 /* build a fake FSE_DTable, designed to always generate the same symbolValue */
 
 
-/******************************************
+
+/* *****************************************
 *  FSE symbol decompression API
-******************************************/
+*******************************************/
 typedef struct
 {
     size_t      state;
@@ -814,7 +1266,7 @@ static unsigned char FSE_decodeSymbol(FSE_DState_t* DStatePtr, BIT_DStream_t* bi
 
 static unsigned FSE_endOfDState(const FSE_DState_t* DStatePtr);
 
-/*
+/*!
 Let's now decompose FSE_decompress_usingDTable() into its unitary components.
 You will decode FSE-encoded symbols from the bitStream,
 and also any other bitFields you put in, **in reverse order**.
@@ -864,265 +1316,73 @@ Check also the states. There might be some symbols left there, if some high prob
 */
 
 
-/******************************************
-*  FSE unsafe API
-******************************************/
-static unsigned char FSE_decodeSymbolFast(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD);
-/* faster, but works only if nbBits is always >= 1 (otherwise, result will be corrupted) */
-
-
-/******************************************
-*  Implementation of inline functions
-******************************************/
-
-/* decompression */
-
-typedef struct {
-    U16 tableLog;
-    U16 fastMode;
-} FSE_DTableHeader;   /* sizeof U32 */
-
-typedef struct
-{
-    unsigned short newState;
-    unsigned char  symbol;
-    unsigned char  nbBits;
-} FSE_decode_t;   /* size == U32 */
-
-MEM_STATIC void FSE_initDState(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD, const FSE_DTable* dt)
-{
-    FSE_DTableHeader DTableH;
-    memcpy(&DTableH, dt, sizeof(DTableH));
-    DStatePtr->state = BIT_readBits(bitD, DTableH.tableLog);
-    BIT_reloadDStream(bitD);
-    DStatePtr->table = dt + 1;
-}
-
-MEM_STATIC BYTE FSE_decodeSymbol(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD)
-{
-    const FSE_decode_t DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state];
-    const U32  nbBits = DInfo.nbBits;
-    BYTE symbol = DInfo.symbol;
-    size_t lowBits = BIT_readBits(bitD, nbBits);
-
-    DStatePtr->state = DInfo.newState + lowBits;
-    return symbol;
-}
-
-MEM_STATIC BYTE FSE_decodeSymbolFast(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD)
-{
-    const FSE_decode_t DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state];
-    const U32 nbBits = DInfo.nbBits;
-    BYTE symbol = DInfo.symbol;
-    size_t lowBits = BIT_readBitsFast(bitD, nbBits);
-
-    DStatePtr->state = DInfo.newState + lowBits;
-    return symbol;
-}
-
-MEM_STATIC unsigned FSE_endOfDState(const FSE_DState_t* DStatePtr)
-{
-    return DStatePtr->state == 0;
-}
-
-
-#if defined (__cplusplus)
-}
-#endif
-/* ******************************************************************
-   Huff0 : Huffman coder, part of New Generation Entropy library
-   header file for static linking (only)
-   Copyright (C) 2013-2015, Yann Collet
-
-   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-
-       * Redistributions of source code must retain the above copyright
-   notice, this list of conditions and the following disclaimer.
-       * Redistributions in binary form must reproduce the above
-   copyright notice, this list of conditions and the following disclaimer
-   in the documentation and/or other materials provided with the
-   distribution.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-   You can contact the author at :
-   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
-   - Public forum : https://groups.google.com/forum/#!forum/lz4c
-****************************************************************** */
-
-#if defined (__cplusplus)
-extern "C" {
-#endif
-
-/******************************************
-*  Static allocation macros
-******************************************/
-/* Huff0 buffer bounds */
-#define HUF_CTABLEBOUND 129
-#define HUF_BLOCKBOUND(size) (size + (size>>8) + 8)   /* only true if incompressible pre-filtered with fast heuristic */
-#define HUF_COMPRESSBOUND(size) (HUF_CTABLEBOUND + HUF_BLOCKBOUND(size))   /* Macro version, useful for static allocation */
-
-/* static allocation of Huff0's DTable */
-#define HUF_DTABLE_SIZE(maxTableLog)   (1 + (1<<maxTableLog))  /* nb Cells; use unsigned short for X2, unsigned int for X4 */
-#define HUF_CREATE_STATIC_DTABLEX2(DTable, maxTableLog) \
-        unsigned short DTable[HUF_DTABLE_SIZE(maxTableLog)] = { maxTableLog }
-#define HUF_CREATE_STATIC_DTABLEX4(DTable, maxTableLog) \
-        unsigned int DTable[HUF_DTABLE_SIZE(maxTableLog)] = { maxTableLog }
-#define HUF_CREATE_STATIC_DTABLEX6(DTable, maxTableLog) \
-        unsigned int DTable[HUF_DTABLE_SIZE(maxTableLog) * 3 / 2] = { maxTableLog }
-
-
-/******************************************
-*  Advanced functions
-******************************************/
-static size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* single-symbol decoder */
-static size_t HUF_decompress4X4 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* double-symbols decoder */
-static size_t HUF_decompress4X6 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* quad-symbols decoder */
-
-
-#if defined (__cplusplus)
-}
-#endif
-
-/*
-    zstd - standard compression library
-    Header File
-    Copyright (C) 2014-2015, Yann Collet.
-
-    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
-
-    Redistribution and use in source and binary forms, with or without
-    modification, are permitted provided that the following conditions are
-    met:
-    * Redistributions of source code must retain the above copyright
-    notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above
-    copyright notice, this list of conditions and the following disclaimer
-    in the documentation and/or other materials provided with the
-    distribution.
-    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-    You can contact the author at :
-    - zstd source repository : https://github.com/Cyan4973/zstd
-    - ztsd public forum : https://groups.google.com/forum/#!forum/lz4c
-*/
-
-#if defined (__cplusplus)
-extern "C" {
-#endif
-
-/* *************************************
-*  Includes
-***************************************/
-#include <stddef.h>   /* size_t */
+/* *****************************************
+*  FSE unsafe API
+*******************************************/
+static unsigned char FSE_decodeSymbolFast(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD);
+/* faster, but works only if nbBits is always >= 1 (otherwise, result will be corrupted) */
 
 
-/* *************************************
-*  Version
-***************************************/
-#define ZSTD_VERSION_MAJOR    0    /* for breaking interface changes  */
-#define ZSTD_VERSION_MINOR    2    /* for new (non-breaking) interface capabilities */
-#define ZSTD_VERSION_RELEASE  2    /* for tweaks, bug-fixes, or development */
-#define ZSTD_VERSION_NUMBER  (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_MINOR *100 + ZSTD_VERSION_RELEASE)
+/* *****************************************
+*  Implementation of inlined functions
+*******************************************/
+/* decompression */
 
+typedef struct {
+    U16 tableLog;
+    U16 fastMode;
+} FSE_DTableHeader;   /* sizeof U32 */
 
-/* *************************************
-*  Advanced functions
-***************************************/
-typedef struct ZSTD_CCtx_s ZSTD_CCtx;   /* incomplete type */
+typedef struct
+{
+    unsigned short newState;
+    unsigned char  symbol;
+    unsigned char  nbBits;
+} FSE_decode_t;   /* size == U32 */
 
-#if defined (__cplusplus)
+MEM_STATIC void FSE_initDState(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD, const FSE_DTable* dt)
+{
+    FSE_DTableHeader DTableH;
+    memcpy(&DTableH, dt, sizeof(DTableH));
+    DStatePtr->state = BIT_readBits(bitD, DTableH.tableLog);
+    BIT_reloadDStream(bitD);
+    DStatePtr->table = dt + 1;
 }
-#endif
-/*
-    zstd - standard compression library
-    Header File for static linking only
-    Copyright (C) 2014-2015, Yann Collet.
-
-    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
-
-    Redistribution and use in source and binary forms, with or without
-    modification, are permitted provided that the following conditions are
-    met:
-    * Redistributions of source code must retain the above copyright
-    notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above
-    copyright notice, this list of conditions and the following disclaimer
-    in the documentation and/or other materials provided with the
-    distribution.
-    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-    You can contact the author at :
-    - zstd source repository : https://github.com/Cyan4973/zstd
-    - ztsd public forum : https://groups.google.com/forum/#!forum/lz4c
-*/
-
-/* The objects defined into this file should be considered experimental.
- * They are not labelled stable, as their prototype may change in the future.
- * You can use them for tests, provide feedback, or if you can endure risk of future changes.
- */
 
-#if defined (__cplusplus)
-extern "C" {
-#endif
+MEM_STATIC BYTE FSE_decodeSymbol(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD)
+{
+    const FSE_decode_t DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    const U32  nbBits = DInfo.nbBits;
+    BYTE symbol = DInfo.symbol;
+    size_t lowBits = BIT_readBits(bitD, nbBits);
 
-/* *************************************
-*  Streaming functions
-***************************************/
+    DStatePtr->state = DInfo.newState + lowBits;
+    return symbol;
+}
 
-typedef struct ZSTD_DCtx_s ZSTD_DCtx;
+MEM_STATIC BYTE FSE_decodeSymbolFast(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD)
+{
+    const FSE_decode_t DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    const U32 nbBits = DInfo.nbBits;
+    BYTE symbol = DInfo.symbol;
+    size_t lowBits = BIT_readBitsFast(bitD, nbBits);
 
-/*
-  Use above functions alternatively.
-  ZSTD_nextSrcSizeToDecompress() tells how much bytes to provide as 'srcSize' to ZSTD_decompressContinue().
-  ZSTD_decompressContinue() will use previous data blocks to improve compression if they are located prior to current block.
-  Result is the number of bytes regenerated within 'dst'.
-  It can be zero, which is not an error; it just means ZSTD_decompressContinue() has decoded some header.
-*/
+    DStatePtr->state = DInfo.newState + lowBits;
+    return symbol;
+}
 
-/* *************************************
-*  Prefix - version detection
-***************************************/
-#define ZSTD_magicNumber 0xFD2FB523   /* v0.3 */
+MEM_STATIC unsigned FSE_endOfDState(const FSE_DState_t* DStatePtr)
+{
+    return DStatePtr->state == 0;
+}
 
 
 #if defined (__cplusplus)
 }
 #endif
+
+#endif  /* FSE_STATIC_H */
+
 /* ******************************************************************
    FSE : Finite State Entropy coder
    Copyright (C) 2013-2015, Yann Collet.
@@ -1159,10 +1419,10 @@ typedef struct ZSTD_DCtx_s ZSTD_DCtx;
 
 #ifndef FSE_COMMONDEFS_ONLY
 
-/****************************************************************
+/* **************************************************************
 *  Tuning parameters
 ****************************************************************/
-/* MEMORY_USAGE :
+/*!MEMORY_USAGE :
 *  Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.)
 *  Increasing memory usage improves compression ratio
 *  Reduced memory usage can improve speed, due to cache effect
@@ -1170,26 +1430,23 @@ typedef struct ZSTD_DCtx_s ZSTD_DCtx;
 #define FSE_MAX_MEMORY_USAGE 14
 #define FSE_DEFAULT_MEMORY_USAGE 13
 
-/* FSE_MAX_SYMBOL_VALUE :
+/*!FSE_MAX_SYMBOL_VALUE :
 *  Maximum symbol value authorized.
 *  Required for proper stack allocation */
 #define FSE_MAX_SYMBOL_VALUE 255
 
 
-/****************************************************************
+/* **************************************************************
 *  template functions type & suffix
 ****************************************************************/
 #define FSE_FUNCTION_TYPE BYTE
 #define FSE_FUNCTION_EXTENSION
+#define FSE_DECODE_TYPE FSE_decode_t
 
 
-/****************************************************************
-*  Byte symbol type
-****************************************************************/
 #endif   /* !FSE_COMMONDEFS_ONLY */
 
-
-/****************************************************************
+/* **************************************************************
 *  Compiler specifics
 ****************************************************************/
 #ifdef _MSC_VER    /* Visual Studio */
@@ -1207,14 +1464,15 @@ typedef struct ZSTD_DCtx_s ZSTD_DCtx;
 #endif
 
 
-/****************************************************************
-*  Includes
+/* **************************************************************
+*  Dependencies
 ****************************************************************/
 #include <stdlib.h>     /* malloc, free, qsort */
 #include <string.h>     /* memcpy, memset */
 #include <stdio.h>      /* printf (debug) */
 
-/****************************************************************
+
+/* ***************************************************************
 *  Constants
 *****************************************************************/
 #define FSE_MAX_TABLELOG  (FSE_MAX_MEMORY_USAGE-2)
@@ -1229,19 +1487,19 @@ typedef struct ZSTD_DCtx_s ZSTD_DCtx;
 #endif
 
 
-/****************************************************************
+/* **************************************************************
 *  Error Management
 ****************************************************************/
 #define FSE_STATIC_ASSERT(c) { enum { FSE_static_assert = 1/(int)(!!(c)) }; }   /* use only *after* variable declarations */
 
 
-/****************************************************************
+/* **************************************************************
 *  Complex types
 ****************************************************************/
 typedef U32 DTable_max_t[FSE_DTABLE_SIZE_U32(FSE_MAX_TABLELOG)];
 
 
-/****************************************************************
+/*-**************************************************************
 *  Templates
 ****************************************************************/
 /*
@@ -1263,19 +1521,14 @@ typedef U32 DTable_max_t[FSE_DTABLE_SIZE_U32(FSE_MAX_TABLELOG)];
 #define FSE_FUNCTION_NAME(X,Y) FSE_CAT(X,Y)
 #define FSE_TYPE_NAME(X,Y) FSE_CAT(X,Y)
 
-
-/* Function templates */
-
-#define FSE_DECODE_TYPE FSE_TYPE_NAME(FSE_decode_t, FSE_FUNCTION_EXTENSION)
-
 static U32 FSE_tableStep(U32 tableSize) { return (tableSize>>1) + (tableSize>>3) + 3; }
 
-static size_t FSE_FUNCTION_NAME(FSE_buildDTable, FSE_FUNCTION_EXTENSION)
-(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
+
+static size_t FSE_buildDTable(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
 {
-    void* ptr = dt+1;
     FSE_DTableHeader DTableH;
-    FSE_DECODE_TYPE* const tableDecode = (FSE_DECODE_TYPE*)ptr;
+    void* const tdPtr = dt+1;   /* because dt is unsigned, 32-bits aligned on 32-bits */
+    FSE_DECODE_TYPE* const tableDecode = (FSE_DECODE_TYPE*) (tdPtr);
     const U32 tableSize = 1 << tableLog;
     const U32 tableMask = tableSize-1;
     const U32 step = FSE_tableStep(tableSize);
@@ -1473,7 +1726,8 @@ static size_t FSE_buildDTable_rle (FSE_DTable* dt, BYTE symbolValue)
 {
     void* ptr = dt;
     FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr;
-    FSE_decode_t* const cell = (FSE_decode_t*)(ptr) + 1;
+    void* dPtr = dt + 1;
+    FSE_decode_t* const cell = (FSE_decode_t*)dPtr;
 
     DTableH->tableLog = 0;
     DTableH->fastMode = 0;
@@ -1490,7 +1744,8 @@ static size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits)
 {
     void* ptr = dt;
     FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr;
-    FSE_decode_t* const dinfo = (FSE_decode_t*)(ptr) + 1;
+    void* dPtr = dt + 1;
+    FSE_decode_t* const dinfo = (FSE_decode_t*)dPtr;
     const unsigned tableSize = 1 << nbBits;
     const unsigned tableMask = tableSize - 1;
     const unsigned maxSymbolValue = tableMask;
@@ -1544,86 +1799,265 @@ FORCE_INLINE size_t FSE_decompress_usingDTable_generic(
         if (FSE_MAX_TABLELOG*2+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
             BIT_reloadDStream(&bitD);
 
-        op[1] = FSE_GETSYMBOL(&state2);
+        op[1] = FSE_GETSYMBOL(&state2);
+
+        if (FSE_MAX_TABLELOG*4+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
+            { if (BIT_reloadDStream(&bitD) > BIT_DStream_unfinished) { op+=2; break; } }
+
+        op[2] = FSE_GETSYMBOL(&state1);
+
+        if (FSE_MAX_TABLELOG*2+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
+            BIT_reloadDStream(&bitD);
+
+        op[3] = FSE_GETSYMBOL(&state2);
+    }
+
+    /* tail */
+    /* note : BIT_reloadDStream(&bitD) >= FSE_DStream_partiallyFilled; Ends at exactly BIT_DStream_completed */
+    while (1)
+    {
+        if ( (BIT_reloadDStream(&bitD)>BIT_DStream_completed) || (op==omax) || (BIT_endOfDStream(&bitD) && (fast || FSE_endOfDState(&state1))) )
+            break;
+
+        *op++ = FSE_GETSYMBOL(&state1);
+
+        if ( (BIT_reloadDStream(&bitD)>BIT_DStream_completed) || (op==omax) || (BIT_endOfDStream(&bitD) && (fast || FSE_endOfDState(&state2))) )
+            break;
+
+        *op++ = FSE_GETSYMBOL(&state2);
+    }
+
+    /* end ? */
+    if (BIT_endOfDStream(&bitD) && FSE_endOfDState(&state1) && FSE_endOfDState(&state2))
+        return op-ostart;
+
+    if (op==omax) return ERROR(dstSize_tooSmall);   /* dst buffer is full, but cSrc unfinished */
+
+    return ERROR(corruption_detected);
+}
+
+
+static size_t FSE_decompress_usingDTable(void* dst, size_t originalSize,
+                            const void* cSrc, size_t cSrcSize,
+                            const FSE_DTable* dt)
+{
+    FSE_DTableHeader DTableH;
+    U32 fastMode;
+
+    memcpy(&DTableH, dt, sizeof(DTableH));
+    fastMode = DTableH.fastMode;
+
+    /* select fast mode (static) */
+    if (fastMode) return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 1);
+    return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 0);
+}
+
+
+static size_t FSE_decompress(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize)
+{
+    const BYTE* const istart = (const BYTE*)cSrc;
+    const BYTE* ip = istart;
+    short counting[FSE_MAX_SYMBOL_VALUE+1];
+    DTable_max_t dt;   /* Static analyzer seems unable to understand this table will be properly initialized later */
+    unsigned tableLog;
+    unsigned maxSymbolValue = FSE_MAX_SYMBOL_VALUE;
+    size_t errorCode;
+
+    if (cSrcSize<2) return ERROR(srcSize_wrong);   /* too small input size */
+
+    /* normal FSE decoding mode */
+    errorCode = FSE_readNCount (counting, &maxSymbolValue, &tableLog, istart, cSrcSize);
+    if (FSE_isError(errorCode)) return errorCode;
+    if (errorCode >= cSrcSize) return ERROR(srcSize_wrong);   /* too small input size */
+    ip += errorCode;
+    cSrcSize -= errorCode;
+
+    errorCode = FSE_buildDTable (dt, counting, maxSymbolValue, tableLog);
+    if (FSE_isError(errorCode)) return errorCode;
+
+    /* always return, even if it is an error code */
+    return FSE_decompress_usingDTable (dst, maxDstSize, ip, cSrcSize, dt);
+}
+
+
+
+#endif   /* FSE_COMMONDEFS_ONLY */
+
+
+/* ******************************************************************
+   Huff0 : Huffman coder, part of New Generation Entropy library
+   header file
+   Copyright (C) 2013-2015, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+   - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+#ifndef HUFF0_H
+#define HUFF0_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/* ****************************************
+*  Dependency
+******************************************/
+#include <stddef.h>    /* size_t */
+
+
+/* ****************************************
+*  Huff0 simple functions
+******************************************/
+static size_t HUF_decompress(void* dst,  size_t dstSize,
+                const void* cSrc, size_t cSrcSize);
+/*!
+HUF_decompress():
+    Decompress Huff0 data from buffer 'cSrc', of size 'cSrcSize',
+    into already allocated destination buffer 'dst', of size 'dstSize'.
+    'dstSize' must be the exact size of original (uncompressed) data.
+    Note : in contrast with FSE, HUF_decompress can regenerate RLE (cSrcSize==1) and uncompressed (cSrcSize==dstSize) data, because it knows size to regenerate.
+    @return : size of regenerated data (== dstSize)
+              or an error code, which can be tested using HUF_isError()
+*/
+
+
+/* ****************************************
+*  Tool functions
+******************************************/
+/* Error Management */
+static unsigned    HUF_isError(size_t code);        /* tells if a return value is an error code */
+
+
+#if defined (__cplusplus)
+}
+#endif
 
-        if (FSE_MAX_TABLELOG*4+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
-            { if (BIT_reloadDStream(&bitD) > BIT_DStream_unfinished) { op+=2; break; } }
+#endif   /* HUFF0_H */
 
-        op[2] = FSE_GETSYMBOL(&state1);
 
-        if (FSE_MAX_TABLELOG*2+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
-            BIT_reloadDStream(&bitD);
+/* ******************************************************************
+   Huff0 : Huffman coder, part of New Generation Entropy library
+   header file for static linking (only)
+   Copyright (C) 2013-2015, Yann Collet
 
-        op[3] = FSE_GETSYMBOL(&state2);
-    }
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
 
-    /* tail */
-    /* note : BIT_reloadDStream(&bitD) >= FSE_DStream_partiallyFilled; Ends at exactly BIT_DStream_completed */
-    while (1)
-    {
-        if ( (BIT_reloadDStream(&bitD)>BIT_DStream_completed) || (op==omax) || (BIT_endOfDStream(&bitD) && (fast || FSE_endOfDState(&state1))) )
-            break;
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
 
-        *op++ = FSE_GETSYMBOL(&state1);
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
 
-        if ( (BIT_reloadDStream(&bitD)>BIT_DStream_completed) || (op==omax) || (BIT_endOfDStream(&bitD) && (fast || FSE_endOfDState(&state2))) )
-            break;
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-        *op++ = FSE_GETSYMBOL(&state2);
-    }
+   You can contact the author at :
+   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+   - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+#ifndef HUFF0_STATIC_H
+#define HUFF0_STATIC_H
 
-    /* end ? */
-    if (BIT_endOfDStream(&bitD) && FSE_endOfDState(&state1) && FSE_endOfDState(&state2))
-        return op-ostart;
+#if defined (__cplusplus)
+extern "C" {
+#endif
 
-    if (op==omax) return ERROR(dstSize_tooSmall);   /* dst buffer is full, but cSrc unfinished */
 
-    return ERROR(corruption_detected);
-}
+/* ****************************************
+*  Dependency
+******************************************/
+#include "huff0.h"
 
 
-static size_t FSE_decompress_usingDTable(void* dst, size_t originalSize,
-                            const void* cSrc, size_t cSrcSize,
-                            const FSE_DTable* dt)
-{
-    FSE_DTableHeader DTableH;
-    memcpy(&DTableH, dt, sizeof(DTableH));
+/* ****************************************
+*  Static allocation macros
+******************************************/
+/* static allocation of Huff0's DTable */
+#define HUF_DTABLE_SIZE(maxTableLog)   (1 + (1<<maxTableLog))  /* nb Cells; use unsigned short for X2, unsigned int for X4 */
+#define HUF_CREATE_STATIC_DTABLEX2(DTable, maxTableLog) \
+        unsigned short DTable[HUF_DTABLE_SIZE(maxTableLog)] = { maxTableLog }
+#define HUF_CREATE_STATIC_DTABLEX4(DTable, maxTableLog) \
+        unsigned int DTable[HUF_DTABLE_SIZE(maxTableLog)] = { maxTableLog }
+#define HUF_CREATE_STATIC_DTABLEX6(DTable, maxTableLog) \
+        unsigned int DTable[HUF_DTABLE_SIZE(maxTableLog) * 3 / 2] = { maxTableLog }
 
-    /* select fast mode (static) */
-    if (DTableH.fastMode) return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 1);
-    return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 0);
-}
 
+/* ****************************************
+*  Advanced decompression functions
+******************************************/
+static size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* single-symbol decoder */
+static size_t HUF_decompress4X4 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* double-symbols decoder */
+static size_t HUF_decompress4X6 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* quad-symbols decoder */
 
-static size_t FSE_decompress(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize)
-{
-    const BYTE* const istart = (const BYTE*)cSrc;
-    const BYTE* ip = istart;
-    short counting[FSE_MAX_SYMBOL_VALUE+1];
-    DTable_max_t dt;   /* Static analyzer seems unable to understand this table will be properly initialized later */
-    unsigned tableLog;
-    unsigned maxSymbolValue = FSE_MAX_SYMBOL_VALUE;
-    size_t errorCode;
 
-    if (cSrcSize<2) return ERROR(srcSize_wrong);   /* too small input size */
+/* ****************************************
+*  Huff0 detailed API
+******************************************/
+/*!
+HUF_decompress() does the following:
+1. select the decompression algorithm (X2, X4, X6) based on pre-computed heuristics
+2. build Huffman table from save, using HUF_readDTableXn()
+3. decode 1 or 4 segments in parallel using HUF_decompressSXn_usingDTable
 
-    /* normal FSE decoding mode */
-    errorCode = FSE_readNCount (counting, &maxSymbolValue, &tableLog, istart, cSrcSize);
-    if (FSE_isError(errorCode)) return errorCode;
-    if (errorCode >= cSrcSize) return ERROR(srcSize_wrong);   /* too small input size */
-    ip += errorCode;
-    cSrcSize -= errorCode;
+*/
+static size_t HUF_readDTableX2 (unsigned short* DTable, const void* src, size_t srcSize);
+static size_t HUF_readDTableX4 (unsigned* DTable, const void* src, size_t srcSize);
+static size_t HUF_readDTableX6 (unsigned* DTable, const void* src, size_t srcSize);
 
-    errorCode = FSE_buildDTable (dt, counting, maxSymbolValue, tableLog);
-    if (FSE_isError(errorCode)) return errorCode;
+static size_t HUF_decompress4X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const unsigned short* DTable);
+static size_t HUF_decompress4X4_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const unsigned* DTable);
+static size_t HUF_decompress4X6_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const unsigned* DTable);
 
-    /* always return, even if it is an error code */
-    return FSE_decompress_usingDTable (dst, maxDstSize, ip, cSrcSize, dt);
+
+#if defined (__cplusplus)
 }
+#endif
+
+#endif /* HUFF0_STATIC_H */
 
 
 
-#endif   /* FSE_COMMONDEFS_ONLY */
 /* ******************************************************************
    Huff0 : Huffman coder, part of New Generation Entropy library
    Copyright (C) 2013-2015, Yann Collet.
@@ -1655,10 +2089,9 @@ static size_t FSE_decompress(void* dst, size_t maxDstSize, const void* cSrc, siz
 
     You can contact the author at :
     - FSE+Huff0 source repository : https://github.com/Cyan4973/FiniteStateEntropy
-    - Public forum : https://groups.google.com/forum/#!forum/lz4c
 ****************************************************************** */
 
-/****************************************************************
+/* **************************************************************
 *  Compiler specifics
 ****************************************************************/
 #if defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
@@ -1683,24 +2116,17 @@ static size_t FSE_decompress(void* dst, size_t maxDstSize, const void* cSrc, siz
 #endif
 
 
-/****************************************************************
+/* **************************************************************
 *  Includes
 ****************************************************************/
 #include <stdlib.h>     /* malloc, free, qsort */
 #include <string.h>     /* memcpy, memset */
 #include <stdio.h>      /* printf (debug) */
 
-/****************************************************************
-*  Error Management
-****************************************************************/
-#define HUF_STATIC_ASSERT(c) { enum { HUF_static_assert = 1/(int)(!!(c)) }; }   /* use only *after* variable declarations */
-
-
-/******************************************
-*  Helper functions
-******************************************/
-static unsigned HUF_isError(size_t code) { return ERR_isError(code); }
 
+/* **************************************************************
+*  Constants
+****************************************************************/
 #define HUF_ABSOLUTEMAX_TABLELOG  16   /* absolute limit of HUF_MAX_TABLELOG. Beyond that value, code does not work */
 #define HUF_MAX_TABLELOG  12           /* max configured tableLog (for static allocation); can be modified up to HUF_ABSOLUTEMAX_TABLELOG */
 #define HUF_DEFAULT_TABLELOG  HUF_MAX_TABLELOG   /* tableLog by default, when not specified */
@@ -1710,8 +2136,15 @@ static unsigned HUF_isError(size_t code) { return ERR_isError(code); }
 #endif
 
 
+/* **************************************************************
+*  Error Management
+****************************************************************/
+static unsigned HUF_isError(size_t code) { return ERR_isError(code); }
+#define HUF_STATIC_ASSERT(c) { enum { HUF_static_assert = 1/(int)(!!(c)) }; }   /* use only *after* variable declarations */
+
 
-/*********************************************************
+
+/*-*******************************************************
 *  Huff0 : Huffman block decompression
 *********************************************************/
 typedef struct { BYTE byte; BYTE nbBits; } HUF_DEltX2;   /* single-symbol decoding */
@@ -1810,13 +2243,12 @@ static size_t HUF_readDTableX2 (U16* DTable, const void* src, size_t srcSize)
     BYTE huffWeight[HUF_MAX_SYMBOL_VALUE + 1];
     U32 rankVal[HUF_ABSOLUTEMAX_TABLELOG + 1];   /* large enough for values from 0 to 16 */
     U32 tableLog = 0;
-    const BYTE* ip = (const BYTE*) src;
-    size_t iSize = ip[0];
+    size_t iSize;
     U32 nbSymbols = 0;
     U32 n;
     U32 nextRankStart;
-    void* ptr = DTable+1;
-    HUF_DEltX2* const dt = (HUF_DEltX2*)(ptr);
+    void* const dtPtr = DTable + 1;
+    HUF_DEltX2* const dt = (HUF_DEltX2*)dtPtr;
 
     HUF_STATIC_ASSERT(sizeof(HUF_DEltX2) == sizeof(U16));   /* if compilation fails here, assertion is false */
     //memset(huffWeight, 0, sizeof(huffWeight));   /* is not necessary, even though some analyzer complain ... */
@@ -1908,9 +2340,8 @@ static size_t HUF_decompress4X2_usingDTable(
         const BYTE* const istart = (const BYTE*) cSrc;
         BYTE* const ostart = (BYTE*) dst;
         BYTE* const oend = ostart + dstSize;
-
-        const void* ptr = DTable;
-        const HUF_DEltX2* const dt = ((const HUF_DEltX2*)ptr) +1;
+        const void* const dtPtr = DTable;
+        const HUF_DEltX2* const dt = ((const HUF_DEltX2*)dtPtr) +1;
         const U32 dtLog = DTable[0];
         size_t errorCode;
 
@@ -2117,10 +2548,9 @@ static size_t HUF_readDTableX4 (U32* DTable, const void* src, size_t srcSize)
     rankVal_t rankVal;
     U32 tableLog, maxW, sizeOfSort, nbSymbols;
     const U32 memLog = DTable[0];
-    const BYTE* ip = (const BYTE*) src;
-    size_t iSize = ip[0];
-    void* ptr = DTable;
-    HUF_DEltX4* const dt = ((HUF_DEltX4*)ptr) + 1;
+    size_t iSize;
+    void* dtPtr = DTable;
+    HUF_DEltX4* const dt = ((HUF_DEltX4*)dtPtr) + 1;
 
     HUF_STATIC_ASSERT(sizeof(HUF_DEltX4) == sizeof(U32));   /* if compilation fails here, assertion is false */
     if (memLog > HUF_ABSOLUTEMAX_TABLELOG) return ERROR(tableLog_tooLarge);
@@ -2133,7 +2563,8 @@ static size_t HUF_readDTableX4 (U32* DTable, const void* src, size_t srcSize)
     if (tableLog > memLog) return ERROR(tableLog_tooLarge);   /* DTable can't fit code depth */
 
     /* find maxWeight */
-    for (maxW = tableLog; rankStats[maxW]==0; maxW--) {}  /* necessarily finds a solution before 0 */
+    for (maxW = tableLog; rankStats[maxW]==0; maxW--)
+        { if (!maxW) return ERROR(GENERIC); }  /* necessarily finds a solution before maxW==0 */
 
     /* Get start index of each weight */
     {
@@ -2161,7 +2592,7 @@ static size_t HUF_readDTableX4 (U32* DTable, const void* src, size_t srcSize)
         rankStart[0] = 0;   /* forget 0w symbols; this is beginning of weight(1) */
     }
 
-	/* Build rankVal */
+    /* Build rankVal */
     {
         const U32 minBits = tableLog+1 - maxW;
         U32 nextRankVal = 0;
@@ -2256,8 +2687,6 @@ static inline size_t HUF_decodeStreamX4(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* c
     return p-pStart;
 }
 
-
-
 static size_t HUF_decompress4X4_usingDTable(
           void* dst,  size_t dstSize,
     const void* cSrc, size_t cSrcSize,
@@ -2269,9 +2698,8 @@ static size_t HUF_decompress4X4_usingDTable(
         const BYTE* const istart = (const BYTE*) cSrc;
         BYTE* const ostart = (BYTE*) dst;
         BYTE* const oend = ostart + dstSize;
-
-        const void* ptr = DTable;
-        const HUF_DEltX4* const dt = ((const HUF_DEltX4*)ptr) +1;
+        const void* const dtPtr = DTable;
+        const HUF_DEltX4* const dt = ((const HUF_DEltX4*)dtPtr) +1;
         const U32 dtLog = DTable[0];
         size_t errorCode;
 
@@ -2452,8 +2880,7 @@ static size_t HUF_readDTableX6 (U32* DTable, const void* src, size_t srcSize)
     U32 tableLog, maxW, sizeOfSort, nbSymbols;
     rankVal_t rankVal;
     const U32 memLog = DTable[0];
-    const BYTE* ip = (const BYTE*) src;
-    size_t iSize = ip[0];
+    size_t iSize;
 
     if (memLog > HUF_ABSOLUTEMAX_TABLELOG) return ERROR(tableLog_tooLarge);
     //memset(weightList, 0, sizeof(weightList));   /* is not necessary, even though some analyzer complain ... */
@@ -2465,7 +2892,8 @@ static size_t HUF_readDTableX6 (U32* DTable, const void* src, size_t srcSize)
     if (tableLog > memLog) return ERROR(tableLog_tooLarge);   /* DTable is too small */
 
     /* find maxWeight */
-    for (maxW = tableLog; rankStats[maxW]==0; maxW--) {}  /* necessarily finds a solution before 0 */
+    for (maxW = tableLog; rankStats[maxW]==0; maxW--)
+        { if (!maxW) return ERROR(GENERIC); }  /* necessarily finds a solution before maxW==0 */
 
     /* Get start index of each weight */
     {
@@ -2493,7 +2921,7 @@ static size_t HUF_readDTableX6 (U32* DTable, const void* src, size_t srcSize)
         rankStart[0] = 0;   /* forget 0w symbols; this is beginning of weight(1) */
     }
 
-	/* Build rankVal */
+    /* Build rankVal */
     {
         const U32 minBits = tableLog+1 - maxW;
         U32 nextRankVal = 0;
@@ -2516,13 +2944,12 @@ static size_t HUF_readDTableX6 (U32* DTable, const void* src, size_t srcSize)
         }
     }
 
-
     /* fill tables */
     {
         void* ddPtr = DTable+1;
-        HUF_DDescX6* DDescription = (HUF_DDescX6*)(ddPtr);
+        HUF_DDescX6* DDescription = (HUF_DDescX6*)ddPtr;
         void* dsPtr = DTable + 1 + ((size_t)1<<(memLog-1));
-        HUF_DSeqX6* DSequence = (HUF_DSeqX6*)(dsPtr);
+        HUF_DSeqX6* DSequence = (HUF_DSeqX6*)dsPtr;
         HUF_DSeqX6 DSeq;
         HUF_DDescX6 DDesc;
         DSeq.sequence = 0;
@@ -2581,10 +3008,10 @@ static U32 HUF_decodeLastSymbolsX6(void* op, const U32 maxL, BIT_DStream_t* DStr
 
 static inline size_t HUF_decodeStreamX6(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* const pEnd, const U32* DTable, const U32 dtLog)
 {
-    const void* ddPtr = DTable+1;
-    const HUF_DDescX6* dd = (const HUF_DDescX6*)(ddPtr);
-    const void* dsPtr = DTable + 1 + ((size_t)1<<(dtLog-1));
-    const HUF_DSeqX6* ds = (const HUF_DSeqX6*)(dsPtr);
+    const void* const ddPtr = DTable+1;
+    const HUF_DDescX6* dd = (const HUF_DDescX6*)ddPtr;
+    const void* const dsPtr = DTable + 1 + ((size_t)1<<(dtLog-1));
+    const HUF_DSeqX6* ds = (const HUF_DSeqX6*)dsPtr;
     BYTE* const pStart = p;
 
     /* up to 16 symbols at a time */
@@ -2600,17 +3027,13 @@ static inline size_t HUF_decodeStreamX6(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* c
     while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) && (p <= pEnd-4))
         HUF_DECODE_SYMBOLX6_0(p, bitDPtr);
 
-    while (p <= pEnd-4)
-        HUF_DECODE_SYMBOLX6_0(p, bitDPtr);   /* no need to reload : reached the end of DStream */
-
-    while (p < pEnd)
+    while ((BIT_reloadDStream(bitDPtr) <= BIT_DStream_endOfBuffer) && (p < pEnd))
         p += HUF_decodeLastSymbolsX6(p, (U32)(pEnd-p), bitDPtr, dd, ds, dtLog);
 
     return p-pStart;
 }
 
 
-
 static size_t HUF_decompress4X6_usingDTable(
           void* dst,  size_t dstSize,
     const void* cSrc, size_t cSrcSize,
@@ -2624,10 +3047,10 @@ static size_t HUF_decompress4X6_usingDTable(
         BYTE* const oend = ostart + dstSize;
 
         const U32 dtLog = DTable[0];
-        const void* ddPtr = DTable+1;
-        const HUF_DDescX6* dd = (const HUF_DDescX6*)(ddPtr);
-        const void* dsPtr = DTable + 1 + ((size_t)1<<(dtLog-1));
-        const HUF_DSeqX6* ds = (const HUF_DSeqX6*)(dsPtr);
+        const void* const ddPtr = DTable+1;
+        const HUF_DDescX6* dd = (const HUF_DDescX6*)ddPtr;
+        const void* const dsPtr = DTable + 1 + ((size_t)1<<(dtLog-1));
+        const HUF_DSeqX6* ds = (const HUF_DSeqX6*)dsPtr;
         size_t errorCode;
 
         /* Init */
@@ -2785,9 +3208,15 @@ static size_t HUF_decompress (void* dst, size_t dstSize, const void* cSrc, size_
     //return HUF_decompress4X4(dst, dstSize, cSrc, cSrcSize);   /* multi-streams double-symbols decoding */
     //return HUF_decompress4X6(dst, dstSize, cSrc, cSrcSize);   /* multi-streams quad-symbols decoding */
 }
+
+
+
+#endif   /* ZSTD_CCOMMON_H_MODULE */
+
+
 /*
-    zstd - standard compression library
-    Copyright (C) 2014-2015, Yann Collet.
+    zstd - decompression module fo v0.4 legacy format
+    Copyright (C) 2015-2016, Yann Collet.
 
     BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
 
@@ -2821,29 +3250,12 @@ static size_t HUF_decompress (void* dst, size_t dstSize, const void* cSrc, size_
 *  Tuning parameters
 *****************************************************************/
 /*!
-*  MEMORY_USAGE :
-*  Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.)
-*  Increasing memory usage improves compression ratio
-*  Reduced memory usage can improve speed, due to cache effect
-*/
-#define ZSTD_MEMORY_USAGE 17
-
-/*!
  * HEAPMODE :
- * Select how default compression functions will allocate memory for their hash table,
- * in memory stack (0, fastest), or in memory heap (1, requires malloc())
- * Note that compression context is fairly large, as a consequence heap memory is recommended.
+ * Select how default decompression function ZSTD_decompress() will allocate memory,
+ * in memory stack (0), or in memory heap (1, requires malloc())
  */
 #ifndef ZSTD_HEAPMODE
 #  define ZSTD_HEAPMODE 1
-#endif /* ZSTD_HEAPMODE */
-
-/*!
-*  LEGACY_SUPPORT :
-*  decompressor can decode older formats (starting from Zstd 0.1+)
-*/
-#ifndef ZSTD_LEGACY_SUPPORT
-#  define ZSTD_LEGACY_SUPPORT 1
 #endif
 
 
@@ -2858,10 +3270,6 @@ static size_t HUF_decompress (void* dst, size_t dstSize, const void* cSrc, size_
 /* *******************************************************
 *  Compiler specifics
 *********************************************************/
-#ifdef __AVX2__
-#  include <immintrin.h>   /* AVX2 intrinsics */
-#endif
-
 #ifdef _MSC_VER    /* Visual Studio */
 #  define FORCE_INLINE static __forceinline
 #  include <intrin.h>                    /* For Visual 2005 */
@@ -2877,220 +3285,126 @@ static size_t HUF_decompress (void* dst, size_t dstSize, const void* cSrc, size_
 #endif
 
 
-/* *******************************************************
-*  Constants
-*********************************************************/
-#define HASH_LOG (ZSTD_MEMORY_USAGE - 2)
-#define HASH_TABLESIZE (1 << HASH_LOG)
-#define HASH_MASK (HASH_TABLESIZE - 1)
-
-#define KNUTH 2654435761
-
-#define BIT7 128
-#define BIT6  64
-#define BIT5  32
-#define BIT4  16
-#define BIT1   2
-#define BIT0   1
-
-#define KB *(1 <<10)
-#define MB *(1 <<20)
-#define GB *(1U<<30)
-
-#define BLOCKSIZE (128 KB)                 /* define, for static allocation */
-#define MIN_SEQUENCES_SIZE (2 /*seqNb*/ + 2 /*dumps*/ + 3 /*seqTables*/ + 1 /*bitStream*/)
-#define MIN_CBLOCK_SIZE (3 /*litCSize*/ + MIN_SEQUENCES_SIZE)
-#define IS_RAW BIT0
-#define IS_RLE BIT1
-
-#define WORKPLACESIZE (BLOCKSIZE*3)
-#define MINMATCH 4
-#define MLbits   7
-#define LLbits   6
-#define Offbits  5
-#define MaxML  ((1<<MLbits )-1)
-#define MaxLL  ((1<<LLbits )-1)
-#define MaxOff   31
-#define LitFSELog  11
-#define MLFSELog   10
-#define LLFSELog   10
-#define OffFSELog   9
-#define MAX(a,b) ((a)<(b)?(b):(a))
-#define MaxSeq MAX(MaxLL, MaxML)
-
-#define LITERAL_NOENTROPY 63
-#define COMMAND_NOENTROPY 7   /* to remove */
-
-static const size_t ZSTD_blockHeaderSize = 3;
-static const size_t ZSTD_frameHeaderSize = 4;
-
-
-/* *******************************************************
-*  Memory operations
-**********************************************************/
-static void   ZSTD_copy4(void* dst, const void* src) { memcpy(dst, src, 4); }
-
-static void   ZSTD_copy8(void* dst, const void* src) { memcpy(dst, src, 8); }
-
-#define COPY8(d,s) { ZSTD_copy8(d,s); d+=8; s+=8; }
-
-/*! ZSTD_wildcopy : custom version of memcpy(), can copy up to 7-8 bytes too many */
-static void ZSTD_wildcopy(void* dst, const void* src, size_t length)
-{
-    const BYTE* ip = (const BYTE*)src;
-    BYTE* op = (BYTE*)dst;
-    BYTE* const oend = op + length;
-    do COPY8(op, ip) while (op < oend);
-}
-
-
-/* **************************************
-*  Local structures
-****************************************/
-typedef enum { bt_compressed, bt_raw, bt_rle, bt_end } blockType_t;
-
+/* *************************************
+*  Local types
+***************************************/
 typedef struct
 {
     blockType_t blockType;
     U32 origSize;
 } blockProperties_t;
 
-typedef struct {
-    void* buffer;
-    U32*  offsetStart;
-    U32*  offset;
-    BYTE* offCodeStart;
-    BYTE* offCode;
-    BYTE* litStart;
-    BYTE* lit;
-    BYTE* litLengthStart;
-    BYTE* litLength;
-    BYTE* matchLengthStart;
-    BYTE* matchLength;
-    BYTE* dumpsStart;
-    BYTE* dumps;
-} seqStore_t;
-
-
-/* *************************************
-*  Error Management
-***************************************/
-/*! ZSTD_isError
-*   tells if a return value is an error code */
-static unsigned ZSTD_isError(size_t code) { return ERR_isError(code); }
-
-
-/* *************************************
-*  Function body to include
-***************************************/
-static size_t ZSTD_read_ARCH(const void* p) { size_t r; memcpy(&r, p, sizeof(r)); return r; }
-
-MEM_STATIC unsigned ZSTD_NbCommonBytes (register size_t val)
-{
-    if (MEM_isLittleEndian())
-    {
-        if (MEM_64bits())
-        {
-#       if defined(_MSC_VER) && defined(_WIN64) && !defined(LZ4_FORCE_SW_BITCOUNT)
-            unsigned long r = 0;
-            _BitScanForward64( &r, (U64)val );
-            return (int)(r>>3);
-#       elif defined(__GNUC__) && (__GNUC__ >= 3) && !defined(LZ4_FORCE_SW_BITCOUNT)
-            return (__builtin_ctzll((U64)val) >> 3);
-#       else
-            static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 };
-            return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58];
-#       endif
-        }
-        else /* 32 bits */
-        {
-#       if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT)
-            unsigned long r;
-            _BitScanForward( &r, (U32)val );
-            return (int)(r>>3);
-#       elif defined(__GNUC__) && (__GNUC__ >= 3) && !defined(LZ4_FORCE_SW_BITCOUNT)
-            return (__builtin_ctz((U32)val) >> 3);
-#       else
-            static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 };
-            return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
-#       endif
-        }
-    }
-    else   /* Big Endian CPU */
-    {
-        if (MEM_32bits())
-        {
-#       if defined(_MSC_VER) && defined(_WIN64) && !defined(LZ4_FORCE_SW_BITCOUNT)
-            unsigned long r = 0;
-            _BitScanReverse64( &r, val );
-            return (unsigned)(r>>3);
-#       elif defined(__GNUC__) && (__GNUC__ >= 3) && !defined(LZ4_FORCE_SW_BITCOUNT)
-            return (__builtin_clzll(val) >> 3);
-#       else
-            unsigned r;
-            const unsigned n32 = sizeof(size_t)*4;   /* calculate this way due to compiler complaining in 32-bits mode */
-            if (!(val>>n32)) { r=4; } else { r=0; val>>=n32; }
-            if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; }
-            r += (!val);
-            return r;
-#       endif
-        }
-        else /* 32 bits */
-        {
-#       if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT)
-            unsigned long r = 0;
-            _BitScanReverse( &r, (unsigned long)val );
-            return (unsigned)(r>>3);
-#       elif defined(__GNUC__) && (__GNUC__ >= 3) && !defined(LZ4_FORCE_SW_BITCOUNT)
-            return (__builtin_clz((U32)val) >> 3);
-#       else
-            unsigned r;
-            if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; }
-            r += (!val);
-            return r;
-#       endif
-        }
-    }
-}
 
+/* *******************************************************
+*  Memory operations
+**********************************************************/
+static void ZSTD_copy4(void* dst, const void* src) { memcpy(dst, src, 4); }
 
-MEM_STATIC size_t ZSTD_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* pInLimit)
-{
-    const BYTE* const pStart = pIn;
 
-    while ((pIn<pInLimit-(sizeof(size_t)-1)))
-    {
-        size_t diff = ZSTD_read_ARCH(pMatch) ^ ZSTD_read_ARCH(pIn);
-        if (!diff) { pIn+=sizeof(size_t); pMatch+=sizeof(size_t); continue; }
-        pIn += ZSTD_NbCommonBytes(diff);
-        return (size_t)(pIn - pStart);
-    }
+/* *************************************
+*  Error Management
+***************************************/
 
-    if (MEM_32bits()) if ((pIn<(pInLimit-3)) && (MEM_read32(pMatch) == MEM_read32(pIn))) { pIn+=4; pMatch+=4; }
-    if ((pIn<(pInLimit-1)) && (MEM_read16(pMatch) == MEM_read16(pIn))) { pIn+=2; pMatch+=2; }
-    if ((pIn<pInLimit) && (*pMatch == *pIn)) pIn++;
-    return (size_t)(pIn - pStart);
-}
+/*! ZSTD_isError
+*   tells if a return value is an error code */
+static unsigned ZSTD_isError(size_t code) { return ERR_isError(code); }
 
 
 /* *************************************************************
-*   Decompression section
+*   Context management
 ***************************************************************/
-struct ZSTD_DCtx_s
+typedef enum { ZSTDds_getFrameHeaderSize, ZSTDds_decodeFrameHeader,
+               ZSTDds_decodeBlockHeader, ZSTDds_decompressBlock } ZSTD_dStage;
+
+struct ZSTDv04_Dctx_s
 {
     U32 LLTable[FSE_DTABLE_SIZE_U32(LLFSELog)];
     U32 OffTable[FSE_DTABLE_SIZE_U32(OffFSELog)];
     U32 MLTable[FSE_DTABLE_SIZE_U32(MLFSELog)];
-    void* previousDstEnd;
-    void* base;
+    const void* previousDstEnd;
+    const void* base;
+    const void* vBase;
+    const void* dictEnd;
     size_t expected;
+    size_t headerSize;
+    ZSTD_parameters params;
     blockType_t bType;
-    U32 phase;
+    ZSTD_dStage stage;
     const BYTE* litPtr;
     size_t litBufSize;
     size_t litSize;
     BYTE litBuffer[BLOCKSIZE + 8 /* margin for wildcopy */];
-};   /* typedef'd to ZSTD_Dctx within "zstd_static.h" */
+    BYTE headerBuffer[ZSTD_frameHeaderSize_max];
+};  /* typedef'd to ZSTD_DCtx within "zstd_static.h" */
+
+static size_t ZSTD_resetDCtx(ZSTD_DCtx* dctx)
+{
+    dctx->expected = ZSTD_frameHeaderSize_min;
+    dctx->stage = ZSTDds_getFrameHeaderSize;
+    dctx->previousDstEnd = NULL;
+    dctx->base = NULL;
+    dctx->vBase = NULL;
+    dctx->dictEnd = NULL;
+    return 0;
+}
+
+static ZSTD_DCtx* ZSTD_createDCtx(void)
+{
+    ZSTD_DCtx* dctx = (ZSTD_DCtx*)malloc(sizeof(ZSTD_DCtx));
+    if (dctx==NULL) return NULL;
+    ZSTD_resetDCtx(dctx);
+    return dctx;
+}
+
+static size_t ZSTD_freeDCtx(ZSTD_DCtx* dctx)
+{
+    free(dctx);
+    return 0;
+}
+
+
+/* *************************************************************
+*   Decompression section
+***************************************************************/
+/** ZSTD_decodeFrameHeader_Part1
+*   decode the 1st part of the Frame Header, which tells Frame Header size.
+*   srcSize must be == ZSTD_frameHeaderSize_min
+*   @return : the full size of the Frame Header */
+static size_t ZSTD_decodeFrameHeader_Part1(ZSTD_DCtx* zc, const void* src, size_t srcSize)
+{
+    U32 magicNumber;
+    if (srcSize != ZSTD_frameHeaderSize_min) return ERROR(srcSize_wrong);
+    magicNumber = MEM_readLE32(src);
+    if (magicNumber != ZSTD_MAGICNUMBER) return ERROR(prefix_unknown);
+    zc->headerSize = ZSTD_frameHeaderSize_min;
+    return zc->headerSize;
+}
+
+
+static size_t ZSTD_getFrameParams(ZSTD_parameters* params, const void* src, size_t srcSize)
+{
+    U32 magicNumber;
+    if (srcSize < ZSTD_frameHeaderSize_min) return ZSTD_frameHeaderSize_max;
+    magicNumber = MEM_readLE32(src);
+    if (magicNumber != ZSTD_MAGICNUMBER) return ERROR(prefix_unknown);
+    memset(params, 0, sizeof(*params));
+    params->windowLog = (((const BYTE*)src)[4] & 15) + ZSTD_WINDOWLOG_ABSOLUTEMIN;
+    if ((((const BYTE*)src)[4] >> 4) != 0) return ERROR(frameParameter_unsupported);   /* reserved bits */
+    return 0;
+}
+
+/** ZSTD_decodeFrameHeader_Part2
+*   decode the full Frame Header
+*   srcSize must be the size provided by ZSTD_decodeFrameHeader_Part1
+*   @return : 0, or an error code, which can be tested using ZSTD_isError() */
+static size_t ZSTD_decodeFrameHeader_Part2(ZSTD_DCtx* zc, const void* src, size_t srcSize)
+{
+    size_t result;
+    if (srcSize != zc->headerSize) return ERROR(srcSize_wrong);
+    result = ZSTD_getFrameParams(&(zc->params), src, srcSize);
+    if ((MEM_32bits()) && (zc->params.windowLog > 25)) return ERROR(frameParameter_unsupportedBy32bitsImplementation);
+    return result;
+}
 
 
 static size_t ZSTD_getcBlockSize(const void* src, size_t srcSize, blockProperties_t* bpPtr)
@@ -3112,7 +3426,7 @@ static size_t ZSTD_getcBlockSize(const void* src, size_t srcSize, blockPropertie
     return cSize;
 }
 
-static size_t ZSTD_copyUncompressedBlock(void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+static size_t ZSTD_copyRawBlock(void* dst, size_t maxDstSize, const void* src, size_t srcSize)
 {
     if (srcSize > maxDstSize) return ERROR(dstSize_tooSmall);
     memcpy(dst, src, srcSize);
@@ -3141,25 +3455,24 @@ static size_t ZSTD_decompressLiterals(void* dst, size_t* maxDstSizePtr,
 
 
 /** ZSTD_decodeLiteralsBlock
-    @return : nb of bytes read from src (< srcSize )*/
-static size_t ZSTD_decodeLiteralsBlock(void* ctx,
-                          const void* src, size_t srcSize)
+    @return : nb of bytes read from src (< srcSize ) */
+static size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+                          const void* src, size_t srcSize)   /* note : srcSize < BLOCKSIZE */
 {
-    ZSTD_DCtx* dctx = (ZSTD_DCtx*)ctx;
-    const BYTE* const istart = (const BYTE* const)src;
+    const BYTE* const istart = (const BYTE*) src;
 
     /* any compressed block with literals segment must be at least this size */
     if (srcSize < MIN_CBLOCK_SIZE) return ERROR(corruption_detected);
 
     switch(*istart & 3)
     {
-    default:
+    /* compressed */
     case 0:
         {
             size_t litSize = BLOCKSIZE;
             const size_t readSize = ZSTD_decompressLiterals(dctx->litBuffer, &litSize, src, srcSize);
             dctx->litPtr = dctx->litBuffer;
-            dctx->litBufSize = BLOCKSIZE;
+            dctx->litBufSize = BLOCKSIZE+8;
             dctx->litSize = litSize;
             return readSize;   /* works if it's an error too */
         }
@@ -3168,29 +3481,30 @@ static size_t ZSTD_decodeLiteralsBlock(void* ctx,
             const size_t litSize = (MEM_readLE32(istart) & 0xFFFFFF) >> 2;   /* no buffer issue : srcSize >= MIN_CBLOCK_SIZE */
             if (litSize > srcSize-11)   /* risk of reading too far with wildcopy */
             {
-				if (litSize > srcSize-3) return ERROR(corruption_detected);
-				memcpy(dctx->litBuffer, istart, litSize);
-				dctx->litPtr = dctx->litBuffer;
-				dctx->litBufSize = BLOCKSIZE;
-				dctx->litSize = litSize;
-				return litSize+3;
-			}
-			/* direct reference into compressed stream */
+                if (litSize > srcSize-3) return ERROR(corruption_detected);
+                memcpy(dctx->litBuffer, istart, litSize);
+                dctx->litPtr = dctx->litBuffer;
+                dctx->litBufSize = BLOCKSIZE+8;
+                dctx->litSize = litSize;
+                return litSize+3;
+            }
+            /* direct reference into compressed stream */
             dctx->litPtr = istart+3;
             dctx->litBufSize = srcSize-3;
             dctx->litSize = litSize;
-            return litSize+3;
-        }
+            return litSize+3;        }
     case IS_RLE:
         {
             const size_t litSize = (MEM_readLE32(istart) & 0xFFFFFF) >> 2;   /* no buffer issue : srcSize >= MIN_CBLOCK_SIZE */
             if (litSize > BLOCKSIZE) return ERROR(corruption_detected);
             memset(dctx->litBuffer, istart[3], litSize);
             dctx->litPtr = dctx->litBuffer;
-            dctx->litBufSize = BLOCKSIZE;
+            dctx->litBufSize = BLOCKSIZE+8;
             dctx->litSize = litSize;
             return 4;
         }
+    default:
+        return ERROR(corruption_detected);   /* forbidden nominal case */
     }
 }
 
@@ -3235,7 +3549,7 @@ static size_t ZSTD_decodeSeqHeaders(int* nbSeq, const BYTE** dumpsPtr, size_t* d
 
     /* sequences */
     {
-        S16 norm[MaxML+1];    /* assumption : MaxML >= MaxLL and MaxOff */
+        S16 norm[MaxML+1];    /* assumption : MaxML >= MaxLL >= MaxOff */
         size_t headerSize;
 
         /* Build DTables */
@@ -3330,7 +3644,6 @@ static void ZSTD_decodeSequence(seq_t* seq, seqState_t* seqState)
     /* Literal length */
     litLength = FSE_decodeSymbol(&(seqState->stateLL), &(seqState->DStream));
     prevOffset = litLength ? seq->offset : seqState->prevOffset;
-    seqState->prevOffset = seq->offset;
     if (litLength == MaxLL)
     {
         U32 add = *dumps++;
@@ -3345,7 +3658,7 @@ static void ZSTD_decodeSequence(seq_t* seq, seqState_t* seqState)
 
     /* Offset */
     {
-        static const size_t offsetPrefix[MaxOff+1] = {  /* note : size_t faster than U32 */
+        static const U32 offsetPrefix[MaxOff+1] = {
                 1 /*fake*/, 1, 2, 4, 8, 16, 32, 64, 128, 256,
                 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144,
                 524288, 1048576, 2097152, 4194304, 8388608, 16777216, 33554432, /*fake*/ 1, 1, 1, 1, 1 };
@@ -3357,6 +3670,7 @@ static void ZSTD_decodeSequence(seq_t* seq, seqState_t* seqState)
         offset = offsetPrefix[offsetCode] + BIT_readBits(&(seqState->DStream), nbBits);
         if (MEM_32bits()) BIT_reloadDStream(&(seqState->DStream));
         if (offsetCode==0) offset = prevOffset;   /* cmove */
+        if (offsetCode | !litLength) seqState->prevOffset = seq->offset;   /* cmove */
     }
 
     /* MatchLength */
@@ -3383,22 +3697,23 @@ static void ZSTD_decodeSequence(seq_t* seq, seqState_t* seqState)
 
 
 static size_t ZSTD_execSequence(BYTE* op,
-                                seq_t sequence,
-                                const BYTE** litPtr, const BYTE* const litLimit,
-                                BYTE* const base, BYTE* const oend)
+                                BYTE* const oend, seq_t sequence,
+                                const BYTE** litPtr, const BYTE* const litLimit_8,
+                                const BYTE* const base, const BYTE* const vBase, const BYTE* const dictEnd)
 {
-    static const int dec32table[] = {0, 1, 2, 1, 4, 4, 4, 4};   /* added */
-    static const int dec64table[] = {8, 8, 8, 7, 8, 9,10,11};   /* substracted */
-    const BYTE* const ostart = op;
+    static const int dec32table[] = { 0, 1, 2, 1, 4, 4, 4, 4 };   /* added */
+    static const int dec64table[] = { 8, 8, 8, 7, 8, 9,10,11 };   /* substracted */
     BYTE* const oLitEnd = op + sequence.litLength;
-    BYTE* const oMatchEnd = op + sequence.litLength + sequence.matchLength;   /* risk : address space overflow (32-bits) */
+    const size_t sequenceLength = sequence.litLength + sequence.matchLength;
+    BYTE* const oMatchEnd = op + sequenceLength;   /* risk : address space overflow (32-bits) */
     BYTE* const oend_8 = oend-8;
     const BYTE* const litEnd = *litPtr + sequence.litLength;
+    const BYTE* match = oLitEnd - sequence.offset;
 
-    /* checks */
+    /* check */
     if (oLitEnd > oend_8) return ERROR(dstSize_tooSmall);   /* last match must start at a minimum distance of 8 from oend */
     if (oMatchEnd > oend) return ERROR(dstSize_tooSmall);   /* overwrite beyond dst buffer */
-    if (litEnd > litLimit-8) return ERROR(corruption_detected);   /* overRead beyond lit buffer */
+    if (litEnd > litLimit_8) return ERROR(corruption_detected);   /* risk read beyond lit buffer */
 
     /* copy Literals */
     ZSTD_wildcopy(op, *litPtr, sequence.litLength);   /* note : oLitEnd <= oend-8 : no risk of overwrite beyond oend */
@@ -3406,57 +3721,69 @@ static size_t ZSTD_execSequence(BYTE* op,
     *litPtr = litEnd;   /* update for next sequence */
 
     /* copy Match */
+    if (sequence.offset > (size_t)(oLitEnd - base))
     {
-        const BYTE* match = op - sequence.offset;
-
-        /* check */
-        if (sequence.offset > (size_t)op) return ERROR(corruption_detected);   /* address space overflow test (this test seems kept by clang optimizer) */
-        //if (match > op) return ERROR(corruption_detected);   /* address space overflow test (is clang optimizer removing this test ?) */
-        if (match < base) return ERROR(corruption_detected);
-
-        /* close range match, overlap */
-        if (sequence.offset < 8)
+        /* offset beyond prefix */
+        if (sequence.offset > (size_t)(oLitEnd - vBase))
+            return ERROR(corruption_detected);
+        match = dictEnd - (base-match);
+        if (match + sequence.matchLength <= dictEnd)
         {
-            const int dec64 = dec64table[sequence.offset];
-            op[0] = match[0];
-            op[1] = match[1];
-            op[2] = match[2];
-            op[3] = match[3];
-            match += dec32table[sequence.offset];
-            ZSTD_copy4(op+4, match);
-            match -= dec64;
+            memmove(oLitEnd, match, sequence.matchLength);
+            return sequenceLength;
         }
-        else
+        /* span extDict & currentPrefixSegment */
         {
-            ZSTD_copy8(op, match);
+            size_t length1 = dictEnd - match;
+            memmove(oLitEnd, match, length1);
+            op = oLitEnd + length1;
+            sequence.matchLength -= length1;
+            match = base;
         }
-        op += 8; match += 8;
+    }
 
-        if (oMatchEnd > oend-12)
-        {
-            if (op < oend_8)
-            {
-                ZSTD_wildcopy(op, match, oend_8 - op);
-                match += oend_8 - op;
-                op = oend_8;
-            }
-            while (op < oMatchEnd) *op++ = *match++;
-        }
-        else
+    /* match within prefix */
+    if (sequence.offset < 8)
+    {
+        /* close range match, overlap */
+        const int sub2 = dec64table[sequence.offset];
+        op[0] = match[0];
+        op[1] = match[1];
+        op[2] = match[2];
+        op[3] = match[3];
+        match += dec32table[sequence.offset];
+        ZSTD_copy4(op+4, match);
+        match -= sub2;
+    }
+    else
+    {
+        ZSTD_copy8(op, match);
+    }
+    op += 8; match += 8;
+
+    if (oMatchEnd > oend-12)
+    {
+        if (op < oend_8)
         {
-            ZSTD_wildcopy(op, match, sequence.matchLength-8);   /* works even if matchLength < 8 */
+            ZSTD_wildcopy(op, match, oend_8 - op);
+            match += oend_8 - op;
+            op = oend_8;
         }
+        while (op < oMatchEnd) *op++ = *match++;
     }
-
-    return oMatchEnd - ostart;
+    else
+    {
+        ZSTD_wildcopy(op, match, sequence.matchLength-8);   /* works even if matchLength < 8 */
+    }
+    return sequenceLength;
 }
 
+
 static size_t ZSTD_decompressSequences(
-                               void* ctx,
+                               ZSTD_DCtx* dctx,
                                void* dst, size_t maxDstSize,
                          const void* seqStart, size_t seqSize)
 {
-    ZSTD_DCtx* dctx = (ZSTD_DCtx*)ctx;
     const BYTE* ip = (const BYTE*)seqStart;
     const BYTE* const iend = ip + seqSize;
     BYTE* const ostart = (BYTE* const)dst;
@@ -3464,14 +3791,16 @@ static size_t ZSTD_decompressSequences(
     BYTE* const oend = ostart + maxDstSize;
     size_t errorCode, dumpsLength;
     const BYTE* litPtr = dctx->litPtr;
-    const BYTE* const litMax = litPtr + dctx->litBufSize;
+    const BYTE* const litLimit_8 = litPtr + dctx->litBufSize - 8;
     const BYTE* const litEnd = litPtr + dctx->litSize;
     int nbSeq;
     const BYTE* dumps;
     U32* DTableLL = dctx->LLTable;
     U32* DTableML = dctx->MLTable;
     U32* DTableOffb = dctx->OffTable;
-    BYTE* const base = (BYTE*) (dctx->base);
+    const BYTE* const base = (const BYTE*) (dctx->base);
+    const BYTE* const vBase = (const BYTE*) (dctx->vBase);
+    const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd);
 
     /* Build Decoding Tables */
     errorCode = ZSTD_decodeSeqHeaders(&nbSeq, &dumps, &dumpsLength,
@@ -3486,35 +3815,35 @@ static size_t ZSTD_decompressSequences(
         seqState_t seqState;
 
         memset(&sequence, 0, sizeof(sequence));
+        sequence.offset = 4;
         seqState.dumps = dumps;
         seqState.dumpsEnd = dumps + dumpsLength;
-        seqState.prevOffset = sequence.offset = 4;
+        seqState.prevOffset = 4;
         errorCode = BIT_initDStream(&(seqState.DStream), ip, iend-ip);
         if (ERR_isError(errorCode)) return ERROR(corruption_detected);
         FSE_initDState(&(seqState.stateLL), &(seqState.DStream), DTableLL);
         FSE_initDState(&(seqState.stateOffb), &(seqState.DStream), DTableOffb);
         FSE_initDState(&(seqState.stateML), &(seqState.DStream), DTableML);
 
-        for ( ; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && (nbSeq>0) ; )
+        for ( ; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && nbSeq ; )
         {
             size_t oneSeqSize;
             nbSeq--;
             ZSTD_decodeSequence(&sequence, &seqState);
-            oneSeqSize = ZSTD_execSequence(op, sequence, &litPtr, litMax, base, oend);
+            oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litLimit_8, base, vBase, dictEnd);
             if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
             op += oneSeqSize;
         }
 
         /* check if reached exact end */
-        if ( !BIT_endOfDStream(&(seqState.DStream)) ) return ERROR(corruption_detected);   /* requested too much : data is corrupted */
-        if (nbSeq<0) return ERROR(corruption_detected);   /* requested too many sequences : data is corrupted */
+        if ( !BIT_endOfDStream(&(seqState.DStream)) ) return ERROR(corruption_detected);   /* DStream should be entirely and exactly consumed; otherwise data is corrupted */
 
         /* last literal segment */
         {
             size_t lastLLSize = litEnd - litPtr;
             if (litPtr > litEnd) return ERROR(corruption_detected);
             if (op+lastLLSize > oend) return ERROR(dstSize_tooSmall);
-            if (op != litPtr) memmove(op, litPtr, lastLLSize);
+            if (op != litPtr) memcpy(op, litPtr, lastLLSize);
             op += lastLLSize;
         }
     }
@@ -3523,8 +3852,19 @@ static size_t ZSTD_decompressSequences(
 }
 
 
-static size_t ZSTD_decompressBlock(
-                            void* ctx,
+static void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst)
+{
+    if (dst != dctx->previousDstEnd)   /* not contiguous */
+    {
+        dctx->dictEnd = dctx->previousDstEnd;
+        dctx->vBase = (const char*)dst - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->base));
+        dctx->base = dst;
+        dctx->previousDstEnd = dst;
+    }
+}
+
+
+static size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
                             void* dst, size_t maxDstSize,
                       const void* src, size_t srcSize)
 {
@@ -3532,16 +3872,19 @@ static size_t ZSTD_decompressBlock(
     const BYTE* ip = (const BYTE*)src;
 
     /* Decode literals sub-block */
-    size_t litCSize = ZSTD_decodeLiteralsBlock(ctx, src, srcSize);
+    size_t litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize);
     if (ZSTD_isError(litCSize)) return litCSize;
     ip += litCSize;
     srcSize -= litCSize;
 
-    return ZSTD_decompressSequences(ctx, dst, maxDstSize, ip, srcSize);
+    return ZSTD_decompressSequences(dctx, dst, maxDstSize, ip, srcSize);
 }
 
 
-static size_t ZSTD_decompressDCtx(void* ctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+static size_t ZSTD_decompress_usingDict(ZSTD_DCtx* ctx,
+                                 void* dst, size_t maxDstSize,
+                                 const void* src, size_t srcSize,
+                                 const void* dict, size_t dictSize)
 {
     const BYTE* ip = (const BYTE*)src;
     const BYTE* iend = ip + srcSize;
@@ -3549,14 +3892,33 @@ static size_t ZSTD_decompressDCtx(void* ctx, void* dst, size_t maxDstSize, const
     BYTE* op = ostart;
     BYTE* const oend = ostart + maxDstSize;
     size_t remainingSize = srcSize;
-    U32 magicNumber;
     blockProperties_t blockProperties;
 
+    /* init */
+    ZSTD_resetDCtx(ctx);
+    if (dict)
+    {
+        ZSTD_decompress_insertDictionary(ctx, dict, dictSize);
+        ctx->dictEnd = ctx->previousDstEnd;
+        ctx->vBase = (const char*)dst - ((const char*)(ctx->previousDstEnd) - (const char*)(ctx->base));
+        ctx->base = dst;
+    }
+    else
+    {
+        ctx->vBase = ctx->base = ctx->dictEnd = dst;
+    }
+
     /* Frame Header */
-    if (srcSize < ZSTD_frameHeaderSize+ZSTD_blockHeaderSize) return ERROR(srcSize_wrong);
-    magicNumber = MEM_readLE32(src);
-    if (magicNumber != ZSTD_magicNumber) return ERROR(prefix_unknown);
-    ip += ZSTD_frameHeaderSize; remainingSize -= ZSTD_frameHeaderSize;
+    {
+        size_t frameHeaderSize;
+        if (srcSize < ZSTD_frameHeaderSize_min+ZSTD_blockHeaderSize) return ERROR(srcSize_wrong);
+        frameHeaderSize = ZSTD_decodeFrameHeader_Part1(ctx, src, ZSTD_frameHeaderSize_min);
+        if (ZSTD_isError(frameHeaderSize)) return frameHeaderSize;
+        if (srcSize < frameHeaderSize+ZSTD_blockHeaderSize) return ERROR(srcSize_wrong);
+        ip += frameHeaderSize; remainingSize -= frameHeaderSize;
+        frameHeaderSize = ZSTD_decodeFrameHeader_Part2(ctx, src, frameHeaderSize);
+        if (ZSTD_isError(frameHeaderSize)) return frameHeaderSize;
+    }
 
     /* Loop on each block */
     while (1)
@@ -3572,10 +3934,10 @@ static size_t ZSTD_decompressDCtx(void* ctx, void* dst, size_t maxDstSize, const
         switch(blockProperties.blockType)
         {
         case bt_compressed:
-            decodedSize = ZSTD_decompressBlock(ctx, op, oend-op, ip, cBlockSize);
+            decodedSize = ZSTD_decompressBlock_internal(ctx, op, oend-op, ip, cBlockSize);
             break;
         case bt_raw :
-            decodedSize = ZSTD_copyUncompressedBlock(op, oend-op, ip, cBlockSize);
+            decodedSize = ZSTD_copyRawBlock(op, oend-op, ip, cBlockSize);
             break;
         case bt_rle :
             return ERROR(GENERIC);   /* not yet supported */
@@ -3598,148 +3960,469 @@ static size_t ZSTD_decompressDCtx(void* ctx, void* dst, size_t maxDstSize, const
     return op-ostart;
 }
 
-static size_t ZSTD_decompress(void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+
+/* ******************************
+*  Streaming Decompression API
+********************************/
+static size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx)
 {
-    ZSTD_DCtx ctx;
-    ctx.base = dst;
-    return ZSTD_decompressDCtx(&ctx, dst, maxDstSize, src, srcSize);
+    return dctx->expected;
 }
 
+static size_t ZSTD_decompressContinue(ZSTD_DCtx* ctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+{
+    /* Sanity check */
+    if (srcSize != ctx->expected) return ERROR(srcSize_wrong);
+    ZSTD_checkContinuity(ctx, dst);
 
-/*******************************
-*  Streaming Decompression API
-*******************************/
+    /* Decompress : frame header; part 1 */
+    switch (ctx->stage)
+    {
+    case ZSTDds_getFrameHeaderSize :
+        {
+            /* get frame header size */
+            if (srcSize != ZSTD_frameHeaderSize_min) return ERROR(srcSize_wrong);   /* impossible */
+            ctx->headerSize = ZSTD_decodeFrameHeader_Part1(ctx, src, ZSTD_frameHeaderSize_min);
+            if (ZSTD_isError(ctx->headerSize)) return ctx->headerSize;
+            memcpy(ctx->headerBuffer, src, ZSTD_frameHeaderSize_min);
+            if (ctx->headerSize > ZSTD_frameHeaderSize_min)
+            {
+                ctx->expected = ctx->headerSize - ZSTD_frameHeaderSize_min;
+                ctx->stage = ZSTDds_decodeFrameHeader;
+                return 0;
+            }
+            ctx->expected = 0;   /* not necessary to copy more */
+        }
+    case ZSTDds_decodeFrameHeader:
+        {
+            /* get frame header */
+            size_t result;
+            memcpy(ctx->headerBuffer + ZSTD_frameHeaderSize_min, src, ctx->expected);
+            result = ZSTD_decodeFrameHeader_Part2(ctx, ctx->headerBuffer, ctx->headerSize);
+            if (ZSTD_isError(result)) return result;
+            ctx->expected = ZSTD_blockHeaderSize;
+            ctx->stage = ZSTDds_decodeBlockHeader;
+            return 0;
+        }
+    case ZSTDds_decodeBlockHeader:
+        {
+            /* Decode block header */
+            blockProperties_t bp;
+            size_t blockSize = ZSTD_getcBlockSize(src, ZSTD_blockHeaderSize, &bp);
+            if (ZSTD_isError(blockSize)) return blockSize;
+            if (bp.blockType == bt_end)
+            {
+                ctx->expected = 0;
+                ctx->stage = ZSTDds_getFrameHeaderSize;
+            }
+            else
+            {
+                ctx->expected = blockSize;
+                ctx->bType = bp.blockType;
+                ctx->stage = ZSTDds_decompressBlock;
+            }
+            return 0;
+        }
+    case ZSTDds_decompressBlock:
+        {
+            /* Decompress : block content */
+            size_t rSize;
+            switch(ctx->bType)
+            {
+            case bt_compressed:
+                rSize = ZSTD_decompressBlock_internal(ctx, dst, maxDstSize, src, srcSize);
+                break;
+            case bt_raw :
+                rSize = ZSTD_copyRawBlock(dst, maxDstSize, src, srcSize);
+                break;
+            case bt_rle :
+                return ERROR(GENERIC);   /* not yet handled */
+                break;
+            case bt_end :   /* should never happen (filtered at phase 1) */
+                rSize = 0;
+                break;
+            default:
+                return ERROR(GENERIC);
+            }
+            ctx->stage = ZSTDds_decodeBlockHeader;
+            ctx->expected = ZSTD_blockHeaderSize;
+            ctx->previousDstEnd = (char*)dst + rSize;
+            return rSize;
+        }
+    default:
+        return ERROR(GENERIC);   /* impossible */
+    }
+}
 
-static size_t ZSTD_resetDCtx(ZSTD_DCtx* dctx)
+
+static void ZSTD_decompress_insertDictionary(ZSTD_DCtx* ctx, const void* dict, size_t dictSize)
 {
-    dctx->expected = ZSTD_frameHeaderSize;
-    dctx->phase = 0;
-    dctx->previousDstEnd = NULL;
-    dctx->base = NULL;
-    return 0;
+    ctx->dictEnd = ctx->previousDstEnd;
+    ctx->vBase = (const char*)dict - ((const char*)(ctx->previousDstEnd) - (const char*)(ctx->base));
+    ctx->base = dict;
+    ctx->previousDstEnd = (const char*)dict + dictSize;
 }
 
-static ZSTD_DCtx* ZSTD_createDCtx(void)
+
+
+/*
+    Buffered version of Zstd compression library
+    Copyright (C) 2015, Yann Collet.
+
+    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+    * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following disclaimer
+    in the documentation and/or other materials provided with the
+    distribution.
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - zstd source repository : https://github.com/Cyan4973/zstd
+    - ztsd public forum : https://groups.google.com/forum/#!forum/lz4c
+*/
+
+/* The objects defined into this file should be considered experimental.
+ * They are not labelled stable, as their prototype may change in the future.
+ * You can use them for tests, provide feedback, or if you can endure risk of future changes.
+ */
+
+/* *************************************
+*  Includes
+***************************************/
+#include <stdlib.h>
+
+
+/** ************************************************
+*  Streaming decompression
+*
+*  A ZBUFF_DCtx object is required to track streaming operation.
+*  Use ZBUFF_createDCtx() and ZBUFF_freeDCtx() to create/release resources.
+*  Use ZBUFF_decompressInit() to start a new decompression operation.
+*  ZBUFF_DCtx objects can be reused multiple times.
+*
+*  Use ZBUFF_decompressContinue() repetitively to consume your input.
+*  *srcSizePtr and *maxDstSizePtr can be any size.
+*  The function will report how many bytes were read or written by modifying *srcSizePtr and *maxDstSizePtr.
+*  Note that it may not consume the entire input, in which case it's up to the caller to call again the function with remaining input.
+*  The content of dst will be overwritten (up to *maxDstSizePtr) at each function call, so save its content if it matters or change dst .
+*  @return : a hint to preferred nb of bytes to use as input for next function call (it's only a hint, to improve latency)
+*            or 0 when a frame is completely decoded
+*            or an error code, which can be tested using ZBUFF_isError().
+*
+*  Hint : recommended buffer sizes (not compulsory)
+*  output : 128 KB block size is the internal unit, it ensures it's always possible to write a full block when it's decoded.
+*  input : just follow indications from ZBUFF_decompressContinue() to minimize latency. It should always be <= 128 KB + 3 .
+* **************************************************/
+
+typedef enum { ZBUFFds_init, ZBUFFds_readHeader, ZBUFFds_loadHeader, ZBUFFds_decodeHeader,
+               ZBUFFds_read, ZBUFFds_load, ZBUFFds_flush } ZBUFF_dStage;
+
+/* *** Resource management *** */
+
+#define ZSTD_frameHeaderSize_max 5   /* too magical, should come from reference */
+struct ZBUFFv04_DCtx_s {
+    ZSTD_DCtx* zc;
+    ZSTD_parameters params;
+    char* inBuff;
+    size_t inBuffSize;
+    size_t inPos;
+    char* outBuff;
+    size_t outBuffSize;
+    size_t outStart;
+    size_t outEnd;
+    size_t hPos;
+    const char* dict;
+    size_t dictSize;
+    ZBUFF_dStage stage;
+    unsigned char headerBuffer[ZSTD_frameHeaderSize_max];
+};   /* typedef'd to ZBUFF_DCtx within "zstd_buffered.h" */
+
+typedef ZBUFFv04_DCtx ZBUFF_DCtx;
+
+
+static ZBUFF_DCtx* ZBUFF_createDCtx(void)
 {
-    ZSTD_DCtx* dctx = (ZSTD_DCtx*)malloc(sizeof(ZSTD_DCtx));
-    if (dctx==NULL) return NULL;
-    ZSTD_resetDCtx(dctx);
-    return dctx;
+    ZBUFF_DCtx* zbc = (ZBUFF_DCtx*)malloc(sizeof(ZBUFF_DCtx));
+    if (zbc==NULL) return NULL;
+    memset(zbc, 0, sizeof(*zbc));
+    zbc->zc = ZSTD_createDCtx();
+    zbc->stage = ZBUFFds_init;
+    return zbc;
 }
 
-static size_t ZSTD_freeDCtx(ZSTD_DCtx* dctx)
+static size_t ZBUFF_freeDCtx(ZBUFF_DCtx* zbc)
 {
-    free(dctx);
+    if (zbc==NULL) return 0;   /* support free on null */
+    ZSTD_freeDCtx(zbc->zc);
+    free(zbc->inBuff);
+    free(zbc->outBuff);
+    free(zbc);
     return 0;
 }
 
-static size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx)
+
+/* *** Initialization *** */
+
+static size_t ZBUFF_decompressInit(ZBUFF_DCtx* zbc)
 {
-    return dctx->expected;
+    zbc->stage = ZBUFFds_readHeader;
+    zbc->hPos = zbc->inPos = zbc->outStart = zbc->outEnd = zbc->dictSize = 0;
+    return ZSTD_resetDCtx(zbc->zc);
 }
 
-static size_t ZSTD_decompressContinue(ZSTD_DCtx* ctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+
+static size_t ZBUFF_decompressWithDictionary(ZBUFF_DCtx* zbc, const void* src, size_t srcSize)
 {
-    /* Sanity check */
-    if (srcSize != ctx->expected) return ERROR(srcSize_wrong);
-    if (dst != ctx->previousDstEnd)  /* not contiguous */
-        ctx->base = dst;
+    zbc->dict = (const char*)src;
+    zbc->dictSize = srcSize;
+    return 0;
+}
 
-    /* Decompress : frame header */
-    if (ctx->phase == 0)
-    {
-        /* Check frame magic header */
-        U32 magicNumber = MEM_readLE32(src);
-        if (magicNumber != ZSTD_magicNumber) return ERROR(prefix_unknown);
-        ctx->phase = 1;
-        ctx->expected = ZSTD_blockHeaderSize;
-        return 0;
-    }
+static size_t ZBUFF_limitCopy(void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+{
+    size_t length = MIN(maxDstSize, srcSize);
+    memcpy(dst, src, length);
+    return length;
+}
+
+/* *** Decompression *** */
 
-    /* Decompress : block header */
-    if (ctx->phase == 1)
+static size_t ZBUFF_decompressContinue(ZBUFF_DCtx* zbc, void* dst, size_t* maxDstSizePtr, const void* src, size_t* srcSizePtr)
+{
+    const char* const istart = (const char*)src;
+    const char* ip = istart;
+    const char* const iend = istart + *srcSizePtr;
+    char* const ostart = (char*)dst;
+    char* op = ostart;
+    char* const oend = ostart + *maxDstSizePtr;
+    U32 notDone = 1;
+
+    while (notDone)
     {
-        blockProperties_t bp;
-        size_t blockSize = ZSTD_getcBlockSize(src, ZSTD_blockHeaderSize, &bp);
-        if (ZSTD_isError(blockSize)) return blockSize;
-        if (bp.blockType == bt_end)
-        {
-            ctx->expected = 0;
-            ctx->phase = 0;
-        }
-        else
+        switch(zbc->stage)
         {
-            ctx->expected = blockSize;
-            ctx->bType = bp.blockType;
-            ctx->phase = 2;
-        }
 
-        return 0;
-    }
+        case ZBUFFds_init :
+            return ERROR(init_missing);
 
-    /* Decompress : block content */
-    {
-        size_t rSize;
-        switch(ctx->bType)
-        {
-        case bt_compressed:
-            rSize = ZSTD_decompressBlock(ctx, dst, maxDstSize, src, srcSize);
-            break;
-        case bt_raw :
-            rSize = ZSTD_copyUncompressedBlock(dst, maxDstSize, src, srcSize);
-            break;
-        case bt_rle :
-            return ERROR(GENERIC);   /* not yet handled */
-            break;
-        case bt_end :   /* should never happen (filtered at phase 1) */
-            rSize = 0;
-            break;
-        default:
-            return ERROR(GENERIC);
+        case ZBUFFds_readHeader :
+            /* read header from src */
+            {
+                size_t headerSize = ZSTD_getFrameParams(&(zbc->params), src, *srcSizePtr);
+                if (ZSTD_isError(headerSize)) return headerSize;
+                if (headerSize)
+                {
+                    /* not enough input to decode header : tell how many bytes would be necessary */
+                    memcpy(zbc->headerBuffer+zbc->hPos, src, *srcSizePtr);
+                    zbc->hPos += *srcSizePtr;
+                    *maxDstSizePtr = 0;
+                    zbc->stage = ZBUFFds_loadHeader;
+                    return headerSize - zbc->hPos;
+                }
+                zbc->stage = ZBUFFds_decodeHeader;
+                break;
+            }
+
+        case ZBUFFds_loadHeader:
+            /* complete header from src */
+            {
+                size_t headerSize = ZBUFF_limitCopy(
+                    zbc->headerBuffer + zbc->hPos, ZSTD_frameHeaderSize_max - zbc->hPos,
+                    src, *srcSizePtr);
+                zbc->hPos += headerSize;
+                ip += headerSize;
+                headerSize = ZSTD_getFrameParams(&(zbc->params), zbc->headerBuffer, zbc->hPos);
+                if (ZSTD_isError(headerSize)) return headerSize;
+                if (headerSize) {
+                    /* not enough input to decode header : tell how many bytes would be necessary */
+                    *maxDstSizePtr = 0;
+                    return headerSize - zbc->hPos;
+            }   }
+
+        case ZBUFFds_decodeHeader:
+                /* apply header to create / resize buffers */
+                {
+                    size_t neededOutSize = (size_t)1 << zbc->params.windowLog;
+                    size_t neededInSize = BLOCKSIZE;   /* a block is never > BLOCKSIZE */
+                    if (zbc->inBuffSize < neededInSize) {
+                        free(zbc->inBuff);
+                        zbc->inBuffSize = neededInSize;
+                        zbc->inBuff = (char*)malloc(neededInSize);
+                        if (zbc->inBuff == NULL) return ERROR(memory_allocation);
+                    }
+                    if (zbc->outBuffSize < neededOutSize) {
+                        free(zbc->outBuff);
+                        zbc->outBuffSize = neededOutSize;
+                        zbc->outBuff = (char*)malloc(neededOutSize);
+                        if (zbc->outBuff == NULL) return ERROR(memory_allocation);
+                }   }
+                if (zbc->dictSize)
+                    ZSTD_decompress_insertDictionary(zbc->zc, zbc->dict, zbc->dictSize);
+                if (zbc->hPos) {
+                    /* some data already loaded into headerBuffer : transfer into inBuff */
+                    memcpy(zbc->inBuff, zbc->headerBuffer, zbc->hPos);
+                    zbc->inPos = zbc->hPos;
+                    zbc->hPos = 0;
+                    zbc->stage = ZBUFFds_load;
+                    break;
+                }
+                zbc->stage = ZBUFFds_read;
+
+        case ZBUFFds_read:
+            {
+                size_t neededInSize = ZSTD_nextSrcSizeToDecompress(zbc->zc);
+                if (neededInSize==0)   /* end of frame */
+                {
+                    zbc->stage = ZBUFFds_init;
+                    notDone = 0;
+                    break;
+                }
+                if ((size_t)(iend-ip) >= neededInSize)
+                {
+                    /* directly decode from src */
+                    size_t decodedSize = ZSTD_decompressContinue(zbc->zc,
+                        zbc->outBuff + zbc->outStart, zbc->outBuffSize - zbc->outStart,
+                        ip, neededInSize);
+                    if (ZSTD_isError(decodedSize)) return decodedSize;
+                    ip += neededInSize;
+                    if (!decodedSize) break;   /* this was just a header */
+                    zbc->outEnd = zbc->outStart +  decodedSize;
+                    zbc->stage = ZBUFFds_flush;
+                    break;
+                }
+                if (ip==iend) { notDone = 0; break; }   /* no more input */
+                zbc->stage = ZBUFFds_load;
+            }
+
+        case ZBUFFds_load:
+            {
+                size_t neededInSize = ZSTD_nextSrcSizeToDecompress(zbc->zc);
+                size_t toLoad = neededInSize - zbc->inPos;   /* should always be <= remaining space within inBuff */
+                size_t loadedSize;
+                if (toLoad > zbc->inBuffSize - zbc->inPos) return ERROR(corruption_detected);   /* should never happen */
+                loadedSize = ZBUFF_limitCopy(zbc->inBuff + zbc->inPos, toLoad, ip, iend-ip);
+                ip += loadedSize;
+                zbc->inPos += loadedSize;
+                if (loadedSize < toLoad) { notDone = 0; break; }   /* not enough input, wait for more */
+                {
+                    size_t decodedSize = ZSTD_decompressContinue(zbc->zc,
+                        zbc->outBuff + zbc->outStart, zbc->outBuffSize - zbc->outStart,
+                        zbc->inBuff, neededInSize);
+                    if (ZSTD_isError(decodedSize)) return decodedSize;
+                    zbc->inPos = 0;   /* input is consumed */
+                    if (!decodedSize) { zbc->stage = ZBUFFds_read; break; }   /* this was just a header */
+                    zbc->outEnd = zbc->outStart +  decodedSize;
+                    zbc->stage = ZBUFFds_flush;
+                    // break; /* ZBUFFds_flush follows */
+                }
+            }
+        case ZBUFFds_flush:
+            {
+                size_t toFlushSize = zbc->outEnd - zbc->outStart;
+                size_t flushedSize = ZBUFF_limitCopy(op, oend-op, zbc->outBuff + zbc->outStart, toFlushSize);
+                op += flushedSize;
+                zbc->outStart += flushedSize;
+                if (flushedSize == toFlushSize)
+                {
+                    zbc->stage = ZBUFFds_read;
+                    if (zbc->outStart + BLOCKSIZE > zbc->outBuffSize)
+                        zbc->outStart = zbc->outEnd = 0;
+                    break;
+                }
+                /* cannot flush everything */
+                notDone = 0;
+                break;
+            }
+        default: return ERROR(GENERIC);   /* impossible */
         }
-        ctx->phase = 1;
-        ctx->expected = ZSTD_blockHeaderSize;
-        ctx->previousDstEnd = (void*)( ((char*)dst) + rSize);
-        return rSize;
     }
 
+    *srcSizePtr = ip-istart;
+    *maxDstSizePtr = op-ostart;
+
+    {
+        size_t nextSrcSizeHint = ZSTD_nextSrcSizeToDecompress(zbc->zc);
+        if (nextSrcSizeHint > 3) nextSrcSizeHint+= 3;   /* get the next block header while at it */
+        nextSrcSizeHint -= zbc->inPos;   /* already loaded*/
+        return nextSrcSizeHint;
+    }
 }
 
 
-/* wrapper layer */
+/* *************************************
+*  Tool functions
+***************************************/
+unsigned ZBUFFv04_isError(size_t errorCode) { return ERR_isError(errorCode); }
+const char* ZBUFFv04_getErrorName(size_t errorCode) { return ERR_getErrorName(errorCode); }
+
+size_t ZBUFFv04_recommendedDInSize()  { return BLOCKSIZE + 3; }
+size_t ZBUFFv04_recommendedDOutSize() { return BLOCKSIZE; }
 
-unsigned ZSTDv03_isError(size_t code)
-{
-	return ZSTD_isError(code);
-}
 
-size_t ZSTDv03_decompress( void* dst, size_t maxOriginalSize,
-                     const void* src, size_t compressedSize)
-{
-	return ZSTD_decompress(dst, maxOriginalSize, src, compressedSize);
-}
 
-ZSTDv03_Dctx* ZSTDv03_createDCtx(void)
+/*- ========================================================================= -*/
+
+/* final wrapping stage */
+
+size_t ZSTDv04_decompressDCtx(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize)
 {
-	return (ZSTDv03_Dctx*)ZSTD_createDCtx();
+    return ZSTD_decompress_usingDict(dctx, dst, maxDstSize, src, srcSize, NULL, 0);
 }
 
-size_t ZSTDv03_freeDCtx(ZSTDv03_Dctx* dctx)
+size_t ZSTDv04_decompress(void* dst, size_t maxDstSize, const void* src, size_t srcSize)
 {
-	return ZSTD_freeDCtx((ZSTD_DCtx*)dctx);
+#if defined(ZSTD_HEAPMODE) && (ZSTD_HEAPMODE==1)
+    size_t regenSize;
+    ZSTD_DCtx* dctx = ZSTD_createDCtx();
+    if (dctx==NULL) return ERROR(memory_allocation);
+    regenSize = ZSTDv04_decompressDCtx(dctx, dst, maxDstSize, src, srcSize);
+    ZSTD_freeDCtx(dctx);
+    return regenSize;
+#else
+    ZSTD_DCtx dctx;
+    return ZSTD_decompressDCtx(&dctx, dst, maxDstSize, src, srcSize);
+#endif
 }
 
-size_t ZSTDv03_resetDCtx(ZSTDv03_Dctx* dctx)
+
+size_t ZSTDv04_resetDCtx(ZSTDv04_Dctx* dctx) { return ZSTD_resetDCtx(dctx); }
+
+size_t ZSTDv04_nextSrcSizeToDecompress(ZSTDv04_Dctx* dctx)
 {
-	return ZSTD_resetDCtx((ZSTD_DCtx*)dctx);
+    return ZSTD_nextSrcSizeToDecompress(dctx);
 }
 
-size_t ZSTDv03_nextSrcSizeToDecompress(ZSTDv03_Dctx* dctx)
+size_t ZSTDv04_decompressContinue(ZSTDv04_Dctx* dctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize)
 {
-	return ZSTD_nextSrcSizeToDecompress((ZSTD_DCtx*)dctx);
+    return ZSTD_decompressContinue(dctx, dst, maxDstSize, src, srcSize);
 }
 
-size_t ZSTDv03_decompressContinue(ZSTDv03_Dctx* dctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+
+
+ZBUFFv04_DCtx* ZBUFFv04_createDCtx(void) { return ZBUFF_createDCtx(); }
+size_t      ZBUFFv04_freeDCtx(ZBUFFv04_DCtx* dctx) { return ZBUFF_freeDCtx(dctx); }
+
+size_t ZBUFFv04_decompressInit(ZBUFFv04_DCtx* dctx) { return ZBUFF_decompressInit(dctx); }
+size_t ZBUFFv04_decompressWithDictionary(ZBUFFv04_DCtx* dctx, const void* src, size_t srcSize)
+{ return ZBUFF_decompressWithDictionary(dctx, src, srcSize); }
+
+size_t ZBUFFv04_decompressContinue(ZBUFFv04_DCtx* dctx, void* dst, size_t* maxDstSizePtr, const void* src, size_t* srcSizePtr)
 {
-	return ZSTD_decompressContinue((ZSTD_DCtx*)dctx, dst, maxDstSize, src, srcSize);
+    return ZBUFF_decompressContinue(dctx, dst, maxDstSizePtr, src, srcSizePtr);
 }
diff --git a/lib/legacy/zstd_v04.h b/lib/legacy/zstd_v04.h
new file mode 100644
index 0000000..a612982
--- /dev/null
+++ b/lib/legacy/zstd_v04.h
@@ -0,0 +1,148 @@
+/*
+    zstd_v04 - decoder for 0.4 format
+    Header File
+    Copyright (C) 2016, Yann Collet.
+
+    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+    * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following disclaimer
+    in the documentation and/or other materials provided with the
+    distribution.
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - zstd source repository : https://github.com/Cyan4973/zstd
+    - ztsd public forum : https://groups.google.com/forum/#!forum/lz4c
+*/
+#pragma once
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/* *************************************
+*  Includes
+***************************************/
+#include <stddef.h>   /* size_t */
+
+
+/* *************************************
+*  Simple one-step function
+***************************************/
+/**
+ZSTDv04_decompress() : decompress ZSTD frames compliant with v0.4.x format
+    compressedSize : is the exact source size
+    maxOriginalSize : is the size of the 'dst' buffer, which must be already allocated.
+                      It must be equal or larger than originalSize, otherwise decompression will fail.
+    return : the number of bytes decompressed into destination buffer (originalSize)
+             or an errorCode if it fails (which can be tested using ZSTDv01_isError())
+*/
+size_t ZSTDv04_decompress( void* dst, size_t maxOriginalSize,
+                     const void* src, size_t compressedSize);
+
+/**
+ZSTDv04_isError() : tells if the result of ZSTDv04_decompress() is an error
+*/
+unsigned ZSTDv04_isError(size_t code);
+
+
+/* *************************************
+*  Advanced functions
+***************************************/
+typedef struct ZSTDv04_Dctx_s ZSTDv04_Dctx;
+ZSTDv04_Dctx* ZSTDv04_createDCtx(void);
+size_t ZSTDv04_freeDCtx(ZSTDv04_Dctx* dctx);
+
+size_t ZSTDv04_decompressDCtx(ZSTDv04_Dctx* dctx,
+                              void* dst, size_t maxOriginalSize,
+                        const void* src, size_t compressedSize);
+
+
+/* *************************************
+*  Direct Streaming
+***************************************/
+size_t ZSTDv04_resetDCtx(ZSTDv04_Dctx* dctx);
+
+size_t ZSTDv04_nextSrcSizeToDecompress(ZSTDv04_Dctx* dctx);
+size_t ZSTDv04_decompressContinue(ZSTDv04_Dctx* dctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize);
+/**
+  Use above functions alternatively.
+  ZSTD_nextSrcSizeToDecompress() tells how much bytes to provide as 'srcSize' to ZSTD_decompressContinue().
+  ZSTD_decompressContinue() will use previous data blocks to improve compression if they are located prior to current block.
+  Result is the number of bytes regenerated within 'dst'.
+  It can be zero, which is not an error; it just means ZSTD_decompressContinue() has decoded some header.
+*/
+
+
+/* *************************************
+*  Buffered Streaming
+***************************************/
+typedef struct ZBUFFv04_DCtx_s ZBUFFv04_DCtx;
+ZBUFFv04_DCtx* ZBUFFv04_createDCtx(void);
+size_t      ZBUFFv04_freeDCtx(ZBUFFv04_DCtx* dctx);
+
+size_t ZBUFFv04_decompressInit(ZBUFFv04_DCtx* dctx);
+size_t ZBUFFv04_decompressWithDictionary(ZBUFFv04_DCtx* dctx, const void* dict, size_t dictSize);
+
+size_t ZBUFFv04_decompressContinue(ZBUFFv04_DCtx* dctx, void* dst, size_t* maxDstSizePtr, const void* src, size_t* srcSizePtr);
+
+/** ************************************************
+*  Streaming decompression
+*
+*  A ZBUFF_DCtx object is required to track streaming operation.
+*  Use ZBUFF_createDCtx() and ZBUFF_freeDCtx() to create/release resources.
+*  Use ZBUFF_decompressInit() to start a new decompression operation.
+*  ZBUFF_DCtx objects can be reused multiple times.
+*
+*  Optionally, a reference to a static dictionary can be set, using ZBUFF_decompressWithDictionary()
+*  It must be the same content as the one set during compression phase.
+*  Dictionary content must remain accessible during the decompression process.
+*
+*  Use ZBUFF_decompressContinue() repetitively to consume your input.
+*  *srcSizePtr and *maxDstSizePtr can be any size.
+*  The function will report how many bytes were read or written by modifying *srcSizePtr and *maxDstSizePtr.
+*  Note that it may not consume the entire input, in which case it's up to the caller to present remaining input again.
+*  The content of dst will be overwritten (up to *maxDstSizePtr) at each function call, so save its content if it matters or change dst.
+*  @return : a hint to preferred nb of bytes to use as input for next function call (it's only a hint, to improve latency)
+*            or 0 when a frame is completely decoded
+*            or an error code, which can be tested using ZBUFF_isError().
+*
+*  Hint : recommended buffer sizes (not compulsory) : ZBUFF_recommendedDInSize / ZBUFF_recommendedDOutSize
+*  output : ZBUFF_recommendedDOutSize==128 KB block size is the internal unit, it ensures it's always possible to write a full block when it's decoded.
+*  input : ZBUFF_recommendedDInSize==128Kb+3; just follow indications from ZBUFF_decompressContinue() to minimize latency. It should always be <= 128 KB + 3 .
+* **************************************************/
+unsigned ZBUFFv04_isError(size_t errorCode);
+const char* ZBUFFv04_getErrorName(size_t errorCode);
+
+
+/** The below functions provide recommended buffer sizes for Compression or Decompression operations.
+*   These sizes are not compulsory, they just tend to offer better latency */
+size_t ZBUFFv04_recommendedDInSize(void);
+size_t ZBUFFv04_recommendedDOutSize(void);
+
+
+/* *************************************
+*  Prefix - version detection
+***************************************/
+#define ZSTDv04_magicNumber 0xFD2FB524   /* v0.4 */
+
+
+#if defined (__cplusplus)
+}
+#endif
diff --git a/lib/mem.h b/lib/mem.h
index 8ac56ed..0e357e5 100644
--- a/lib/mem.h
+++ b/lib/mem.h
@@ -39,15 +39,15 @@
 extern "C" {
 #endif
 
-/******************************************
-*  Includes
+/*-****************************************
+*  Dependencies
 ******************************************/
 #include <stddef.h>    /* size_t, ptrdiff_t */
 #include <string.h>    /* memcpy */
 
 
-/******************************************
-*  Compiler-specific
+/*-****************************************
+*  Compiler specifics
 ******************************************/
 #if defined(__GNUC__)
 #  define MEM_STATIC static __attribute__((unused))
@@ -60,7 +60,7 @@ extern "C" {
 #endif
 
 
-/****************************************************************
+/*-**************************************************************
 *  Basic Types
 *****************************************************************/
 #if defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
@@ -83,10 +83,10 @@ extern "C" {
 #endif
 
 
-/****************************************************************
+/*-**************************************************************
 *  Memory I/O
 *****************************************************************/
-/* MEM_FORCE_MEMORY_ACCESS
+/* MEM_FORCE_MEMORY_ACCESS :
  * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
  * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
  * The below switch allow to select different access method for improved performance.
@@ -94,8 +94,8 @@ extern "C" {
  * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable).
  *            This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`.
  * Method 2 : direct access. This method is portable but violate C standard.
- *            It can generate buggy code on targets generating assembly depending on alignment.
- *            But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6)
+ *            It can generate buggy code on targets depending on alignment.
+ *            In some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6)
  * See http://fastcompression.blogspot.fr/2015/08/accessing-unaligned-memory.html for details.
  * Prefer these methods in priority order (0 > 1 > 2)
  */
@@ -119,11 +119,12 @@ MEM_STATIC unsigned MEM_isLittleEndian(void)
 
 #if defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==2)
 
-/* violates C standard on structure alignment.
+/* violates C standard, by lying on structure alignment.
 Only use if no other choice to achieve best performance on target platform */
 MEM_STATIC U16 MEM_read16(const void* memPtr) { return *(const U16*) memPtr; }
 MEM_STATIC U32 MEM_read32(const void* memPtr) { return *(const U32*) memPtr; }
 MEM_STATIC U64 MEM_read64(const void* memPtr) { return *(const U64*) memPtr; }
+MEM_STATIC U64 MEM_readST(const void* memPtr) { return *(const size_t*) memPtr; }
 
 MEM_STATIC void MEM_write16(void* memPtr, U16 value) { *(U16*)memPtr = value; }
 MEM_STATIC void MEM_write32(void* memPtr, U32 value) { *(U32*)memPtr = value; }
@@ -133,11 +134,12 @@ MEM_STATIC void MEM_write64(void* memPtr, U64 value) { *(U64*)memPtr = value; }
 
 /* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
 /* currently only defined for gcc and icc */
-typedef union { U16 u16; U32 u32; U64 u64; } __attribute__((packed)) unalign;
+typedef union { U16 u16; U32 u32; U64 u64; size_t st; } __attribute__((packed)) unalign;
 
 MEM_STATIC U16 MEM_read16(const void* ptr) { return ((const unalign*)ptr)->u16; }
 MEM_STATIC U32 MEM_read32(const void* ptr) { return ((const unalign*)ptr)->u32; }
 MEM_STATIC U64 MEM_read64(const void* ptr) { return ((const unalign*)ptr)->u64; }
+MEM_STATIC U64 MEM_readST(const void* ptr) { return ((const unalign*)ptr)->st; }
 
 MEM_STATIC void MEM_write16(void* memPtr, U16 value) { ((unalign*)memPtr)->u16 = value; }
 MEM_STATIC void MEM_write32(void* memPtr, U32 value) { ((unalign*)memPtr)->u32 = value; }
@@ -163,6 +165,11 @@ MEM_STATIC U64 MEM_read64(const void* memPtr)
     U64 val; memcpy(&val, memPtr, sizeof(val)); return val;
 }
 
+MEM_STATIC size_t MEM_readST(const void* memPtr)
+{
+    size_t val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
 MEM_STATIC void MEM_write16(void* memPtr, U16 value)
 {
     memcpy(memPtr, &value, sizeof(value));
@@ -178,15 +185,14 @@ MEM_STATIC void MEM_write64(void* memPtr, U64 value)
     memcpy(memPtr, &value, sizeof(value));
 }
 
-#endif // MEM_FORCE_MEMORY_ACCESS
+#endif /* MEM_FORCE_MEMORY_ACCESS */
 
 
 MEM_STATIC U16 MEM_readLE16(const void* memPtr)
 {
     if (MEM_isLittleEndian())
         return MEM_read16(memPtr);
-    else
-    {
+    else {
         const BYTE* p = (const BYTE*)memPtr;
         return (U16)(p[0] + (p[1]<<8));
     }
@@ -194,12 +200,9 @@ MEM_STATIC U16 MEM_readLE16(const void* memPtr)
 
 MEM_STATIC void MEM_writeLE16(void* memPtr, U16 val)
 {
-    if (MEM_isLittleEndian())
-    {
+    if (MEM_isLittleEndian()) {
         MEM_write16(memPtr, val);
-    }
-    else
-    {
+    } else {
         BYTE* p = (BYTE*)memPtr;
         p[0] = (BYTE)val;
         p[1] = (BYTE)(val>>8);
@@ -210,8 +213,7 @@ MEM_STATIC U32 MEM_readLE32(const void* memPtr)
 {
     if (MEM_isLittleEndian())
         return MEM_read32(memPtr);
-    else
-    {
+    else {
         const BYTE* p = (const BYTE*)memPtr;
         return (U32)((U32)p[0] + ((U32)p[1]<<8) + ((U32)p[2]<<16) + ((U32)p[3]<<24));
     }
@@ -219,12 +221,9 @@ MEM_STATIC U32 MEM_readLE32(const void* memPtr)
 
 MEM_STATIC void MEM_writeLE32(void* memPtr, U32 val32)
 {
-    if (MEM_isLittleEndian())
-    {
+    if (MEM_isLittleEndian()) {
         MEM_write32(memPtr, val32);
-    }
-    else
-    {
+    } else {
         BYTE* p = (BYTE*)memPtr;
         p[0] = (BYTE)val32;
         p[1] = (BYTE)(val32>>8);
@@ -237,8 +236,7 @@ MEM_STATIC U64 MEM_readLE64(const void* memPtr)
 {
     if (MEM_isLittleEndian())
         return MEM_read64(memPtr);
-    else
-    {
+    else {
         const BYTE* p = (const BYTE*)memPtr;
         return (U64)((U64)p[0] + ((U64)p[1]<<8) + ((U64)p[2]<<16) + ((U64)p[3]<<24)
                      + ((U64)p[4]<<32) + ((U64)p[5]<<40) + ((U64)p[6]<<48) + ((U64)p[7]<<56));
@@ -247,12 +245,9 @@ MEM_STATIC U64 MEM_readLE64(const void* memPtr)
 
 MEM_STATIC void MEM_writeLE64(void* memPtr, U64 val64)
 {
-    if (MEM_isLittleEndian())
-    {
+    if (MEM_isLittleEndian()) {
         MEM_write64(memPtr, val64);
-    }
-    else
-    {
+    } else {
         BYTE* p = (BYTE*)memPtr;
         p[0] = (BYTE)val64;
         p[1] = (BYTE)(val64>>8);
diff --git a/lib/zstd_buffered.c b/lib/zbuff.c
similarity index 89%
rename from lib/zstd_buffered.c
rename to lib/zbuff.c
index 48721d6..4c1eb2c 100644
--- a/lib/zstd_buffered.c
+++ b/lib/zbuff.c
@@ -1,6 +1,6 @@
 /*
     Buffered version of Zstd compression library
-    Copyright (C) 2015, Yann Collet.
+    Copyright (C) 2015-2016, Yann Collet.
 
     BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
 
@@ -36,14 +36,20 @@
  */
 
 /* *************************************
-*  Includes
+*  Dependencies
 ***************************************/
 #include <stdlib.h>
 #include "error_private.h"
 #include "zstd_static.h"
-#include "zstd_buffered_static.h"
+#include "zbuff_static.h"
 
 
+/* *************************************
+*  Constants
+***************************************/
+static size_t ZBUFF_blockHeaderSize = 3;
+static size_t ZBUFF_endFrameSize = 3;
+
 /** ************************************************
 *  Streaming compression
 *
@@ -119,7 +125,7 @@ size_t ZBUFF_freeCCtx(ZBUFF_CCtx* zbc)
 
 #define MIN(a,b)    ( ((a)<(b)) ? (a) : (b) )
 #define BLOCKSIZE   (128 * 1024)   /* a bit too "magic", should come from reference */
-size_t ZBUFF_compressInit_advanced(ZBUFF_CCtx* zbc, ZSTD_parameters params)
+size_t ZBUFF_compressInit_advanced(ZBUFF_CCtx* zbc, const void* dict, size_t dictSize, ZSTD_parameters params)
 {
     size_t neededInBuffSize;
 
@@ -127,23 +133,21 @@ size_t ZBUFF_compressInit_advanced(ZBUFF_CCtx* zbc, ZSTD_parameters params)
     neededInBuffSize = (size_t)1 << params.windowLog;
 
     /* allocate buffers */
-    if (zbc->inBuffSize < neededInBuffSize)
-    {
+    if (zbc->inBuffSize < neededInBuffSize) {
         zbc->inBuffSize = neededInBuffSize;
         free(zbc->inBuff);   /* should not be necessary */
         zbc->inBuff = (char*)malloc(neededInBuffSize);
         if (zbc->inBuff == NULL) return ERROR(memory_allocation);
     }
     zbc->blockSize = MIN(BLOCKSIZE, zbc->inBuffSize);
-    if (zbc->outBuffSize < ZSTD_compressBound(zbc->blockSize)+1)
-    {
+    if (zbc->outBuffSize < ZSTD_compressBound(zbc->blockSize)+1) {
         zbc->outBuffSize = ZSTD_compressBound(zbc->blockSize)+1;
         free(zbc->outBuff);   /* should not be necessary */
         zbc->outBuff = (char*)malloc(zbc->outBuffSize);
         if (zbc->outBuff == NULL) return ERROR(memory_allocation);
     }
 
-    zbc->outBuffContentSize = ZSTD_compressBegin_advanced(zbc->zc, params);
+    zbc->outBuffContentSize = ZSTD_compressBegin_advanced(zbc->zc, dict, dictSize, params);
     if (ZSTD_isError(zbc->outBuffContentSize)) return zbc->outBuffContentSize;
 
     zbc->inToCompress = 0;
@@ -156,14 +160,13 @@ size_t ZBUFF_compressInit_advanced(ZBUFF_CCtx* zbc, ZSTD_parameters params)
 
 size_t ZBUFF_compressInit(ZBUFF_CCtx* zbc, int compressionLevel)
 {
-    return ZBUFF_compressInit_advanced(zbc, ZSTD_getParams(compressionLevel, 0));
+    return ZBUFF_compressInit_advanced(zbc, NULL, 0, ZSTD_getParams(compressionLevel, 0));
 }
 
 
-ZSTDLIB_API size_t ZBUFF_compressWithDictionary(ZBUFF_CCtx* zbc, const void* src, size_t srcSize)
+ZSTDLIB_API size_t ZBUFF_compressInitDictionary(ZBUFF_CCtx* zbc, const void* dict, size_t dictSize, int compressionLevel)
 {
-    ZSTD_compress_insertDictionary(zbc->zc, src, srcSize);
-    return 0;
+    return ZBUFF_compressInit_advanced(zbc, dict, dictSize, ZSTD_getParams(compressionLevel, 0));
 }
 
 
@@ -189,8 +192,7 @@ static size_t ZBUFF_compressContinue_generic(ZBUFF_CCtx* zbc,
     char* op = ostart;
     char* const oend = ostart + *maxDstSizePtr;
 
-    while (notDone)
-    {
+    while (notDone) {
         switch(zbc->stage)
         {
         case ZBUFFcs_init: return ERROR(init_missing);   /* call ZBUFF_compressInit() first ! */
@@ -202,9 +204,9 @@ static size_t ZBUFF_compressContinue_generic(ZBUFF_CCtx* zbc,
                 size_t loaded = ZBUFF_limitCopy(zbc->inBuff + zbc->inBuffPos, toLoad, ip, iend-ip);
                 zbc->inBuffPos += loaded;
                 ip += loaded;
-                if ( (zbc->inBuffPos==zbc->inToCompress) || (!flush && (toLoad != loaded)) )
-                    { notDone = 0; break; }  /* not enough input to get a full block : stop there, wait for more */
-            }
+                if ( (zbc->inBuffPos==zbc->inToCompress) || (!flush && (toLoad != loaded)) ) {
+                    notDone = 0; break;  /* not enough input to get a full block : stop there, wait for more */
+            }   }
             /* compress current block (note : this stage cannot be stopped in the middle) */
             {
                 void* cDst;
@@ -236,8 +238,7 @@ static size_t ZBUFF_compressContinue_generic(ZBUFF_CCtx* zbc,
                 size_t flushed = ZBUFF_limitCopy(op, oend-op, zbc->outBuff + zbc->outBuffFlushedSize, toFlush);
                 op += flushed;
                 zbc->outBuffFlushedSize += flushed;
-                if (toFlush!=flushed)
-                    { notDone = 0; break; } /* not enough space within dst to store compressed block : stop there */
+                if (toFlush!=flushed) { notDone = 0; break; } /* not enough space within dst to store compressed block : stop there */
                 zbc->outBuffContentSize = 0;
                 zbc->outBuffFlushedSize = 0;
                 zbc->stage = ZBUFFcs_load;
@@ -260,7 +261,9 @@ static size_t ZBUFF_compressContinue_generic(ZBUFF_CCtx* zbc,
 size_t ZBUFF_compressContinue(ZBUFF_CCtx* zbc,
                               void* dst, size_t* maxDstSizePtr,
                         const void* src, size_t* srcSizePtr)
-{ return ZBUFF_compressContinue_generic(zbc, dst, maxDstSizePtr, src, srcSizePtr, 0); }
+{
+    return ZBUFF_compressContinue_generic(zbc, dst, maxDstSizePtr, src, srcSizePtr, 0);
+}
 
 
 
@@ -335,8 +338,6 @@ struct ZBUFF_DCtx_s {
     size_t outStart;
     size_t outEnd;
     size_t hPos;
-    const char* dict;
-    size_t dictSize;
     ZBUFF_dStage stage;
     unsigned char headerBuffer[ZSTD_frameHeaderSize_max];
 };   /* typedef'd to ZBUFF_DCtx within "zstd_buffered.h" */
@@ -365,19 +366,16 @@ size_t ZBUFF_freeDCtx(ZBUFF_DCtx* zbc)
 
 /* *** Initialization *** */
 
-size_t ZBUFF_decompressInit(ZBUFF_DCtx* zbc)
+size_t ZBUFF_decompressInitDictionary(ZBUFF_DCtx* zbc, const void* dict, size_t dictSize)
 {
     zbc->stage = ZBUFFds_readHeader;
-    zbc->hPos = zbc->inPos = zbc->outStart = zbc->outEnd = zbc->dictSize = 0;
-    return ZSTD_resetDCtx(zbc->zc);
+    zbc->hPos = zbc->inPos = zbc->outStart = zbc->outEnd = 0;
+    return ZSTD_decompressBegin_usingDict(zbc->zc, dict, dictSize);
 }
 
-
-size_t ZBUFF_decompressWithDictionary(ZBUFF_DCtx* zbc, const void* src, size_t srcSize)
+size_t ZBUFF_decompressInit(ZBUFF_DCtx* zbc)
 {
-    zbc->dict = (const char*)src;
-    zbc->dictSize = srcSize;
-    return 0;
+    return ZBUFF_decompressInitDictionary(zbc, NULL, 0);
 }
 
 
@@ -393,11 +391,9 @@ size_t ZBUFF_decompressContinue(ZBUFF_DCtx* zbc, void* dst, size_t* maxDstSizePt
     char* const oend = ostart + *maxDstSizePtr;
     U32 notDone = 1;
 
-    while (notDone)
-    {
+    while (notDone) {
         switch(zbc->stage)
         {
-
         case ZBUFFds_init :
             return ERROR(init_missing);
 
@@ -406,8 +402,7 @@ size_t ZBUFF_decompressContinue(ZBUFF_DCtx* zbc, void* dst, size_t* maxDstSizePt
             {
                 size_t headerSize = ZSTD_getFrameParams(&(zbc->params), src, *srcSizePtr);
                 if (ZSTD_isError(headerSize)) return headerSize;
-                if (headerSize)
-                {
+                if (headerSize) {
                     /* not enough input to decode header : tell how many bytes would be necessary */
                     memcpy(zbc->headerBuffer+zbc->hPos, src, *srcSizePtr);
                     zbc->hPos += *srcSizePtr;
@@ -429,8 +424,7 @@ size_t ZBUFF_decompressContinue(ZBUFF_DCtx* zbc, void* dst, size_t* maxDstSizePt
                 ip += headerSize;
                 headerSize = ZSTD_getFrameParams(&(zbc->params), zbc->headerBuffer, zbc->hPos);
                 if (ZSTD_isError(headerSize)) return headerSize;
-                if (headerSize)
-                {
+                if (headerSize) {
                     /* not enough input to decode header : tell how many bytes would be necessary */
                     *maxDstSizePtr = 0;
                     return headerSize - zbc->hPos;
@@ -443,25 +437,19 @@ size_t ZBUFF_decompressContinue(ZBUFF_DCtx* zbc, void* dst, size_t* maxDstSizePt
                 {
                     size_t neededOutSize = (size_t)1 << zbc->params.windowLog;
                     size_t neededInSize = BLOCKSIZE;   /* a block is never > BLOCKSIZE */
-                    if (zbc->inBuffSize < neededInSize)
-                    {
+                    if (zbc->inBuffSize < neededInSize) {
                         free(zbc->inBuff);
                         zbc->inBuffSize = neededInSize;
                         zbc->inBuff = (char*)malloc(neededInSize);
                         if (zbc->inBuff == NULL) return ERROR(memory_allocation);
                     }
-                    if (zbc->outBuffSize < neededOutSize)
-                    {
+                    if (zbc->outBuffSize < neededOutSize) {
                         free(zbc->outBuff);
                         zbc->outBuffSize = neededOutSize;
                         zbc->outBuff = (char*)malloc(neededOutSize);
                         if (zbc->outBuff == NULL) return ERROR(memory_allocation);
-                    }
-                }
-                if (zbc->dictSize)
-                    ZSTD_decompress_insertDictionary(zbc->zc, zbc->dict, zbc->dictSize);
-                if (zbc->hPos)
-                {
+                }   }
+                if (zbc->hPos) {
                     /* some data already loaded into headerBuffer : transfer into inBuff */
                     memcpy(zbc->inBuff, zbc->headerBuffer, zbc->hPos);
                     zbc->inPos = zbc->hPos;
@@ -474,14 +462,12 @@ size_t ZBUFF_decompressContinue(ZBUFF_DCtx* zbc, void* dst, size_t* maxDstSizePt
         case ZBUFFds_read:
             {
                 size_t neededInSize = ZSTD_nextSrcSizeToDecompress(zbc->zc);
-                if (neededInSize==0)   /* end of frame */
-                {
+                if (neededInSize==0) {  /* end of frame */
                     zbc->stage = ZBUFFds_init;
                     notDone = 0;
                     break;
                 }
-                if ((size_t)(iend-ip) >= neededInSize)
-                {
+                if ((size_t)(iend-ip) >= neededInSize) {
                     /* directly decode from src */
                     size_t decodedSize = ZSTD_decompressContinue(zbc->zc,
                         zbc->outBuff + zbc->outStart, zbc->outBuffSize - zbc->outStart,
@@ -517,16 +503,14 @@ size_t ZBUFF_decompressContinue(ZBUFF_DCtx* zbc, void* dst, size_t* maxDstSizePt
                     zbc->outEnd = zbc->outStart +  decodedSize;
                     zbc->stage = ZBUFFds_flush;
                     // break; /* ZBUFFds_flush follows */
-                }
-            }
+            }   }
         case ZBUFFds_flush:
             {
                 size_t toFlushSize = zbc->outEnd - zbc->outStart;
                 size_t flushedSize = ZBUFF_limitCopy(op, oend-op, zbc->outBuff + zbc->outStart, toFlushSize);
                 op += flushedSize;
                 zbc->outStart += flushedSize;
-                if (flushedSize == toFlushSize)
-                {
+                if (flushedSize == toFlushSize) {
                     zbc->stage = ZBUFFds_read;
                     if (zbc->outStart + BLOCKSIZE > zbc->outBuffSize)
                         zbc->outStart = zbc->outEnd = 0;
@@ -537,15 +521,14 @@ size_t ZBUFF_decompressContinue(ZBUFF_DCtx* zbc, void* dst, size_t* maxDstSizePt
                 break;
             }
         default: return ERROR(GENERIC);   /* impossible */
-        }
-    }
+    }   }
 
     *srcSizePtr = ip-istart;
     *maxDstSizePtr = op-ostart;
 
     {
         size_t nextSrcSizeHint = ZSTD_nextSrcSizeToDecompress(zbc->zc);
-        if (nextSrcSizeHint > 3) nextSrcSizeHint+= 3;   /* get the next block header while at it */
+        if (nextSrcSizeHint > ZBUFF_blockHeaderSize) nextSrcSizeHint+= ZBUFF_blockHeaderSize;   /* get next block header too */
         nextSrcSizeHint -= zbc->inPos;   /* already loaded*/
         return nextSrcSizeHint;
     }
@@ -553,18 +536,13 @@ size_t ZBUFF_decompressContinue(ZBUFF_DCtx* zbc, void* dst, size_t* maxDstSizePt
 
 
 
-
-
-
-
-
 /* *************************************
 *  Tool functions
 ***************************************/
 unsigned ZBUFF_isError(size_t errorCode) { return ERR_isError(errorCode); }
 const char* ZBUFF_getErrorName(size_t errorCode) { return ERR_getErrorName(errorCode); }
 
-size_t ZBUFF_recommendedCInSize()  { return BLOCKSIZE; }
-size_t ZBUFF_recommendedCOutSize() { return ZSTD_compressBound(BLOCKSIZE) + 6; }
-size_t ZBUFF_recommendedDInSize()  { return BLOCKSIZE + 3; }
-size_t ZBUFF_recommendedDOutSize() { return BLOCKSIZE; }
+size_t ZBUFF_recommendedCInSize(void)  { return BLOCKSIZE; }
+size_t ZBUFF_recommendedCOutSize(void) { return ZSTD_compressBound(BLOCKSIZE) + ZBUFF_blockHeaderSize + ZBUFF_endFrameSize; }
+size_t ZBUFF_recommendedDInSize(void)  { return BLOCKSIZE + ZBUFF_blockHeaderSize /* block header size*/ ; }
+size_t ZBUFF_recommendedDOutSize(void) { return BLOCKSIZE; }
diff --git a/lib/zstd_buffered.h b/lib/zbuff.h
similarity index 66%
rename from lib/zstd_buffered.h
rename to lib/zbuff.h
index d2316a8..d3275b7 100644
--- a/lib/zstd_buffered.h
+++ b/lib/zbuff.h
@@ -1,6 +1,6 @@
 /*
     Buffered version of Zstd compression library
-    Copyright (C) 2015, Yann Collet.
+    Copyright (C) 2015-2016, Yann Collet.
 
     BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
 
@@ -26,14 +26,13 @@
     OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
     You can contact the author at :
-    - zstd source repository : https://github.com/Cyan4973/zstd
-    - ztsd public forum : https://groups.google.com/forum/#!forum/lz4c
+    - zstd homepage : http://www.zstd.net/
 */
 #ifndef ZSTD_BUFFERED_H
 #define ZSTD_BUFFERED_H
 
 /* The objects defined into this file should be considered experimental.
- * They are not labelled stable, as their prototype may change in the future.
+ * They are not considered stable, as their prototype may change in the future.
  * You can use them for tests, provide feedback, or if you can endure risk of future changes.
  */
 
@@ -42,13 +41,13 @@ extern "C" {
 #endif
 
 /* *************************************
-*  Includes
+*  Dependencies
 ***************************************/
 #include <stddef.h>   /* size_t */
 
 
 /* ***************************************************************
-*  Tuning parameters
+*  Compiler specifics
 *****************************************************************/
 /*!
 *  ZSTD_DLL_EXPORT :
@@ -69,48 +68,50 @@ ZSTDLIB_API ZBUFF_CCtx* ZBUFF_createCCtx(void);
 ZSTDLIB_API size_t      ZBUFF_freeCCtx(ZBUFF_CCtx* cctx);
 
 ZSTDLIB_API size_t ZBUFF_compressInit(ZBUFF_CCtx* cctx, int compressionLevel);
-ZSTDLIB_API size_t ZBUFF_compressWithDictionary(ZBUFF_CCtx* cctx, const void* src, size_t srcSize);
-ZSTDLIB_API size_t ZBUFF_compressContinue(ZBUFF_CCtx* cctx, void* dst, size_t* maxDstSizePtr, const void* src, size_t* srcSizePtr);
-ZSTDLIB_API size_t ZBUFF_compressFlush(ZBUFF_CCtx* cctx, void* dst, size_t* maxDstSizePtr);
-ZSTDLIB_API size_t ZBUFF_compressEnd(ZBUFF_CCtx* cctx, void* dst, size_t* maxDstSizePtr);
+ZSTDLIB_API size_t ZBUFF_compressInitDictionary(ZBUFF_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel);
 
-/** ************************************************
+ZSTDLIB_API size_t ZBUFF_compressContinue(ZBUFF_CCtx* cctx, void* dst, size_t* dstCapacityPtr, const void* src, size_t* srcSizePtr);
+ZSTDLIB_API size_t ZBUFF_compressFlush(ZBUFF_CCtx* cctx, void* dst, size_t* dstCapacityPtr);
+ZSTDLIB_API size_t ZBUFF_compressEnd(ZBUFF_CCtx* cctx, void* dst, size_t* dstCapacityPtr);
+
+/*-*************************************************
 *  Streaming compression
 *
 *  A ZBUFF_CCtx object is required to track streaming operation.
 *  Use ZBUFF_createCCtx() and ZBUFF_freeCCtx() to create/release resources.
-*  Use ZBUFF_compressInit() to start a new compression operation.
 *  ZBUFF_CCtx objects can be reused multiple times.
 *
-*  Optionally, a reference to a static dictionary can be created with ZBUFF_compressWithDictionary()
-*  Note that the dictionary content must remain accessible during the compression process.
+*  Start by initializing ZBUF_CCtx.
+*  Use ZBUFF_compressInit() to start a new compression operation.
+*  Use ZBUFF_compressInitDictionary() for a compression which requires a dictionary.
 *
 *  Use ZBUFF_compressContinue() repetitively to consume input stream.
-*  *srcSizePtr and *maxDstSizePtr can be any size.
-*  The function will report how many bytes were read or written within *srcSizePtr and *maxDstSizePtr.
+*  *srcSizePtr and *dstCapacityPtr can be any size.
+*  The function will report how many bytes were read or written within *srcSizePtr and *dstCapacityPtr.
 *  Note that it may not consume the entire input, in which case it's up to the caller to present again remaining data.
-*  The content of dst will be overwritten (up to *maxDstSizePtr) at each function call, so save its content if it matters or move dst .
-*  @return : a hint to preferred nb of bytes to use as input for next function call (it's only a hint, to improve latency)
+*  The content of @dst will be overwritten (up to *dstCapacityPtr) at each call, so save its content if it matters or change @dst .
+*  @return : a hint to preferred nb of bytes to use as input for next function call (it's just a hint, to improve latency)
 *            or an error code, which can be tested using ZBUFF_isError().
 *
-*  ZBUFF_compressFlush() can be used to instruct ZBUFF to compress and output whatever remains within its buffer.
-*  Note that it will not output more than *maxDstSizePtr.
-*  Therefore, some content might still be left into its internal buffer if dst buffer is too small.
+*  At any moment, it's possible to flush whatever data remains within buffer, using ZBUFF_compressFlush().
+*  The nb of bytes written into @dst will be reported into *dstCapacityPtr.
+*  Note that the function cannot output more than *dstCapacityPtr,
+*  therefore, some content might still be left into internal buffer if *dstCapacityPtr is too small.
 *  @return : nb of bytes still present into internal buffer (0 if it's empty)
 *            or an error code, which can be tested using ZBUFF_isError().
 *
 *  ZBUFF_compressEnd() instructs to finish a frame.
 *  It will perform a flush and write frame epilogue.
-*  Note that the epilogue is necessary for decoders to consider a frame completed.
-*  Similar to ZBUFF_compressFlush(), it may not be able to output the entire internal buffer content if *maxDstSizePtr is too small.
+*  The epilogue is required for decoders to consider a frame completed.
+*  Similar to ZBUFF_compressFlush(), it may not be able to output the entire internal buffer content if *dstCapacityPtr is too small.
 *  In which case, call again ZBUFF_compressFlush() to complete the flush.
 *  @return : nb of bytes still present into internal buffer (0 if it's empty)
 *            or an error code, which can be tested using ZBUFF_isError().
 *
 *  Hint : recommended buffer sizes (not compulsory) : ZBUFF_recommendedCInSize / ZBUFF_recommendedCOutSize
-*  input : ZBUFF_recommendedCInSize==128 KB block size is the internal unit, it improves latency to use this value.
+*  input : ZBUFF_recommendedCInSize==128 KB block size is the internal unit, it improves latency to use this value (skipped buffering).
 *  output : ZBUFF_recommendedCOutSize==ZSTD_compressBound(128 KB) + 3 + 3 : ensures it's always possible to write/flush/end a full block. Skip some buffering.
-*  By using both, you ensure that input will be entirely consumed, and output will always contain the result.
+*  By using both, it ensures that input will be entirely consumed, and output will always contain the result, reducing intermediate buffering.
 * **************************************************/
 
 
@@ -119,35 +120,34 @@ ZSTDLIB_API ZBUFF_DCtx* ZBUFF_createDCtx(void);
 ZSTDLIB_API size_t      ZBUFF_freeDCtx(ZBUFF_DCtx* dctx);
 
 ZSTDLIB_API size_t ZBUFF_decompressInit(ZBUFF_DCtx* dctx);
-ZSTDLIB_API size_t ZBUFF_decompressWithDictionary(ZBUFF_DCtx* dctx, const void* src, size_t srcSize);
+ZSTDLIB_API size_t ZBUFF_decompressInitDictionary(ZBUFF_DCtx* dctx, const void* dict, size_t dictSize);
 
-ZSTDLIB_API size_t ZBUFF_decompressContinue(ZBUFF_DCtx* dctx, void* dst, size_t* maxDstSizePtr, const void* src, size_t* srcSizePtr);
+ZSTDLIB_API size_t ZBUFF_decompressContinue(ZBUFF_DCtx* dctx,
+                                            void* dst, size_t* dstCapacityPtr,
+                                      const void* src, size_t* srcSizePtr);
 
-/** ************************************************
+/*-***************************************************************************
 *  Streaming decompression
 *
-*  A ZBUFF_DCtx object is required to track streaming operation.
+*  A ZBUFF_DCtx object is required to track streaming operations.
 *  Use ZBUFF_createDCtx() and ZBUFF_freeDCtx() to create/release resources.
-*  Use ZBUFF_decompressInit() to start a new decompression operation.
-*  ZBUFF_DCtx objects can be reused multiple times.
-*
-*  Optionally, a reference to a static dictionary can be set, using ZBUFF_decompressWithDictionary()
-*  It must be the same content as the one set during compression phase.
-*  Dictionary content must remain accessible during the decompression process.
+*  Use ZBUFF_decompressInit() to start a new decompression operation,
+*   or ZBUFF_decompressInitDictionary() if decompression requires a dictionary.
+*  Note that ZBUFF_DCtx objects can be reused multiple times.
 *
 *  Use ZBUFF_decompressContinue() repetitively to consume your input.
-*  *srcSizePtr and *maxDstSizePtr can be any size.
-*  The function will report how many bytes were read or written by modifying *srcSizePtr and *maxDstSizePtr.
+*  *srcSizePtr and *dstCapacityPtr can be any size.
+*  The function will report how many bytes were read or written by modifying *srcSizePtr and *dstCapacityPtr.
 *  Note that it may not consume the entire input, in which case it's up to the caller to present remaining input again.
-*  The content of dst will be overwritten (up to *maxDstSizePtr) at each function call, so save its content if it matters or change dst.
-*  @return : a hint to preferred nb of bytes to use as input for next function call (it's only a hint, to improve latency)
+*  The content of @dst will be overwritten (up to *dstCapacityPtr) at each function call, so save its content if it matters or change @dst.
+*  @return : a hint to preferred nb of bytes to use as input for next function call (it's only a hint, to help latency)
 *            or 0 when a frame is completely decoded
 *            or an error code, which can be tested using ZBUFF_isError().
 *
-*  Hint : recommended buffer sizes (not compulsory) : ZBUFF_recommendedDInSize / ZBUFF_recommendedDOutSize
-*  output : ZBUFF_recommendedDOutSize==128 KB block size is the internal unit, it ensures it's always possible to write a full block when it's decoded.
-*  input : ZBUFF_recommendedDInSize==128Kb+3; just follow indications from ZBUFF_decompressContinue() to minimize latency. It should always be <= 128 KB + 3 .
-* **************************************************/
+*  Hint : recommended buffer sizes (not compulsory) : ZBUFF_recommendedDInSize() / ZBUFF_recommendedDOutSize()
+*  output : ZBUFF_recommendedDOutSize==128 KB block size is the internal unit, it ensures it's always possible to write a full block when decoded.
+*  input  : ZBUFF_recommendedDInSize==128Kb+3; just follow indications from ZBUFF_decompressContinue() to minimize latency. It should always be <= 128 KB + 3 .
+* *******************************************************************************/
 
 
 /* *************************************
@@ -156,8 +156,8 @@ ZSTDLIB_API size_t ZBUFF_decompressContinue(ZBUFF_DCtx* dctx, void* dst, size_t*
 ZSTDLIB_API unsigned ZBUFF_isError(size_t errorCode);
 ZSTDLIB_API const char* ZBUFF_getErrorName(size_t errorCode);
 
-/** The below functions provide recommended buffer sizes for Compression or Decompression operations.
-*   These sizes are not compulsory, they just tend to offer better latency */
+/** Functions below provide recommended buffer sizes for Compression or Decompression operations.
+*   These sizes are just hints, and tend to offer better latency */
 ZSTDLIB_API size_t ZBUFF_recommendedCInSize(void);
 ZSTDLIB_API size_t ZBUFF_recommendedCOutSize(void);
 ZSTDLIB_API size_t ZBUFF_recommendedDInSize(void);
diff --git a/lib/zstd_buffered_static.h b/lib/zbuff_static.h
similarity index 90%
copy from lib/zstd_buffered_static.h
copy to lib/zbuff_static.h
index 7d9ee27..4055089 100644
--- a/lib/zstd_buffered_static.h
+++ b/lib/zbuff_static.h
@@ -1,7 +1,7 @@
 /*
     zstd - buffered version of compression library
     experimental complementary API, for static linking only
-    Copyright (C) 2015, Yann Collet.
+    Copyright (C) 2015-2016, Yann Collet.
 
     BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
 
@@ -27,8 +27,7 @@
     OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
     You can contact the author at :
-    - zstd source repository : https://github.com/Cyan4973/zstd
-    - ztsd public forum : https://groups.google.com/forum/#!forum/lz4c
+    - zstd homepage : http://www.zstd.net
 */
 #ifndef ZSTD_BUFFERED_STATIC_H
 #define ZSTD_BUFFERED_STATIC_H
@@ -45,14 +44,14 @@ extern "C" {
 /* *************************************
 *  Includes
 ***************************************/
-#include "zstd_static.h"
-#include "zstd_buffered.h"
+#include "zstd_static.h"     /* ZSTD_parameters */
+#include "zbuff.h"
 
 
 /* *************************************
 *  Advanced Streaming functions
 ***************************************/
-ZSTDLIB_API size_t ZBUFF_compressInit_advanced(ZBUFF_CCtx* cctx, ZSTD_parameters params);
+ZSTDLIB_API size_t ZBUFF_compressInit_advanced(ZBUFF_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_parameters params);
 
 
 #if defined (__cplusplus)
diff --git a/lib/zdict.c b/lib/zdict.c
new file mode 100644
index 0000000..d3d5784
--- /dev/null
+++ b/lib/zdict.c
@@ -0,0 +1,923 @@
+/*
+    dictBuilder - dictionary builder for zstd
+    Copyright (C) Yann Collet 2016
+
+    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+
+    * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following disclaimer
+    in the documentation and/or other materials provided with the
+    distribution.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - Zstd homepage : https://www.zstd.net
+*/
+
+/*-**************************************
+*  Compiler Options
+****************************************/
+/* Disable some Visual warning messages */
+#ifdef _MSC_VER
+#  pragma warning(disable : 4127)                /* disable: C4127: conditional expression is constant */
+#endif
+
+/* Unix Large Files support (>4GB) */
+#define _FILE_OFFSET_BITS 64
+#if (defined(__sun__) && (!defined(__LP64__)))   /* Sun Solaris 32-bits requires specific definitions */
+#  define _LARGEFILE_SOURCE
+#elif ! defined(__LP64__)                        /* No point defining Large file for 64 bit */
+#  define _LARGEFILE64_SOURCE
+#endif
+
+
+/*-*************************************
+*  Dependencies
+***************************************/
+#include <stdlib.h>        /* malloc, free */
+#include <string.h>        /* memset */
+#include <stdio.h>         /* fprintf, fopen, ftello64 */
+#include <sys/types.h>     /* stat64 */
+#include <sys/stat.h>      /* stat64 */
+#include <time.h>          /* clock */
+
+#include "mem.h"           /* read */
+#include "error_private.h"
+#include "fse.h"
+#include "huff0_static.h"
+#include "zstd_internal.h"
+#include "divsufsort.h"
+#include "zdict_static.h"
+
+
+/*-*************************************
+*  Compiler specifics
+***************************************/
+#if !defined(S_ISREG)
+#  define S_ISREG(x) (((x) & S_IFMT) == S_IFREG)
+#endif
+
+
+/*-*************************************
+*  Constants
+***************************************/
+#define KB *(1 <<10)
+#define MB *(1 <<20)
+#define GB *(1U<<30)
+
+#define DICTLISTSIZE 10000
+
+#define NOISELENGTH 32
+#define PRIME1   2654435761U
+#define PRIME2   2246822519U
+
+#define MINRATIO 4
+static const U32 g_compressionLevel_default = 5;
+static const U32 g_selectivity_default = 9;
+static const size_t g_provision_entropySize = 200;
+static const size_t g_min_fast_dictContent = 192;
+
+
+/*-*************************************
+*  Console display
+***************************************/
+#define DISPLAY(...)         fprintf(stderr, __VA_ARGS__)
+#define DISPLAYLEVEL(l, ...) if (g_displayLevel>=l) { DISPLAY(__VA_ARGS__); }
+static unsigned g_displayLevel = 0;   /* 0 : no display;   1: errors;   2: default;  4: full information */
+
+#define DISPLAYUPDATE(l, ...) if (g_displayLevel>=l) { \
+            if (ZDICT_GetMilliSpan(g_time) > refreshRate)  \
+            { g_time = clock(); DISPLAY(__VA_ARGS__); \
+            if (g_displayLevel>=4) fflush(stdout); } }
+static const unsigned refreshRate = 300;
+static clock_t g_time = 0;
+
+static void ZDICT_printHex(U32 dlevel, const void* ptr, size_t length)
+{
+    const BYTE* const b = (const BYTE*)ptr;
+    size_t u;
+    for (u=0; u<length; u++)
+    {
+        BYTE c = b[u];
+        if (c<32 || c>126) c = '.';   /* non-printable char */
+        DISPLAYLEVEL(dlevel, "%c", c);
+    }
+}
+
+
+/*-********************************************************
+*  Helper functions
+**********************************************************/
+static unsigned ZDICT_GetMilliSpan(clock_t nPrevious)
+{
+    clock_t nCurrent = clock();
+    unsigned nSpan = (unsigned)(((nCurrent - nPrevious) * 1000) / CLOCKS_PER_SEC);
+    return nSpan;
+}
+
+unsigned ZDICT_isError(size_t errorCode) { return ERR_isError(errorCode); }
+
+const char* ZDICT_getErrorName(size_t errorCode) { return ERR_getErrorName(errorCode); }
+
+
+/*-********************************************************
+*  Dictionary training functions
+**********************************************************/
+static unsigned ZDICT_NbCommonBytes (register size_t val)
+{
+    if (MEM_isLittleEndian()) {
+        if (MEM_64bits()) {
+#       if defined(_MSC_VER) && defined(_WIN64)
+            unsigned long r = 0;
+            _BitScanForward64( &r, (U64)val );
+            return (unsigned)(r>>3);
+#       elif defined(__GNUC__) && (__GNUC__ >= 3)
+            return (__builtin_ctzll((U64)val) >> 3);
+#       else
+            static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 };
+            return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58];
+#       endif
+        } else { /* 32 bits */
+#       if defined(_MSC_VER)
+            unsigned long r=0;
+            _BitScanForward( &r, (U32)val );
+            return (unsigned)(r>>3);
+#       elif defined(__GNUC__) && (__GNUC__ >= 3)
+            return (__builtin_ctz((U32)val) >> 3);
+#       else
+            static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 };
+            return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
+#       endif
+        }
+    } else {  /* Big Endian CPU */
+        if (MEM_64bits()) {
+#       if defined(_MSC_VER) && defined(_WIN64)
+            unsigned long r = 0;
+            _BitScanReverse64( &r, val );
+            return (unsigned)(r>>3);
+#       elif defined(__GNUC__) && (__GNUC__ >= 3)
+            return (__builtin_clzll(val) >> 3);
+#       else
+            unsigned r;
+            const unsigned n32 = sizeof(size_t)*4;   /* calculate this way due to compiler complaining in 32-bits mode */
+            if (!(val>>n32)) { r=4; } else { r=0; val>>=n32; }
+            if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; }
+            r += (!val);
+            return r;
+#       endif
+        } else { /* 32 bits */
+#       if defined(_MSC_VER)
+            unsigned long r = 0;
+            _BitScanReverse( &r, (unsigned long)val );
+            return (unsigned)(r>>3);
+#       elif defined(__GNUC__) && (__GNUC__ >= 3)
+            return (__builtin_clz((U32)val) >> 3);
+#       else
+            unsigned r;
+            if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; }
+            r += (!val);
+            return r;
+#       endif
+    }   }
+}
+
+
+/*! ZDICT_count() :
+    Count the nb of common bytes between 2 pointers.
+    Note : this function presumes end of buffer followed by noisy guard band.
+*/
+static size_t ZDICT_count(const void* pIn, const void* pMatch)
+{
+    const char* const pStart = (const char*)pIn;
+    for (;;) {
+        size_t diff = MEM_readST(pMatch) ^ MEM_readST(pIn);
+        if (!diff) { pIn = (const char*)pIn+sizeof(size_t); pMatch = (const char*)pMatch+sizeof(size_t); continue; }
+        pIn = (const char*)pIn+ZDICT_NbCommonBytes(diff);
+        return (size_t)((const char*)pIn - pStart);
+    }
+}
+
+
+typedef struct {
+    U32 pos;
+    U32 length;
+    U32 savings;
+} dictItem;
+
+static void ZDICT_initDictItem(dictItem* d)
+{
+    d->pos = 1;
+    d->length = 0;
+    d->savings = (U32)(-1);
+}
+
+
+#define LLIMIT 64          /* heuristic determined experimentally */
+#define MINMATCHLENGTH 7   /* heuristic determined experimentally */
+static dictItem ZDICT_analyzePos(
+                       BYTE* doneMarks,
+                       const int* suffix, U32 start,
+                       const void* buffer, U32 minRatio)
+{
+    U32 lengthList[LLIMIT] = {0};
+    U32 cumulLength[LLIMIT] = {0};
+    U32 savings[LLIMIT] = {0};
+    const BYTE* b = (const BYTE*)buffer;
+    size_t length;
+    size_t maxLength = LLIMIT;
+    size_t pos = suffix[start];
+    U32 end = start;
+    dictItem solution;
+
+    /* init */
+    memset(&solution, 0, sizeof(solution));
+    doneMarks[pos] = 1;
+
+    /* trivial repetition cases */
+    if ( (MEM_read16(b+pos+0) == MEM_read16(b+pos+2))
+       ||(MEM_read16(b+pos+1) == MEM_read16(b+pos+3))
+       ||(MEM_read16(b+pos+2) == MEM_read16(b+pos+4)) ) {
+        /* skip and mark segment */
+        U16 u16 = MEM_read16(b+pos+4);
+        U32 u, e = 6;
+        while (MEM_read16(b+pos+e) == u16) e+=2 ;
+        if (b[pos+e] == b[pos+e-1]) e++;
+        for (u=1; u<e; u++)
+            doneMarks[pos+u] = 1;
+        return solution;
+    }
+
+    /* look forward */
+    do {
+        end++;
+        length = ZDICT_count(b + pos, b + suffix[end]);
+    } while (length >=MINMATCHLENGTH);
+
+    /* look backward */
+    do {
+        length = ZDICT_count(b + pos, b + *(suffix+start-1));
+        if (length >=MINMATCHLENGTH) start--;
+    } while(length >= MINMATCHLENGTH);
+
+    /* exit if not found a minimum nb of repetitions */
+    if (end-start < minRatio) {
+        U32 idx;
+        for(idx=start; idx<end; idx++)
+            doneMarks[suffix[idx]] = 1;
+        return solution;
+    }
+
+    {
+        int i;
+        U32 searchLength;
+        U32 refinedStart = start;
+        U32 refinedEnd = end;
+
+        DISPLAYLEVEL(4, "\n");
+        DISPLAYLEVEL(4, "found %3u matches of length >= %u at pos %7u  ", (U32)(end-start), MINMATCHLENGTH, (U32)pos);
+        DISPLAYLEVEL(4, "\n");
+
+        for (searchLength = MINMATCHLENGTH ; ; searchLength++) {
+            BYTE currentChar = 0;
+            U32 currentCount = 0;
+            U32 currentID = refinedStart;
+            U32 id;
+            U32 selectedCount = 0;
+            U32 selectedID = currentID;
+            for (id =refinedStart; id < refinedEnd; id++) {
+                if (b[ suffix[id] + searchLength] != currentChar) {
+                    if (currentCount > selectedCount) {
+                        selectedCount = currentCount;
+                        selectedID = currentID;
+                    }
+                    currentID = id;
+                    currentChar = b[ suffix[id] + searchLength];
+                    currentCount = 0;
+                }
+                currentCount ++;
+            }
+            if (currentCount > selectedCount) {  /* for last */
+                selectedCount = currentCount;
+                selectedID = currentID;
+            }
+
+            if (selectedCount < minRatio)
+                break;
+            refinedStart = selectedID;
+            refinedEnd = refinedStart + selectedCount;
+        }
+
+        /* evaluate gain based on new ref */
+        start = refinedStart;
+        pos = suffix[refinedStart];
+        end = start;
+        memset(lengthList, 0, sizeof(lengthList));
+
+        /* look forward */
+        do {
+            end++;
+            length = ZDICT_count(b + pos, b + suffix[end]);
+            if (length >= LLIMIT) length = LLIMIT-1;
+            lengthList[length]++;
+        } while (length >=MINMATCHLENGTH);
+
+        /* look backward */
+        do {
+            length = ZDICT_count(b + pos, b + suffix[start-1]);
+            if (length >= LLIMIT) length = LLIMIT-1;
+            lengthList[length]++;
+            if (length >=MINMATCHLENGTH) start--;
+        } while(length >= MINMATCHLENGTH);
+
+        /* largest useful length */
+        memset(cumulLength, 0, sizeof(cumulLength));
+        cumulLength[maxLength-1] = lengthList[maxLength-1];
+        for (i=(int)(maxLength-2); i>=0; i--)
+            cumulLength[i] = cumulLength[i+1] + lengthList[i];
+
+        for (i=LLIMIT-1; i>=MINMATCHLENGTH; i--) if (cumulLength[i]>=minRatio) break;
+        maxLength = i;
+
+        /* reduce maxLength in case of final into repetitive data */
+        {
+            U32 l = (U32)maxLength;
+            BYTE c = b[pos + maxLength-1];
+            while (b[pos+l-2]==c) l--;
+            maxLength = l;
+        }
+        if (maxLength < MINMATCHLENGTH) return solution;   /* skip : no long-enough solution */
+
+        /* calculate savings */
+        savings[5] = 0;
+        for (i=MINMATCHLENGTH; i<=(int)maxLength; i++)
+            savings[i] = savings[i-1] + (lengthList[i] * (i-3));
+
+        DISPLAYLEVEL(4, "Selected ref at position %u, of length %u : saves %u (ratio: %.2f)  \n",
+                     (U32)pos, (U32)maxLength, savings[maxLength], (double)savings[maxLength] / maxLength);
+
+        solution.pos = (U32)pos;
+        solution.length = (U32)maxLength;
+        solution.savings = savings[maxLength];
+
+        /* mark positions done */
+        {
+            U32 id;
+            U32 testedPos;
+            for (id=start; id<end; id++) {
+                U32 p, pEnd;
+                testedPos = suffix[id];
+                if (testedPos == pos)
+                    length = solution.length;
+                else {
+                    length = ZDICT_count(b+pos, b+testedPos);
+                    if (length > solution.length) length = solution.length;
+                }
+                pEnd = (U32)(testedPos + length);
+                for (p=testedPos; p<pEnd; p++)
+                    doneMarks[p] = 1;
+    }   }   }
+
+    return solution;
+}
+
+
+/*! ZDICT_checkMerge
+    check if dictItem can be merged, do it if possible
+    @return : id of destination elt, 0 if not merged
+*/
+static U32 ZDICT_checkMerge(dictItem* table, dictItem elt, U32 eltNbToSkip)
+{
+    const U32 tableSize = table->pos;
+    const U32 max = elt.pos + (elt.length-1);
+
+    /* tail overlap */
+    U32 u; for (u=1; u<tableSize; u++) {
+        if (u==eltNbToSkip) continue;
+        if ((table[u].pos > elt.pos) && (table[u].pos < max)) {  /* overlap */
+            /* append */
+            U32 addedLength = table[u].pos - elt.pos;
+            table[u].length += addedLength;
+            table[u].pos = elt.pos;
+            table[u].savings += elt.savings * addedLength / elt.length;   /* rough approx */
+            table[u].savings += elt.length / 8;    /* rough approx */
+            elt = table[u];
+            while ((u>1) && (table[u-1].savings < elt.savings))
+                table[u] = table[u-1], u--;
+            table[u] = elt;
+            return u;
+    }   }
+
+    /* front overlap */
+    for (u=1; u<tableSize; u++) {
+        if (u==eltNbToSkip) continue;
+        if ((table[u].pos + table[u].length > elt.pos) && (table[u].pos < elt.pos)) {  /* overlap */
+            /* append */
+            int addedLength = (elt.pos + elt.length) - (table[u].pos + table[u].length);
+            table[u].savings += elt.length / 8;    /* rough approx */
+            if (addedLength > 0) {   /* otherwise, already included */
+                table[u].length += addedLength;
+                table[u].savings += elt.savings * addedLength / elt.length;   /* rough approx */
+            }
+            elt = table[u];
+            while ((u>1) && (table[u-1].savings < elt.savings))
+                table[u] = table[u-1], u--;
+            table[u] = elt;
+            return u;
+    }   }
+
+    return 0;
+}
+
+
+static void ZDICT_removeDictItem(dictItem* table, U32 id)
+{
+    /* convention : first element is nb of elts */
+    U32 max = table->pos;
+    U32 u;
+    if (!id) return;   /* protection, should never happen */
+    for (u=id; u<max-1; u++)
+        table[u] = table[u+1];
+    table->pos--;
+}
+
+
+static void ZDICT_insertDictItem(dictItem* table, U32 maxSize, dictItem elt)
+{
+    /* merge if possible */
+    U32 mergeId = ZDICT_checkMerge(table, elt, 0);
+    if (mergeId) {
+        U32 newMerge = 1;
+        while (newMerge) {
+            newMerge = ZDICT_checkMerge(table, table[mergeId], mergeId);
+            if (newMerge) ZDICT_removeDictItem(table, mergeId);
+            mergeId = newMerge;
+        }
+        return;
+    }
+
+    /* insert */
+    {
+        U32 current;
+        U32 nextElt = table->pos;
+        if (nextElt >= maxSize) nextElt = maxSize-1;
+        current = nextElt-1;
+        while (table[current].savings < elt.savings) {
+            table[current+1] = table[current];
+            current--;
+        }
+        table[current+1] = elt;
+        table->pos = nextElt+1;
+    }
+}
+
+
+static U32 ZDICT_dictSize(const dictItem* dictList)
+{
+    U32 u, dictSize = 0;
+    for (u=1; u<dictList[0].pos; u++)
+        dictSize += dictList[u].length;
+    return dictSize;
+}
+
+
+static size_t ZDICT_trainBuffer(dictItem* dictList, U32 dictListSize,
+                            const void* const buffer, const size_t bufferSize,   /* buffer must end with noisy guard band */
+                            const size_t* fileSizes, unsigned nbFiles,
+                            U32 shiftRatio, unsigned maxDictSize)
+{
+    int* const suffix0 = (int*)malloc((bufferSize+2)*sizeof(*suffix0));
+    int* const suffix = suffix0+1;
+    U32* reverseSuffix = (U32*)malloc((bufferSize)*sizeof(*reverseSuffix));
+    BYTE* doneMarks = (BYTE*)malloc((bufferSize+16)*sizeof(*doneMarks));   /* +16 for overflow security */
+    U32* filePos = (U32*)malloc(nbFiles * sizeof(*filePos));
+    U32 minRatio = nbFiles >> shiftRatio;
+    int divSuftSortResult;
+    size_t result = 0;
+
+    /* init */
+    DISPLAYLEVEL(2, "\r%70s\r", "");   /* clean display line */
+    if (!suffix0 || !reverseSuffix || !doneMarks || !filePos) {
+        result = ERROR(memory_allocation);
+        goto _cleanup;
+    }
+    if (minRatio < MINRATIO) minRatio = MINRATIO;
+    memset(doneMarks, 0, bufferSize+16);
+
+    /* sort */
+    DISPLAYLEVEL(2, "sorting %u files of total size %u MB ...\n", nbFiles, (U32)(bufferSize>>20));
+    divSuftSortResult = divsufsort((const unsigned char*)buffer, suffix, (int)bufferSize, 0);
+    if (divSuftSortResult != 0) { result = ERROR(GENERIC); goto _cleanup; }
+    suffix[bufferSize] = (int)bufferSize;   /* leads into noise */
+    suffix0[0] = (int)bufferSize;           /* leads into noise */
+    {
+        /* build reverse suffix sort */
+        size_t pos;
+        for (pos=0; pos < bufferSize; pos++)
+            reverseSuffix[suffix[pos]] = (U32)pos;
+        /* build file pos */
+        filePos[0] = 0;
+        for (pos=1; pos<nbFiles; pos++)
+            filePos[pos] = (U32)(filePos[pos-1] + fileSizes[pos-1]);
+    }
+
+    DISPLAYLEVEL(2, "finding patterns ... \n");
+    DISPLAYLEVEL(3, "minimum ratio : %u \n", minRatio);
+
+    {
+        U32 cursor; for (cursor=0; cursor < bufferSize; ) {
+            dictItem solution;
+            if (doneMarks[cursor]) { cursor++; continue; }
+            solution = ZDICT_analyzePos(doneMarks, suffix, reverseSuffix[cursor], buffer, minRatio);
+            if (solution.length==0) { cursor++; continue; }
+            ZDICT_insertDictItem(dictList, dictListSize, solution);
+            cursor += solution.length;
+            DISPLAYUPDATE(2, "\r%4.2f %% \r", (double)cursor / bufferSize * 100);
+    }   }
+
+    /* limit dictionary size */
+    {
+        U32 max = dictList->pos;   /* convention : nb of useful elts within dictList */
+        U32 currentSize = 0;
+        U32 n; for (n=1; n<max; n++) {
+            currentSize += dictList[n].length;
+            if (currentSize > maxDictSize) break;
+        }
+        dictList->pos = n;
+    }
+
+_cleanup:
+    free(suffix0);
+    free(reverseSuffix);
+    free(doneMarks);
+    free(filePos);
+    return result;
+}
+
+
+static void ZDICT_fillNoise(void* buffer, size_t length)
+{
+    unsigned acc = PRIME1;
+    size_t p=0;;
+
+    for (p=0; p<length; p++) {
+        acc *= PRIME2;
+        ((unsigned char*)buffer)[p] = (unsigned char)(acc >> 21);
+    }
+}
+
+
+typedef struct
+{
+    ZSTD_CCtx* ref;
+    ZSTD_CCtx* zc;
+    void* workPlace;   /* must be BLOCKSIZE allocated */
+} EStats_ress_t;
+
+
+static void ZDICT_countEStats(EStats_ress_t esr,
+                            U32* countLit, U32* offsetcodeCount, U32* matchlengthCount, U32* litlengthCount,
+                            const void* src, size_t srcSize)
+{
+    const BYTE* bytePtr;
+    const U32* u32Ptr;
+    seqStore_t seqStore;
+
+    if (srcSize > BLOCKSIZE) srcSize = BLOCKSIZE;   /* protection vs large samples */
+    ZSTD_copyCCtx(esr.zc, esr.ref);
+    ZSTD_compressBlock(esr.zc, esr.workPlace, BLOCKSIZE, src, srcSize);
+    seqStore = ZSTD_copySeqStore(esr.zc);
+
+    /* count stats */
+    for(bytePtr = seqStore.litStart; bytePtr < seqStore.lit; bytePtr++)
+        countLit[*bytePtr]++;
+    for(u32Ptr = seqStore.offsetStart; u32Ptr < seqStore.offset; u32Ptr++) {
+        BYTE offcode = (BYTE)ZSTD_highbit(*u32Ptr) + 1;
+        if (*u32Ptr==0) offcode=0;
+        offsetcodeCount[offcode]++;
+    }
+    for(bytePtr = seqStore.matchLengthStart; bytePtr < seqStore.matchLength; bytePtr++)
+        matchlengthCount[*bytePtr]++;
+    for(bytePtr = seqStore.litLengthStart; bytePtr < seqStore.litLength; bytePtr++)
+        litlengthCount[*bytePtr]++;
+}
+
+
+#define OFFCODE_MAX 18  /* only applicable to first block */
+static size_t ZDICT_analyzeEntropy(void*  dstBuffer, size_t maxDstSize,
+                                 unsigned compressionLevel,
+                           const void*  srcBuffer, const size_t* fileSizes, unsigned nbFiles,
+                           const void* dictBuffer, size_t  dictBufferSize)
+{
+    U32 countLit[256];
+    U32 offcodeCount[MaxOff+1];
+    HUF_CREATE_STATIC_CTABLE(hufTable, 255);
+    short offcodeNCount[MaxOff+1];
+    U32 matchLengthCount[MaxML+1];
+    short matchLengthNCount[MaxML+1];
+    U32 litlengthCount[MaxLL+1];
+    short litlengthNCount[MaxLL+1];
+    EStats_ress_t esr;
+    ZSTD_parameters params;
+    U32 u, huffLog = 12, Offlog = OffFSELog, mlLog = MLFSELog, llLog = LLFSELog, total;
+    size_t pos = 0, errorCode;
+    size_t eSize = 0;
+
+    /* init */
+    for (u=0; u<256; u++) countLit[u]=1;   /* any character must be described */
+    for (u=0; u<=OFFCODE_MAX; u++) offcodeCount[u]=1;
+    for (u=0; u<=MaxML; u++) matchLengthCount[u]=1;
+    for (u=0; u<=MaxLL; u++) litlengthCount[u]=1;
+    esr.ref = ZSTD_createCCtx();
+    esr.zc = ZSTD_createCCtx();
+    esr.workPlace = malloc(BLOCKSIZE);
+    if (!esr.ref || !esr.zc || !esr.workPlace) {
+            eSize = ERROR(memory_allocation);
+            DISPLAYLEVEL(1, "Not enough memory");
+            goto _cleanup;
+    }
+    if (compressionLevel==0) compressionLevel=g_compressionLevel_default;
+    params = ZSTD_getParams(compressionLevel, dictBufferSize + 15 KB);
+    params.strategy = ZSTD_greedy;
+    ZSTD_compressBegin_advanced(esr.ref, dictBuffer, dictBufferSize, params);
+
+    /* collect stats on all files */
+    for (u=0; u<nbFiles; u++) {
+        ZDICT_countEStats(esr,
+                        countLit, offcodeCount, matchLengthCount, litlengthCount,
+           (const char*)srcBuffer + pos, fileSizes[u]);
+        pos += fileSizes[u];
+    }
+
+    /* analyze */
+    errorCode = HUF_buildCTable (hufTable, countLit, 255, huffLog);
+    if (HUF_isError(errorCode)) {
+        eSize = ERROR(GENERIC);
+        DISPLAYLEVEL(1, "HUF_buildCTable error");
+        goto _cleanup;
+    }
+    huffLog = (U32)errorCode;
+
+    total=0; for (u=0; u<=OFFCODE_MAX; u++) total+=offcodeCount[u];
+    errorCode = FSE_normalizeCount(offcodeNCount, Offlog, offcodeCount, total, OFFCODE_MAX);
+    if (FSE_isError(errorCode)) {
+        eSize = ERROR(GENERIC);
+        DISPLAYLEVEL(1, "FSE_normalizeCount error with offcodeCount");
+        goto _cleanup;
+    }
+    Offlog = (U32)errorCode;
+
+    total=0; for (u=0; u<=MaxML; u++) total+=matchLengthCount[u];
+    errorCode = FSE_normalizeCount(matchLengthNCount, mlLog, matchLengthCount, total, MaxML);
+    if (FSE_isError(errorCode)) {
+        eSize = ERROR(GENERIC);
+        DISPLAYLEVEL(1, "FSE_normalizeCount error with matchLengthCount");
+        goto _cleanup;
+    }
+    mlLog = (U32)errorCode;
+
+    total=0; for (u=0; u<=MaxLL; u++) total+=litlengthCount[u];
+    errorCode = FSE_normalizeCount(litlengthNCount, llLog, litlengthCount, total, MaxLL);
+    if (FSE_isError(errorCode)) {
+        eSize = ERROR(GENERIC);
+        DISPLAYLEVEL(1, "FSE_normalizeCount error with litlengthCount");
+        goto _cleanup;
+    }
+    llLog = (U32)errorCode;
+
+    /* write result to buffer */
+    errorCode = HUF_writeCTable(dstBuffer, maxDstSize, hufTable, 255, huffLog);
+    if (HUF_isError(errorCode)) {
+        eSize = ERROR(GENERIC);
+        DISPLAYLEVEL(1, "HUF_writeCTable error");
+        goto _cleanup;
+    }
+    dstBuffer = (char*)dstBuffer + errorCode;
+    maxDstSize -= errorCode;
+    eSize += errorCode;
+
+    errorCode = FSE_writeNCount(dstBuffer, maxDstSize, offcodeNCount, OFFCODE_MAX, Offlog);
+    if (FSE_isError(errorCode)) {
+        eSize = ERROR(GENERIC);
+        DISPLAYLEVEL(1, "FSE_writeNCount error with offcodeNCount");
+        goto _cleanup;
+    }
+    dstBuffer = (char*)dstBuffer + errorCode;
+    maxDstSize -= errorCode;
+    eSize += errorCode;
+
+    errorCode = FSE_writeNCount(dstBuffer, maxDstSize, matchLengthNCount, MaxML, mlLog);
+    if (FSE_isError(errorCode)) {
+        eSize = ERROR(GENERIC);
+        DISPLAYLEVEL(1, "FSE_writeNCount error with matchLengthNCount");
+        goto _cleanup;
+    }
+    dstBuffer = (char*)dstBuffer + errorCode;
+    maxDstSize -= errorCode;
+    eSize += errorCode;
+
+    errorCode = FSE_writeNCount(dstBuffer, maxDstSize, litlengthNCount, MaxLL, llLog);
+    if (FSE_isError(errorCode)) {
+        eSize = ERROR(GENERIC);
+        DISPLAYLEVEL(1, "FSE_writeNCount error with litlengthNCount");
+        goto _cleanup;
+    }
+    dstBuffer = (char*)dstBuffer + errorCode;
+    maxDstSize -= errorCode;
+    eSize += errorCode;
+
+_cleanup:
+    ZSTD_freeCCtx(esr.ref);
+    ZSTD_freeCCtx(esr.zc);
+    free(esr.workPlace);
+
+    return eSize;
+}
+
+
+#define DIB_FASTSEGMENTSIZE 64
+/*! ZDICT_fastSampling()  (based on an idea proposed by Giuseppe Ottaviano) :
+    Fill `dictBuffer` with stripes of size DIB_FASTSEGMENTSIZE from `samplesBuffer`,
+    up to `dictSize`.
+    Filling starts from the end of `dictBuffer`, down to maximum possible.
+    if `dictSize` is not a multiply of DIB_FASTSEGMENTSIZE, some bytes at beginning of `dictBuffer` won't be used.
+    @return : amount of data written into `dictBuffer`,
+              or an error code
+*/
+static size_t ZDICT_fastSampling(void* dictBuffer, size_t dictSize,
+                         const void* samplesBuffer, size_t samplesSize)
+{
+    char* dstPtr = (char*)dictBuffer + dictSize;
+    const char* srcPtr = (const char*)samplesBuffer;
+    size_t nbSegments = dictSize / DIB_FASTSEGMENTSIZE;
+    size_t segNb, interSize;
+
+    if (nbSegments <= 2) return ERROR(srcSize_wrong);
+    if (samplesSize < dictSize) return ERROR(srcSize_wrong);
+
+    /* first and last segments are part of dictionary, in case they contain interesting header/footer */
+    dstPtr -= DIB_FASTSEGMENTSIZE;
+    memcpy(dstPtr, srcPtr, DIB_FASTSEGMENTSIZE);
+    dstPtr -= DIB_FASTSEGMENTSIZE;
+    memcpy(dstPtr, srcPtr+samplesSize-DIB_FASTSEGMENTSIZE, DIB_FASTSEGMENTSIZE);
+
+    /* regularly copy a segment */
+    interSize = (samplesSize - nbSegments*DIB_FASTSEGMENTSIZE) / (nbSegments-1);
+    srcPtr += DIB_FASTSEGMENTSIZE;
+    for (segNb=2; segNb < nbSegments; segNb++) {
+        srcPtr += interSize;
+        dstPtr -= DIB_FASTSEGMENTSIZE;
+        memcpy(dstPtr, srcPtr, DIB_FASTSEGMENTSIZE);
+        srcPtr += DIB_FASTSEGMENTSIZE;
+    }
+
+    return nbSegments * DIB_FASTSEGMENTSIZE;
+}
+
+
+size_t ZDICT_trainFromBuffer_unsafe(
+                            void* dictBuffer, size_t maxDictSize,
+                            const void* samplesBuffer, const size_t* sampleSizes, unsigned nbSamples,
+                            ZDICT_params_t params)
+{
+    const U32 dictListSize = MAX( MAX(DICTLISTSIZE, nbSamples), (U32)(maxDictSize/16));
+    dictItem* dictList = (dictItem*)malloc(dictListSize * sizeof(*dictList));
+    unsigned selectivity = params.selectivityLevel;
+    unsigned compressionLevel = params.compressionLevel;
+    size_t targetDictSize = maxDictSize - g_provision_entropySize;
+    size_t sBuffSize;
+    size_t dictSize = 0;
+
+    /* checks */
+    if (maxDictSize <= g_provision_entropySize + g_min_fast_dictContent) return ERROR(dstSize_tooSmall);
+
+    /* init */
+    { unsigned u; for (u=0, sBuffSize=0; u<nbSamples; u++) sBuffSize += sampleSizes[u]; }
+    if (!dictList) return ERROR(memory_allocation);
+    ZDICT_initDictItem(dictList);
+    g_displayLevel = params.notificationLevel;
+    if (selectivity==0) selectivity = g_selectivity_default;
+    if (compressionLevel==0) compressionLevel = g_compressionLevel_default;
+
+    /* build dictionary */
+    if (selectivity>1) {  /* selectivity == 1 => fast mode */
+        ZDICT_trainBuffer(dictList, dictListSize,
+                        samplesBuffer, sBuffSize,
+                        sampleSizes, nbSamples,
+                        selectivity, (U32)targetDictSize);
+
+        /* display best matches */
+        if (g_displayLevel>= 3) {
+            const U32 nb = 25;
+            U32 u;
+            U32 dictContentSize = ZDICT_dictSize(dictList);
+            DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", dictList[0].pos, dictContentSize);
+            DISPLAYLEVEL(3, "list %u best segments \n", nb);
+            for (u=1; u<=nb; u++) {
+                U32 p = dictList[u].pos;
+                U32 l = dictList[u].length;
+                U32 d = MIN(40, l);
+                DISPLAYLEVEL(3, "%3u:%3u bytes at pos %8u, savings %7u bytes |",
+                             u, l, p, dictList[u].savings);
+                ZDICT_printHex(3, (const char*)samplesBuffer+p, d);
+                DISPLAYLEVEL(3, "| \n");
+    }   }   }
+
+    /* create dictionary */
+    {
+        U32 dictContentSize = ZDICT_dictSize(dictList);
+        size_t hSize;
+        BYTE* ptr;
+        U32 u;
+
+        /* build dict content */
+        ptr = (BYTE*)dictBuffer + maxDictSize;
+        for (u=1; u<dictList->pos; u++) {
+            U32 l = dictList[u].length;
+            ptr -= l;
+            if (ptr<(BYTE*)dictBuffer) return ERROR(GENERIC);   /* should not happen */
+            memcpy(ptr, (const char*)samplesBuffer+dictList[u].pos, l);
+        }
+
+        /* fast mode dict content */
+        if (selectivity==1) {  /* note could also be used to complete a dictionary, but not necessarily better */
+            DISPLAYLEVEL(3, "\r%70s\r", "");   /* clean display line */
+            DISPLAYLEVEL(3, "Adding %u KB with fast sampling \n", (U32)(targetDictSize>>10));
+            dictContentSize = (U32)ZDICT_fastSampling((char*)dictBuffer + g_provision_entropySize,
+                                               targetDictSize, samplesBuffer, sBuffSize);
+        }
+
+       /* dictionary header */
+        MEM_writeLE32(dictBuffer, ZSTD_DICT_MAGIC);
+        hSize = 4;
+
+        /* entropic tables */
+        DISPLAYLEVEL(2, "\r%70s\r", "");   /* clean display line */
+        DISPLAYLEVEL(2, "statistics ... \n");
+        hSize += ZDICT_analyzeEntropy((char*)dictBuffer+4, maxDictSize-4,
+                                    compressionLevel,
+                                    samplesBuffer, sampleSizes, nbSamples,
+                                    (char*)dictBuffer + maxDictSize - dictContentSize, dictContentSize);
+
+        if (hSize + dictContentSize < maxDictSize)
+            memmove((char*)dictBuffer + hSize, (char*)dictBuffer + maxDictSize - dictContentSize, dictContentSize);
+        dictSize = MIN(maxDictSize, hSize+dictContentSize);
+    }
+
+    /* clean up */
+    free(dictList);
+    return dictSize;
+}
+
+
+size_t ZDICT_trainFromBuffer_advanced(void* dictBuffer, size_t dictBufferCapacity,
+                           const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
+                           ZDICT_params_t params)
+{
+    size_t sBuffSize;
+    void* newBuff;
+    size_t result;
+
+    { unsigned u; for (u=0, sBuffSize=0; u<nbSamples; u++) sBuffSize += samplesSizes[u]; }
+    newBuff = malloc(sBuffSize + NOISELENGTH);
+    if (!newBuff) return ERROR(memory_allocation);
+
+    memcpy(newBuff, samplesBuffer, sBuffSize);
+    ZDICT_fillNoise((char*)newBuff + sBuffSize, NOISELENGTH);   /* guard band, for end of buffer condition */
+
+    result = ZDICT_trainFromBuffer_unsafe(dictBuffer, dictBufferCapacity,
+                                        newBuff, samplesSizes, nbSamples,
+                                        params);
+    free(newBuff);
+    return result;
+}
+
+
+/* issue : samplesBuffer need to be followed by a noisy guard band.
+*  work around : duplicate the buffer, and add the noise ? */
+size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
+                             const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
+{
+    ZDICT_params_t params;
+    memset(&params, 0, sizeof(params));
+    return ZDICT_trainFromBuffer_advanced(dictBuffer, dictBufferCapacity,
+                                          samplesBuffer, samplesSizes, nbSamples,
+                                          params);
+}
+
diff --git a/lib/zstd_buffered_static.h b/lib/zdict.h
similarity index 61%
rename from lib/zstd_buffered_static.h
rename to lib/zdict.h
index 7d9ee27..2ca190c 100644
--- a/lib/zstd_buffered_static.h
+++ b/lib/zdict.h
@@ -1,19 +1,20 @@
 /*
-    zstd - buffered version of compression library
-    experimental complementary API, for static linking only
-    Copyright (C) 2015, Yann Collet.
+    dictBuilder header file
+    Copyright (C) Yann Collet 2016
 
     BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
 
     Redistribution and use in source and binary forms, with or without
     modification, are permitted provided that the following conditions are
     met:
+
     * Redistributions of source code must retain the above copyright
     notice, this list of conditions and the following disclaimer.
     * Redistributions in binary form must reproduce the above
     copyright notice, this list of conditions and the following disclaimer
     in the documentation and/or other materials provided with the
     distribution.
+
     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -27,36 +28,40 @@
     OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
     You can contact the author at :
-    - zstd source repository : https://github.com/Cyan4973/zstd
-    - ztsd public forum : https://groups.google.com/forum/#!forum/lz4c
+       - Zstd source repository : https://www.zstd.net
 */
-#ifndef ZSTD_BUFFERED_STATIC_H
-#define ZSTD_BUFFERED_STATIC_H
 
-/* The objects defined into this file should be considered experimental.
- * They are not labelled stable, as their prototype may change in the future.
- * You can use them for tests, provide feedback, or if you can endure risk of future changes.
- */
+#ifndef DICTBUILDER_H_001
+#define DICTBUILDER_H_001
 
 #if defined (__cplusplus)
 extern "C" {
 #endif
 
-/* *************************************
-*  Includes
+/*-*************************************
+*  Public functions
 ***************************************/
-#include "zstd_static.h"
-#include "zstd_buffered.h"
+/*! ZDICT_trainFromBuffer() :
+    Train a dictionary from a memory buffer `samplesBuffer`,
+    where `nbSamples` samples have been stored concatenated.
+    Each sample size is provided into an orderly table `samplesSizes`.
+    Resulting dictionary will be saved into `dictBuffer`.
+    @return : size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
+              or an error code, which can be tested by ZDICT_isError().
+*/
+size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
+                             const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples);
 
 
-/* *************************************
-*  Advanced Streaming functions
+/*-*************************************
+*  Helper functions
 ***************************************/
-ZSTDLIB_API size_t ZBUFF_compressInit_advanced(ZBUFF_CCtx* cctx, ZSTD_parameters params);
+unsigned ZDICT_isError(size_t errorCode);
+const char* ZDICT_getErrorName(size_t errorCode);
 
 
 #if defined (__cplusplus)
 }
 #endif
 
-#endif  /* ZSTD_BUFFERED_STATIC_H */
+#endif
diff --git a/lib/zdict_static.h b/lib/zdict_static.h
new file mode 100644
index 0000000..e5f909a
--- /dev/null
+++ b/lib/zdict_static.h
@@ -0,0 +1,80 @@
+/*
+    dictBuilder header file
+    for static linking only
+    Copyright (C) Yann Collet 2016
+
+    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+
+    * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following disclaimer
+    in the documentation and/or other materials provided with the
+    distribution.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+       - Zstd source repository : https://www.zstd.net
+*/
+
+/* This library is EXPERIMENTAL, below API is not yet stable */
+
+#ifndef DICTBUILDER_STATIC_H_002
+#define DICTBUILDER_STATIC_H_002
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/*-*************************************
+*  Dependencies
+***************************************/
+#include "zdict.h"
+
+
+/*-*************************************
+*  Public type
+***************************************/
+typedef struct {
+    unsigned selectivityLevel;   /* 0 means default; larger => bigger selection => larger dictionary */
+    unsigned compressionLevel;   /* 0 means default; target a specific zstd compression level */
+    unsigned notificationLevel;  /* Write to stderr; 0 = none (default); 1 = errors; 2 = progression; 3 = details; 4 = debug; */
+    unsigned reserved[3];        /* space for future parameters */
+} ZDICT_params_t;
+
+
+/*-*************************************
+*  Public functions
+***************************************/
+/*! ZDICT_trainFromBuffer_advanced() :
+    Same as ZDICT_trainFromBuffer() with control over more parameters.
+    `parameters` is optional and can be provided with values set to 0 to mean "default".
+    @return : size of dictionary stored into `dictBuffer` (<= `dictBufferSize`)
+              or an error code, which can be tested by DiB_isError().
+    note : ZDICT_trainFromBuffer_advanced() will send notifications into stderr if instructed to, using ZDICT_setNotificationLevel()
+*/
+size_t ZDICT_trainFromBuffer_advanced(void* dictBuffer, size_t dictBufferCapacity,
+                             const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
+                             ZDICT_params_t parameters);
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif  /* DICTBUILDER_STATIC_H_002 */
diff --git a/lib/zstd.h b/lib/zstd.h
index 695d26d..53ed697 100644
--- a/lib/zstd.h
+++ b/lib/zstd.h
@@ -1,7 +1,7 @@
 /*
     zstd - standard compression library
     Header File
-    Copyright (C) 2014-2015, Yann Collet.
+    Copyright (C) 2014-2016, Yann Collet.
 
     BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
 
@@ -28,7 +28,6 @@
 
     You can contact the author at :
     - zstd source repository : https://github.com/Cyan4973/zstd
-    - ztsd public forum : https://groups.google.com/forum/#!forum/lz4c
 */
 #ifndef ZSTD_H
 #define ZSTD_H
@@ -37,13 +36,13 @@
 extern "C" {
 #endif
 
-/* *************************************
-*  Includes
+/*-*************************************
+*  Dependencies
 ***************************************/
 #include <stddef.h>   /* size_t */
 
 
-/* ***************************************************************
+/*-***************************************************************
 *  Export parameters
 *****************************************************************/
 /*!
@@ -61,8 +60,8 @@ extern "C" {
 *  Version
 ***************************************/
 #define ZSTD_VERSION_MAJOR    0    /* for breaking interface changes  */
-#define ZSTD_VERSION_MINOR    4    /* for new (non-breaking) interface capabilities */
-#define ZSTD_VERSION_RELEASE  7    /* for tweaks, bug-fixes, or development */
+#define ZSTD_VERSION_MINOR    5    /* for new (non-breaking) interface capabilities */
+#define ZSTD_VERSION_RELEASE  1    /* for tweaks, bug-fixes, or development */
 #define ZSTD_VERSION_NUMBER  (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_MINOR *100 + ZSTD_VERSION_RELEASE)
 ZSTDLIB_API unsigned ZSTD_versionNumber (void);
 
@@ -70,60 +69,77 @@ ZSTDLIB_API unsigned ZSTD_versionNumber (void);
 /* *************************************
 *  Simple functions
 ***************************************/
-ZSTDLIB_API size_t ZSTD_compress(   void* dst, size_t maxDstSize,
+/*! ZSTD_compress() :
+    Compresses `srcSize` bytes from buffer `src` into buffer `dst` of size `dstCapacity`.
+    Destination buffer must be already allocated.
+    Compression runs faster if `dstCapacity` >=  `ZSTD_compressBound(srcSize)`.
+    @return : the number of bytes written into `dst`,
+              or an error code if it fails (which can be tested using ZSTD_isError()) */
+ZSTDLIB_API size_t ZSTD_compress(   void* dst, size_t dstCapacity,
                               const void* src, size_t srcSize,
                                      int  compressionLevel);
 
-ZSTDLIB_API size_t ZSTD_decompress( void* dst, size_t maxOriginalSize,
+/*! ZSTD_decompress() :
+    `compressedSize` : is the _exact_ size of the compressed blob, otherwise decompression will fail.
+    `dstCapacity` must be large enough, equal or larger than originalSize.
+    @return : the number of bytes decompressed into `dst` (<= `dstCapacity`),
+              or an errorCode if it fails (which can be tested using ZSTD_isError()) */
+ZSTDLIB_API size_t ZSTD_decompress( void* dst, size_t dstCapacity,
                               const void* src, size_t compressedSize);
 
-/**
-ZSTD_compress() :
-    Compresses 'srcSize' bytes from buffer 'src' into buffer 'dst', of maximum size 'dstSize'.
-    Destination buffer must be already allocated.
-    Compression runs faster if maxDstSize >=  ZSTD_compressBound(srcSize).
-    return : the number of bytes written into buffer 'dst'
-             or an error code if it fails (which can be tested using ZSTD_isError())
-
-ZSTD_decompress() :
-    compressedSize : is the exact source size
-    maxOriginalSize : is the size of the 'dst' buffer, which must be already allocated.
-                      It must be equal or larger than originalSize, otherwise decompression will fail.
-    return : the number of bytes decompressed into destination buffer (<= maxOriginalSize)
-             or an errorCode if it fails (which can be tested using ZSTD_isError())
-*/
-
 
 /* *************************************
-*  Tool functions
+*  Helper functions
 ***************************************/
-ZSTDLIB_API size_t      ZSTD_compressBound(size_t srcSize);   /** maximum compressed size (worst case scenario) */
+ZSTDLIB_API size_t      ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size (worst case scenario) */
 
 /* Error Management */
-ZSTDLIB_API unsigned    ZSTD_isError(size_t code);         /** tells if a return value is an error code */
-ZSTDLIB_API const char* ZSTD_getErrorName(size_t code);    /** provides error code string */
+ZSTDLIB_API unsigned    ZSTD_isError(size_t code);          /*!< tells if a `size_t` function result is an error code */
+ZSTDLIB_API const char* ZSTD_getErrorName(size_t code);     /*!< provides readable string for an error code */
 
 
 /* *************************************
-*  Advanced functions
+*  Explicit memory management
 ***************************************/
-/** Compression context management */
-typedef struct ZSTD_CCtx_s ZSTD_CCtx;   /* incomplete type */
+/** Compression context */
+typedef struct ZSTD_CCtx_s ZSTD_CCtx;                       /*< incomplete type */
 ZSTDLIB_API ZSTD_CCtx* ZSTD_createCCtx(void);
-ZSTDLIB_API size_t     ZSTD_freeCCtx(ZSTD_CCtx* cctx);
+ZSTDLIB_API size_t     ZSTD_freeCCtx(ZSTD_CCtx* cctx);      /*!< @return : errorCode */
 
 /** ZSTD_compressCCtx() :
     Same as ZSTD_compress(), but requires an already allocated ZSTD_CCtx (see ZSTD_createCCtx()) */
-ZSTDLIB_API size_t ZSTD_compressCCtx(ZSTD_CCtx* ctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize, int compressionLevel);
+ZSTDLIB_API size_t ZSTD_compressCCtx(ZSTD_CCtx* ctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize, int compressionLevel);
 
-/** Decompression context management */
+/** Decompression context */
 typedef struct ZSTD_DCtx_s ZSTD_DCtx;
 ZSTDLIB_API ZSTD_DCtx* ZSTD_createDCtx(void);
-ZSTDLIB_API size_t     ZSTD_freeDCtx(ZSTD_DCtx* dctx);
+ZSTDLIB_API size_t     ZSTD_freeDCtx(ZSTD_DCtx* dctx);      /*!< @return : errorCode */
 
-/** ZSTD_decompressDCtx
+/** ZSTD_decompressDCtx() :
 *   Same as ZSTD_decompress(), but requires an already allocated ZSTD_DCtx (see ZSTD_createDCtx()) */
-ZSTDLIB_API size_t ZSTD_decompressDCtx(ZSTD_DCtx* ctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize);
+ZSTDLIB_API size_t ZSTD_decompressDCtx(ZSTD_DCtx* ctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+
+
+/*-***********************
+*  Dictionary API
+*************************/
+/*! ZSTD_compress_usingDict() :
+*   Compression using a pre-defined Dictionary content (see dictBuilder).
+*   Note : dict can be NULL, in which case, it's equivalent to ZSTD_compressCCtx() */
+ZSTDLIB_API size_t ZSTD_compress_usingDict(ZSTD_CCtx* ctx,
+                                           void* dst, size_t dstCapacity,
+                                     const void* src, size_t srcSize,
+                                     const void* dict,size_t dictSize,
+                                           int compressionLevel);
+
+/*! ZSTD_decompress_usingDict() :
+*   Decompression using a pre-defined Dictionary content (see dictBuilder).
+*   Dictionary must be identical to the one used during compression, otherwise regenerated data will be corrupted.
+*   Note : dict can be NULL, in which case, it's equivalent to ZSTD_decompressDCtx() */
+ZSTDLIB_API size_t ZSTD_decompress_usingDict(ZSTD_DCtx* dctx,
+                                             void* dst, size_t dstCapacity,
+                                       const void* src, size_t srcSize,
+                                       const void* dict,size_t dictSize);
 
 
 #if defined (__cplusplus)
diff --git a/lib/zstd_compress.c b/lib/zstd_compress.c
index d01807b..7bea6ab 100644
--- a/lib/zstd_compress.c
+++ b/lib/zstd_compress.c
@@ -1,6 +1,6 @@
 /*
     ZSTD HC - High Compression Mode of Zstandard
-    Copyright (C) 2015, Yann Collet.
+    Copyright (C) 2015-2016, Yann Collet.
 
     BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
 
@@ -40,7 +40,6 @@
 #  include <intrin.h>                    /* For Visual 2005 */
 #  pragma warning(disable : 4127)        /* disable: C4127: conditional expression is constant */
 #else
-#  define GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
 #  ifdef __GNUC__
 #    define FORCE_INLINE static inline __attribute__((always_inline))
 #  else
@@ -49,43 +48,50 @@
 #endif
 
 
-/* *************************************
-*  Includes
+/*-*************************************
+*  Dependencies
 ***************************************/
 #include <stdlib.h>   /* malloc */
 #include <string.h>   /* memset */
 #include "mem.h"
 #include "fse_static.h"
-#include "huff0.h"
-#include "zstd_static.h"
+#include "huff0_static.h"
 #include "zstd_internal.h"
 
 
-/* *************************************
+/*-*************************************
 *  Constants
 ***************************************/
-ZSTDLIB_API unsigned ZSTD_maxCLevel(void) { return ZSTD_MAX_CLEVEL; }
 static const U32 g_searchStrength = 8;
 
 
-/* *************************************
+/*-*************************************
+*  Helper functions
+***************************************/
+size_t ZSTD_compressBound(size_t srcSize) { return FSE_compressBound(srcSize) + 12; }
+
+
+/*-*************************************
 *  Sequence storage
 ***************************************/
-typedef struct {
-    void* buffer;
-    U32*  offsetStart;
-    U32*  offset;
-    BYTE* offCodeStart;
-    BYTE* offCode;
-    BYTE* litStart;
-    BYTE* lit;
-    BYTE* litLengthStart;
-    BYTE* litLength;
-    BYTE* matchLengthStart;
-    BYTE* matchLength;
-    BYTE* dumpsStart;
-    BYTE* dumps;
-} seqStore_t;
+/** ZSTD_resetFreqs() : for opt variants */
+static void ZSTD_resetFreqs(seqStore_t* ssPtr)
+{
+    unsigned u;
+    ssPtr->matchLengthSum = 512; // (1<<MLbits);
+    ssPtr->litLengthSum = 256; // (1<<LLbits);
+    ssPtr->litSum = (1<<Litbits);
+    ssPtr->offCodeSum = (1<<Offbits);
+
+    for (u=0; u<=MaxLit; u++)
+        ssPtr->litFreq[u] = 1;
+    for (u=0; u<=MaxLL; u++)
+        ssPtr->litLengthFreq[u] = 1;
+    for (u=0; u<=MaxML; u++)
+        ssPtr->matchLengthFreq[u] = 1;
+    for (u=0; u<=MaxOff; u++)
+        ssPtr->offCodeFreq[u] = 1;
+}
 
 static void ZSTD_resetSeqStore(seqStore_t* ssPtr)
 {
@@ -97,7 +103,7 @@ static void ZSTD_resetSeqStore(seqStore_t* ssPtr)
 }
 
 
-/* *************************************
+/*-*************************************
 *  Context memory management
 ***************************************/
 struct ZSTD_CCtx_s
@@ -108,6 +114,7 @@ struct ZSTD_CCtx_s
     U32   dictLimit;        /* below that point, need extDict */
     U32   lowLimit;         /* below that point, no more data */
     U32   nextToUpdate;     /* index from which to continue dictionary update */
+    U32   loadedDictEnd;
     U32   stage;
     ZSTD_parameters params;
     void* workSpace;
@@ -116,13 +123,16 @@ struct ZSTD_CCtx_s
     size_t hbSize;
     char headerBuffer[ZSTD_frameHeaderSize_max];
 
-
     seqStore_t seqStore;    /* sequences storage ptrs */
     U32* hashTable;
     U32* contentTable;
+    HUF_CElt* hufTable;
+    U32 flagStaticTables;
+    FSE_CTable offcodeCTable   [FSE_CTABLE_SIZE_U32(OffFSELog, MaxOff)];
+    FSE_CTable matchlengthCTable [FSE_CTABLE_SIZE_U32(MLFSELog, MaxML)];
+    FSE_CTable litlengthCTable   [FSE_CTABLE_SIZE_U32(LLFSELog, MaxLL)];
 };
 
-
 ZSTD_CCtx* ZSTD_createCCtx(void)
 {
     return (ZSTD_CCtx*) calloc(1, sizeof(ZSTD_CCtx));
@@ -132,67 +142,67 @@ size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx)
 {
     free(cctx->workSpace);
     free(cctx);
-    return 0;
+    return 0;   /* reserved as a potential error code in the future */
+}
+
+seqStore_t ZSTD_copySeqStore(const ZSTD_CCtx* ctx)
+{
+    return ctx->seqStore;
 }
 
 
 static unsigned ZSTD_highbit(U32 val);
 
-/** ZSTD_validateParams
-    correct params value to remain within authorized range
-    optimize for srcSize if srcSize > 0 */
+#define CLAMP(val,min,max) { if (val<min) val=min; else if (val>max) val=max; }
+
+/** ZSTD_validateParams() :
+    correct params value to remain within authorized range,
+    optimize for `srcSize` if srcSize > 0 */
 void ZSTD_validateParams(ZSTD_parameters* params)
 {
-    const U32 btPlus = (params->strategy == ZSTD_btlazy2);
+    const U32 btPlus = (params->strategy == ZSTD_btlazy2) || (params->strategy == ZSTD_btopt);
 
     /* validate params */
     if (MEM_32bits()) if (params->windowLog > 25) params->windowLog = 25;   /* 32 bits mode cannot flush > 24 bits */
-    if (params->windowLog   > ZSTD_WINDOWLOG_MAX) params->windowLog = ZSTD_WINDOWLOG_MAX;
-    if (params->windowLog   < ZSTD_WINDOWLOG_MIN) params->windowLog = ZSTD_WINDOWLOG_MIN;
+    CLAMP(params->windowLog, ZSTD_WINDOWLOG_MIN, ZSTD_WINDOWLOG_MAX);
+    CLAMP(params->contentLog, ZSTD_CONTENTLOG_MIN, ZSTD_CONTENTLOG_MAX);
+    CLAMP(params->hashLog, ZSTD_HASHLOG_MIN, ZSTD_HASHLOG_MAX);
+    CLAMP(params->searchLog, ZSTD_SEARCHLOG_MIN, ZSTD_SEARCHLOG_MAX);
+    CLAMP(params->searchLength, ZSTD_SEARCHLENGTH_MIN, ZSTD_SEARCHLENGTH_MAX);
+    CLAMP(params->targetLength, ZSTD_TARGETLENGTH_MIN, ZSTD_TARGETLENGTH_MAX);
+    if ((U32)params->strategy>(U32)ZSTD_btopt) params->strategy = ZSTD_btopt;
 
     /* correct params, to use less memory */
-    if ((params->srcSize > 0) && (params->srcSize < (1<<ZSTD_WINDOWLOG_MAX)))
-    {
+    if ((params->srcSize > 0) && (params->srcSize < (1<<ZSTD_WINDOWLOG_MAX))) {
         U32 srcLog = ZSTD_highbit((U32)(params->srcSize)-1) + 1;
         if (params->windowLog > srcLog) params->windowLog = srcLog;
     }
-
     if (params->windowLog   < ZSTD_WINDOWLOG_ABSOLUTEMIN) params->windowLog = ZSTD_WINDOWLOG_ABSOLUTEMIN;  /* required for frame header */
     if (params->contentLog  > params->windowLog+btPlus) params->contentLog = params->windowLog+btPlus;   /* <= ZSTD_CONTENTLOG_MAX */
-    if (params->contentLog  < ZSTD_CONTENTLOG_MIN) params->contentLog = ZSTD_CONTENTLOG_MIN;
-    if (params->hashLog     > ZSTD_HASHLOG_MAX) params->hashLog = ZSTD_HASHLOG_MAX;
-    if (params->hashLog     < ZSTD_HASHLOG_MIN) params->hashLog = ZSTD_HASHLOG_MIN;
-    if (params->searchLog   > ZSTD_SEARCHLOG_MAX) params->searchLog = ZSTD_SEARCHLOG_MAX;
-    if (params->searchLog   < ZSTD_SEARCHLOG_MIN) params->searchLog = ZSTD_SEARCHLOG_MIN;
-    if (params->searchLength> ZSTD_SEARCHLENGTH_MAX) params->searchLength = ZSTD_SEARCHLENGTH_MAX;
-    if (params->searchLength< ZSTD_SEARCHLENGTH_MIN) params->searchLength = ZSTD_SEARCHLENGTH_MIN;
-    if ((U32)params->strategy>(U32)ZSTD_btlazy2) params->strategy = ZSTD_btlazy2;
 }
 
 
 static size_t ZSTD_resetCCtx_advanced (ZSTD_CCtx* zc,
                                        ZSTD_parameters params)
-{
-    /* note : params considered validated here */
+{   /* note : params considered validated here */
     const size_t blockSize = MIN(BLOCKSIZE, (size_t)1 << params.windowLog);
-
     /* reserve table memory */
-    {
-        const U32 contentLog = (params.strategy == ZSTD_fast) ? 1 : params.contentLog;
-        const size_t tableSpace = ((1 << contentLog) + (1 << params.hashLog)) * sizeof(U32);
-        const size_t neededSpace = tableSpace + (3*blockSize);
-        if (zc->workSpaceSize < neededSpace)
-        {
-            free(zc->workSpace);
-            zc->workSpaceSize = neededSpace;
-            zc->workSpace = malloc(neededSpace);
-            if (zc->workSpace == NULL) return ERROR(memory_allocation);
-        }
-        memset(zc->workSpace, 0, tableSpace );
-        zc->hashTable = (U32*)(zc->workSpace);
-        zc->contentTable = zc->hashTable + ((size_t)1 << params.hashLog);
-        zc->seqStore.buffer = (void*) (zc->contentTable + ((size_t)1 << contentLog));
+    const U32 contentLog = (params.strategy == ZSTD_fast) ? 1 : params.contentLog;
+    const size_t tableSpace = ((1 << contentLog) + (1 << params.hashLog)) * sizeof(U32);
+    const size_t neededSpace = tableSpace + (256*sizeof(U32)) + (3*blockSize) + ((1<<MLbits) + (1<<LLbits) + (1<<Offbits) + (1<<Litbits))*sizeof(U32);
+    if (zc->workSpaceSize < neededSpace) {
+        free(zc->workSpace);
+        zc->workSpace = malloc(neededSpace);
+        if (zc->workSpace == NULL) return ERROR(memory_allocation);
+        zc->workSpaceSize = neededSpace;
     }
+    memset(zc->workSpace, 0, tableSpace );   /* reset only tables */
+    zc->hashTable = (U32*)(zc->workSpace);
+    zc->contentTable = zc->hashTable + ((size_t)1 << params.hashLog);
+    zc->seqStore.buffer = zc->contentTable + ((size_t)1 << contentLog);
+    zc->hufTable = (HUF_CElt*)zc->seqStore.buffer;
+    zc->flagStaticTables = 0;
+    zc->seqStore.buffer = (U32*)(zc->seqStore.buffer) + 256;
 
     zc->nextToUpdate = 1;
     zc->nextSrc = NULL;
@@ -202,20 +212,71 @@ static size_t ZSTD_resetCCtx_advanced (ZSTD_CCtx* zc,
     zc->lowLimit = 0;
     zc->params = params;
     zc->blockSize = blockSize;
-    zc->seqStore.offsetStart = (U32*) (zc->seqStore.buffer);
+
+    zc->seqStore.litFreq = (U32*) (zc->seqStore.buffer);
+    zc->seqStore.litLengthFreq = zc->seqStore.litFreq + (1<<Litbits);
+    zc->seqStore.matchLengthFreq = zc->seqStore.litLengthFreq + (1<<LLbits);
+    zc->seqStore.offCodeFreq = zc->seqStore.matchLengthFreq + (1<<MLbits);
+
+    zc->seqStore.offsetStart = zc->seqStore.offCodeFreq + (1<<Offbits);
     zc->seqStore.offCodeStart = (BYTE*) (zc->seqStore.offsetStart + (blockSize>>2));
     zc->seqStore.litStart = zc->seqStore.offCodeStart + (blockSize>>2);
     zc->seqStore.litLengthStart =  zc->seqStore.litStart + blockSize;
     zc->seqStore.matchLengthStart = zc->seqStore.litLengthStart + (blockSize>>2);
     zc->seqStore.dumpsStart = zc->seqStore.matchLengthStart + (blockSize>>2);
+    // zc->seqStore.XXX = zc->seqStore.dumpsStart + (blockSize>>4);
+
     zc->hbSize = 0;
     zc->stage = 0;
+    zc->loadedDictEnd = 0;
+
+    return 0;
+}
+
+
+/*! ZSTD_copyCCtx
+*   Duplicate an existing context @srcCCtx into another one @dstCCtx.
+*   Only works during stage 0 (i.e. before first call to ZSTD_compressContinue())
+*   @return : 0, or an error code */
+size_t ZSTD_copyCCtx(ZSTD_CCtx* dstCCtx, const ZSTD_CCtx* srcCCtx)
+{
+    const U32 contentLog = (srcCCtx->params.strategy == ZSTD_fast) ? 1 : srcCCtx->params.contentLog;
+    const size_t tableSpace = ((1 << contentLog) + (1 << srcCCtx->params.hashLog)) * sizeof(U32);
+
+    if (srcCCtx->stage!=0) return ERROR(stage_wrong);
+
+    ZSTD_resetCCtx_advanced(dstCCtx, srcCCtx->params);
+
+    /* copy tables */
+    memcpy(dstCCtx->hashTable, srcCCtx->hashTable, tableSpace);
+
+    /* copy frame header */
+    dstCCtx->hbSize = srcCCtx->hbSize;
+    memcpy(dstCCtx->headerBuffer , srcCCtx->headerBuffer, srcCCtx->hbSize);
+
+    /* copy dictionary pointers */
+    dstCCtx->nextToUpdate= srcCCtx->nextToUpdate;
+    dstCCtx->nextSrc     = srcCCtx->nextSrc;
+    dstCCtx->base        = srcCCtx->base;
+    dstCCtx->dictBase    = srcCCtx->dictBase;
+    dstCCtx->dictLimit   = srcCCtx->dictLimit;
+    dstCCtx->lowLimit    = srcCCtx->lowLimit;
+    dstCCtx->loadedDictEnd = srcCCtx->loadedDictEnd;
+
+    /* copy entropy tables */
+    dstCCtx->flagStaticTables = srcCCtx->flagStaticTables;
+    if (srcCCtx->flagStaticTables) {
+        memcpy(dstCCtx->hufTable, srcCCtx->hufTable, 256*4);
+        memcpy(dstCCtx->litlengthCTable, srcCCtx->litlengthCTable, sizeof(dstCCtx->litlengthCTable));
+        memcpy(dstCCtx->matchlengthCTable, srcCCtx->matchlengthCTable, sizeof(dstCCtx->matchlengthCTable));
+        memcpy(dstCCtx->offcodeCTable, srcCCtx->offcodeCTable, sizeof(dstCCtx->offcodeCTable));
+    }
 
     return 0;
 }
 
 
-/** ZSTD_reduceIndex
+/*! ZSTD_reduceIndex
 *   rescale indexes to avoid future overflow (indexes are U32) */
 static void ZSTD_reduceIndex (ZSTD_CCtx* zc,
                         const U32 reducerValue)
@@ -225,22 +286,102 @@ static void ZSTD_reduceIndex (ZSTD_CCtx* zc,
     U32* table32 = zc->hashTable;
     U32 index;
 
-    for (index=0 ; index < tableSpaceU32 ; index++)
-    {
+    for (index=0 ; index < tableSpaceU32 ; index++) {
         if (table32[index] < reducerValue) table32[index] = 0;
         else table32[index] -= reducerValue;
     }
 }
 
 
-/* *******************************************************
+/*-*******************************************************
 *  Block entropic compression
 *********************************************************/
-size_t ZSTD_compressBound(size_t srcSize)   /* maximum compressed size */
-{
-    return FSE_compressBound(srcSize) + 12;
-}
 
+/* Block format description
+
+   Block = Literal Section - Sequences Section
+   Prerequisite : size of (compressed) block, maximum size of regenerated data
+
+   1) Literal Section
+
+   1.1) Header : 1-5 bytes
+        flags: 2 bits
+            00 compressed by Huff0
+            01 unused
+            10 is Raw (uncompressed)
+            11 is Rle
+            Note : using 01 => Huff0 with precomputed table ?
+            Note : delta map ? => compressed ?
+
+   1.1.1) Huff0-compressed literal block : 3-5 bytes
+            srcSize < 1 KB => 3 bytes (2-2-10-10) => single stream
+            srcSize < 1 KB => 3 bytes (2-2-10-10)
+            srcSize < 16KB => 4 bytes (2-2-14-14)
+            else           => 5 bytes (2-2-18-18)
+            big endian convention
+
+   1.1.2) Raw (uncompressed) literal block header : 1-3 bytes
+        size :  5 bits: (IS_RAW<<6) + (0<<4) + size
+               12 bits: (IS_RAW<<6) + (2<<4) + (size>>8)
+                        size&255
+               20 bits: (IS_RAW<<6) + (3<<4) + (size>>16)
+                        size>>8&255
+                        size&255
+
+   1.1.3) Rle (repeated single byte) literal block header : 1-3 bytes
+        size :  5 bits: (IS_RLE<<6) + (0<<4) + size
+               12 bits: (IS_RLE<<6) + (2<<4) + (size>>8)
+                        size&255
+               20 bits: (IS_RLE<<6) + (3<<4) + (size>>16)
+                        size>>8&255
+                        size&255
+
+   1.1.4) Huff0-compressed literal block, using precomputed CTables : 3-5 bytes
+            srcSize < 1 KB => 3 bytes (2-2-10-10) => single stream
+            srcSize < 1 KB => 3 bytes (2-2-10-10)
+            srcSize < 16KB => 4 bytes (2-2-14-14)
+            else           => 5 bytes (2-2-18-18)
+            big endian convention
+
+        1- CTable available (stored into workspace ?)
+        2- Small input (fast heuristic ? Full comparison ? depend on clevel ?)
+
+
+   1.2) Literal block content
+
+   1.2.1) Huff0 block, using sizes from header
+        See Huff0 format
+
+   1.2.2) Huff0 block, using prepared table
+
+   1.2.3) Raw content
+
+   1.2.4) single byte
+
+
+   2) Sequences section
+
+      - Nb Sequences : 2 bytes, little endian
+      - Control Token : 1 byte (see below)
+      - Dumps Length : 1 or 2 bytes (depending on control token)
+      - Dumps : as stated by dumps length
+      - Literal Lengths FSE table (as needed depending on encoding method)
+      - Offset Codes FSE table (as needed depending on encoding method)
+      - Match Lengths FSE table (as needed depending on encoding method)
+
+    2.1) Control Token
+      8 bits, divided as :
+      0-1 : dumpsLength
+      2-3 : MatchLength, FSE encoding method
+      4-5 : Offset Codes, FSE encoding method
+      6-7 : Literal Lengths, FSE encoding method
+
+      FSE encoding method :
+      FSE_ENCODING_RAW : uncompressed; no header
+      FSE_ENCODING_RLE : single repeated value; header 1 byte
+      FSE_ENCODING_STATIC : use prepared table; no header
+      FSE_ENCODING_DYNAMIC : read NCount
+*/
 
 size_t ZSTD_noCompressBlock (void* dst, size_t maxDstSize, const void* src, size_t srcSize)
 {
@@ -262,69 +403,129 @@ size_t ZSTD_noCompressBlock (void* dst, size_t maxDstSize, const void* src, size
 static size_t ZSTD_noCompressLiterals (void* dst, size_t maxDstSize, const void* src, size_t srcSize)
 {
     BYTE* const ostart = (BYTE* const)dst;
+    const U32 flSize = 1 + (srcSize>31) + (srcSize>4095);
+
+    if (srcSize + flSize > maxDstSize) return ERROR(dstSize_tooSmall);
 
-    if (srcSize + 3 > maxDstSize) return ERROR(dstSize_tooSmall);
+    switch(flSize)
+    {
+        case 1: /* 2 - 1 - 5 */
+            ostart[0] = (BYTE)((IS_RAW<<6) + (0<<5) + srcSize);
+            break;
+        case 2: /* 2 - 2 - 12 */
+            ostart[0] = (BYTE)((IS_RAW<<6) + (2<<4) + (srcSize >> 8));
+            ostart[1] = (BYTE)srcSize;
+            break;
+        default:   /*note : should not be necessary : flSize is within {1,2,3} */
+        case 3: /* 2 - 2 - 20 */
+            ostart[0] = (BYTE)((IS_RAW<<6) + (3<<4) + (srcSize >> 16));
+            ostart[1] = (BYTE)(srcSize>>8);
+            ostart[2] = (BYTE)srcSize;
+            break;
+    }
 
-    MEM_writeLE32(dst, ((U32)srcSize << 2) | IS_RAW);
-    memcpy(ostart + 3, src, srcSize);
-    return srcSize + 3;
+    memcpy(ostart + flSize, src, srcSize);
+    return srcSize + flSize;
 }
 
 static size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t maxDstSize, const void* src, size_t srcSize)
 {
     BYTE* const ostart = (BYTE* const)dst;
+    U32 flSize = 1 + (srcSize>31) + (srcSize>4095);
+
+    (void)maxDstSize;  /* maxDstSize guaranteed to be >=4, hence large enough */
+
+    switch(flSize)
+    {
+        case 1: /* 2 - 1 - 5 */
+            ostart[0] = (BYTE)((IS_RLE<<6) + (0<<5) + srcSize);
+            break;
+        case 2: /* 2 - 2 - 12 */
+            ostart[0] = (BYTE)((IS_RLE<<6) + (2<<4) + (srcSize >> 8));
+            ostart[1] = (BYTE)srcSize;
+            break;
+        default:   /*note : should not be necessary : flSize is necessary within {1,2,3} */
+        case 3: /* 2 - 2 - 20 */
+            ostart[0] = (BYTE)((IS_RLE<<6) + (3<<4) + (srcSize >> 16));
+            ostart[1] = (BYTE)(srcSize>>8);
+            ostart[2] = (BYTE)srcSize;
+            break;
+    }
 
-    (void)maxDstSize;
-    MEM_writeLE32(dst, ((U32)srcSize << 2) | IS_RLE);  /* note : maxDstSize > litHeaderSize > 4 */
-    ostart[3] = *(const BYTE*)src;
-    return 4;
+    ostart[flSize] = *(const BYTE*)src;
+    return flSize+1;
 }
 
-size_t ZSTD_minGain(size_t srcSize) { return (srcSize >> 6) + 1; }
 
-static size_t ZSTD_compressLiterals (void* dst, size_t maxDstSize,
+size_t ZSTD_minGain(size_t srcSize) { return (srcSize >> 6) + 2; }
+
+static size_t ZSTD_compressLiterals (ZSTD_CCtx* zc,
+                                     void* dst, size_t maxDstSize,
                                const void* src, size_t srcSize)
 {
     const size_t minGain = ZSTD_minGain(srcSize);
     BYTE* const ostart = (BYTE*)dst;
-    size_t hsize;
-    static const size_t litHeaderSize = 5;
-
-    if (maxDstSize < litHeaderSize+1) return ERROR(dstSize_tooSmall);   /* not enough space for compression */
-
-    hsize = HUF_compress(ostart+litHeaderSize, maxDstSize-litHeaderSize, src, srcSize);
+    const size_t lhSize = 3 + (srcSize >= 1 KB) + (srcSize >= 16 KB);
+    U32 singleStream = srcSize < 256;
+    U32 hType = IS_HUF;
+    size_t clitSize;
+
+    if (maxDstSize < lhSize+1) return ERROR(dstSize_tooSmall);   /* not enough space for compression */
+
+    if (zc->flagStaticTables && (lhSize==3)) {
+        hType = IS_PCH;
+        singleStream = 1;
+        clitSize = HUF_compress1X_usingCTable(ostart+lhSize, maxDstSize-lhSize, src, srcSize, zc->hufTable);
+    } else {
+        clitSize = singleStream ? HUF_compress1X(ostart+lhSize, maxDstSize-lhSize, src, srcSize, 255, 12)
+                                : HUF_compress2 (ostart+lhSize, maxDstSize-lhSize, src, srcSize, 255, 12);
+    }
 
-    if ((hsize==0) || (hsize >= srcSize - minGain)) return ZSTD_noCompressLiterals(dst, maxDstSize, src, srcSize);
-    if (hsize==1) return ZSTD_compressRleLiteralsBlock(dst, maxDstSize, src, srcSize);
+    if ((clitSize==0) || (clitSize >= srcSize - minGain)) return ZSTD_noCompressLiterals(dst, maxDstSize, src, srcSize);
+    if (clitSize==1) return ZSTD_compressRleLiteralsBlock(dst, maxDstSize, src, srcSize);
 
     /* Build header */
+    switch(lhSize)
     {
-        ostart[0]  = (BYTE)(srcSize << 2); /* is a block, is compressed */
-        ostart[1]  = (BYTE)(srcSize >> 6);
-        ostart[2]  = (BYTE)(srcSize >>14);
-        ostart[2] += (BYTE)(hsize << 5);
-        ostart[3]  = (BYTE)(hsize >> 3);
-        ostart[4]  = (BYTE)(hsize >>11);
+    case 3: /* 2 - 2 - 10 - 10 */
+        ostart[0] = (BYTE)((srcSize>>6) + (singleStream << 4) + (hType<<6));
+        ostart[1] = (BYTE)((srcSize<<2) + (clitSize>>8));
+        ostart[2] = (BYTE)(clitSize);
+        break;
+    case 4: /* 2 - 2 - 14 - 14 */
+        ostart[0] = (BYTE)((srcSize>>10) + (2<<4) +  (hType<<6));
+        ostart[1] = (BYTE)(srcSize>> 2);
+        ostart[2] = (BYTE)((srcSize<<6) + (clitSize>>8));
+        ostart[3] = (BYTE)(clitSize);
+        break;
+    default:   /* should not be necessary, lhSize is {3,4,5} */
+    case 5: /* 2 - 2 - 18 - 18 */
+        ostart[0] = (BYTE)((srcSize>>14) + (3<<4) +  (hType<<6));
+        ostart[1] = (BYTE)(srcSize>>6);
+        ostart[2] = (BYTE)((srcSize<<2) + (clitSize>>16));
+        ostart[3] = (BYTE)(clitSize>>8);
+        ostart[4] = (BYTE)(clitSize);
+        break;
     }
 
-    return hsize+litHeaderSize;
+    return lhSize+clitSize;
 }
 
 
-#define LITERAL_NOENTROPY 63   /* cheap heuristic */
+#define LITERAL_NOENTROPY 63   /* don't even attempt to compress literals below this threshold (cheap heuristic) */
 
-size_t ZSTD_compressSequences(void* dst, size_t maxDstSize,
-                        const seqStore_t* seqStorePtr,
+size_t ZSTD_compressSequences(ZSTD_CCtx* zc,
+                              void* dst, size_t maxDstSize,
                               size_t srcSize)
 {
+    const seqStore_t* seqStorePtr = &(zc->seqStore);
     U32 count[MaxSeq+1];
     S16 norm[MaxSeq+1];
     size_t mostFrequent;
-    U32 max = 255;
-    U32 tableLog = 11;
-    U32 CTable_LitLength  [FSE_CTABLE_SIZE_U32(LLFSELog, MaxLL )];
-    U32 CTable_OffsetBits [FSE_CTABLE_SIZE_U32(OffFSELog,MaxOff)];
-    U32 CTable_MatchLength[FSE_CTABLE_SIZE_U32(MLFSELog, MaxML )];
+    U32 max;
+    FSE_CTable* CTable_LitLength = zc->litlengthCTable;
+    FSE_CTable* CTable_OffsetBits = zc->offcodeCTable;
+    FSE_CTable* CTable_MatchLength = zc->matchlengthCTable;
     U32 LLtype, Offtype, MLtype;   /* compressed, raw or rle */
     const BYTE* const op_lit_start = seqStorePtr->litStart;
     const BYTE* const llTable = seqStorePtr->litLengthStart;
@@ -340,37 +541,39 @@ size_t ZSTD_compressSequences(void* dst, size_t maxDstSize,
     const size_t maxCSize = srcSize - minGain;
     BYTE* seqHead;
 
-
     /* Compress literals */
     {
         size_t cSize;
         size_t litSize = seqStorePtr->lit - op_lit_start;
+        const size_t minLitSize = zc->flagStaticTables ? 6 : LITERAL_NOENTROPY;
 
-        if (litSize <= LITERAL_NOENTROPY)
+        if (litSize <= minLitSize)
             cSize = ZSTD_noCompressLiterals(op, maxDstSize, op_lit_start, litSize);
         else
-            cSize = ZSTD_compressLiterals(op, maxDstSize, op_lit_start, litSize);
+            cSize = ZSTD_compressLiterals(zc, op, maxDstSize, op_lit_start, litSize);
         if (ZSTD_isError(cSize)) return cSize;
         op += cSize;
     }
 
     /* Sequences Header */
-    if ((oend-op) < MIN_SEQUENCES_SIZE)
+    if ((oend-op) < MIN_SEQUENCES_SIZE) return ERROR(dstSize_tooSmall);
+    if (nbSeq < 128) *op++ = (BYTE)nbSeq;
+    else {
+        op[0] = (BYTE)((nbSeq>>8) + 128); op[1] = (BYTE)nbSeq; op+=2;
+    }
+    if (nbSeq==0) goto _check_compressibility;
+
+    /* dumps : contains rests of large lengths */
+    if ((oend-op) < 3 /* dumps */ + 1 /*seqHead*/)
         return ERROR(dstSize_tooSmall);
-    MEM_writeLE16(op, (U16)nbSeq); op+=2;
     seqHead = op;
-
-    /* dumps : contains too large lengths */
     {
         size_t dumpsLength = seqStorePtr->dumps - seqStorePtr->dumpsStart;
-        if (dumpsLength < 512)
-        {
+        if (dumpsLength < 512) {
             op[0] = (BYTE)(dumpsLength >> 8);
             op[1] = (BYTE)(dumpsLength);
             op += 2;
-        }
-        else
-        {
+        } else {
             op[0] = 2;
             op[1] = (BYTE)(dumpsLength>>8);
             op[2] = (BYTE)(dumpsLength);
@@ -381,94 +584,90 @@ size_t ZSTD_compressSequences(void* dst, size_t maxDstSize,
         op += dumpsLength;
     }
 
+#define MIN_SEQ_FOR_DYNAMIC_FSE   64
+#define MAX_SEQ_FOR_STATIC_FSE  1000
+
     /* CTable for Literal Lengths */
     max = MaxLL;
-    mostFrequent = FSE_countFast(count, &max, seqStorePtr->litLengthStart, nbSeq);
-    if ((mostFrequent == nbSeq) && (nbSeq > 2))
-    {
-        *op++ = *(seqStorePtr->litLengthStart);
+    mostFrequent = FSE_countFast(count, &max, llTable, nbSeq);
+    if ((mostFrequent == nbSeq) && (nbSeq > 2)) {
+        *op++ = llTable[0];
         FSE_buildCTable_rle(CTable_LitLength, (BYTE)max);
-        LLtype = bt_rle;
-    }
-    else if ((nbSeq < 64) || (mostFrequent < (nbSeq >> (LLbits-1))))
-    {
+        LLtype = FSE_ENCODING_RLE;
+    } else if ((zc->flagStaticTables) && (nbSeq < MAX_SEQ_FOR_STATIC_FSE)) {
+        LLtype = FSE_ENCODING_STATIC;
+    } else if ((nbSeq < MIN_SEQ_FOR_DYNAMIC_FSE) || (mostFrequent < (nbSeq >> (LLbits-1)))) {
         FSE_buildCTable_raw(CTable_LitLength, LLbits);
-        LLtype = bt_raw;
-    }
-    else
-    {
+        LLtype = FSE_ENCODING_RAW;
+    } else {
         size_t NCountSize;
-        tableLog = FSE_optimalTableLog(LLFSELog, nbSeq, max);
-        FSE_normalizeCount(norm, tableLog, count, nbSeq, max);
+        size_t nbSeq_1 = nbSeq;
+        U32 tableLog = FSE_optimalTableLog(LLFSELog, nbSeq, max);
+        if (count[llTable[nbSeq-1]]>1) { count[llTable[nbSeq-1]]--; nbSeq_1--; }
+        FSE_normalizeCount(norm, tableLog, count, nbSeq_1, max);
         NCountSize = FSE_writeNCount(op, oend-op, norm, max, tableLog);   /* overflow protected */
         if (FSE_isError(NCountSize)) return ERROR(GENERIC);
         op += NCountSize;
         FSE_buildCTable(CTable_LitLength, norm, max, tableLog);
-        LLtype = bt_compressed;
+        LLtype = FSE_ENCODING_DYNAMIC;
     }
 
-    /* CTable for Offsets codes */
-    {
-        /* create Offset codes */
-        size_t i;
-        max = MaxOff;
-        for (i=0; i<nbSeq; i++)
-        {
+    /* CTable for Offset codes */
+    {   /* create Offset codes */
+        size_t i; for (i=0; i<nbSeq; i++) {
             offCodeTable[i] = (BYTE)ZSTD_highbit(offsetTable[i]) + 1;
             if (offsetTable[i]==0) offCodeTable[i]=0;
         }
-        mostFrequent = FSE_countFast(count, &max, offCodeTable, nbSeq);
     }
-    if ((mostFrequent == nbSeq) && (nbSeq > 2))
-    {
-        *op++ = *offCodeTable;
+    max = MaxOff;
+    mostFrequent = FSE_countFast(count, &max, offCodeTable, nbSeq);
+    if ((mostFrequent == nbSeq) && (nbSeq > 2)) {
+        *op++ = offCodeTable[0];
         FSE_buildCTable_rle(CTable_OffsetBits, (BYTE)max);
-        Offtype = bt_rle;
-    }
-    else if ((nbSeq < 64) || (mostFrequent < (nbSeq >> (Offbits-1))))
-    {
+        Offtype = FSE_ENCODING_RLE;
+    } else if ((zc->flagStaticTables) && (nbSeq < MAX_SEQ_FOR_STATIC_FSE)) {
+        Offtype = FSE_ENCODING_STATIC;
+    } else if ((nbSeq < MIN_SEQ_FOR_DYNAMIC_FSE) || (mostFrequent < (nbSeq >> (Offbits-1)))) {
         FSE_buildCTable_raw(CTable_OffsetBits, Offbits);
-        Offtype = bt_raw;
-    }
-    else
-    {
+        Offtype = FSE_ENCODING_RAW;
+    } else {
         size_t NCountSize;
-        tableLog = FSE_optimalTableLog(OffFSELog, nbSeq, max);
-        FSE_normalizeCount(norm, tableLog, count, nbSeq, max);
+        size_t nbSeq_1 = nbSeq;
+        U32 tableLog = FSE_optimalTableLog(OffFSELog, nbSeq, max);
+        if (count[offCodeTable[nbSeq-1]]>1) { count[offCodeTable[nbSeq-1]]--; nbSeq_1--; }
+        FSE_normalizeCount(norm, tableLog, count, nbSeq_1, max);
         NCountSize = FSE_writeNCount(op, oend-op, norm, max, tableLog);   /* overflow protected */
         if (FSE_isError(NCountSize)) return ERROR(GENERIC);
         op += NCountSize;
         FSE_buildCTable(CTable_OffsetBits, norm, max, tableLog);
-        Offtype = bt_compressed;
+        Offtype = FSE_ENCODING_DYNAMIC;
     }
 
     /* CTable for MatchLengths */
     max = MaxML;
-    mostFrequent = FSE_countFast(count, &max, seqStorePtr->matchLengthStart, nbSeq);
-    if ((mostFrequent == nbSeq) && (nbSeq > 2))
-    {
-        *op++ = *seqStorePtr->matchLengthStart;
+    mostFrequent = FSE_countFast(count, &max, mlTable, nbSeq);
+    if ((mostFrequent == nbSeq) && (nbSeq > 2)) {
+        *op++ = *mlTable;
         FSE_buildCTable_rle(CTable_MatchLength, (BYTE)max);
-        MLtype = bt_rle;
-    }
-    else if ((nbSeq < 64) || (mostFrequent < (nbSeq >> (MLbits-1))))
-    {
+        MLtype = FSE_ENCODING_RLE;
+    } else if ((zc->flagStaticTables) && (nbSeq < MAX_SEQ_FOR_STATIC_FSE)) {
+        MLtype = FSE_ENCODING_STATIC;
+    } else if ((nbSeq < MIN_SEQ_FOR_DYNAMIC_FSE) || (mostFrequent < (nbSeq >> (MLbits-1)))) {
         FSE_buildCTable_raw(CTable_MatchLength, MLbits);
-        MLtype = bt_raw;
-    }
-    else
-    {
+        MLtype = FSE_ENCODING_RAW;
+    } else {
         size_t NCountSize;
-        tableLog = FSE_optimalTableLog(MLFSELog, nbSeq, max);
+        U32 tableLog = FSE_optimalTableLog(MLFSELog, nbSeq, max);
         FSE_normalizeCount(norm, tableLog, count, nbSeq, max);
         NCountSize = FSE_writeNCount(op, oend-op, norm, max, tableLog);   /* overflow protected */
         if (FSE_isError(NCountSize)) return ERROR(GENERIC);
         op += NCountSize;
         FSE_buildCTable(CTable_MatchLength, norm, max, tableLog);
-        MLtype = bt_compressed;
+        MLtype = FSE_ENCODING_DYNAMIC;
     }
 
     seqHead[0] += (BYTE)((LLtype<<6) + (Offtype<<4) + (MLtype<<2));
+    zc->flagStaticTables = 0;
 
     /* Encoding Sequences */
     {
@@ -481,23 +680,26 @@ size_t ZSTD_compressSequences(void* dst, size_t maxDstSize,
 
         errorCode = BIT_initCStream(&blockStream, op, oend-op);
         if (ERR_isError(errorCode)) return ERROR(dstSize_tooSmall);   /* not enough space remaining */
-        FSE_initCState(&stateMatchLength, CTable_MatchLength);
-        FSE_initCState(&stateOffsetBits, CTable_OffsetBits);
-        FSE_initCState(&stateLitLength, CTable_LitLength);
 
-        for (i=(int)nbSeq-1; i>=0; i--)
-        {
-            BYTE matchLength = mlTable[i];
+        /* first symbols */
+        FSE_initCState2(&stateMatchLength, CTable_MatchLength, mlTable[nbSeq-1]);
+        FSE_initCState2(&stateOffsetBits,  CTable_OffsetBits,  offCodeTable[nbSeq-1]);
+        FSE_initCState2(&stateLitLength,   CTable_LitLength,   llTable[nbSeq-1]);
+        BIT_addBits(&blockStream, offsetTable[nbSeq-1], offCodeTable[nbSeq-1] ? (offCodeTable[nbSeq-1]-1) : 0);
+        BIT_flushBits(&blockStream);
+
+        for (i=(int)nbSeq-2; i>=0; i--) {
+            BYTE mlCode = mlTable[i];
             U32  offset = offsetTable[i];
             BYTE offCode = offCodeTable[i];                                 /* 32b*/  /* 64b*/
-            U32 nbBits = (offCode-1) * (!!offCode);
+            U32 nbBits = (offCode-1) + (!offCode);
             BYTE litLength = llTable[i];                                    /* (7)*/  /* (7)*/
-            FSE_encodeSymbol(&blockStream, &stateMatchLength, matchLength); /* 17 */  /* 17 */
-            if (MEM_32bits()) BIT_flushBits(&blockStream);                  /*  7 */
-            BIT_addBits(&blockStream, offset, nbBits);                      /* 31 */  /* 42 */   /* 24 bits max in 32-bits mode */
+            FSE_encodeSymbol(&blockStream, &stateMatchLength, mlCode);      /* 17 */  /* 17 */
             if (MEM_32bits()) BIT_flushBits(&blockStream);                  /*  7 */
-            FSE_encodeSymbol(&blockStream, &stateOffsetBits, offCode);      /* 16 */  /* 51 */
             FSE_encodeSymbol(&blockStream, &stateLitLength, litLength);     /* 26 */  /* 61 */
+            FSE_encodeSymbol(&blockStream, &stateOffsetBits, offCode);      /* 16 */  /* 51 */
+            if (MEM_32bits()) BIT_flushBits(&blockStream);                  /*  7 */
+            BIT_addBits(&blockStream, offset, nbBits);                      /* 31 */  /* 42 */   /* 24 bits max in 32-bits mode */
             BIT_flushBits(&blockStream);                                    /*  7 */  /*  7 */
         }
 
@@ -511,23 +713,24 @@ size_t ZSTD_compressSequences(void* dst, size_t maxDstSize,
     }
 
     /* check compressibility */
+_check_compressibility:
     if ((size_t)(op-ostart) >= maxCSize) return 0;
 
     return op - ostart;
 }
 
 
-/** ZSTD_storeSeq
-    Store a sequence (literal length, literals, offset code and match length) into seqStore_t
+/*! ZSTD_storeSeq
+    Store a sequence (literal length, literals, offset code and match length code) into seqStore_t
     @offsetCode : distance to match, or 0 == repCode
     @matchCode : matchLength - MINMATCH
 */
 MEM_STATIC void ZSTD_storeSeq(seqStore_t* seqStorePtr, size_t litLength, const BYTE* literals, size_t offsetCode, size_t matchCode)
 {
-#if 0
+#if 0  /* for debug */
     static const BYTE* g_start = NULL;
     if (g_start==NULL) g_start = literals;
-    if (literals - g_start == 8695)
+    //if (literals - g_start == 8695)
     printf("pos %6u : %3u literals & match %3u bytes at distance %6u \n",
            (U32)(literals - g_start), (U32)litLength, (U32)matchCode+4, (U32)offsetCode);
 #endif
@@ -537,71 +740,51 @@ MEM_STATIC void ZSTD_storeSeq(seqStore_t* seqStorePtr, size_t litLength, const B
     seqStorePtr->lit += litLength;
 
     /* literal Length */
-    if (litLength >= MaxLL)
-    {
+    if (litLength >= MaxLL) {
         *(seqStorePtr->litLength++) = MaxLL;
-        if (litLength<255 + MaxLL)
+        if (litLength<255 + MaxLL) {
             *(seqStorePtr->dumps++) = (BYTE)(litLength - MaxLL);
-        else
-        {
+        } else {
             *(seqStorePtr->dumps++) = 255;
-            MEM_writeLE32(seqStorePtr->dumps, (U32)litLength); seqStorePtr->dumps += 3;
-        }
-    }
+            if (litLength < (1<<15)) {
+                MEM_writeLE16(seqStorePtr->dumps, (U16)(litLength<<1));
+                seqStorePtr->dumps += 2;
+            } else {
+                MEM_writeLE32(seqStorePtr->dumps, (U32)((litLength<<1)+1));
+                seqStorePtr->dumps += 3;
+            }
+    }   }
     else *(seqStorePtr->litLength++) = (BYTE)litLength;
 
     /* match offset */
     *(seqStorePtr->offset++) = (U32)offsetCode;
 
     /* match Length */
-    if (matchCode >= MaxML)
-    {
+    if (matchCode >= MaxML) {
         *(seqStorePtr->matchLength++) = MaxML;
-        if (matchCode < 255+MaxML)
+        if (matchCode < 255+MaxML) {
             *(seqStorePtr->dumps++) = (BYTE)(matchCode - MaxML);
-        else
-        {
+        } else {
             *(seqStorePtr->dumps++) = 255;
-            MEM_writeLE32(seqStorePtr->dumps, (U32)matchCode); seqStorePtr->dumps += 3;
-        }
-    }
+            if (matchCode < (1<<15)) {
+                MEM_writeLE16(seqStorePtr->dumps, (U16)(matchCode<<1));
+                seqStorePtr->dumps += 2;
+            } else {
+                MEM_writeLE32(seqStorePtr->dumps, (U32)((matchCode<<1)+1));
+                seqStorePtr->dumps += 3;
+            }
+    }   }
     else *(seqStorePtr->matchLength++) = (BYTE)matchCode;
 }
 
 
-/* *************************************
+/*-*************************************
 *  Match length counter
 ***************************************/
-static size_t ZSTD_read_ARCH(const void* p) { size_t r; memcpy(&r, p, sizeof(r)); return r; }
-
-static unsigned ZSTD_highbit(U32 val)
-{
-#   if defined(_MSC_VER)   /* Visual */
-    unsigned long r=0;
-    _BitScanReverse(&r, val);
-    return (unsigned)r;
-#   elif defined(__GNUC__) && (__GNUC__ >= 3)   /* GCC Intrinsic */
-    return 31 - __builtin_clz(val);
-#   else   /* Software version */
-    static const int DeBruijnClz[32] = { 0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31 };
-    U32 v = val;
-    int r;
-    v |= v >> 1;
-    v |= v >> 2;
-    v |= v >> 4;
-    v |= v >> 8;
-    v |= v >> 16;
-    r = DeBruijnClz[(U32)(v * 0x07C4ACDDU) >> 27];
-    return r;
-#   endif
-}
-
 static unsigned ZSTD_NbCommonBytes (register size_t val)
 {
-    if (MEM_isLittleEndian())
-    {
-        if (MEM_64bits())
-        {
+    if (MEM_isLittleEndian()) {
+        if (MEM_64bits()) {
 #       if defined(_MSC_VER) && defined(_WIN64)
             unsigned long r = 0;
             _BitScanForward64( &r, (U64)val );
@@ -612,9 +795,7 @@ static unsigned ZSTD_NbCommonBytes (register size_t val)
             static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 };
             return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58];
 #       endif
-        }
-        else /* 32 bits */
-        {
+        } else { /* 32 bits */
 #       if defined(_MSC_VER)
             unsigned long r=0;
             _BitScanForward( &r, (U32)val );
@@ -626,11 +807,8 @@ static unsigned ZSTD_NbCommonBytes (register size_t val)
             return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
 #       endif
         }
-    }
-    else   /* Big Endian CPU */
-    {
-        if (MEM_64bits())
-        {
+    } else {  /* Big Endian CPU */
+        if (MEM_64bits()) {
 #       if defined(_MSC_VER) && defined(_WIN64)
             unsigned long r = 0;
             _BitScanReverse64( &r, val );
@@ -645,9 +823,7 @@ static unsigned ZSTD_NbCommonBytes (register size_t val)
             r += (!val);
             return r;
 #       endif
-        }
-        else /* 32 bits */
-        {
+        } else { /* 32 bits */
 #       if defined(_MSC_VER)
             unsigned long r = 0;
             _BitScanReverse( &r, (unsigned long)val );
@@ -660,8 +836,7 @@ static unsigned ZSTD_NbCommonBytes (register size_t val)
             r += (!val);
             return r;
 #       endif
-        }
-    }
+    }   }
 }
 
 
@@ -669,22 +844,20 @@ static size_t ZSTD_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* pInLim
 {
     const BYTE* const pStart = pIn;
 
-    while ((pIn<pInLimit-(sizeof(size_t)-1)))
-    {
-        size_t diff = ZSTD_read_ARCH(pMatch) ^ ZSTD_read_ARCH(pIn);
+    while ((pIn<pInLimit-(sizeof(size_t)-1))) {
+        size_t diff = MEM_readST(pMatch) ^ MEM_readST(pIn);
         if (!diff) { pIn+=sizeof(size_t); pMatch+=sizeof(size_t); continue; }
         pIn += ZSTD_NbCommonBytes(diff);
         return (size_t)(pIn - pStart);
     }
-
     if (MEM_64bits()) if ((pIn<(pInLimit-3)) && (MEM_read32(pMatch) == MEM_read32(pIn))) { pIn+=4; pMatch+=4; }
     if ((pIn<(pInLimit-1)) && (MEM_read16(pMatch) == MEM_read16(pIn))) { pIn+=2; pMatch+=2; }
     if ((pIn<pInLimit) && (*pMatch == *pIn)) pIn++;
     return (size_t)(pIn - pStart);
 }
 
-/** ZSTD_count_2segments
-*   can count match length with ip & match in potentially 2 different segments.
+/** ZSTD_count_2segments() :
+*   can count match length with `ip` & `match` in 2 different segments.
 *   convention : on reaching mEnd, match count continue starting from iStart
 */
 static size_t ZSTD_count_2segments(const BYTE* ip, const BYTE* match, const BYTE* iEnd, const BYTE* mEnd, const BYTE* iStart)
@@ -699,26 +872,24 @@ static size_t ZSTD_count_2segments(const BYTE* ip, const BYTE* match, const BYTE
 }
 
 
-
-/* *************************************
+/*-*************************************
 *  Hashes
 ***************************************/
-
 static const U32 prime4bytes = 2654435761U;
-static U32 ZSTD_hash4(U32 u, U32 h) { return (u * prime4bytes) >> (32-h) ; }
+static U32    ZSTD_hash4(U32 u, U32 h) { return (u * prime4bytes) >> (32-h) ; }
 static size_t ZSTD_hash4Ptr(const void* ptr, U32 h) { return ZSTD_hash4(MEM_read32(ptr), h); }
 
 static const U64 prime5bytes = 889523592379ULL;
-static size_t ZSTD_hash5(U64 u, U32 h) { return (size_t)((u * prime5bytes) << (64-40) >> (64-h)) ; }
-static size_t ZSTD_hash5Ptr(const void* p, U32 h) { return ZSTD_hash5(MEM_read64(p), h); }
+static size_t ZSTD_hash5(U64 u, U32 h) { return (size_t)(((u  << (64-40)) * prime5bytes) >> (64-h)) ; }
+static size_t ZSTD_hash5Ptr(const void* p, U32 h) { return ZSTD_hash5(MEM_readLE64(p), h); }
 
 static const U64 prime6bytes = 227718039650203ULL;
-static size_t ZSTD_hash6(U64 u, U32 h) { return (size_t)((u * prime6bytes) << (64-48) >> (64-h)) ; }
-static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_read64(p), h); }
+static size_t ZSTD_hash6(U64 u, U32 h) { return (size_t)(((u  << (64-48)) * prime6bytes) >> (64-h)) ; }
+static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h); }
 
 static const U64 prime7bytes = 58295818150454627ULL;
-static size_t ZSTD_hash7(U64 u, U32 h) { return (size_t)((u * prime7bytes) << (64-56) >> (64-h)) ; }
-static size_t ZSTD_hash7Ptr(const void* p, U32 h) { return ZSTD_hash7(MEM_read64(p), h); }
+static size_t ZSTD_hash7(U64 u, U32 h) { return (size_t)(((u  << (64-56)) * prime7bytes) >> (64-h)) ; }
+static size_t ZSTD_hash7Ptr(const void* p, U32 h) { return ZSTD_hash7(MEM_readLE64(p), h); }
 
 static size_t ZSTD_hashPtr(const void* p, U32 hBits, U32 mls)
 {
@@ -732,10 +903,10 @@ static size_t ZSTD_hashPtr(const void* p, U32 hBits, U32 mls)
     }
 }
 
-/* *************************************
+
+/*-*************************************
 *  Fast Scan
 ***************************************/
-
 #define FILLHASHSTEP 3
 static void ZSTD_fillHashTable (ZSTD_CCtx* zc, const void* end, const U32 mls)
 {
@@ -743,10 +914,9 @@ static void ZSTD_fillHashTable (ZSTD_CCtx* zc, const void* end, const U32 mls)
     const U32 hBits = zc->params.hashLog;
     const BYTE* const base = zc->base;
     const BYTE* ip = base + zc->nextToUpdate;
-    const BYTE* const iend = (const BYTE*) end;
+    const BYTE* const iend = ((const BYTE*)end) - 8;
 
-    while(ip <= iend)
-    {
+    while(ip <= iend) {
         hashTable[ZSTD_hashPtr(ip, hBits, mls)] = (U32)(ip - base);
         ip += FILLHASHSTEP;
     }
@@ -754,8 +924,7 @@ static void ZSTD_fillHashTable (ZSTD_CCtx* zc, const void* end, const U32 mls)
 
 
 FORCE_INLINE
-size_t ZSTD_compressBlock_fast_generic(ZSTD_CCtx* zc,
-                                       void* dst, size_t maxDstSize,
+void ZSTD_compressBlock_fast_generic(ZSTD_CCtx* zc,
                                  const void* src, size_t srcSize,
                                  const U32 mls)
 {
@@ -776,17 +945,10 @@ size_t ZSTD_compressBlock_fast_generic(ZSTD_CCtx* zc,
 
     /* init */
     ZSTD_resetSeqStore(seqStorePtr);
-    if (ip < lowest+4)
-    {
-        hashTable[ZSTD_hashPtr(lowest+1, hBits, mls)] = lowIndex+1;
-        hashTable[ZSTD_hashPtr(lowest+2, hBits, mls)] = lowIndex+2;
-        hashTable[ZSTD_hashPtr(lowest+3, hBits, mls)] = lowIndex+3;
-        ip = lowest+4;
-    }
+    if (ip < lowest+REPCODE_STARTVALUE) ip = lowest+REPCODE_STARTVALUE;
 
     /* Main Search Loop */
-    while (ip < ilimit)  /* < instead of <=, because repcode check at (ip+1) */
-    {
+    while (ip < ilimit) {  /* < instead of <=, because repcode check at (ip+1) */
         size_t mlCode;
         size_t offset;
         const size_t h = ZSTD_hashPtr(ip, hBits, mls);
@@ -795,17 +957,13 @@ size_t ZSTD_compressBlock_fast_generic(ZSTD_CCtx* zc,
         const U32 current = (U32)(ip-base);
         hashTable[h] = current;   /* update hash table */
 
-        if (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1))   /* note : by construction, offset_1 <= current */
-        {
+        if (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1)) {   /* note : by construction, offset_1 <= current */
             mlCode = ZSTD_count(ip+1+MINMATCH, ip+1+MINMATCH-offset_1, iend);
             ip++;
             offset = 0;
-        }
-        else
-        {
+        } else {
             if ( (matchIndex <= lowIndex) ||
-                 (MEM_read32(match) != MEM_read32(ip)) )
-            {
+                 (MEM_read32(match) != MEM_read32(ip)) ) {
                 ip += ((ip-anchor) >> g_searchStrength) + 1;
                 continue;
             }
@@ -821,15 +979,13 @@ size_t ZSTD_compressBlock_fast_generic(ZSTD_CCtx* zc,
         ip += mlCode + MINMATCH;
         anchor = ip;
 
-        if (ip <= ilimit)
-        {
+        if (ip <= ilimit) {
             /* Fill Table */
             hashTable[ZSTD_hashPtr(base+current+2, hBits, mls)] = current+2;  /* here because current+2 could be > iend-8 */
             hashTable[ZSTD_hashPtr(ip-2, hBits, mls)] = (U32)(ip-2-base);
             /* check immediate repcode */
             while ( (ip <= ilimit)
-                 && (MEM_read32(ip) == MEM_read32(ip - offset_2)) )
-            {
+                 && (MEM_read32(ip) == MEM_read32(ip - offset_2)) ) {
                 /* store sequence */
                 size_t rlCode = ZSTD_count(ip+MINMATCH, ip+MINMATCH-offset_2, iend);
                 size_t tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff;   /* swap offset_2 <=> offset_1 */
@@ -838,48 +994,38 @@ size_t ZSTD_compressBlock_fast_generic(ZSTD_CCtx* zc,
                 ip += rlCode+MINMATCH;
                 anchor = ip;
                 continue;   /* faster when present ... (?) */
-            }
-        }
-    }
+    }   }   }
 
-    /* Last Literals */
-    {
+    {   /* Last Literals */
         size_t lastLLSize = iend - anchor;
         memcpy(seqStorePtr->lit, anchor, lastLLSize);
         seqStorePtr->lit += lastLLSize;
     }
-
-    /* Finale compression stage */
-    return ZSTD_compressSequences(dst, maxDstSize,
-                                  seqStorePtr, srcSize);
 }
 
 
-size_t ZSTD_compressBlock_fast(ZSTD_CCtx* ctx,
-                               void* dst, size_t maxDstSize,
-                         const void* src, size_t srcSize)
+static void ZSTD_compressBlock_fast(ZSTD_CCtx* ctx,
+                       const void* src, size_t srcSize)
 {
     const U32 mls = ctx->params.searchLength;
     switch(mls)
     {
     default:
     case 4 :
-        return ZSTD_compressBlock_fast_generic(ctx, dst, maxDstSize, src, srcSize, 4);
+        ZSTD_compressBlock_fast_generic(ctx, src, srcSize, 4); return;
     case 5 :
-        return ZSTD_compressBlock_fast_generic(ctx, dst, maxDstSize, src, srcSize, 5);
+        ZSTD_compressBlock_fast_generic(ctx, src, srcSize, 5); return;
     case 6 :
-        return ZSTD_compressBlock_fast_generic(ctx, dst, maxDstSize, src, srcSize, 6);
+        ZSTD_compressBlock_fast_generic(ctx, src, srcSize, 6); return;
     case 7 :
-        return ZSTD_compressBlock_fast_generic(ctx, dst, maxDstSize, src, srcSize, 7);
+        ZSTD_compressBlock_fast_generic(ctx, src, srcSize, 7); return;
     }
 }
 
 
-//FORCE_INLINE
-size_t ZSTD_compressBlock_fast_extDict_generic(ZSTD_CCtx* ctx,
-                                          void* dst, size_t maxDstSize,
-                                    const void* src, size_t srcSize,
-                                    const U32 mls)
+static void ZSTD_compressBlock_fast_extDict_generic(ZSTD_CCtx* ctx,
+                                 const void* src, size_t srcSize,
+                                 const U32 mls)
 {
     U32* hashTable = ctx->hashTable;
     const U32 hBits = ctx->params.hashLog;
@@ -902,18 +1048,12 @@ size_t ZSTD_compressBlock_fast_extDict_generic(ZSTD_CCtx* ctx,
 
     /* init */
     ZSTD_resetSeqStore(seqStorePtr);
-    {
-        /* skip first 4 positions to avoid read overflow during repcode match check */
-        hashTable[ZSTD_hashPtr(ip+0, hBits, mls)] = (U32)(ip-base+0);
-        hashTable[ZSTD_hashPtr(ip+1, hBits, mls)] = (U32)(ip-base+1);
-        hashTable[ZSTD_hashPtr(ip+2, hBits, mls)] = (U32)(ip-base+2);
-        hashTable[ZSTD_hashPtr(ip+3, hBits, mls)] = (U32)(ip-base+3);
-        ip += 4;
-    }
+    /* skip first position to avoid read overflow during repcode match check */
+    hashTable[ZSTD_hashPtr(ip+0, hBits, mls)] = (U32)(ip-base+0);
+    ip += REPCODE_STARTVALUE;
 
     /* Main Search Loop */
-    while (ip < ilimit)  /* < instead of <=, because (ip+1) */
-    {
+    while (ip < ilimit) {  /* < instead of <=, because (ip+1) */
         const size_t h = ZSTD_hashPtr(ip, hBits, mls);
         const U32 matchIndex = hashTable[h];
         const BYTE* matchBase = matchIndex < dictLimit ? dictBase : base;
@@ -927,15 +1067,12 @@ size_t ZSTD_compressBlock_fast_extDict_generic(ZSTD_CCtx* ctx,
         hashTable[h] = current;   /* update hash table */
 
         if ( ((repIndex <= dictLimit-4) || (repIndex >= dictLimit))
-          && (MEM_read32(repMatch) == MEM_read32(ip+1)) )
-        {
+          && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
             const BYTE* repMatchEnd = repIndex < dictLimit ? dictEnd : iend;
             mlCode = ZSTD_count_2segments(ip+1+MINMATCH, repMatch+MINMATCH, iend, repMatchEnd, lowPrefixPtr);
             ip++;
             offset = 0;
-        }
-        else
-        {
+        } else {
             if ( (matchIndex < lowLimit) ||
                  (MEM_read32(match) != MEM_read32(ip)) )
             { ip += ((ip-anchor) >> g_searchStrength) + 1; continue; }
@@ -943,32 +1080,28 @@ size_t ZSTD_compressBlock_fast_extDict_generic(ZSTD_CCtx* ctx,
                 const BYTE* matchEnd = matchIndex < dictLimit ? dictEnd : iend;
                 const BYTE* lowMatchPtr = matchIndex < dictLimit ? dictStart : lowPrefixPtr;
                 mlCode = ZSTD_count_2segments(ip+MINMATCH, match+MINMATCH, iend, matchEnd, lowPrefixPtr);
-                while ((ip>anchor) && (match>lowMatchPtr) && (ip[-1] == match[-1])) { ip--; match--; mlCode++; }  /* catch up */
+                while ((ip>anchor) && (match>lowMatchPtr) && (ip[-1] == match[-1])) { ip--; match--; mlCode++; }   /* catch up */
                 offset = current - matchIndex;
                 offset_2 = offset_1;
                 offset_1 = offset;
-            }
-        }
+        }   }
 
         /* found a match : store it */
         ZSTD_storeSeq(seqStorePtr, ip-anchor, anchor, offset, mlCode);
         ip += mlCode + MINMATCH;
         anchor = ip;
 
-        if (ip <= ilimit)
-        {
+        if (ip <= ilimit) {
             /* Fill Table */
 			hashTable[ZSTD_hashPtr(base+current+2, hBits, mls)] = current+2;
             hashTable[ZSTD_hashPtr(ip-2, hBits, mls)] = (U32)(ip-2-base);
             /* check immediate repcode */
-            while (ip <= ilimit)
-            {
+            while (ip <= ilimit) {
                 U32 current2 = (U32)(ip-base);
                 const U32 repIndex2 = current2 - offset_2;
                 const BYTE* repMatch2 = repIndex2 < dictLimit ? dictBase + repIndex2 : base + repIndex2;
                 if ( ((repIndex2 <= dictLimit-4) || (repIndex2 >= dictLimit))
-                  && (MEM_read32(repMatch2) == MEM_read32(ip)) )
-                {
+                  && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
                     const BYTE* const repEnd2 = repIndex2 < dictLimit ? dictEnd : iend;
                     size_t repLength2 = ZSTD_count_2segments(ip+MINMATCH, repMatch2+MINMATCH, iend, repEnd2, lowPrefixPtr);
                     U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
@@ -979,9 +1112,7 @@ size_t ZSTD_compressBlock_fast_extDict_generic(ZSTD_CCtx* ctx,
                     continue;
                 }
                 break;
-            }
-        }
-    }
+    }   }   }
 
     /* Last Literals */
     {
@@ -989,15 +1120,10 @@ size_t ZSTD_compressBlock_fast_extDict_generic(ZSTD_CCtx* ctx,
         memcpy(seqStorePtr->lit, anchor, lastLLSize);
         seqStorePtr->lit += lastLLSize;
     }
-
-    /* Finale compression stage */
-    return ZSTD_compressSequences(dst, maxDstSize,
-                                  seqStorePtr, srcSize);
 }
 
 
-size_t ZSTD_compressBlock_fast_extDict(ZSTD_CCtx* ctx,
-                               void* dst, size_t maxDstSize,
+static void ZSTD_compressBlock_fast_extDict(ZSTD_CCtx* ctx,
                          const void* src, size_t srcSize)
 {
     const U32 mls = ctx->params.searchLength;
@@ -1005,22 +1131,22 @@ size_t ZSTD_compressBlock_fast_extDict(ZSTD_CCtx* ctx,
     {
     default:
     case 4 :
-        return ZSTD_compressBlock_fast_extDict_generic(ctx, dst, maxDstSize, src, srcSize, 4);
+        ZSTD_compressBlock_fast_extDict_generic(ctx, src, srcSize, 4); return;
     case 5 :
-        return ZSTD_compressBlock_fast_extDict_generic(ctx, dst, maxDstSize, src, srcSize, 5);
+        ZSTD_compressBlock_fast_extDict_generic(ctx, src, srcSize, 5); return;
     case 6 :
-        return ZSTD_compressBlock_fast_extDict_generic(ctx, dst, maxDstSize, src, srcSize, 6);
+        ZSTD_compressBlock_fast_extDict_generic(ctx, src, srcSize, 6); return;
     case 7 :
-        return ZSTD_compressBlock_fast_extDict_generic(ctx, dst, maxDstSize, src, srcSize, 7);
+        ZSTD_compressBlock_fast_extDict_generic(ctx, src, srcSize, 7); return;
     }
 }
 
 
-/* *************************************
+/*-*************************************
 *  Binary Tree search
 ***************************************/
-/** ZSTD_insertBt1 : add one or multiple positions to tree
-*   @ip : assumed <= iend-8
+/** ZSTD_insertBt1() : add one or multiple positions to tree.
+*   ip : assumed <= iend-8 .
 *   @return : nb of positions added */
 static U32 ZSTD_insertBt1(ZSTD_CCtx* zc, const BYTE* const ip, const U32 mls, const BYTE* const iend, U32 nbCompares,
                           U32 extDict)
@@ -1046,6 +1172,7 @@ static U32 ZSTD_insertBt1(ZSTD_CCtx* zc, const BYTE* const ip, const U32 mls, co
     U32 dummy32;   /* to be nullified at the end */
     const U32 windowLow = zc->lowLimit;
     U32 matchEndIdx = current+8;
+    size_t bestLength = 8;
     U32 predictedSmall = *(bt + 2*((current-1)&btMask) + 0);
     U32 predictedLarge = *(bt + 2*((current-1)&btMask) + 1);
     predictedSmall += (predictedSmall>0);
@@ -1053,14 +1180,13 @@ static U32 ZSTD_insertBt1(ZSTD_CCtx* zc, const BYTE* const ip, const U32 mls, co
 
     hashTable[h] = current;   /* Update Hash Table */
 
-    while (nbCompares-- && (matchIndex > windowLow))
-    {
+    while (nbCompares-- && (matchIndex > windowLow)) {
         U32* nextPtr = bt + 2*(matchIndex & btMask);
-        const U32* predictPtr = bt + 2*((matchIndex-1) & btMask);   /* written this way, as bt is a roll buffer */
         size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger);   /* guaranteed minimum nb of common bytes */
-
-        if (matchIndex == predictedSmall)
-        {   /* no need to check length, result known */
+#if 1   /* note : can create issues when hlog small <= 11 */
+        const U32* predictPtr = bt + 2*((matchIndex-1) & btMask);   /* written this way, as bt is a roll buffer */
+        if (matchIndex == predictedSmall) {
+            /* no need to check length, result known */
             *smallerPtr = matchIndex;
             if (matchIndex <= btLow) { smallerPtr=&dummy32; break; }   /* beyond tree size, stop the search */
             smallerPtr = nextPtr+1;               /* new "smaller" => larger of match */
@@ -1068,9 +1194,7 @@ static U32 ZSTD_insertBt1(ZSTD_CCtx* zc, const BYTE* const ip, const U32 mls, co
             predictedSmall = predictPtr[1] + (predictPtr[1]>0);
             continue;
         }
-
-        if (matchIndex == predictedLarge)
-        {
+        if (matchIndex == predictedLarge) {
             *largerPtr = matchIndex;
             if (matchIndex <= btLow) { largerPtr=&dummy32; break; }   /* beyond tree size, stop the search */
             largerPtr = nextPtr;
@@ -1078,64 +1202,51 @@ static U32 ZSTD_insertBt1(ZSTD_CCtx* zc, const BYTE* const ip, const U32 mls, co
             predictedLarge = predictPtr[0] + (predictPtr[0]>0);
             continue;
         }
-
-        if ((!extDict) || (matchIndex+matchLength >= dictLimit))
-        {
+#endif
+        if ((!extDict) || (matchIndex+matchLength >= dictLimit)) {
             match = base + matchIndex;
             if (match[matchLength] == ip[matchLength])
                 matchLength += ZSTD_count(ip+matchLength+1, match+matchLength+1, iend) +1;
-        }
-        else
-        {
+        } else {
             match = dictBase + matchIndex;
             matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart);
             if (matchIndex+matchLength >= dictLimit)
 				match = base + matchIndex;   /* to prepare for next usage of match[matchLength] */
         }
 
-        if (matchLength > matchEndIdx - matchIndex)
-            matchEndIdx = matchIndex + (U32)matchLength;
+        if (matchLength > bestLength) {
+            bestLength = matchLength;
+            if (matchLength > matchEndIdx - matchIndex)
+                matchEndIdx = matchIndex + (U32)matchLength;
+        }
 
         if (ip+matchLength == iend)   /* equal : no way to know if inf or sup */
             break;   /* drop , to guarantee consistency ; miss a bit of compression, but other solutions can corrupt the tree */
 
-        if (match[matchLength] < ip[matchLength])   /* necessarily within correct buffer */
-        {
+        if (match[matchLength] < ip[matchLength]) {  /* necessarily within correct buffer */
             /* match is smaller than current */
             *smallerPtr = matchIndex;             /* update smaller idx */
             commonLengthSmaller = matchLength;    /* all smaller will now have at least this guaranteed common length */
             if (matchIndex <= btLow) { smallerPtr=&dummy32; break; }   /* beyond tree size, stop the search */
             smallerPtr = nextPtr+1;               /* new "smaller" => larger of match */
             matchIndex = nextPtr[1];              /* new matchIndex larger than previous (closer to current) */
-        }
-        else
-        {
+        } else {
             /* match is larger than current */
             *largerPtr = matchIndex;
             commonLengthLarger = matchLength;
             if (matchIndex <= btLow) { largerPtr=&dummy32; break; }   /* beyond tree size, stop the search */
             largerPtr = nextPtr;
             matchIndex = nextPtr[0];
-        }
-    }
+    }   }
 
     *smallerPtr = *largerPtr = 0;
-    return (matchEndIdx > current + 8) ? matchEndIdx - current - 8 : 1;
+    if (bestLength > 384) return MIN(192, (U32)(bestLength - 384));
+    if (matchEndIdx > current + 8) return matchEndIdx - current - 8;
+    return 1;
 }
 
 
-static void ZSTD_updateTree(ZSTD_CCtx* zc, const BYTE* const ip, const BYTE* const iend, const U32 nbCompares, const U32 mls)
-{
-    const BYTE* const base = zc->base;
-    const U32 target = (U32)(ip - base);
-    U32 idx = zc->nextToUpdate;
-
-    for( ; idx < target ; )
-        idx += ZSTD_insertBt1(zc, base+idx, mls, iend, nbCompares, 0);
-}
-
-FORCE_INLINE /* inlining is important to hardwire a hot branch (template emulation) */
-size_t ZSTD_insertBtAndFindBestMatch (
+static size_t ZSTD_insertBtAndFindBestMatch (
                         ZSTD_CCtx* zc,
                         const BYTE* const ip, const BYTE* const iend,
                         size_t* offsetPtr,
@@ -1166,28 +1277,23 @@ size_t ZSTD_insertBtAndFindBestMatch (
 
     hashTable[h] = current;   /* Update Hash Table */
 
-    while (nbCompares-- && (matchIndex > windowLow))
-    {
+    while (nbCompares-- && (matchIndex > windowLow)) {
         U32* nextPtr = bt + 2*(matchIndex & btMask);
         size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger);   /* guaranteed minimum nb of common bytes */
         const BYTE* match;
 
-        if ((!extDict) || (matchIndex+matchLength >= dictLimit))
-        {
+        if ((!extDict) || (matchIndex+matchLength >= dictLimit)) {
             match = base + matchIndex;
             if (match[matchLength] == ip[matchLength])
                 matchLength += ZSTD_count(ip+matchLength+1, match+matchLength+1, iend) +1;
-        }
-        else
-        {
+        } else {
             match = dictBase + matchIndex;
             matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart);
             if (matchIndex+matchLength >= dictLimit)
 				match = base + matchIndex;   /* to prepare for next usage of match[matchLength] */
         }
 
-        if (matchLength > bestLength)
-        {
+        if (matchLength > bestLength) {
             if (matchLength > matchEndIdx - matchIndex)
                 matchEndIdx = matchIndex + (U32)matchLength;
             if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit(current-matchIndex+1) - ZSTD_highbit((U32)offsetPtr[0]+1)) )
@@ -1196,25 +1302,21 @@ size_t ZSTD_insertBtAndFindBestMatch (
                 break;   /* drop, to guarantee consistency (miss a little bit of compression) */
         }
 
-        if (match[matchLength] < ip[matchLength])
-        {
+        if (match[matchLength] < ip[matchLength]) {
             /* match is smaller than current */
             *smallerPtr = matchIndex;             /* update smaller idx */
             commonLengthSmaller = matchLength;    /* all smaller will now have at least this guaranteed common length */
             if (matchIndex <= btLow) { smallerPtr=&dummy32; break; }   /* beyond tree size, stop the search */
             smallerPtr = nextPtr+1;               /* new "smaller" => larger of match */
             matchIndex = nextPtr[1];              /* new matchIndex larger than previous (closer to current) */
-        }
-        else
-        {
+        } else {
             /* match is larger than current */
             *largerPtr = matchIndex;
             commonLengthLarger = matchLength;
             if (matchIndex <= btLow) { largerPtr=&dummy32; break; }   /* beyond tree size, stop the search */
             largerPtr = nextPtr;
             matchIndex = nextPtr[0];
-        }
-    }
+    }   }
 
     *smallerPtr = *largerPtr = 0;
 
@@ -1223,9 +1325,18 @@ size_t ZSTD_insertBtAndFindBestMatch (
 }
 
 
+static void ZSTD_updateTree(ZSTD_CCtx* zc, const BYTE* const ip, const BYTE* const iend, const U32 nbCompares, const U32 mls)
+{
+    const BYTE* const base = zc->base;
+    const U32 target = (U32)(ip - base);
+    U32 idx = zc->nextToUpdate;
+
+    while(idx < target)
+        idx += ZSTD_insertBt1(zc, base+idx, mls, iend, nbCompares, 0);
+}
+
 /** Tree updater, providing best match */
-FORCE_INLINE /* inlining is important to hardwire a hot branch (template emulation) */
-size_t ZSTD_BtFindBestMatch (
+static size_t ZSTD_BtFindBestMatch (
                         ZSTD_CCtx* zc,
                         const BYTE* const ip, const BYTE* const iLimit,
                         size_t* offsetPtr,
@@ -1237,7 +1348,7 @@ size_t ZSTD_BtFindBestMatch (
 }
 
 
-FORCE_INLINE size_t ZSTD_BtFindBestMatch_selectMLS (
+static size_t ZSTD_BtFindBestMatch_selectMLS (
                         ZSTD_CCtx* zc,   /* Index table will be updated */
                         const BYTE* ip, const BYTE* const iLimit,
                         size_t* offsetPtr,
@@ -1259,14 +1370,12 @@ static void ZSTD_updateTree_extDict(ZSTD_CCtx* zc, const BYTE* const ip, const B
     const U32 target = (U32)(ip - base);
     U32 idx = zc->nextToUpdate;
 
-    for( ; idx < target ; )
-        idx += ZSTD_insertBt1(zc, base+idx, mls, iend, nbCompares, 1);
+    while (idx < target) idx += ZSTD_insertBt1(zc, base+idx, mls, iend, nbCompares, 1);
 }
 
 
 /** Tree updater, providing best match */
-FORCE_INLINE /* inlining is important to hardwire a hot branch (template emulation) */
-size_t ZSTD_BtFindBestMatch_extDict (
+static size_t ZSTD_BtFindBestMatch_extDict (
                         ZSTD_CCtx* zc,
                         const BYTE* const ip, const BYTE* const iLimit,
                         size_t* offsetPtr,
@@ -1278,7 +1387,7 @@ size_t ZSTD_BtFindBestMatch_extDict (
 }
 
 
-FORCE_INLINE size_t ZSTD_BtFindBestMatch_selectMLS_extDict (
+static size_t ZSTD_BtFindBestMatch_selectMLS_extDict (
                         ZSTD_CCtx* zc,   /* Index table will be updated */
                         const BYTE* ip, const BYTE* const iLimit,
                         size_t* offsetPtr,
@@ -1302,7 +1411,8 @@ FORCE_INLINE size_t ZSTD_BtFindBestMatch_selectMLS_extDict (
 
 /* Update chains up to ip (excluded)
    Assumption : always within prefix (ie. not within extDict) */
-static U32 ZSTD_insertAndFindFirstIndex (ZSTD_CCtx* zc, const BYTE* ip, U32 mls)
+FORCE_INLINE
+U32 ZSTD_insertAndFindFirstIndex (ZSTD_CCtx* zc, const BYTE* ip, U32 mls)
 {
     U32* const hashTable  = zc->hashTable;
     const U32 hashLog = zc->params.hashLog;
@@ -1312,8 +1422,7 @@ static U32 ZSTD_insertAndFindFirstIndex (ZSTD_CCtx* zc, const BYTE* ip, U32 mls)
     const U32 target = (U32)(ip - base);
     U32 idx = zc->nextToUpdate;
 
-    while(idx < target)
-    {
+    while(idx < target) {
         size_t h = ZSTD_hashPtr(base+idx, hashLog, mls);
         NEXT_IN_CHAIN(idx, chainMask) = hashTable[h];
         hashTable[h] = idx;
@@ -1351,18 +1460,14 @@ size_t ZSTD_HcFindBestMatch_generic (
     /* HC4 match finder */
     matchIndex = ZSTD_insertAndFindFirstIndex (zc, ip, mls);
 
-    while ((matchIndex>lowLimit) && (nbAttempts))
-    {
+    while ((matchIndex>lowLimit) && (nbAttempts)) {
         size_t currentMl=0;
         nbAttempts--;
-        if ((!extDict) || matchIndex >= dictLimit)
-        {
+        if ((!extDict) || matchIndex >= dictLimit) {
             match = base + matchIndex;
             if (match[ml] == ip[ml])   /* potentially better */
                 currentMl = ZSTD_count(ip, match, iLimit);
-        }
-        else
-        {
+        } else {
             match = dictBase + matchIndex;
             if (MEM_read32(match) == MEM_read32(ip))   /* assumption : matchIndex <= dictLimit-4 (by table construction) */
                 currentMl = ZSTD_count_2segments(ip+MINMATCH, match+MINMATCH, iLimit, dictEnd, prefixStart) + MINMATCH;
@@ -1415,8 +1520,8 @@ FORCE_INLINE size_t ZSTD_HcFindBestMatch_extDict_selectMLS (
 *  Common parser - lazy strategy
 *********************************/
 FORCE_INLINE
-size_t ZSTD_compressBlock_lazy_generic(ZSTD_CCtx* ctx,
-                                     void* dst, size_t maxDstSize, const void* src, size_t srcSize,
+void ZSTD_compressBlock_lazy_generic(ZSTD_CCtx* ctx,
+                                     const void* src, size_t srcSize,
                                      const U32 searchMethod, const U32 depth)
 {
     seqStore_t* seqStorePtr = &(ctx->seqStore);
@@ -1441,15 +1546,13 @@ size_t ZSTD_compressBlock_lazy_generic(ZSTD_CCtx* ctx,
     if ((ip-base) < REPCODE_STARTVALUE) ip = base + REPCODE_STARTVALUE;
 
     /* Match Loop */
-    while (ip < ilimit)
-    {
+    while (ip < ilimit) {
         size_t matchLength=0;
         size_t offset=0;
         const BYTE* start=ip+1;
 
         /* check repCode */
-        if (MEM_read32(ip+1) == MEM_read32(ip+1 - offset_1))
-        {
+        if (MEM_read32(ip+1) == MEM_read32(ip+1 - offset_1)) {
             /* repcode : we take it */
             matchLength = ZSTD_count(ip+1+MINMATCH, ip+1+MINMATCH-offset_1, iend) + MINMATCH;
             if (depth==0) goto _storeSequence;
@@ -1463,19 +1566,16 @@ size_t ZSTD_compressBlock_lazy_generic(ZSTD_CCtx* ctx,
                 matchLength = ml2, start = ip, offset=offsetFound;
         }
 
-        if (matchLength < MINMATCH)
-        {
+        if (matchLength < MINMATCH) {
             ip += ((ip-anchor) >> g_searchStrength) + 1;   /* jump faster over incompressible sections */
             continue;
         }
 
         /* let's try to find a better solution */
         if (depth>=1)
-        while (ip<ilimit)
-        {
+        while (ip<ilimit) {
             ip ++;
-            if ((offset) && (MEM_read32(ip) == MEM_read32(ip - offset_1)))
-            {
+            if ((offset) && (MEM_read32(ip) == MEM_read32(ip - offset_1))) {
                 size_t mlRep = ZSTD_count(ip+MINMATCH, ip+MINMATCH-offset_1, iend) + MINMATCH;
                 int gain2 = (int)(mlRep * 3);
                 int gain1 = (int)(matchLength*3 - ZSTD_highbit((U32)offset+1) + 1);
@@ -1487,19 +1587,15 @@ size_t ZSTD_compressBlock_lazy_generic(ZSTD_CCtx* ctx,
                 size_t ml2 = searchMax(ctx, ip, iend, &offset2, maxSearches, mls);
                 int gain2 = (int)(ml2*4 - ZSTD_highbit((U32)offset2+1));   /* raw approx */
                 int gain1 = (int)(matchLength*4 - ZSTD_highbit((U32)offset+1) + 4);
-                if ((ml2 >= MINMATCH) && (gain2 > gain1))
-                {
+                if ((ml2 >= MINMATCH) && (gain2 > gain1)) {
                     matchLength = ml2, offset = offset2, start = ip;
                     continue;   /* search a better one */
-                }
-            }
+            }   }
 
             /* let's find an even better one */
-            if ((depth==2) && (ip<ilimit))
-            {
+            if ((depth==2) && (ip<ilimit)) {
                 ip ++;
-                if ((offset) && (MEM_read32(ip) == MEM_read32(ip - offset_1)))
-                {
+                if ((offset) && (MEM_read32(ip) == MEM_read32(ip - offset_1))) {
                     size_t ml2 = ZSTD_count(ip+MINMATCH, ip+MINMATCH-offset_1, iend) + MINMATCH;
                     int gain2 = (int)(ml2 * 4);
                     int gain1 = (int)(matchLength*4 - ZSTD_highbit((U32)offset+1) + 1);
@@ -1511,19 +1607,15 @@ size_t ZSTD_compressBlock_lazy_generic(ZSTD_CCtx* ctx,
                     size_t ml2 = searchMax(ctx, ip, iend, &offset2, maxSearches, mls);
                     int gain2 = (int)(ml2*4 - ZSTD_highbit((U32)offset2+1));   /* raw approx */
                     int gain1 = (int)(matchLength*4 - ZSTD_highbit((U32)offset+1) + 7);
-                    if ((ml2 >= MINMATCH) && (gain2 > gain1))
-                    {
+                    if ((ml2 >= MINMATCH) && (gain2 > gain1)) {
                         matchLength = ml2, offset = offset2, start = ip;
                         continue;
-                    }
-                }
-            }
+            }   }   }
             break;  /* nothing found : store previous solution */
         }
 
         /* catch up */
-        if (offset)
-        {
+        if (offset) {
             while ((start>anchor) && (start>base+offset) && (start[-1] == start[-1-offset]))   /* only search for offset within prefix */
                 { start--; matchLength++; }
             offset_2 = offset_1; offset_1 = offset;
@@ -1539,8 +1631,7 @@ _storeSequence:
 
         /* check immediate repcode */
         while ( (ip <= ilimit)
-             && (MEM_read32(ip) == MEM_read32(ip - offset_2)) )
-        {
+             && (MEM_read32(ip) == MEM_read32(ip - offset_2)) ) {
             /* store sequence */
             matchLength = ZSTD_count(ip+MINMATCH, ip+MINMATCH-offset_2, iend);
             offset = offset_2;
@@ -1550,8 +1641,7 @@ _storeSequence:
             ip += matchLength+MINMATCH;
             anchor = ip;
             continue;   /* faster when present ... (?) */
-        }
-    }
+    }   }
 
     /* Last Literals */
     {
@@ -1559,36 +1649,44 @@ _storeSequence:
         memcpy(seqStorePtr->lit, anchor, lastLLSize);
         seqStorePtr->lit += lastLLSize;
     }
+}
+
+#include "zstd_opt.h"
+
+static void ZSTD_compressBlock_opt_bt(ZSTD_CCtx* ctx, const void* src, size_t srcSize)
+{
+    ZSTD_compressBlock_opt_generic(ctx, src, srcSize, 1, 2);
+}
 
-    /* Final compression stage */
-    return ZSTD_compressSequences(dst, maxDstSize,
-                                  seqStorePtr, srcSize);
+static void ZSTD_compressBlock_opt(ZSTD_CCtx* ctx, const void* src, size_t srcSize)
+{
+    ZSTD_compressBlock_opt_generic(ctx, src, srcSize, 0, 2);
 }
 
-size_t ZSTD_compressBlock_btlazy2(ZSTD_CCtx* ctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+static void ZSTD_compressBlock_btlazy2(ZSTD_CCtx* ctx, const void* src, size_t srcSize)
 {
-    return ZSTD_compressBlock_lazy_generic(ctx, dst, maxDstSize, src, srcSize, 1, 2);
+    ZSTD_compressBlock_lazy_generic(ctx, src, srcSize, 1, 2);
 }
 
-size_t ZSTD_compressBlock_lazy2(ZSTD_CCtx* ctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+static void ZSTD_compressBlock_lazy2(ZSTD_CCtx* ctx, const void* src, size_t srcSize)
 {
-    return ZSTD_compressBlock_lazy_generic(ctx, dst, maxDstSize, src, srcSize, 0, 2);
+    ZSTD_compressBlock_lazy_generic(ctx, src, srcSize, 0, 2);
 }
 
-size_t ZSTD_compressBlock_lazy(ZSTD_CCtx* ctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+static void ZSTD_compressBlock_lazy(ZSTD_CCtx* ctx, const void* src, size_t srcSize)
 {
-    return ZSTD_compressBlock_lazy_generic(ctx, dst, maxDstSize, src, srcSize, 0, 1);
+    ZSTD_compressBlock_lazy_generic(ctx, src, srcSize, 0, 1);
 }
 
-size_t ZSTD_compressBlock_greedy(ZSTD_CCtx* ctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+static void ZSTD_compressBlock_greedy(ZSTD_CCtx* ctx, const void* src, size_t srcSize)
 {
-    return ZSTD_compressBlock_lazy_generic(ctx, dst, maxDstSize, src, srcSize, 0, 0);
+    ZSTD_compressBlock_lazy_generic(ctx, src, srcSize, 0, 0);
 }
 
 
 FORCE_INLINE
-size_t ZSTD_compressBlock_lazy_extDict_generic(ZSTD_CCtx* ctx,
-                                     void* dst, size_t maxDstSize, const void* src, size_t srcSize,
+void ZSTD_compressBlock_lazy_extDict_generic(ZSTD_CCtx* ctx,
+                                     const void* src, size_t srcSize,
                                      const U32 searchMethod, const U32 depth)
 {
     seqStore_t* seqStorePtr = &(ctx->seqStore);
@@ -1618,8 +1716,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(ZSTD_CCtx* ctx,
     if ((ip - prefixStart) < REPCODE_STARTVALUE) ip += REPCODE_STARTVALUE;
 
     /* Match Loop */
-    while (ip < ilimit)
-    {
+    while (ip < ilimit) {
         size_t matchLength=0;
         size_t offset=0;
         const BYTE* start=ip+1;
@@ -1631,14 +1728,12 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(ZSTD_CCtx* ctx,
             const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
             const BYTE* const repMatch = repBase + repIndex;
             if ((U32)((dictLimit-1) - repIndex) >= 3)   /* intentional overflow */
-            if (MEM_read32(ip+1) == MEM_read32(repMatch))
-            {
+            if (MEM_read32(ip+1) == MEM_read32(repMatch)) {
                 /* repcode detected we should take it */
                 const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
                 matchLength = ZSTD_count_2segments(ip+1+MINMATCH, repMatch+MINMATCH, iend, repEnd, prefixStart) + MINMATCH;
                 if (depth==0) goto _storeSequence;
-            }
-        }
+        }   }
 
         {
             /* first search (depth 0) */
@@ -1648,27 +1743,23 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(ZSTD_CCtx* ctx,
                 matchLength = ml2, start = ip, offset=offsetFound;
         }
 
-         if (matchLength < MINMATCH)
-        {
+         if (matchLength < MINMATCH) {
             ip += ((ip-anchor) >> g_searchStrength) + 1;   /* jump faster over incompressible sections */
             continue;
         }
 
         /* let's try to find a better solution */
         if (depth>=1)
-        while (ip<ilimit)
-        {
+        while (ip<ilimit) {
             ip ++;
             current++;
             /* check repCode */
-            if (offset)
-            {
+            if (offset) {
                 const U32 repIndex = (U32)(current - offset_1);
                 const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
                 const BYTE* const repMatch = repBase + repIndex;
                 if ((U32)((dictLimit-1) - repIndex) >= 3)   /* intentional overflow */
-                if (MEM_read32(ip) == MEM_read32(repMatch))
-                {
+                if (MEM_read32(ip) == MEM_read32(repMatch)) {
                     /* repcode detected */
                     const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
                     size_t repLength = ZSTD_count_2segments(ip+MINMATCH, repMatch+MINMATCH, iend, repEnd, prefixStart) + MINMATCH;
@@ -1676,8 +1767,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(ZSTD_CCtx* ctx,
                     int gain1 = (int)(matchLength*3 - ZSTD_highbit((U32)offset+1) + 1);
                     if ((repLength >= MINMATCH) && (gain2 > gain1))
                         matchLength = repLength, offset = 0, start = ip;
-                }
-            }
+            }   }
 
             /* search match, depth 1 */
             {
@@ -1685,27 +1775,22 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(ZSTD_CCtx* ctx,
                 size_t ml2 = searchMax(ctx, ip, iend, &offset2, maxSearches, mls);
                 int gain2 = (int)(ml2*4 - ZSTD_highbit((U32)offset2+1));   /* raw approx */
                 int gain1 = (int)(matchLength*4 - ZSTD_highbit((U32)offset+1) + 4);
-                if ((ml2 >= MINMATCH) && (gain2 > gain1))
-                {
+                if ((ml2 >= MINMATCH) && (gain2 > gain1)) {
                     matchLength = ml2, offset = offset2, start = ip;
                     continue;   /* search a better one */
-                }
-            }
+            }   }
 
             /* let's find an even better one */
-            if ((depth==2) && (ip<ilimit))
-            {
+            if ((depth==2) && (ip<ilimit)) {
                 ip ++;
                 current++;
                 /* check repCode */
-                if (offset)
-                {
+                if (offset) {
                     const U32 repIndex = (U32)(current - offset_1);
                     const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
                     const BYTE* const repMatch = repBase + repIndex;
                     if ((U32)((dictLimit-1) - repIndex) >= 3)   /* intentional overflow */
-                    if (MEM_read32(ip) == MEM_read32(repMatch))
-                    {
+                    if (MEM_read32(ip) == MEM_read32(repMatch)) {
                         /* repcode detected */
                         const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
                         size_t repLength = ZSTD_count_2segments(ip+MINMATCH, repMatch+MINMATCH, iend, repEnd, prefixStart) + MINMATCH;
@@ -1713,8 +1798,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(ZSTD_CCtx* ctx,
                         int gain1 = (int)(matchLength*4 - ZSTD_highbit((U32)offset+1) + 1);
                         if ((repLength >= MINMATCH) && (gain2 > gain1))
                             matchLength = repLength, offset = 0, start = ip;
-                    }
-                }
+                }   }
 
                 /* search match, depth 2 */
                 {
@@ -1722,19 +1806,15 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(ZSTD_CCtx* ctx,
                     size_t ml2 = searchMax(ctx, ip, iend, &offset2, maxSearches, mls);
                     int gain2 = (int)(ml2*4 - ZSTD_highbit((U32)offset2+1));   /* raw approx */
                     int gain1 = (int)(matchLength*4 - ZSTD_highbit((U32)offset+1) + 7);
-                    if ((ml2 >= MINMATCH) && (gain2 > gain1))
-                    {
+                    if ((ml2 >= MINMATCH) && (gain2 > gain1)) {
                         matchLength = ml2, offset = offset2, start = ip;
                         continue;
-                    }
-                }
-            }
+            }   }   }
             break;  /* nothing found : store previous solution */
         }
 
         /* catch up */
-        if (offset)
-        {
+        if (offset) {
             U32 matchIndex = (U32)((start-base) - offset);
             const BYTE* match = (matchIndex < dictLimit) ? dictBase + matchIndex : base + matchIndex;
             const BYTE* const mStart = (matchIndex < dictLimit) ? dictStart : prefixStart;
@@ -1751,14 +1831,12 @@ _storeSequence:
         }
 
         /* check immediate repcode */
-        while (ip <= ilimit)
-        {
+        while (ip <= ilimit) {
             const U32 repIndex = (U32)((ip-base) - offset_2);
             const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
             const BYTE* const repMatch = repBase + repIndex;
             if ((U32)((dictLimit-1) - repIndex) >= 3)   /* intentional overflow */
-            if (MEM_read32(ip) == MEM_read32(repMatch))
-            {
+            if (MEM_read32(ip) == MEM_read32(repMatch)) {
                 /* repcode detected we should take it */
                 const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
                 matchLength = ZSTD_count_2segments(ip+MINMATCH, repMatch+MINMATCH, iend, repEnd, prefixStart) + MINMATCH;
@@ -1769,8 +1847,7 @@ _storeSequence:
                 continue;   /* faster when present ... (?) */
             }
             break;
-        }
-    }
+    }   }
 
     /* Last Literals */
     {
@@ -1778,40 +1855,46 @@ _storeSequence:
         memcpy(seqStorePtr->lit, anchor, lastLLSize);
         seqStorePtr->lit += lastLLSize;
     }
+}
+
+void ZSTD_compressBlock_greedy_extDict(ZSTD_CCtx* ctx, const void* src, size_t srcSize)
+{
+    ZSTD_compressBlock_lazy_extDict_generic(ctx, src, srcSize, 0, 0);
+}
 
-    /* Final compression stage */
-    return ZSTD_compressSequences(dst, maxDstSize,
-                                  seqStorePtr, srcSize);
+static void ZSTD_compressBlock_lazy_extDict(ZSTD_CCtx* ctx, const void* src, size_t srcSize)
+{
+    ZSTD_compressBlock_lazy_extDict_generic(ctx, src, srcSize, 0, 1);
 }
 
-size_t ZSTD_compressBlock_greedy_extDict(ZSTD_CCtx* ctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+static void ZSTD_compressBlock_lazy2_extDict(ZSTD_CCtx* ctx, const void* src, size_t srcSize)
 {
-    return ZSTD_compressBlock_lazy_extDict_generic(ctx, dst, maxDstSize, src, srcSize, 0, 0);
+    ZSTD_compressBlock_lazy_extDict_generic(ctx, src, srcSize, 0, 2);
 }
 
-size_t ZSTD_compressBlock_lazy_extDict(ZSTD_CCtx* ctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+static void ZSTD_compressBlock_btlazy2_extDict(ZSTD_CCtx* ctx, const void* src, size_t srcSize)
 {
-    return ZSTD_compressBlock_lazy_extDict_generic(ctx, dst, maxDstSize, src, srcSize, 0, 1);
+    ZSTD_compressBlock_lazy_extDict_generic(ctx, src, srcSize, 1, 2);
 }
 
-size_t ZSTD_compressBlock_lazy2_extDict(ZSTD_CCtx* ctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+static void ZSTD_compressBlock_opt_extDict(ZSTD_CCtx* ctx, const void* src, size_t srcSize)
 {
-    return ZSTD_compressBlock_lazy_extDict_generic(ctx, dst, maxDstSize, src, srcSize, 0, 2);
+    ZSTD_compressBlock_opt_extDict_generic(ctx, src, srcSize, 0, 2);
 }
 
-static size_t ZSTD_compressBlock_btlazy2_extDict(ZSTD_CCtx* ctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+static void ZSTD_compressBlock_opt_bt_extDict(ZSTD_CCtx* ctx, const void* src, size_t srcSize)
 {
-    return ZSTD_compressBlock_lazy_extDict_generic(ctx, dst, maxDstSize, src, srcSize, 1, 2);
+    ZSTD_compressBlock_opt_extDict_generic(ctx, src, srcSize, 1, 2);
 }
 
 
-typedef size_t (*ZSTD_blockCompressor) (ZSTD_CCtx* ctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize);
+typedef void (*ZSTD_blockCompressor) (ZSTD_CCtx* ctx, const void* src, size_t srcSize);
 
 static ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, int extDict)
 {
-    static const ZSTD_blockCompressor blockCompressor[2][5] = {
-        { ZSTD_compressBlock_fast, ZSTD_compressBlock_greedy, ZSTD_compressBlock_lazy,ZSTD_compressBlock_lazy2, ZSTD_compressBlock_btlazy2 },
-        { ZSTD_compressBlock_fast_extDict, ZSTD_compressBlock_greedy_extDict, ZSTD_compressBlock_lazy_extDict,ZSTD_compressBlock_lazy2_extDict, ZSTD_compressBlock_btlazy2_extDict }
+    static const ZSTD_blockCompressor blockCompressor[2][7] = {
+        { ZSTD_compressBlock_fast, ZSTD_compressBlock_greedy, ZSTD_compressBlock_lazy,ZSTD_compressBlock_lazy2, ZSTD_compressBlock_btlazy2, ZSTD_compressBlock_opt, ZSTD_compressBlock_opt_bt },
+        { ZSTD_compressBlock_fast_extDict, ZSTD_compressBlock_greedy_extDict, ZSTD_compressBlock_lazy_extDict,ZSTD_compressBlock_lazy2_extDict, ZSTD_compressBlock_btlazy2_extDict, ZSTD_compressBlock_opt_extDict, ZSTD_compressBlock_opt_bt_extDict }
     };
 
     return blockCompressor[extDict][(U32)strat];
@@ -1821,46 +1904,42 @@ static ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, int
 static size_t ZSTD_compressBlock_internal(ZSTD_CCtx* zc, void* dst, size_t maxDstSize, const void* src, size_t srcSize)
 {
     ZSTD_blockCompressor blockCompressor = ZSTD_selectBlockCompressor(zc->params.strategy, zc->lowLimit < zc->dictLimit);
-    if (srcSize < MIN_CBLOCK_SIZE+3) return 0;   /* don't even attempt compression below a certain srcSize */
-    return blockCompressor(zc, dst, maxDstSize, src, srcSize);
+    if (srcSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1) return 0;   /* don't even attempt compression below a certain srcSize */
+    blockCompressor(zc, src, srcSize);
+    return ZSTD_compressSequences(zc, dst, maxDstSize, srcSize);
 }
 
 
-static size_t ZSTD_compress_generic (ZSTD_CCtx* ctxPtr,
+static size_t ZSTD_compress_generic (ZSTD_CCtx* zc,
                                         void* dst, size_t maxDstSize,
                                   const void* src, size_t srcSize)
 {
-    size_t blockSize = ctxPtr->blockSize;
+    size_t blockSize = zc->blockSize;
     size_t remaining = srcSize;
     const BYTE* ip = (const BYTE*)src;
     BYTE* const ostart = (BYTE*)dst;
     BYTE* op = ostart;
-    const U32 maxDist = 1 << ctxPtr->params.windowLog;
+    const U32 maxDist = 1 << zc->params.windowLog;
 
-    while (remaining)
-    {
+    while (remaining) {
         size_t cSize;
 
-        if (maxDstSize < 3 + MIN_CBLOCK_SIZE) return ERROR(dstSize_tooSmall);   /* not enough space to store compressed block */
+        if (maxDstSize < ZSTD_blockHeaderSize + MIN_CBLOCK_SIZE) return ERROR(dstSize_tooSmall);   /* not enough space to store compressed block */
         if (remaining < blockSize) blockSize = remaining;
 
-        if ((U32)(ip+blockSize - (ctxPtr->base + ctxPtr->lowLimit)) > maxDist)
-        {
-            /* respect windowLog contract */
-            ctxPtr->lowLimit = (U32)(ip+blockSize - ctxPtr->base) - maxDist;
-            if (ctxPtr->dictLimit < ctxPtr->lowLimit) ctxPtr->dictLimit = ctxPtr->lowLimit;
+        if ((U32)(ip+blockSize - zc->base) > zc->loadedDictEnd + maxDist) { /* enforce maxDist */
+            U32 newLowLimit = (U32)(ip+blockSize - zc->base) - maxDist;
+            if (zc->lowLimit < newLowLimit) zc->lowLimit = newLowLimit;
+            if (zc->dictLimit < zc->lowLimit) zc->dictLimit = zc->lowLimit;
         }
 
-        cSize = ZSTD_compressBlock_internal(ctxPtr, op+3, maxDstSize-3, ip, blockSize);
+        cSize = ZSTD_compressBlock_internal(zc, op+ZSTD_blockHeaderSize, maxDstSize-ZSTD_blockHeaderSize, ip, blockSize);
         if (ZSTD_isError(cSize)) return cSize;
 
-        if (cSize == 0)
-        {
-            cSize = ZSTD_noCompressBlock(op, maxDstSize, ip, blockSize);   /* block is not compressible */
+        if (cSize == 0) {  /* block is not compressible */
+            cSize = ZSTD_noCompressBlock(op, maxDstSize, ip, blockSize);
             if (ZSTD_isError(cSize)) return cSize;
-        }
-        else
-        {
+        } else {
             op[0] = (BYTE)(cSize>>16);
             op[1] = (BYTE)(cSize>>8);
             op[2] = (BYTE)cSize;
@@ -1886,8 +1965,7 @@ static size_t ZSTD_compressContinue_internal (ZSTD_CCtx* zc,
     const BYTE* const ip = (const BYTE*) src;
     size_t hbSize = 0;
 
-    if (frame && (zc->stage==0))
-    {
+    if (frame && (zc->stage==0)) {
         hbSize = zc->hbSize;
         if (dstSize <= hbSize) return ERROR(dstSize_tooSmall);
         zc->stage = 1;
@@ -1897,8 +1975,7 @@ static size_t ZSTD_compressContinue_internal (ZSTD_CCtx* zc,
     }
 
     /* Check if blocks follow each other */
-    if (src != zc->nextSrc)
-    {
+    if (src != zc->nextSrc) {
         /* not contiguous */
         size_t delta = zc->nextSrc - ip;
         zc->lowLimit = zc->dictLimit;
@@ -1910,9 +1987,8 @@ static size_t ZSTD_compressContinue_internal (ZSTD_CCtx* zc,
     }
 
     /* preemptive overflow correction */
-    if (zc->lowLimit > (1<<30))
-    {
-        U32 btplus = (zc->params.strategy == ZSTD_btlazy2);
+    if (zc->lowLimit > (1<<30)) {
+        U32 btplus = (zc->params.strategy == ZSTD_btlazy2) || (zc->params.strategy == ZSTD_btopt);
         U32 contentMask = (1 << (zc->params.contentLog - btplus)) - 1;
         U32 newLowLimit = zc->lowLimit & contentMask;   /* preserve position % contentSize */
         U32 correction = zc->lowLimit - newLowLimit;
@@ -1926,8 +2002,7 @@ static size_t ZSTD_compressContinue_internal (ZSTD_CCtx* zc,
     }
 
     /* if input and dictionary overlap : reduce dictionary (presumed modified by input) */
-    if ((ip+srcSize > zc->dictBase + zc->lowLimit) && (ip < zc->dictBase + zc->dictLimit))
-    {
+    if ((ip+srcSize > zc->dictBase + zc->lowLimit) && (ip < zc->dictBase + zc->dictLimit)) {
         zc->lowLimit = (U32)(ip + srcSize - zc->dictBase);
         if (zc->lowLimit > zc->dictLimit) zc->lowLimit = zc->dictLimit;
     }
@@ -1958,7 +2033,7 @@ size_t ZSTD_compressBlock(ZSTD_CCtx* zc, void* dst, size_t maxDstSize, const voi
 }
 
 
-size_t ZSTD_compress_insertDictionary(ZSTD_CCtx* zc, const void* src, size_t srcSize)
+static size_t ZSTD_loadDictionaryContent(ZSTD_CCtx* zc, const void* src, size_t srcSize)
 {
     const BYTE* const ip = (const BYTE*) src;
     const BYTE* const iend = ip + srcSize;
@@ -1969,6 +2044,7 @@ size_t ZSTD_compress_insertDictionary(ZSTD_CCtx* zc, const void* src, size_t src
     zc->dictBase = zc->base;
     zc->base += ip - zc->nextSrc;
     zc->nextToUpdate = zc->dictLimit;
+    zc->loadedDictEnd = (U32)(iend - zc->base);
 
     zc->nextSrc = iend;
     if (srcSize <= 8) return 0;
@@ -1976,99 +2052,123 @@ size_t ZSTD_compress_insertDictionary(ZSTD_CCtx* zc, const void* src, size_t src
     switch(zc->params.strategy)
     {
     case ZSTD_fast:
-        ZSTD_fillHashTable (zc, iend-8, zc->params.searchLength);
+        ZSTD_fillHashTable (zc, iend, zc->params.searchLength);
         break;
 
     case ZSTD_greedy:
     case ZSTD_lazy:
     case ZSTD_lazy2:
+    case ZSTD_opt:
         ZSTD_insertAndFindFirstIndex (zc, iend-8, zc->params.searchLength);
         break;
 
     case ZSTD_btlazy2:
+    case ZSTD_btopt:
         ZSTD_updateTree(zc, iend-8, iend, 1 << zc->params.searchLog, zc->params.searchLength);
-        zc->nextToUpdate = (U32)(iend - zc->base);
         break;
 
     default:
         return ERROR(GENERIC);   /* strategy doesn't exist; impossible */
     }
 
+    zc->nextToUpdate = zc->loadedDictEnd;
     return 0;
 }
 
 
-/*! ZSTD_duplicateCCtx
-*   Duplicate an existing context @srcCCtx into another one @dstCCtx.
-*   Only works during stage 0 (i.e. before first call to ZSTD_compressContinue())
-*   @return : 0, or an error code */
-size_t ZSTD_duplicateCCtx(ZSTD_CCtx* dstCCtx, const ZSTD_CCtx* srcCCtx)
+/* Dictionary format :
+     Magic == ZSTD_DICT_MAGIC (4 bytes)
+     Huff0 CTable (256 * 4 bytes)  => to be changed to read from writeCTable
+     Dictionary content
+*/
+/*! ZSTD_loadDictEntropyStats
+    @return : size read from dictionary */
+static size_t ZSTD_loadDictEntropyStats(ZSTD_CCtx* zc, const void* dict, size_t dictSize)
 {
-    const U32 contentLog = (srcCCtx->params.strategy == ZSTD_fast) ? 1 : srcCCtx->params.contentLog;
-    const size_t tableSpace = ((1 << contentLog) + (1 << srcCCtx->params.hashLog)) * sizeof(U32);
-
-    if (srcCCtx->stage!=0) return ERROR(stage_wrong);
-
-    ZSTD_resetCCtx_advanced(dstCCtx, srcCCtx->params);
-
-    /* copy tables */
-    memcpy(dstCCtx->hashTable, srcCCtx->hashTable, tableSpace);
-
-    /* copy frame header */
-    dstCCtx->hbSize = srcCCtx->hbSize;
-    memcpy(dstCCtx->headerBuffer , srcCCtx->headerBuffer, srcCCtx->hbSize);
+    /* note : magic number already checked */
+    size_t offcodeHeaderSize, matchlengthHeaderSize, litlengthHeaderSize, errorCode;
+    short offcodeNCount[MaxOff+1];
+    unsigned offcodeMaxValue = MaxOff, offcodeLog = OffFSELog;
+    short matchlengthNCount[MaxML+1];
+    unsigned matchlengthMaxValue = MaxML, matchlengthLog = MLFSELog;
+    short litlengthNCount[MaxLL+1];
+    unsigned litlengthMaxValue = MaxLL, litlengthLog = LLFSELog;
+
+    const size_t hufHeaderSize = HUF_readCTable(zc->hufTable, 255, dict, dictSize);
+    if (HUF_isError(hufHeaderSize)) return ERROR(dictionary_corrupted);
+    zc->flagStaticTables = 1;
+    dict = (const char*)dict + hufHeaderSize;
+    dictSize -= hufHeaderSize;
+
+    offcodeHeaderSize = FSE_readNCount(offcodeNCount, &offcodeMaxValue, &offcodeLog, dict, dictSize);
+    if (FSE_isError(offcodeHeaderSize)) return ERROR(dictionary_corrupted);
+    errorCode = FSE_buildCTable(zc->offcodeCTable, offcodeNCount, offcodeMaxValue, offcodeLog);
+    if (FSE_isError(errorCode)) return ERROR(dictionary_corrupted);
+    dict = (const char*)dict + offcodeHeaderSize;
+    dictSize -= offcodeHeaderSize;
+
+    matchlengthHeaderSize = FSE_readNCount(matchlengthNCount, &matchlengthMaxValue, &matchlengthLog, dict, dictSize);
+    if (FSE_isError(matchlengthHeaderSize)) return ERROR(dictionary_corrupted);
+    errorCode = FSE_buildCTable(zc->matchlengthCTable, matchlengthNCount, matchlengthMaxValue, matchlengthLog);
+    if (FSE_isError(errorCode)) return ERROR(dictionary_corrupted);
+    dict = (const char*)dict + matchlengthHeaderSize;
+    dictSize -= matchlengthHeaderSize;
+
+    litlengthHeaderSize = FSE_readNCount(litlengthNCount, &litlengthMaxValue, &litlengthLog, dict, dictSize);
+    if (FSE_isError(litlengthHeaderSize)) return ERROR(dictionary_corrupted);
+    errorCode = FSE_buildCTable(zc->litlengthCTable, litlengthNCount, litlengthMaxValue, litlengthLog);
+    if (FSE_isError(errorCode)) return ERROR(dictionary_corrupted);
+
+    return hufHeaderSize + offcodeHeaderSize + matchlengthHeaderSize + litlengthHeaderSize;
+}
 
-    /* copy dictionary pointers */
-    dstCCtx->nextToUpdate= srcCCtx->nextToUpdate;
-    dstCCtx->nextSrc     = srcCCtx->nextSrc;
-    dstCCtx->base        = srcCCtx->base;
-    dstCCtx->dictBase    = srcCCtx->dictBase;
-    dstCCtx->dictLimit   = srcCCtx->dictLimit;
-    dstCCtx->lowLimit    = srcCCtx->lowLimit;
 
+static size_t ZSTD_compress_insertDictionary(ZSTD_CCtx* zc, const void* dict, size_t dictSize)
+{
+    if (dict && (dictSize>4)) {
+        U32 magic = MEM_readLE32(dict);
+        size_t eSize;
+        if (magic != ZSTD_DICT_MAGIC)
+            return ZSTD_loadDictionaryContent(zc, dict, dictSize);
+
+        eSize = ZSTD_loadDictEntropyStats(zc, (const char*)dict+4, dictSize-4) + 4;
+        if (ZSTD_isError(eSize)) return eSize;
+        return ZSTD_loadDictionaryContent(zc, (const char*)dict+eSize, dictSize-eSize);
+    }
     return 0;
 }
 
 
 /*! ZSTD_compressBegin_advanced
 *   @return : 0, or an error code */
-size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* ctx,
+size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* zc,
+                             const void* dict, size_t dictSize,
                                    ZSTD_parameters params)
 {
     size_t errorCode;
 
     ZSTD_validateParams(&params);
 
-    errorCode = ZSTD_resetCCtx_advanced(ctx, params);
+    errorCode = ZSTD_resetCCtx_advanced(zc, params);
     if (ZSTD_isError(errorCode)) return errorCode;
 
-    MEM_writeLE32(ctx->headerBuffer, ZSTD_MAGICNUMBER);   /* Write Header */
-    ((BYTE*)ctx->headerBuffer)[4] = (BYTE)(params.windowLog - ZSTD_WINDOWLOG_ABSOLUTEMIN);
-    ctx->hbSize = ZSTD_frameHeaderSize_min;
-    ctx->stage = 0;
+    MEM_writeLE32(zc->headerBuffer, ZSTD_MAGICNUMBER);   /* Write Header */
+    ((BYTE*)zc->headerBuffer)[4] = (BYTE)(params.windowLog - ZSTD_WINDOWLOG_ABSOLUTEMIN);
+    zc->hbSize = ZSTD_frameHeaderSize_min;
+    zc->stage = 0;
 
-    return 0;
+    return ZSTD_compress_insertDictionary(zc, dict, dictSize);
 }
 
 
-/** ZSTD_getParams
-*   return ZSTD_parameters structure for a selected compression level and srcSize.
-*   srcSizeHint value is optional, select 0 if not known */
-ZSTD_parameters ZSTD_getParams(int compressionLevel, U64 srcSizeHint)
+size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* zc, const void* dict, size_t dictSize, int compressionLevel)
 {
-    ZSTD_parameters result;
-    int tableID = ((srcSizeHint-1) <= 256 KB) + ((srcSizeHint-1) <= 128 KB) + ((srcSizeHint-1) <= 16 KB);   /* intentional underflow for srcSizeHint == 0 */
-    if (compressionLevel<=0) compressionLevel = 1;
-    if (compressionLevel > ZSTD_MAX_CLEVEL) compressionLevel = ZSTD_MAX_CLEVEL;
-    result = ZSTD_defaultParameters[tableID][compressionLevel];
-    result.srcSize = srcSizeHint;
-    return result;
+    return ZSTD_compressBegin_advanced(zc, dict, dictSize, ZSTD_getParams(compressionLevel, MAX(128 KB, dictSize)));
 }
 
-
-size_t ZSTD_compressBegin(ZSTD_CCtx* ctx, int compressionLevel)
+size_t ZSTD_compressBegin(ZSTD_CCtx* zc, int compressionLevel)
 {
-    return ZSTD_compressBegin_advanced(ctx, ZSTD_getParams(compressionLevel, 0));
+    return ZSTD_compressBegin_advanced(zc, NULL, 0, ZSTD_getParams(compressionLevel, 0));
 }
 
 
@@ -2081,8 +2181,7 @@ size_t ZSTD_compressEnd(ZSTD_CCtx* zc, void* dst, size_t maxDstSize)
     size_t hbSize = 0;
 
     /* empty frame */
-    if (zc->stage==0)
-    {
+    if (zc->stage==0) {
         hbSize = zc->hbSize;
         if (maxDstSize <= hbSize) return ERROR(dstSize_tooSmall);
         zc->stage = 1;
@@ -2100,6 +2199,24 @@ size_t ZSTD_compressEnd(ZSTD_CCtx* zc, void* dst, size_t maxDstSize)
     return 3+hbSize;
 }
 
+
+size_t ZSTD_compress_usingPreparedCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx,
+                                       void* dst, size_t maxDstSize,
+                                 const void* src, size_t srcSize)
+{
+    size_t outSize;
+    size_t errorCode = ZSTD_copyCCtx(cctx, preparedCCtx);
+    if (ZSTD_isError(errorCode)) return errorCode;
+    errorCode = ZSTD_compressContinue(cctx, dst, maxDstSize, src, srcSize);
+    if (ZSTD_isError(errorCode)) return errorCode;
+    outSize = errorCode;
+    errorCode = ZSTD_compressEnd(cctx, (char*)dst+outSize, maxDstSize-outSize);
+    if (ZSTD_isError(errorCode)) return errorCode;
+    outSize += errorCode;
+    return outSize;
+}
+
+
 size_t ZSTD_compress_advanced (ZSTD_CCtx* ctx,
                                void* dst, size_t maxDstSize,
                          const void* src, size_t srcSize,
@@ -2110,17 +2227,10 @@ size_t ZSTD_compress_advanced (ZSTD_CCtx* ctx,
     BYTE* op = ostart;
     size_t oSize;
 
-    /* Header */
-    oSize = ZSTD_compressBegin_advanced(ctx, params);
+    /* Init */
+    oSize = ZSTD_compressBegin_advanced(ctx, dict, dictSize, params);
     if(ZSTD_isError(oSize)) return oSize;
 
-    /* dictionary */
-    if (dict)
-    {
-        oSize = ZSTD_compress_insertDictionary(ctx, dict, dictSize);
-        if (ZSTD_isError(oSize)) return oSize;
-    }
-
     /* body (compression) */
     oSize = ZSTD_compressContinue (ctx, op,  maxDstSize, src, srcSize);
     if(ZSTD_isError(oSize)) return oSize;
@@ -2137,7 +2247,7 @@ size_t ZSTD_compress_advanced (ZSTD_CCtx* ctx,
 
 size_t ZSTD_compress_usingDict(ZSTD_CCtx* ctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize, const void* dict, size_t dictSize, int compressionLevel)
 {
-    return ZSTD_compress_advanced(ctx, dst, maxDstSize, src, srcSize, dict, dictSize, ZSTD_getParams(compressionLevel, srcSize+dictSize));
+    return ZSTD_compress_advanced(ctx, dst, maxDstSize, src, srcSize, dict, dictSize, ZSTD_getParams(compressionLevel, srcSize));
 }
 
 size_t ZSTD_compressCCtx (ZSTD_CCtx* ctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize, int compressionLevel)
@@ -2155,3 +2265,130 @@ size_t ZSTD_compress(void* dst, size_t maxDstSize, const void* src, size_t srcSi
     return result;
 }
 
+
+/*-=====  Pre-defined compression levels  =====-*/
+
+#define ZSTD_MAX_CLEVEL 21
+unsigned ZSTD_maxCLevel(void) { return ZSTD_MAX_CLEVEL; }
+
+static const ZSTD_parameters ZSTD_defaultParameters[4][ZSTD_MAX_CLEVEL+1] = {
+{   /* "default" */
+    /* l,  W,  C,  H,  S,  L, SL, strat */
+    {  0,  0,  0,  0,  0,  0,  0, ZSTD_fast    },  /* level  0 - never used */
+    {  0, 19, 13, 14,  1,  7,  4, ZSTD_fast    },  /* level  1 */
+    {  0, 19, 15, 16,  1,  6,  4, ZSTD_fast    },  /* level  2 */
+    {  0, 20, 18, 20,  1,  6,  4, ZSTD_fast    },  /* level  3 */
+    {  0, 21, 19, 21,  1,  6,  4, ZSTD_fast    },  /* level  4 */
+    {  0, 20, 14, 18,  3,  5,  4, ZSTD_greedy  },  /* level  5 */
+    {  0, 20, 18, 19,  3,  5,  4, ZSTD_greedy  },  /* level  6 */
+    {  0, 21, 17, 20,  3,  5,  4, ZSTD_lazy    },  /* level  7 */
+    {  0, 21, 19, 20,  3,  5,  4, ZSTD_lazy    },  /* level  8 */
+    {  0, 21, 20, 20,  3,  5,  4, ZSTD_lazy2   },  /* level  9 */
+    {  0, 21, 19, 21,  4,  5,  4, ZSTD_lazy2   },  /* level 10 */
+    {  0, 22, 20, 22,  4,  5,  4, ZSTD_lazy2   },  /* level 11 */
+    {  0, 22, 20, 22,  5,  5,  4, ZSTD_lazy2   },  /* level 12 */
+    {  0, 22, 21, 22,  5,  5,  4, ZSTD_lazy2   },  /* level 13 */
+    {  0, 22, 22, 23,  5,  5,  4, ZSTD_lazy2   },  /* level 14 */
+    {  0, 23, 23, 23,  5,  5,  4, ZSTD_lazy2   },  /* level 15 */
+    {  0, 23, 22, 22,  5,  5,  4, ZSTD_btlazy2 },  /* level 16 */
+    {  0, 24, 24, 23,  4,  5,  4, ZSTD_btlazy2 },  /* level 17 */
+    {  0, 24, 24, 23,  5,  5, 30, ZSTD_btopt   },  /* level 18 */
+    {  0, 25, 25, 24,  5,  4, 40, ZSTD_btopt   },  /* level 19 */
+    {  0, 26, 26, 25,  8,  4,256, ZSTD_btopt   },  /* level 20 */
+    {  0, 26, 27, 25, 10,  4,256, ZSTD_btopt   },  /* level 21 */
+},
+{   /* for srcSize <= 256 KB */
+    /* l,  W,  C,  H,  S,  L,  T, strat */
+    {  0,  0,  0,  0,  0,  0,  0, ZSTD_fast    },  /* level  0 */
+    {  0, 18, 14, 15,  1,  6,  4, ZSTD_fast    },  /* level  1 */
+    {  0, 18, 14, 16,  1,  5,  4, ZSTD_fast    },  /* level  2 */
+    {  0, 18, 14, 17,  1,  5,  4, ZSTD_fast    },  /* level  3.*/
+    {  0, 18, 14, 15,  4,  4,  4, ZSTD_greedy  },  /* level  4 */
+    {  0, 18, 16, 17,  4,  4,  4, ZSTD_greedy  },  /* level  5 */
+    {  0, 18, 17, 17,  3,  4,  4, ZSTD_lazy    },  /* level  6 */
+    {  0, 18, 17, 17,  4,  4,  4, ZSTD_lazy    },  /* level  7 */
+    {  0, 18, 17, 17,  4,  4,  4, ZSTD_lazy2   },  /* level  8 */
+    {  0, 18, 17, 17,  5,  4,  4, ZSTD_lazy2   },  /* level  9 */
+    {  0, 18, 17, 17,  6,  4,  4, ZSTD_lazy2   },  /* level 10 */
+    {  0, 18, 17, 17,  7,  4,  4, ZSTD_lazy2   },  /* level 11 */
+    {  0, 18, 18, 17,  4,  4,  4, ZSTD_btlazy2 },  /* level 12 */
+    {  0, 18, 19, 17,  7,  4,  4, ZSTD_btlazy2 },  /* level 13.*/
+    {  0, 18, 17, 19,  8,  4, 24, ZSTD_btopt   },  /* level 14.*/
+    {  0, 18, 19, 19,  8,  4, 48, ZSTD_btopt   },  /* level 15.*/
+    {  0, 18, 19, 18,  9,  4,128, ZSTD_btopt   },  /* level 16.*/
+    {  0, 18, 19, 18,  9,  4,192, ZSTD_btopt   },  /* level 17.*/
+    {  0, 18, 19, 18,  9,  4,256, ZSTD_btopt   },  /* level 18.*/
+    {  0, 18, 19, 18, 10,  4,256, ZSTD_btopt   },  /* level 19.*/
+    {  0, 18, 19, 18, 11,  4,256, ZSTD_btopt   },  /* level 20.*/
+    {  0, 18, 19, 18, 12,  4,256, ZSTD_btopt   },  /* level 21.*/
+},
+{   /* for srcSize <= 128 KB */
+    /* l,  W,  C,  H,  S,  L,  T, strat */
+    {  0,  0,  0,  0,  0,  0,  0, ZSTD_fast    },  /* level  0 - never used */
+    {  0, 17, 12, 13,  1,  6,  4, ZSTD_fast    },  /* level  1 */
+    {  0, 17, 13, 16,  1,  5,  4, ZSTD_fast    },  /* level  2 */
+    {  0, 17, 13, 14,  2,  5,  4, ZSTD_greedy  },  /* level  3 */
+    {  0, 17, 13, 15,  3,  4,  4, ZSTD_greedy  },  /* level  4 */
+    {  0, 17, 15, 17,  4,  4,  4, ZSTD_greedy  },  /* level  5 */
+    {  0, 17, 16, 17,  3,  4,  4, ZSTD_lazy    },  /* level  6 */
+    {  0, 17, 16, 17,  4,  4,  4, ZSTD_lazy    },  /* level  7 */
+    {  0, 17, 17, 16,  4,  4,  4, ZSTD_lazy2   },  /* level  8 */
+    {  0, 17, 17, 16,  5,  4,  4, ZSTD_lazy2   },  /* level  9 */
+    {  0, 17, 17, 16,  6,  4,  4, ZSTD_lazy2   },  /* level 10 */
+    {  0, 17, 17, 17,  7,  4,  4, ZSTD_lazy2   },  /* level 11 */
+    {  0, 17, 17, 17,  8,  4,  4, ZSTD_lazy2   },  /* level 12 */
+    {  0, 17, 17, 17,  9,  4,  4, ZSTD_lazy2   },  /* level 13 */
+    {  0, 17, 18, 16,  5,  4, 20, ZSTD_btopt   },  /* level 14 */
+    {  0, 17, 18, 16,  9,  4, 48, ZSTD_btopt   },  /* level 15 */
+    {  0, 17, 18, 17,  7,  4,128, ZSTD_btopt   },  /* level 16 */
+    {  0, 17, 18, 17,  8,  4,128, ZSTD_btopt   },  /* level 17 */
+    {  0, 17, 18, 17,  8,  4,256, ZSTD_btopt   },  /* level 18 */
+    {  0, 17, 18, 17,  9,  4,256, ZSTD_btopt   },  /* level 19 */
+    {  0, 17, 18, 17, 10,  4,512, ZSTD_btopt   },  /* level 20 */
+    {  0, 17, 18, 17, 11,  4,512, ZSTD_btopt   },  /* level 21 */
+
+},
+{   /* for srcSize <= 16 KB */
+    /* l,  W,  C,  H,  S,  L,  T, strat */
+    {  0,  0,  0,  0,  0,  0,  0, ZSTD_fast    },  /* level  0 -- never used */
+    {  0, 14, 14, 14,  1,  4,  4, ZSTD_fast    },  /* level  1 */
+    {  0, 14, 14, 15,  1,  4,  4, ZSTD_fast    },  /* level  2 */
+    {  0, 14, 13, 15,  4,  4,  4, ZSTD_greedy  },  /* level  3 */
+    {  0, 14, 14, 15,  3,  4,  4, ZSTD_lazy    },  /* level  4 */
+    {  0, 14, 14, 14,  6,  4,  4, ZSTD_lazy    },  /* level  5 */
+    {  0, 14, 14, 14,  5,  4,  4, ZSTD_lazy2   },  /* level  6 */
+    {  0, 14, 14, 14,  7,  4,  4, ZSTD_lazy2   },  /* level  7 */
+    {  0, 14, 14, 14,  8,  4,  4, ZSTD_lazy2   },  /* level  8 */
+    {  0, 14, 14, 14,  9,  4,  4, ZSTD_lazy2   },  /* level  9 */
+    {  0, 14, 14, 14, 10,  4,  4, ZSTD_lazy2   },  /* level 10 */
+    {  0, 14, 14, 14, 11,  4,  4, ZSTD_lazy2   },  /* level 11 */
+    {  0, 14, 15, 15, 12,  4, 32, ZSTD_btopt   },  /* level 12 */
+    {  0, 14, 15, 15, 12,  4, 64, ZSTD_btopt   },  /* level 13 */
+    {  0, 14, 15, 15, 12,  4, 96, ZSTD_btopt   },  /* level 14 */
+    {  0, 14, 15, 15, 12,  4,128, ZSTD_btopt   },  /* level 15 */
+    {  0, 14, 15, 15, 12,  4,256, ZSTD_btopt   },  /* level 16 */
+    {  0, 14, 15, 15, 13,  4,256, ZSTD_btopt   },  /* level 17 */
+    {  0, 14, 15, 15, 14,  4,256, ZSTD_btopt   },  /* level 18 */
+    {  0, 14, 15, 15, 15,  4,256, ZSTD_btopt   },  /* level 19 */
+    {  0, 14, 15, 15, 16,  4,256, ZSTD_btopt   },  /* level 20 */
+    {  0, 14, 15, 15, 17,  4,256, ZSTD_btopt   },  /* level 21 */
+},
+};
+
+/*! ZSTD_getParams
+*   @return ZSTD_parameters structure for a selected compression level and srcSize.
+*   @srcSizeHint value is optional, select 0 if not known */
+ZSTD_parameters ZSTD_getParams(int compressionLevel, U64 srcSizeHint)
+{
+    ZSTD_parameters result;
+    int tableID = ((srcSizeHint-1) <= 256 KB) + ((srcSizeHint-1) <= 128 KB) + ((srcSizeHint-1) <= 16 KB);   /* intentional underflow for srcSizeHint == 0 */
+    if (compressionLevel<=0) compressionLevel = 1;
+    if (compressionLevel > ZSTD_MAX_CLEVEL) compressionLevel = ZSTD_MAX_CLEVEL;
+#if ZSTD_OPT_DEBUG >= 1
+    tableID=0;
+#endif
+    result = ZSTD_defaultParameters[tableID][compressionLevel];
+    result.srcSize = srcSizeHint;
+    return result;
+}
+
diff --git a/lib/zstd_decompress.c b/lib/zstd_decompress.c
index 4a026df..bfa0ea3 100644
--- a/lib/zstd_decompress.c
+++ b/lib/zstd_decompress.c
@@ -1,6 +1,6 @@
 /*
     zstd - standard compression library
-    Copyright (C) 2014-2015, Yann Collet.
+    Copyright (C) 2014-2016, Yann Collet.
 
     BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
 
@@ -27,7 +27,6 @@
 
     You can contact the author at :
     - zstd source repository : https://github.com/Cyan4973/zstd
-    - ztsd public forum : https://groups.google.com/forum/#!forum/lz4c
 */
 
 /* ***************************************************************
@@ -44,30 +43,30 @@
 
 /*!
 *  LEGACY_SUPPORT :
-*  ZSTD_decompress() can decode older formats (v0.1+) if set to 1
+*  if set to 1, ZSTD_decompress() can decode older formats (v0.1+)
 */
 #ifndef ZSTD_LEGACY_SUPPORT
 #  define ZSTD_LEGACY_SUPPORT 0
 #endif
 
 
-/* *******************************************************
-*  Includes
+/*-*******************************************************
+*  Dependencies
 *********************************************************/
 #include <stdlib.h>      /* calloc */
 #include <string.h>      /* memcpy, memmove */
-#include <stdio.h>       /* debug : printf */
+#include <stdio.h>       /* debug only : printf */
 #include "mem.h"         /* low level memory routines */
-#include "zstd_static.h"
 #include "zstd_internal.h"
 #include "fse_static.h"
-#include "huff0.h"
+#include "huff0_static.h"
 
 #if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT==1)
 #  include "zstd_legacy.h"
 #endif
 
-/* *******************************************************
+
+/*-*******************************************************
 *  Compiler specifics
 *********************************************************/
 #ifdef _MSC_VER    /* Visual Studio */
@@ -85,7 +84,7 @@
 #endif
 
 
-/* *************************************
+/*-*************************************
 *  Local types
 ***************************************/
 typedef struct
@@ -106,11 +105,15 @@ static void ZSTD_copy4(void* dst, const void* src) { memcpy(dst, src, 4); }
 ***************************************/
 unsigned ZSTD_versionNumber (void) { return ZSTD_VERSION_NUMBER; }
 
-/*! ZSTD_isError
+/*! ZSTD_isError() :
 *   tells if a return value is an error code */
 unsigned ZSTD_isError(size_t code) { return ERR_isError(code); }
 
-/*! ZSTD_getErrorName
+/*! ZSTD_getError() :
+*   convert a `size_t` function result into a proper ZSTD_errorCode enum */
+ZSTD_ErrorCode ZSTD_getError(size_t code) { return ERR_getError(code); }
+
+/*! ZSTD_getErrorName() :
 *   provides error code string (useful for debugging) */
 const char* ZSTD_getErrorName(size_t code) { return ERR_getErrorName(code); }
 
@@ -123,9 +126,10 @@ typedef enum { ZSTDds_getFrameHeaderSize, ZSTDds_decodeFrameHeader,
 
 struct ZSTD_DCtx_s
 {
-    U32 LLTable[FSE_DTABLE_SIZE_U32(LLFSELog)];
-    U32 OffTable[FSE_DTABLE_SIZE_U32(OffFSELog)];
-    U32 MLTable[FSE_DTABLE_SIZE_U32(MLFSELog)];
+    FSE_DTable LLTable[FSE_DTABLE_SIZE_U32(LLFSELog)];
+    FSE_DTable OffTable[FSE_DTABLE_SIZE_U32(OffFSELog)];
+    FSE_DTable MLTable[FSE_DTABLE_SIZE_U32(MLFSELog)];
+    unsigned   hufTableX4[HUF_DTABLE_SIZE(HufLog)];
     const void* previousDstEnd;
     const void* base;
     const void* vBase;
@@ -133,16 +137,19 @@ struct ZSTD_DCtx_s
     size_t expected;
     size_t headerSize;
     ZSTD_parameters params;
-    blockType_t bType;
+    blockType_t bType;   /* used in ZSTD_decompressContinue(), to transfer blockType between header decoding and block decoding stages */
     ZSTD_dStage stage;
+    U32 flagStaticTables;
     const BYTE* litPtr;
     size_t litBufSize;
     size_t litSize;
-    BYTE litBuffer[BLOCKSIZE + 8 /* margin for wildcopy */];
+    BYTE litBuffer[BLOCKSIZE + WILDCOPY_OVERLENGTH];
     BYTE headerBuffer[ZSTD_frameHeaderSize_max];
 };  /* typedef'd to ZSTD_DCtx within "zstd_static.h" */
 
-size_t ZSTD_resetDCtx(ZSTD_DCtx* dctx)
+size_t sizeofDCtx (void) { return sizeof(ZSTD_DCtx); }
+
+size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx)
 {
     dctx->expected = ZSTD_frameHeaderSize_min;
     dctx->stage = ZSTDds_getFrameHeaderSize;
@@ -150,6 +157,8 @@ size_t ZSTD_resetDCtx(ZSTD_DCtx* dctx)
     dctx->base = NULL;
     dctx->vBase = NULL;
     dctx->dictEnd = NULL;
+    dctx->hufTableX4[0] = HufLog;
+    dctx->flagStaticTables = 0;
     return 0;
 }
 
@@ -157,28 +166,117 @@ ZSTD_DCtx* ZSTD_createDCtx(void)
 {
     ZSTD_DCtx* dctx = (ZSTD_DCtx*)malloc(sizeof(ZSTD_DCtx));
     if (dctx==NULL) return NULL;
-    ZSTD_resetDCtx(dctx);
+    ZSTD_decompressBegin(dctx);
     return dctx;
 }
 
 size_t ZSTD_freeDCtx(ZSTD_DCtx* dctx)
 {
     free(dctx);
-    return 0;
+    return 0;   /* reserved as a potential error code in the future */
+}
+
+void ZSTD_copyDCtx(ZSTD_DCtx* dstDCtx, const ZSTD_DCtx* srcDCtx)
+{
+    memcpy(dstDCtx, srcDCtx,
+           sizeof(ZSTD_DCtx) - (BLOCKSIZE+WILDCOPY_OVERLENGTH + ZSTD_frameHeaderSize_max));  /* no need to copy workspace */
 }
 
 
 /* *************************************************************
 *   Decompression section
 ***************************************************************/
-/** ZSTD_decodeFrameHeader_Part1
+
+/* Frame format description
+   Frame Header -  [ Block Header - Block ] - Frame End
+   1) Frame Header
+      - 4 bytes - Magic Number : ZSTD_MAGICNUMBER (defined within zstd_internal.h)
+      - 1 byte  - Window Descriptor
+   2) Block Header
+      - 3 bytes, starting with a 2-bits descriptor
+                 Uncompressed, Compressed, Frame End, unused
+   3) Block
+      See Block Format Description
+   4) Frame End
+      - 3 bytes, compatible with Block Header
+*/
+
+/* Block format description
+
+   Block = Literal Section - Sequences Section
+   Prerequisite : size of (compressed) block, maximum size of regenerated data
+
+   1) Literal Section
+
+   1.1) Header : 1-5 bytes
+        flags: 2 bits
+            00 compressed by Huff0
+            01 unused
+            10 is Raw (uncompressed)
+            11 is Rle
+            Note : using 01 => Huff0 with precomputed table ?
+            Note : delta map ? => compressed ?
+
+   1.1.1) Huff0-compressed literal block : 3-5 bytes
+            srcSize < 1 KB => 3 bytes (2-2-10-10) => single stream
+            srcSize < 1 KB => 3 bytes (2-2-10-10)
+            srcSize < 16KB => 4 bytes (2-2-14-14)
+            else           => 5 bytes (2-2-18-18)
+            big endian convention
+
+   1.1.2) Raw (uncompressed) literal block header : 1-3 bytes
+        size :  5 bits: (IS_RAW<<6) + (0<<4) + size
+               12 bits: (IS_RAW<<6) + (2<<4) + (size>>8)
+                        size&255
+               20 bits: (IS_RAW<<6) + (3<<4) + (size>>16)
+                        size>>8&255
+                        size&255
+
+   1.1.3) Rle (repeated single byte) literal block header : 1-3 bytes
+        size :  5 bits: (IS_RLE<<6) + (0<<4) + size
+               12 bits: (IS_RLE<<6) + (2<<4) + (size>>8)
+                        size&255
+               20 bits: (IS_RLE<<6) + (3<<4) + (size>>16)
+                        size>>8&255
+                        size&255
+
+   1.1.4) Huff0-compressed literal block, using precomputed CTables : 3-5 bytes
+            srcSize < 1 KB => 3 bytes (2-2-10-10) => single stream
+            srcSize < 1 KB => 3 bytes (2-2-10-10)
+            srcSize < 16KB => 4 bytes (2-2-14-14)
+            else           => 5 bytes (2-2-18-18)
+            big endian convention
+
+        1- CTable available (stored into workspace ?)
+        2- Small input (fast heuristic ? Full comparison ? depend on clevel ?)
+
+
+   1.2) Literal block content
+
+   1.2.1) Huff0 block, using sizes from header
+        See Huff0 format
+
+   1.2.2) Huff0 block, using prepared table
+
+   1.2.3) Raw content
+
+   1.2.4) single byte
+
+
+   2) Sequences section
+      TO DO
+*/
+
+
+/** ZSTD_decodeFrameHeader_Part1() :
 *   decode the 1st part of the Frame Header, which tells Frame Header size.
-*   srcSize must be == ZSTD_frameHeaderSize_min
+*   srcSize must be == ZSTD_frameHeaderSize_min.
 *   @return : the full size of the Frame Header */
 static size_t ZSTD_decodeFrameHeader_Part1(ZSTD_DCtx* zc, const void* src, size_t srcSize)
 {
     U32 magicNumber;
-    if (srcSize != ZSTD_frameHeaderSize_min) return ERROR(srcSize_wrong);
+    if (srcSize != ZSTD_frameHeaderSize_min)
+        return ERROR(srcSize_wrong);
     magicNumber = MEM_readLE32(src);
     if (magicNumber != ZSTD_MAGICNUMBER) return ERROR(prefix_unknown);
     zc->headerSize = ZSTD_frameHeaderSize_min;
@@ -198,16 +296,17 @@ size_t ZSTD_getFrameParams(ZSTD_parameters* params, const void* src, size_t srcS
     return 0;
 }
 
-/** ZSTD_decodeFrameHeader_Part2
-*   decode the full Frame Header
-*   srcSize must be the size provided by ZSTD_decodeFrameHeader_Part1
+/** ZSTD_decodeFrameHeader_Part2() :
+*   decode the full Frame Header.
+*   srcSize must be the size provided by ZSTD_decodeFrameHeader_Part1().
 *   @return : 0, or an error code, which can be tested using ZSTD_isError() */
 static size_t ZSTD_decodeFrameHeader_Part2(ZSTD_DCtx* zc, const void* src, size_t srcSize)
 {
     size_t result;
-    if (srcSize != zc->headerSize) return ERROR(srcSize_wrong);
+    if (srcSize != zc->headerSize)
+        return ERROR(srcSize_wrong);
     result = ZSTD_getFrameParams(&(zc->params), src, srcSize);
-    if ((MEM_32bits()) && (zc->params.windowLog > 25)) return ERROR(frameParameter_unsupportedBy32bitsImplementation);
+    if ((MEM_32bits()) && (zc->params.windowLog > 25)) return ERROR(frameParameter_unsupportedBy32bits);
     return result;
 }
 
@@ -218,7 +317,8 @@ size_t ZSTD_getcBlockSize(const void* src, size_t srcSize, blockProperties_t* bp
     BYTE headerFlags;
     U32 cSize;
 
-    if (srcSize < 3) return ERROR(srcSize_wrong);
+    if (srcSize < 3)
+        return ERROR(srcSize_wrong);
 
     headerFlags = *in;
     cSize = in[2] + (in[1]<<8) + ((in[0] & 7)<<16);
@@ -231,6 +331,7 @@ size_t ZSTD_getcBlockSize(const void* src, size_t srcSize, blockProperties_t* bp
     return cSize;
 }
 
+
 static size_t ZSTD_copyRawBlock(void* dst, size_t maxDstSize, const void* src, size_t srcSize)
 {
     if (srcSize > maxDstSize) return ERROR(dstSize_tooSmall);
@@ -239,27 +340,7 @@ static size_t ZSTD_copyRawBlock(void* dst, size_t maxDstSize, const void* src, s
 }
 
 
-/** ZSTD_decompressLiterals
-    @return : nb of bytes read from src, or an error code*/
-static size_t ZSTD_decompressLiterals(void* dst, size_t* maxDstSizePtr,
-                                const void* src, size_t srcSize)
-{
-    const BYTE* ip = (const BYTE*)src;
-
-    const size_t litSize = (MEM_readLE32(src) & 0x1FFFFF) >> 2;   /* no buffer issue : srcSize >= MIN_CBLOCK_SIZE */
-    const size_t litCSize = (MEM_readLE32(ip+2) & 0xFFFFFF) >> 5;   /* no buffer issue : srcSize >= MIN_CBLOCK_SIZE */
-
-    if (litSize > *maxDstSizePtr) return ERROR(corruption_detected);
-    if (litCSize + 5 > srcSize) return ERROR(corruption_detected);
-
-    if (HUF_isError(HUF_decompress(dst, litSize, ip+5, litCSize))) return ERROR(corruption_detected);
-
-    *maxDstSizePtr = litSize;
-    return litCSize + 5;
-}
-
-
-/** ZSTD_decodeLiteralsBlock
+/*! ZSTD_decodeLiteralsBlock() :
     @return : nb of bytes read from src (< srcSize ) */
 size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
                           const void* src, size_t srcSize)   /* note : srcSize < BLOCKSIZE */
@@ -269,47 +350,127 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
     /* any compressed block with literals segment must be at least this size */
     if (srcSize < MIN_CBLOCK_SIZE) return ERROR(corruption_detected);
 
-    switch(*istart & 3)
+    switch(istart[0]>> 6)
     {
-    /* compressed */
-    case 0:
+    case IS_HUF:
         {
-            size_t litSize = BLOCKSIZE;
-            const size_t readSize = ZSTD_decompressLiterals(dctx->litBuffer, &litSize, src, srcSize);
+            size_t litSize, litCSize, singleStream=0;
+            U32 lhSize = ((istart[0]) >> 4) & 3;
+            switch(lhSize)
+            {
+            case 0: case 1: default:   /* note : default is impossible, since lhSize into [0..3] */
+                /* 2 - 2 - 10 - 10 */
+                lhSize=3;
+                singleStream = istart[0] & 16;
+                litSize  = ((istart[0] & 15) << 6) + (istart[1] >> 2);
+                litCSize = ((istart[1] &  3) << 8) + istart[2];
+                break;
+            case 2:
+                /* 2 - 2 - 14 - 14 */
+                lhSize=4;
+                litSize  = ((istart[0] & 15) << 10) + (istart[1] << 2) + (istart[2] >> 6);
+                litCSize = ((istart[2] & 63) <<  8) + istart[3];
+                break;
+            case 3:
+                /* 2 - 2 - 18 - 18 */
+                lhSize=5;
+                litSize  = ((istart[0] & 15) << 14) + (istart[1] << 6) + (istart[2] >> 2);
+                litCSize = ((istart[2] &  3) << 16) + (istart[3] << 8) + istart[4];
+                break;
+            }
+            if (litSize > BLOCKSIZE) return ERROR(corruption_detected);
+
+            if (HUF_isError(singleStream ?
+                            HUF_decompress1X2(dctx->litBuffer, litSize, istart+lhSize, litCSize) :
+                            HUF_decompress   (dctx->litBuffer, litSize, istart+lhSize, litCSize) ))
+                return ERROR(corruption_detected);
+
             dctx->litPtr = dctx->litBuffer;
             dctx->litBufSize = BLOCKSIZE+8;
             dctx->litSize = litSize;
-            return readSize;   /* works if it's an error too */
+            return litCSize + lhSize;
+        }
+    case IS_PCH:
+        {
+            size_t errorCode;
+            size_t litSize, litCSize;
+            U32 lhSize = ((istart[0]) >> 4) & 3;
+            if (lhSize != 1)  /* only case supported for now : small litSize, single stream */
+                return ERROR(corruption_detected);
+            if (!dctx->flagStaticTables)
+                return ERROR(dictionary_corrupted);
+
+            /* 2 - 2 - 10 - 10 */
+            lhSize=3;
+            litSize  = ((istart[0] & 15) << 6) + (istart[1] >> 2);
+            litCSize = ((istart[1] &  3) << 8) + istart[2];
+
+            errorCode = HUF_decompress1X4_usingDTable(dctx->litBuffer, litSize, istart+lhSize, litCSize, dctx->hufTableX4);
+            if (HUF_isError(errorCode)) return ERROR(corruption_detected);
+
+            dctx->litPtr = dctx->litBuffer;
+            dctx->litBufSize = BLOCKSIZE+WILDCOPY_OVERLENGTH;
+            dctx->litSize = litSize;
+            return litCSize + lhSize;
         }
     case IS_RAW:
         {
-            const size_t litSize = (MEM_readLE32(istart) & 0xFFFFFF) >> 2;   /* no buffer issue : srcSize >= MIN_CBLOCK_SIZE */
-            if (litSize > srcSize-11)   /* risk of reading too far with wildcopy */
+            size_t litSize;
+            U32 lhSize = ((istart[0]) >> 4) & 3;
+            switch(lhSize)
             {
-                if (litSize > srcSize-3) return ERROR(corruption_detected);
-                memcpy(dctx->litBuffer, istart, litSize);
+            case 0: case 1: default:   /* note : default is impossible, since lhSize into [0..3] */
+                lhSize=1;
+                litSize = istart[0] & 31;
+                break;
+            case 2:
+                litSize = ((istart[0] & 15) << 8) + istart[1];
+                break;
+            case 3:
+                litSize = ((istart[0] & 15) << 16) + (istart[1] << 8) + istart[2];
+                break;
+            }
+
+            if (lhSize+litSize+WILDCOPY_OVERLENGTH > srcSize) {  /* risk reading beyond src buffer with wildcopy */
+                if (litSize+lhSize > srcSize) return ERROR(corruption_detected);
+                memcpy(dctx->litBuffer, istart+lhSize, litSize);
                 dctx->litPtr = dctx->litBuffer;
                 dctx->litBufSize = BLOCKSIZE+8;
                 dctx->litSize = litSize;
-                return litSize+3;
+                return lhSize+litSize;
             }
             /* direct reference into compressed stream */
-            dctx->litPtr = istart+3;
-            dctx->litBufSize = srcSize-3;
+            dctx->litPtr = istart+lhSize;
+            dctx->litBufSize = srcSize-lhSize;
             dctx->litSize = litSize;
-            return litSize+3;        }
+            return lhSize+litSize;
+        }
     case IS_RLE:
         {
-            const size_t litSize = (MEM_readLE32(istart) & 0xFFFFFF) >> 2;   /* no buffer issue : srcSize >= MIN_CBLOCK_SIZE */
+            size_t litSize;
+            U32 lhSize = ((istart[0]) >> 4) & 3;
+            switch(lhSize)
+            {
+            case 0: case 1: default:   /* note : default is impossible, since lhSize into [0..3] */
+                lhSize = 1;
+                litSize = istart[0] & 31;
+                break;
+            case 2:
+                litSize = ((istart[0] & 15) << 8) + istart[1];
+                break;
+            case 3:
+                litSize = ((istart[0] & 15) << 16) + (istart[1] << 8) + istart[2];
+                break;
+            }
             if (litSize > BLOCKSIZE) return ERROR(corruption_detected);
-            memset(dctx->litBuffer, istart[3], litSize);
+            memset(dctx->litBuffer, istart[lhSize], litSize);
             dctx->litPtr = dctx->litBuffer;
-            dctx->litBufSize = BLOCKSIZE+8;
+            dctx->litBufSize = BLOCKSIZE+WILDCOPY_OVERLENGTH;
             dctx->litSize = litSize;
-            return 4;
+            return lhSize+1;
         }
     default:
-        return ERROR(corruption_detected);   /* forbidden nominal case */
+        return ERROR(corruption_detected);   /* impossible */
     }
 }
 
@@ -326,21 +487,23 @@ size_t ZSTD_decodeSeqHeaders(int* nbSeq, const BYTE** dumpsPtr, size_t* dumpsLen
     size_t dumpsLength;
 
     /* check */
-    if (srcSize < 5) return ERROR(srcSize_wrong);
+    if (srcSize < MIN_SEQUENCES_SIZE)
+        return ERROR(srcSize_wrong);
 
     /* SeqHead */
-    *nbSeq = MEM_readLE16(ip); ip+=2;
+    *nbSeq = *ip++;
+    if (*nbSeq==0) return 1;
+    if (*nbSeq >= 128)
+        *nbSeq = ((nbSeq[0]-128)<<8) + *ip++;
+
     LLtype  = *ip >> 6;
     Offtype = (*ip >> 4) & 3;
     MLtype  = (*ip >> 2) & 3;
-    if (*ip & 2)
-    {
+    if (*ip & 2) {
         dumpsLength  = ip[2];
         dumpsLength += ip[1] << 8;
         ip += 3;
-    }
-    else
-    {
+    } else {
         dumpsLength  = ip[1];
         dumpsLength += (ip[0] & 1) << 8;
         ip += 2;
@@ -361,13 +524,18 @@ size_t ZSTD_decodeSeqHeaders(int* nbSeq, const BYTE** dumpsPtr, size_t* dumpsLen
         switch(LLtype)
         {
         U32 max;
-        case bt_rle :
+        case FSE_ENCODING_RLE :
             LLlog = 0;
-            FSE_buildDTable_rle(DTableLL, *ip++); break;
-        case bt_raw :
+            FSE_buildDTable_rle(DTableLL, *ip++);
+            break;
+        case FSE_ENCODING_RAW :
             LLlog = LLbits;
-            FSE_buildDTable_raw(DTableLL, LLbits); break;
-        default :
+            FSE_buildDTable_raw(DTableLL, LLbits);
+            break;
+        case FSE_ENCODING_STATIC:
+            break;
+        case FSE_ENCODING_DYNAMIC :
+        default :   /* impossible */
             max = MaxLL;
             headerSize = FSE_readNCount(norm, &max, &LLlog, ip, iend-ip);
             if (FSE_isError(headerSize)) return ERROR(GENERIC);
@@ -379,15 +547,19 @@ size_t ZSTD_decodeSeqHeaders(int* nbSeq, const BYTE** dumpsPtr, size_t* dumpsLen
         switch(Offtype)
         {
         U32 max;
-        case bt_rle :
+        case FSE_ENCODING_RLE :
             Offlog = 0;
             if (ip > iend-2) return ERROR(srcSize_wrong);   /* min : "raw", hence no header, but at least xxLog bits */
             FSE_buildDTable_rle(DTableOffb, *ip++ & MaxOff); /* if *ip > MaxOff, data is corrupted */
             break;
-        case bt_raw :
+        case FSE_ENCODING_RAW :
             Offlog = Offbits;
-            FSE_buildDTable_raw(DTableOffb, Offbits); break;
-        default :
+            FSE_buildDTable_raw(DTableOffb, Offbits);
+            break;
+        case FSE_ENCODING_STATIC:
+            break;
+        case FSE_ENCODING_DYNAMIC :
+        default :   /* impossible */
             max = MaxOff;
             headerSize = FSE_readNCount(norm, &max, &Offlog, ip, iend-ip);
             if (FSE_isError(headerSize)) return ERROR(GENERIC);
@@ -399,22 +571,26 @@ size_t ZSTD_decodeSeqHeaders(int* nbSeq, const BYTE** dumpsPtr, size_t* dumpsLen
         switch(MLtype)
         {
         U32 max;
-        case bt_rle :
+        case FSE_ENCODING_RLE :
             MLlog = 0;
             if (ip > iend-2) return ERROR(srcSize_wrong); /* min : "raw", hence no header, but at least xxLog bits */
-            FSE_buildDTable_rle(DTableML, *ip++); break;
-        case bt_raw :
+            FSE_buildDTable_rle(DTableML, *ip++);
+            break;
+        case FSE_ENCODING_RAW :
             MLlog = MLbits;
-            FSE_buildDTable_raw(DTableML, MLbits); break;
-        default :
+            FSE_buildDTable_raw(DTableML, MLbits);
+            break;
+        case FSE_ENCODING_STATIC:
+            break;
+        case FSE_ENCODING_DYNAMIC :
+        default :   /* impossible */
             max = MaxML;
             headerSize = FSE_readNCount(norm, &max, &MLlog, ip, iend-ip);
             if (FSE_isError(headerSize)) return ERROR(GENERIC);
             if (MLlog > MLFSELog) return ERROR(corruption_detected);
             ip += headerSize;
             FSE_buildDTable(DTableML, norm, max, MLlog);
-        }
-    }
+    }   }
 
     return ip-istart;
 }
@@ -422,8 +598,8 @@ size_t ZSTD_decodeSeqHeaders(int* nbSeq, const BYTE** dumpsPtr, size_t* dumpsLen
 
 typedef struct {
     size_t litLength;
-    size_t offset;
     size_t matchLength;
+    size_t offset;
 } seq_t;
 
 typedef struct {
@@ -437,6 +613,7 @@ typedef struct {
 } seqState_t;
 
 
+
 static void ZSTD_decodeSequence(seq_t* seq, seqState_t* seqState)
 {
     size_t litLength;
@@ -447,16 +624,15 @@ static void ZSTD_decodeSequence(seq_t* seq, seqState_t* seqState)
     const BYTE* const de = seqState->dumpsEnd;
 
     /* Literal length */
-    litLength = FSE_decodeSymbol(&(seqState->stateLL), &(seqState->DStream));
+    litLength = FSE_peakSymbol(&(seqState->stateLL));
     prevOffset = litLength ? seq->offset : seqState->prevOffset;
-    if (litLength == MaxLL)
-    {
+    if (litLength == MaxLL) {
         U32 add = *dumps++;
         if (add < 255) litLength += add;
-        else
-        {
-            litLength = MEM_readLE32(dumps) & 0xFFFFFF;  /* no pb : dumps is always followed by seq tables > 1 byte */
-            dumps += 3;
+        else {
+            litLength = MEM_readLE32(dumps) & 0xFFFFFF;  /* no risk : dumps is always followed by seq tables > 1 byte */
+            if (litLength&1) litLength>>=1, dumps += 3;
+            else litLength = (U16)(litLength)>>1, dumps += 2;
         }
         if (dumps >= de) dumps = de-1;   /* late correction, to avoid read overflow (data is now corrupted anyway) */
     }
@@ -467,27 +643,29 @@ static void ZSTD_decodeSequence(seq_t* seq, seqState_t* seqState)
                 1 /*fake*/, 1, 2, 4, 8, 16, 32, 64, 128, 256,
                 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144,
                 524288, 1048576, 2097152, 4194304, 8388608, 16777216, 33554432, /*fake*/ 1, 1, 1, 1, 1 };
-        U32 offsetCode, nbBits;
-        offsetCode = FSE_decodeSymbol(&(seqState->stateOffb), &(seqState->DStream));   /* <= maxOff, by table construction */
-        if (MEM_32bits()) BIT_reloadDStream(&(seqState->DStream));
-        nbBits = offsetCode - 1;
+        U32 offsetCode = FSE_peakSymbol(&(seqState->stateOffb));   /* <= maxOff, by table construction */
+        U32 nbBits = offsetCode - 1;
         if (offsetCode==0) nbBits = 0;   /* cmove */
         offset = offsetPrefix[offsetCode] + BIT_readBits(&(seqState->DStream), nbBits);
         if (MEM_32bits()) BIT_reloadDStream(&(seqState->DStream));
-        if (offsetCode==0) offset = prevOffset;   /* cmove */
+        if (offsetCode==0) offset = prevOffset;   /* repcode, cmove */
         if (offsetCode | !litLength) seqState->prevOffset = seq->offset;   /* cmove */
+        FSE_decodeSymbol(&(seqState->stateOffb), &(seqState->DStream));    /* update */
     }
 
+    /* Literal length update */
+    FSE_decodeSymbol(&(seqState->stateLL), &(seqState->DStream));   /* update */
+    if (MEM_32bits()) BIT_reloadDStream(&(seqState->DStream));
+
     /* MatchLength */
     matchLength = FSE_decodeSymbol(&(seqState->stateML), &(seqState->DStream));
-    if (matchLength == MaxML)
-    {
+    if (matchLength == MaxML) {
         U32 add = *dumps++;
         if (add < 255) matchLength += add;
-        else
-        {
+        else {
             matchLength = MEM_readLE32(dumps) & 0xFFFFFF;  /* no pb : dumps is always followed by seq tables > 1 byte */
-            dumps += 3;
+            if (matchLength&1) matchLength>>=1, dumps += 3;
+            else matchLength = (U16)(matchLength)>>1, dumps += 2;
         }
         if (dumps >= de) dumps = de-1;   /* late correction, to avoid read overflow (data is now corrupted anyway) */
     }
@@ -498,6 +676,15 @@ static void ZSTD_decodeSequence(seq_t* seq, seqState_t* seqState)
     seq->offset = offset;
     seq->matchLength = matchLength;
     seqState->dumps = dumps;
+
+#if 0   /* debug */
+    {
+        static U64 totalDecoded = 0;
+        printf("pos %6u : %3u literals & match %3u bytes at distance %6u \n",
+           (U32)(totalDecoded), (U32)litLength, (U32)matchLength, (U32)offset);
+        totalDecoded += litLength + matchLength;
+    }
+#endif
 }
 
 
@@ -526,14 +713,12 @@ FORCE_INLINE size_t ZSTD_execSequence(BYTE* op,
     *litPtr = litEnd;   /* update for next sequence */
 
     /* copy Match */
-    if (sequence.offset > (size_t)(oLitEnd - base))
-    {
+    if (sequence.offset > (size_t)(oLitEnd - base)) {
         /* offset beyond prefix */
         if (sequence.offset > (size_t)(oLitEnd - vBase))
             return ERROR(corruption_detected);
         match = dictEnd - (base-match);
-        if (match + sequence.matchLength <= dictEnd)
-        {
+        if (match + sequence.matchLength <= dictEnd) {
             memmove(oLitEnd, match, sequence.matchLength);
             return sequenceLength;
         }
@@ -544,12 +729,10 @@ FORCE_INLINE size_t ZSTD_execSequence(BYTE* op,
             op = oLitEnd + length1;
             sequence.matchLength -= length1;
             match = base;
-        }
-    }
+    }   }
 
     /* match within prefix */
-    if (sequence.offset < 8)
-    {
+    if (sequence.offset < 8) {
         /* close range match, overlap */
         const int sub2 = dec64table[sequence.offset];
         op[0] = match[0];
@@ -559,25 +742,20 @@ FORCE_INLINE size_t ZSTD_execSequence(BYTE* op,
         match += dec32table[sequence.offset];
         ZSTD_copy4(op+4, match);
         match -= sub2;
-    }
-    else
-    {
+    } else {
         ZSTD_copy8(op, match);
     }
     op += 8; match += 8;
 
-    if (oMatchEnd > oend-12)
-    {
-        if (op < oend_8)
-        {
+    if (oMatchEnd > oend-12) {
+        if (op < oend_8) {
             ZSTD_wildcopy(op, match, oend_8 - op);
             match += oend_8 - op;
             op = oend_8;
         }
-        while (op < oMatchEnd) *op++ = *match++;
-    }
-    else
-    {
+        while (op < oMatchEnd)
+            *op++ = *match++;
+    } else {
         ZSTD_wildcopy(op, match, sequence.matchLength-8);   /* works even if matchLength < 8 */
     }
     return sequenceLength;
@@ -610,28 +788,27 @@ static size_t ZSTD_decompressSequences(
     /* Build Decoding Tables */
     errorCode = ZSTD_decodeSeqHeaders(&nbSeq, &dumps, &dumpsLength,
                                       DTableLL, DTableML, DTableOffb,
-                                      ip, iend-ip);
+                                      ip, seqSize);
     if (ZSTD_isError(errorCode)) return errorCode;
     ip += errorCode;
 
     /* Regen sequences */
-    {
+    if (nbSeq) {
         seq_t sequence;
         seqState_t seqState;
 
         memset(&sequence, 0, sizeof(sequence));
-        sequence.offset = 4;
+        sequence.offset = REPCODE_STARTVALUE;
         seqState.dumps = dumps;
         seqState.dumpsEnd = dumps + dumpsLength;
-        seqState.prevOffset = 4;
+        seqState.prevOffset = REPCODE_STARTVALUE;
         errorCode = BIT_initDStream(&(seqState.DStream), ip, iend-ip);
         if (ERR_isError(errorCode)) return ERROR(corruption_detected);
         FSE_initDState(&(seqState.stateLL), &(seqState.DStream), DTableLL);
         FSE_initDState(&(seqState.stateOffb), &(seqState.DStream), DTableOffb);
         FSE_initDState(&(seqState.stateML), &(seqState.DStream), DTableML);
 
-        for ( ; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && nbSeq ; )
-        {
+        for ( ; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && nbSeq ; ) {
             size_t oneSeqSize;
             nbSeq--;
             ZSTD_decodeSequence(&sequence, &seqState);
@@ -641,16 +818,16 @@ static size_t ZSTD_decompressSequences(
         }
 
         /* check if reached exact end */
-        if ( !BIT_endOfDStream(&(seqState.DStream)) ) return ERROR(corruption_detected);   /* DStream should be entirely and exactly consumed; otherwise data is corrupted */
+        if (nbSeq) return ERROR(corruption_detected);
+    }
 
-        /* last literal segment */
-        {
-            size_t lastLLSize = litEnd - litPtr;
-            if (litPtr > litEnd) return ERROR(corruption_detected);
-            if (op+lastLLSize > oend) return ERROR(dstSize_tooSmall);
-            if (op != litPtr) memcpy(op, litPtr, lastLLSize);
-            op += lastLLSize;
-        }
+    /* last literal segment */
+    {
+        size_t lastLLSize = litEnd - litPtr;
+        if (litPtr > litEnd) return ERROR(corruption_detected);   /* too many literals already used */
+        if (op+lastLLSize > oend) return ERROR(dstSize_tooSmall);
+        memcpy(op, litPtr, lastLLSize);
+        op += lastLLSize;
     }
 
     return op-ostart;
@@ -659,8 +836,7 @@ static size_t ZSTD_decompressSequences(
 
 static void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst)
 {
-    if (dst != dctx->previousDstEnd)   /* not contiguous */
-    {
+    if (dst != dctx->previousDstEnd) {   /* not contiguous */
         dctx->dictEnd = dctx->previousDstEnd;
         dctx->vBase = (const char*)dst - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->base));
         dctx->base = dst;
@@ -670,35 +846,38 @@ static void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst)
 
 
 static size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
-                            void* dst, size_t maxDstSize,
+                            void* dst, size_t dstCapacity,
                       const void* src, size_t srcSize)
-{
-    /* blockType == blockCompressed */
+{   /* blockType == blockCompressed */
     const BYTE* ip = (const BYTE*)src;
+    size_t litCSize;
+
+    if (srcSize >= BLOCKSIZE) return ERROR(srcSize_wrong);
 
     /* Decode literals sub-block */
-    size_t litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize);
+    litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize);
     if (ZSTD_isError(litCSize)) return litCSize;
     ip += litCSize;
     srcSize -= litCSize;
 
-    return ZSTD_decompressSequences(dctx, dst, maxDstSize, ip, srcSize);
+    return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize);
 }
 
 
 size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx,
-                            void* dst, size_t maxDstSize,
+                            void* dst, size_t dstCapacity,
                       const void* src, size_t srcSize)
 {
     ZSTD_checkContinuity(dctx, dst);
-    return ZSTD_decompressBlock_internal(dctx, dst, maxDstSize, src, srcSize);
+    return ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize);
 }
 
 
-size_t ZSTD_decompress_usingDict(ZSTD_DCtx* ctx,
+/*! ZSTD_decompress_continueDCtx
+*   dctx must have been properly initialized */
+static size_t ZSTD_decompress_continueDCtx(ZSTD_DCtx* dctx,
                                  void* dst, size_t maxDstSize,
-                                 const void* src, size_t srcSize,
-                                 const void* dict, size_t dictSize)
+                                 const void* src, size_t srcSize)
 {
     const BYTE* ip = (const BYTE*)src;
     const BYTE* iend = ip + srcSize;
@@ -708,20 +887,6 @@ size_t ZSTD_decompress_usingDict(ZSTD_DCtx* ctx,
     size_t remainingSize = srcSize;
     blockProperties_t blockProperties;
 
-    /* init */
-    ZSTD_resetDCtx(ctx);
-    if (dict)
-    {
-        ZSTD_decompress_insertDictionary(ctx, dict, dictSize);
-        ctx->dictEnd = ctx->previousDstEnd;
-        ctx->vBase = (const char*)dst - ((const char*)(ctx->previousDstEnd) - (const char*)(ctx->base));
-        ctx->base = dst;
-    }
-    else
-    {
-        ctx->vBase = ctx->base = ctx->dictEnd = dst;
-    }
-
     /* Frame Header */
     {
         size_t frameHeaderSize;
@@ -733,11 +898,11 @@ size_t ZSTD_decompress_usingDict(ZSTD_DCtx* ctx,
                 return ZSTD_decompressLegacy(dst, maxDstSize, src, srcSize, magicNumber);
         }
 #endif
-        frameHeaderSize = ZSTD_decodeFrameHeader_Part1(ctx, src, ZSTD_frameHeaderSize_min);
+        frameHeaderSize = ZSTD_decodeFrameHeader_Part1(dctx, src, ZSTD_frameHeaderSize_min);
         if (ZSTD_isError(frameHeaderSize)) return frameHeaderSize;
         if (srcSize < frameHeaderSize+ZSTD_blockHeaderSize) return ERROR(srcSize_wrong);
         ip += frameHeaderSize; remainingSize -= frameHeaderSize;
-        frameHeaderSize = ZSTD_decodeFrameHeader_Part2(ctx, src, frameHeaderSize);
+        frameHeaderSize = ZSTD_decodeFrameHeader_Part2(dctx, src, frameHeaderSize);
         if (ZSTD_isError(frameHeaderSize)) return frameHeaderSize;
     }
 
@@ -755,7 +920,7 @@ size_t ZSTD_decompress_usingDict(ZSTD_DCtx* ctx,
         switch(blockProperties.blockType)
         {
         case bt_compressed:
-            decodedSize = ZSTD_decompressBlock_internal(ctx, op, oend-op, ip, cBlockSize);
+            decodedSize = ZSTD_decompressBlock_internal(dctx, op, oend-op, ip, cBlockSize);
             break;
         case bt_raw :
             decodedSize = ZSTD_copyRawBlock(op, oend-op, ip, cBlockSize);
@@ -782,6 +947,27 @@ size_t ZSTD_decompress_usingDict(ZSTD_DCtx* ctx,
 }
 
 
+size_t ZSTD_decompress_usingPreparedDCtx(ZSTD_DCtx* dctx, const ZSTD_DCtx* refDCtx,
+                                         void* dst, size_t maxDstSize,
+                                   const void* src, size_t srcSize)
+{
+    ZSTD_copyDCtx(dctx, refDCtx);
+    ZSTD_checkContinuity(dctx, dst);
+    return ZSTD_decompress_continueDCtx(dctx, dst, maxDstSize, src, srcSize);
+}
+
+
+size_t ZSTD_decompress_usingDict(ZSTD_DCtx* dctx,
+                                 void* dst, size_t maxDstSize,
+                                 const void* src, size_t srcSize,
+                                 const void* dict, size_t dictSize)
+{
+    ZSTD_decompressBegin_usingDict(dctx, dict, dictSize);
+    ZSTD_checkContinuity(dctx, dst);
+    return ZSTD_decompress_continueDCtx(dctx, dst, maxDstSize, src, srcSize);
+}
+
+
 size_t ZSTD_decompressDCtx(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize)
 {
     return ZSTD_decompress_usingDict(dctx, dst, maxDstSize, src, srcSize, NULL, 0);
@@ -811,39 +997,38 @@ size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx)
     return dctx->expected;
 }
 
-size_t ZSTD_decompressContinue(ZSTD_DCtx* ctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize)
 {
     /* Sanity check */
-    if (srcSize != ctx->expected) return ERROR(srcSize_wrong);
-    ZSTD_checkContinuity(ctx, dst);
+    if (srcSize != dctx->expected) return ERROR(srcSize_wrong);
+    ZSTD_checkContinuity(dctx, dst);
 
     /* Decompress : frame header; part 1 */
-    switch (ctx->stage)
+    switch (dctx->stage)
     {
     case ZSTDds_getFrameHeaderSize :
         {
             /* get frame header size */
             if (srcSize != ZSTD_frameHeaderSize_min) return ERROR(srcSize_wrong);   /* impossible */
-            ctx->headerSize = ZSTD_decodeFrameHeader_Part1(ctx, src, ZSTD_frameHeaderSize_min);
-            if (ZSTD_isError(ctx->headerSize)) return ctx->headerSize;
-            memcpy(ctx->headerBuffer, src, ZSTD_frameHeaderSize_min);
-            if (ctx->headerSize > ZSTD_frameHeaderSize_min)
-            {
-                ctx->expected = ctx->headerSize - ZSTD_frameHeaderSize_min;
-                ctx->stage = ZSTDds_decodeFrameHeader;
+            dctx->headerSize = ZSTD_decodeFrameHeader_Part1(dctx, src, ZSTD_frameHeaderSize_min);
+            if (ZSTD_isError(dctx->headerSize)) return dctx->headerSize;
+            memcpy(dctx->headerBuffer, src, ZSTD_frameHeaderSize_min);
+            if (dctx->headerSize > ZSTD_frameHeaderSize_min) {
+                dctx->expected = dctx->headerSize - ZSTD_frameHeaderSize_min;
+                dctx->stage = ZSTDds_decodeFrameHeader;
                 return 0;
             }
-            ctx->expected = 0;   /* not necessary to copy more */
+            dctx->expected = 0;   /* not necessary to copy more */
         }
     case ZSTDds_decodeFrameHeader:
         {
             /* get frame header */
             size_t result;
-            memcpy(ctx->headerBuffer + ZSTD_frameHeaderSize_min, src, ctx->expected);
-            result = ZSTD_decodeFrameHeader_Part2(ctx, ctx->headerBuffer, ctx->headerSize);
+            memcpy(dctx->headerBuffer + ZSTD_frameHeaderSize_min, src, dctx->expected);
+            result = ZSTD_decodeFrameHeader_Part2(dctx, dctx->headerBuffer, dctx->headerSize);
             if (ZSTD_isError(result)) return result;
-            ctx->expected = ZSTD_blockHeaderSize;
-            ctx->stage = ZSTDds_decodeBlockHeader;
+            dctx->expected = ZSTD_blockHeaderSize;
+            dctx->stage = ZSTDds_decodeBlockHeader;
             return 0;
         }
     case ZSTDds_decodeBlockHeader:
@@ -852,16 +1037,14 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* ctx, void* dst, size_t maxDstSize, con
             blockProperties_t bp;
             size_t blockSize = ZSTD_getcBlockSize(src, ZSTD_blockHeaderSize, &bp);
             if (ZSTD_isError(blockSize)) return blockSize;
-            if (bp.blockType == bt_end)
-            {
-                ctx->expected = 0;
-                ctx->stage = ZSTDds_getFrameHeaderSize;
+            if (bp.blockType == bt_end) {
+                dctx->expected = 0;
+                dctx->stage = ZSTDds_getFrameHeaderSize;
             }
-            else
-            {
-                ctx->expected = blockSize;
-                ctx->bType = bp.blockType;
-                ctx->stage = ZSTDds_decompressBlock;
+            else {
+                dctx->expected = blockSize;
+                dctx->bType = bp.blockType;
+                dctx->stage = ZSTDds_decompressBlock;
             }
             return 0;
         }
@@ -869,10 +1052,10 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* ctx, void* dst, size_t maxDstSize, con
         {
             /* Decompress : block content */
             size_t rSize;
-            switch(ctx->bType)
+            switch(dctx->bType)
             {
             case bt_compressed:
-                rSize = ZSTD_decompressBlock_internal(ctx, dst, maxDstSize, src, srcSize);
+                rSize = ZSTD_decompressBlock_internal(dctx, dst, maxDstSize, src, srcSize);
                 break;
             case bt_raw :
                 rSize = ZSTD_copyRawBlock(dst, maxDstSize, src, srcSize);
@@ -884,11 +1067,11 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* ctx, void* dst, size_t maxDstSize, con
                 rSize = 0;
                 break;
             default:
-                return ERROR(GENERIC);
+                return ERROR(GENERIC);   /* impossible */
             }
-            ctx->stage = ZSTDds_decodeBlockHeader;
-            ctx->expected = ZSTD_blockHeaderSize;
-            ctx->previousDstEnd = (char*)dst + rSize;
+            dctx->stage = ZSTDds_decodeBlockHeader;
+            dctx->expected = ZSTD_blockHeaderSize;
+            dctx->previousDstEnd = (char*)dst + rSize;
             return rSize;
         }
     default:
@@ -897,10 +1080,87 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* ctx, void* dst, size_t maxDstSize, con
 }
 
 
-void ZSTD_decompress_insertDictionary(ZSTD_DCtx* ctx, const void* dict, size_t dictSize)
+static void ZSTD_refDictContent(ZSTD_DCtx* dctx, const void* dict, size_t dictSize)
 {
-    ctx->dictEnd = ctx->previousDstEnd;
-    ctx->vBase = (const char*)dict - ((const char*)(ctx->previousDstEnd) - (const char*)(ctx->base));
-    ctx->base = dict;
-    ctx->previousDstEnd = (const char*)dict + dictSize;
+    dctx->dictEnd = dctx->previousDstEnd;
+    dctx->vBase = (const char*)dict - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->base));
+    dctx->base = dict;
+    dctx->previousDstEnd = (const char*)dict + dictSize;
 }
+
+static size_t ZSTD_loadEntropy(ZSTD_DCtx* dctx, const void* dict, size_t dictSize)
+{
+    size_t hSize, offcodeHeaderSize, matchlengthHeaderSize, errorCode, litlengthHeaderSize;
+    short offcodeNCount[MaxOff+1];
+    U32 offcodeMaxValue=MaxOff, offcodeLog=OffFSELog;
+    short matchlengthNCount[MaxML+1];
+    unsigned matchlengthMaxValue = MaxML, matchlengthLog = MLFSELog;
+    short litlengthNCount[MaxLL+1];
+    unsigned litlengthMaxValue = MaxLL, litlengthLog = LLFSELog;
+
+    hSize = HUF_readDTableX4(dctx->hufTableX4, dict, dictSize);
+    if (HUF_isError(hSize)) return ERROR(dictionary_corrupted);
+    dict = (const char*)dict + hSize;
+    dictSize -= hSize;
+
+    offcodeHeaderSize = FSE_readNCount(offcodeNCount, &offcodeMaxValue, &offcodeLog, dict, dictSize);
+    if (FSE_isError(offcodeHeaderSize)) return ERROR(dictionary_corrupted);
+    errorCode = FSE_buildDTable(dctx->OffTable, offcodeNCount, offcodeMaxValue, offcodeLog);
+    if (FSE_isError(errorCode)) return ERROR(dictionary_corrupted);
+    dict = (const char*)dict + offcodeHeaderSize;
+    dictSize -= offcodeHeaderSize;
+
+    matchlengthHeaderSize = FSE_readNCount(matchlengthNCount, &matchlengthMaxValue, &matchlengthLog, dict, dictSize);
+    if (FSE_isError(matchlengthHeaderSize)) return ERROR(dictionary_corrupted);
+    errorCode = FSE_buildDTable(dctx->MLTable, matchlengthNCount, matchlengthMaxValue, matchlengthLog);
+    if (FSE_isError(errorCode)) return ERROR(dictionary_corrupted);
+    dict = (const char*)dict + matchlengthHeaderSize;
+    dictSize -= matchlengthHeaderSize;
+
+    litlengthHeaderSize = FSE_readNCount(litlengthNCount, &litlengthMaxValue, &litlengthLog, dict, dictSize);
+    if (FSE_isError(litlengthHeaderSize)) return ERROR(dictionary_corrupted);
+    errorCode = FSE_buildDTable(dctx->LLTable, litlengthNCount, litlengthMaxValue, litlengthLog);
+    if (FSE_isError(errorCode)) return ERROR(dictionary_corrupted);
+
+    dctx->flagStaticTables = 1;
+    return hSize + offcodeHeaderSize + matchlengthHeaderSize + litlengthHeaderSize;
+}
+
+static size_t ZSTD_decompress_insertDictionary(ZSTD_DCtx* dctx, const void* dict, size_t dictSize)
+{
+    size_t eSize;
+    U32 magic = MEM_readLE32(dict);
+    if (magic != ZSTD_DICT_MAGIC) {
+        /* pure content mode */
+        ZSTD_refDictContent(dctx, dict, dictSize);
+        return 0;
+    }
+    /* load entropy tables */
+    dict = (const char*)dict + 4;
+    dictSize -= 4;
+    eSize = ZSTD_loadEntropy(dctx, dict, dictSize);
+    if (ZSTD_isError(eSize)) return ERROR(dictionary_corrupted);
+
+    /* reference dictionary content */
+    dict = (const char*)dict + eSize;
+    dictSize -= eSize;
+    ZSTD_refDictContent(dctx, dict, dictSize);
+
+    return 0;
+}
+
+
+size_t ZSTD_decompressBegin_usingDict(ZSTD_DCtx* dctx, const void* dict, size_t dictSize)
+{
+    size_t errorCode;
+    errorCode = ZSTD_decompressBegin(dctx);
+    if (ZSTD_isError(errorCode)) return errorCode;
+
+    if (dict && dictSize) {
+        errorCode = ZSTD_decompress_insertDictionary(dctx, dict, dictSize);
+        if (ZSTD_isError(errorCode)) return ERROR(dictionary_corrupted);
+    }
+
+    return 0;
+}
+
diff --git a/lib/zstd_internal.h b/lib/zstd_internal.h
index cae2cb8..26fc857 100644
--- a/lib/zstd_internal.h
+++ b/lib/zstd_internal.h
@@ -1,7 +1,7 @@
 /*
     zstd_internal - common functions to include
     Header File for include
-    Copyright (C) 2014-2015, Yann Collet.
+    Copyright (C) 2014-2016, Yann Collet.
 
     BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
 
@@ -28,33 +28,29 @@
 
     You can contact the author at :
     - zstd source repository : https://github.com/Cyan4973/zstd
-    - ztsd public forum : https://groups.google.com/forum/#!forum/lz4c
 */
 #ifndef ZSTD_CCOMMON_H_MODULE
 #define ZSTD_CCOMMON_H_MODULE
 
-#if defined (__cplusplus)
-extern "C" {
-#endif
-
-/* *************************************
-*  Includes
+/*-*************************************
+*  Dependencies
 ***************************************/
 #include "mem.h"
 #include "error_private.h"
+#include "zstd_static.h"
 
 
-/* *************************************
+/*-*************************************
 *  Common macros
 ***************************************/
 #define MIN(a,b) ((a)<(b) ? (a) : (b))
 #define MAX(a,b) ((a)>(b) ? (a) : (b))
 
 
-/* *************************************
+/*-*************************************
 *  Common constants
 ***************************************/
-#define ZSTD_MAGICNUMBER 0xFD2FB524   /* v0.4 */
+#define ZSTD_DICT_MAGIC  0xEC30A435
 
 #define KB *(1 <<10)
 #define MB *(1 <<20)
@@ -73,15 +69,19 @@ static const size_t ZSTD_frameHeaderSize_min = 5;
 #define BIT1   2
 #define BIT0   1
 
-#define IS_RAW BIT0
-#define IS_RLE BIT1
+#define IS_HUF 0
+#define IS_PCH 1
+#define IS_RAW 2
+#define IS_RLE 3
 
 #define MINMATCH 4
-#define REPCODE_STARTVALUE 4
+#define REPCODE_STARTVALUE 1
 
+#define Litbits  8
 #define MLbits   7
 #define LLbits   6
 #define Offbits  5
+#define MaxLit ((1<<Litbits) - 1)
 #define MaxML  ((1<<MLbits) - 1)
 #define MaxLL  ((1<<LLbits) - 1)
 #define MaxOff ((1<<Offbits)- 1)
@@ -90,21 +90,32 @@ static const size_t ZSTD_frameHeaderSize_min = 5;
 #define OffFSELog   9
 #define MaxSeq MAX(MaxLL, MaxML)
 
-#define MIN_SEQUENCES_SIZE (2 /*seqNb*/ + 2 /*dumps*/ + 3 /*seqTables*/ + 1 /*bitStream*/)
-#define MIN_CBLOCK_SIZE (3 /*litCSize*/ + MIN_SEQUENCES_SIZE)
+#define FSE_ENCODING_RAW     0
+#define FSE_ENCODING_RLE     1
+#define FSE_ENCODING_STATIC  2
+#define FSE_ENCODING_DYNAMIC 3
+
+
+#define HufLog 12
+
+#define MIN_SEQUENCES_SIZE 1 /* nbSeq==0 */
+#define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */ + MIN_SEQUENCES_SIZE /* nbSeq==0 */)   /* for a non-null block */
+
+#define WILDCOPY_OVERLENGTH 8
 
 typedef enum { bt_compressed, bt_raw, bt_rle, bt_end } blockType_t;
 
 
-/* ******************************************
+/*-*******************************************
 *  Shared functions to include for inlining
-********************************************/
+*********************************************/
 static void ZSTD_copy8(void* dst, const void* src) { memcpy(dst, src, 8); }
 
 #define COPY8(d,s) { ZSTD_copy8(d,s); d+=8; s+=8; }
 
-/*! ZSTD_wildcopy : custom version of memcpy(), can copy up to 7-8 bytes too many */
-static void ZSTD_wildcopy(void* dst, const void* src, size_t length)
+/*! ZSTD_wildcopy() :
+*   custom version of memcpy(), can copy up to 7 bytes too many (8 bytes if length==0) */
+MEM_STATIC void ZSTD_wildcopy(void* dst, const void* src, size_t length)
 {
     const BYTE* ip = (const BYTE*)src;
     BYTE* op = (BYTE*)dst;
@@ -114,9 +125,58 @@ static void ZSTD_wildcopy(void* dst, const void* src, size_t length)
     while (op < oend);
 }
 
-
-#if defined (__cplusplus)
+MEM_STATIC unsigned ZSTD_highbit(U32 val)
+{
+#   if defined(_MSC_VER)   /* Visual */
+    unsigned long r=0;
+    _BitScanReverse(&r, val);
+    return (unsigned)r;
+#   elif defined(__GNUC__) && (__GNUC__ >= 3)   /* GCC Intrinsic */
+    return 31 - __builtin_clz(val);
+#   else   /* Software version */
+    static const int DeBruijnClz[32] = { 0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31 };
+    U32 v = val;
+    int r;
+    v |= v >> 1;
+    v |= v >> 2;
+    v |= v >> 4;
+    v |= v >> 8;
+    v |= v >> 16;
+    r = DeBruijnClz[(U32)(v * 0x07C4ACDDU) >> 27];
+    return r;
+#   endif
 }
-#endif
+
+
+/*-*******************************************
+*  Private interfaces
+*********************************************/
+typedef struct {
+    void* buffer;
+    U32*  offsetStart;
+    U32*  offset;
+    BYTE* offCodeStart;
+    BYTE* offCode;
+    BYTE* litStart;
+    BYTE* lit;
+    BYTE* litLengthStart;
+    BYTE* litLength;
+    BYTE* matchLengthStart;
+    BYTE* matchLength;
+    BYTE* dumpsStart;
+    BYTE* dumps;
+    /* opt */
+    U32* matchLengthFreq;
+    U32* litLengthFreq;
+    U32* litFreq;
+    U32* offCodeFreq;
+    U32  matchLengthSum;
+    U32  litLengthSum;
+    U32  litSum;
+    U32  offCodeSum;
+} seqStore_t;
+
+seqStore_t ZSTD_copySeqStore(const ZSTD_CCtx* ctx);
+
 
 #endif   /* ZSTD_CCOMMON_H_MODULE */
diff --git a/lib/zstd_opt.h b/lib/zstd_opt.h
new file mode 100644
index 0000000..ec9a2a1
--- /dev/null
+++ b/lib/zstd_opt.h
@@ -0,0 +1,1125 @@
+/*
+    ZSTD Optimal mode
+    Copyright (C) 2016, Przemyslaw Skibinski, Yann Collet.
+
+    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+
+    * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following disclaimer
+    in the documentation and/or other materials provided with the
+    distribution.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+       - Zstd source repository : https://www.zstd.net
+*/
+
+/* Note : this file is intended to be included within zstd_compress.c */
+
+/*-  Dependencies  -*/
+#include <stdio.h>  /* for debug */
+
+
+/*-  Local types  -*/
+typedef struct {
+    U32 off;
+    U32 len;
+    U32 back;
+} ZSTD_match_t;
+
+typedef struct {
+    U32 price;
+    U32 off;
+    U32 mlen;
+    U32 litlen;
+    U32 rep;
+    U32 rep2;
+} ZSTD_optimal_t;
+
+
+/*-  Constants  -*/
+#define ZSTD_OPT_NUM   (1<<12)
+#define ZSTD_FREQ_THRESHOLD (256)
+
+/*-  Debug  -*/
+#define ZSTD_OPT_DEBUG 0     // 1 = tableID=0;    5 = check encoded sequences
+
+#if defined(ZSTD_OPT_DEBUG) && ZSTD_OPT_DEBUG>=1
+    #define ZSTD_LOG_PARSER(...) printf(__VA_ARGS__)
+    #define ZSTD_LOG_ENCODE(...) printf(__VA_ARGS__)
+    #define ZSTD_LOG_TRY_PRICE(...) printf(__VA_ARGS__)
+#else
+    #define ZSTD_LOG_PARSER(...)
+    #define ZSTD_LOG_ENCODE(...)
+    #define ZSTD_LOG_TRY_PRICE(...)
+#endif
+
+
+FORCE_INLINE U32 ZSTD_getLiteralPriceReal(seqStore_t* seqStorePtr, U32 litLength, const BYTE* literals)
+{
+    U32 price, freq, u;
+
+    if (!litLength) return 1;   /* special case */
+
+    /* literals */
+    price = litLength * ZSTD_highbit(seqStorePtr->litSum);
+    for (u=0; u < litLength; u++)
+        price -= ZSTD_highbit(seqStorePtr->litFreq[literals[u]]);
+
+    /* literal Length */
+    price += ((litLength >= MaxLL)*8) + ((litLength >= 255+MaxLL)*16) + ((litLength>=(1<<15))*8);
+    if (litLength >= MaxLL) litLength = MaxLL;
+    freq = seqStorePtr->litLengthFreq[litLength];
+    price += ZSTD_highbit(seqStorePtr->litLengthSum) - ZSTD_highbit(freq);
+
+    return price;
+}
+
+
+FORCE_INLINE U32 ZSTD_getLiteralPrice(seqStore_t* seqStorePtr, U32 litLength, const BYTE* literals)
+{
+    if (seqStorePtr->litSum > ZSTD_FREQ_THRESHOLD)
+        return ZSTD_getLiteralPriceReal(seqStorePtr, litLength, literals);
+    /* backup eval */
+    return 1 + (litLength<<3);
+}
+
+
+FORCE_INLINE U32 ZSTD_getMatchPriceReal(seqStore_t* seqStorePtr, U32 offset, U32 matchLength)
+{
+    /* offset */
+    BYTE offCode = offset ? (BYTE)ZSTD_highbit(offset) + 1 : 0;
+    U32 price = ZSTD_highbit(seqStorePtr->offCodeSum) - ZSTD_highbit(seqStorePtr->offCodeFreq[offCode]);
+    price += offCode;
+
+    /* match Length */
+    price += ((matchLength >= MaxML)*8) + ((matchLength >= 255+MaxML)*16) + ((matchLength>=(1<<15))*8);
+    if (matchLength >= MaxML) matchLength = MaxML;
+    price += ZSTD_highbit(seqStorePtr->matchLengthSum) - ZSTD_highbit(seqStorePtr->matchLengthFreq[matchLength]);
+
+    return price;
+}
+
+
+FORCE_INLINE U32 ZSTD_getPrice(seqStore_t* seqStorePtr, U32 litLength, const BYTE* literals, U32 offset, U32 matchLength)
+{
+    if (seqStorePtr->litSum > ZSTD_FREQ_THRESHOLD)
+        return ZSTD_getLiteralPriceReal(seqStorePtr, litLength, literals) + ZSTD_getMatchPriceReal(seqStorePtr, offset, matchLength);
+    /* backup eval */
+    return (litLength<<3) + ZSTD_highbit((U32)matchLength+1) + Offbits + ZSTD_highbit((U32)offset+1);
+}
+
+
+MEM_STATIC void ZSTD_updatePrice(seqStore_t* seqStorePtr, U32 litLength, const BYTE* literals, U32 offset, U32 matchLength)
+{
+    U32 u;
+
+    /* literals */
+    seqStorePtr->litSum += litLength;
+    for (u=0; u < litLength; u++)
+        seqStorePtr->litFreq[literals[u]]++;
+
+    /* literal Length */
+    seqStorePtr->litLengthSum++;
+    if (litLength >= MaxLL)
+        seqStorePtr->litLengthFreq[MaxLL]++;
+    else
+        seqStorePtr->litLengthFreq[litLength]++;
+
+    /* match offset */
+    seqStorePtr->offCodeSum++;
+    BYTE offCode = (BYTE)ZSTD_highbit(offset) + 1;
+    if (offset==0) offCode=0;
+    seqStorePtr->offCodeFreq[offCode]++;
+
+    /* match Length */
+    seqStorePtr->matchLengthSum++;
+    if (matchLength >= MaxML)
+        seqStorePtr->matchLengthFreq[MaxML]++;
+    else
+        seqStorePtr->matchLengthFreq[matchLength]++;
+}
+
+
+#define SET_PRICE(pos, mlen_, offset_, litlen_, price_)   \
+    {                                                 \
+        while (last_pos < pos)  { opt[last_pos+1].price = 1<<30; last_pos++; } \
+        opt[pos].mlen = mlen_;                         \
+        opt[pos].off = offset_;                        \
+        opt[pos].litlen = litlen_;                     \
+        opt[pos].price = price_;                       \
+        ZSTD_LOG_PARSER("%d: SET price[%d/%d]=%d litlen=%d len=%d off=%d\n", (int)(inr-base), (int)pos, (int)last_pos, opt[pos].price, opt[pos].litlen, opt[pos].mlen, opt[pos].off); \
+    }
+
+
+
+/*-*************************************
+*  Binary Tree search
+***************************************/
+static U32 ZSTD_insertBtAndGetAllMatches (
+                        ZSTD_CCtx* zc,
+                        const BYTE* const ip, const BYTE* const iend,
+                        U32 nbCompares, const U32 mls,
+                        U32 extDict, ZSTD_match_t* matches, size_t bestLength)
+{
+    const BYTE* const base = zc->base;
+    const U32 current = (U32)(ip-base);
+    const U32 hashLog = zc->params.hashLog;
+    const size_t h  = ZSTD_hashPtr(ip, hashLog, mls);
+    U32* const hashTable = zc->hashTable;
+    U32 matchIndex  = hashTable[h];
+    U32* const bt   = zc->contentTable;
+    const U32 btLog = zc->params.contentLog - 1;
+    const U32 btMask= (1U << btLog) - 1;
+    size_t commonLengthSmaller=0, commonLengthLarger=0;
+    const BYTE* const dictBase = zc->dictBase;
+    const U32 dictLimit = zc->dictLimit;
+    const BYTE* const dictEnd = dictBase + dictLimit;
+    const BYTE* const prefixStart = base + dictLimit;
+    const U32 btLow = btMask >= current ? 0 : current - btMask;
+    const U32 windowLow = zc->lowLimit;
+    U32* smallerPtr = bt + 2*(current&btMask);
+    U32* largerPtr  = bt + 2*(current&btMask) + 1;
+    U32 matchEndIdx = current+8;
+    U32 dummy32;   /* to be nullified at the end */
+    U32 mnum = 0;
+
+    bestLength = MINMATCH-1;
+    hashTable[h] = current;   /* Update Hash Table */
+
+    while (nbCompares-- && (matchIndex > windowLow)) {
+        U32* nextPtr = bt + 2*(matchIndex & btMask);
+        size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger);   /* guaranteed minimum nb of common bytes */
+        const BYTE* match;
+
+        if ((!extDict) || (matchIndex+matchLength >= dictLimit)) {
+            match = base + matchIndex;
+            if (match[matchLength] == ip[matchLength])
+                matchLength += ZSTD_count(ip+matchLength+1, match+matchLength+1, iend) +1;
+        } else {
+            match = dictBase + matchIndex;
+            matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart);
+            if (matchIndex+matchLength >= dictLimit)
+                match = base + matchIndex;   /* to prepare for next usage of match[matchLength] */
+        }
+
+        if (matchLength > bestLength) {
+            if (matchLength > matchEndIdx - matchIndex) matchEndIdx = matchIndex + (U32)matchLength;
+            bestLength = matchLength;
+            matches[mnum].off = current - matchIndex;
+            matches[mnum].len = (U32)matchLength;
+            matches[mnum].back = 0;
+            mnum++;
+            if (matchLength > ZSTD_OPT_NUM) break;
+            if (ip+matchLength == iend)   /* equal : no way to know if inf or sup */
+                break;   /* drop, to guarantee consistency (miss a little bit of compression) */
+        }
+
+        if (match[matchLength] < ip[matchLength]) {
+            /* match is smaller than current */
+            *smallerPtr = matchIndex;             /* update smaller idx */
+            commonLengthSmaller = matchLength;    /* all smaller will now have at least this guaranteed common length */
+            if (matchIndex <= btLow) { smallerPtr=&dummy32; break; }   /* beyond tree size, stop the search */
+            smallerPtr = nextPtr+1;               /* new "smaller" => larger of match */
+            matchIndex = nextPtr[1];              /* new matchIndex larger than previous (closer to current) */
+        } else {
+            /* match is larger than current */
+            *largerPtr = matchIndex;
+            commonLengthLarger = matchLength;
+            if (matchIndex <= btLow) { largerPtr=&dummy32; break; }   /* beyond tree size, stop the search */
+            largerPtr = nextPtr;
+            matchIndex = nextPtr[0];
+    }   }
+
+    *smallerPtr = *largerPtr = 0;
+
+    zc->nextToUpdate = (matchEndIdx > current + 8) ? matchEndIdx - 8 : current+1;
+    return mnum;
+}
+
+
+/** Tree updater, providing best match */
+static U32 ZSTD_BtGetAllMatches (
+                        ZSTD_CCtx* zc,
+                        const BYTE* const ip, const BYTE* const iLimit,
+                        const U32 maxNbAttempts, const U32 mls, ZSTD_match_t* matches, U32 minml)
+{
+    if (ip < zc->base + zc->nextToUpdate) return 0;   /* skipped area */
+    ZSTD_updateTree(zc, ip, iLimit, maxNbAttempts, mls);
+    return ZSTD_insertBtAndGetAllMatches(zc, ip, iLimit, maxNbAttempts, mls, 0, matches, minml);
+}
+
+
+static U32 ZSTD_BtGetAllMatches_selectMLS (
+                        ZSTD_CCtx* zc,   /* Index table will be updated */
+                        const BYTE* ip, const BYTE* const iLowLimit, const BYTE* const iHighLimit,
+                        const U32 maxNbAttempts, const U32 matchLengthSearch, ZSTD_match_t* matches, U32 minml)
+{
+    (void)iLowLimit;  /* unused */
+    switch(matchLengthSearch)
+    {
+    default :
+    case 4 : return ZSTD_BtGetAllMatches(zc, ip, iHighLimit, maxNbAttempts, 4, matches, minml);
+    case 5 : return ZSTD_BtGetAllMatches(zc, ip, iHighLimit, maxNbAttempts, 5, matches, minml);
+    case 6 : return ZSTD_BtGetAllMatches(zc, ip, iHighLimit, maxNbAttempts, 6, matches, minml);
+    }
+}
+
+/** Tree updater, providing best match */
+static U32 ZSTD_BtGetAllMatches_extDict (
+                        ZSTD_CCtx* zc,
+                        const BYTE* const ip, const BYTE* const iLimit,
+                        const U32 maxNbAttempts, const U32 mls, ZSTD_match_t* matches, U32 minml)
+{
+    if (ip < zc->base + zc->nextToUpdate) return 0;   /* skipped area */
+    ZSTD_updateTree_extDict(zc, ip, iLimit, maxNbAttempts, mls);
+    return ZSTD_insertBtAndGetAllMatches(zc, ip, iLimit, maxNbAttempts, mls, 1, matches, minml);
+}
+
+
+static U32 ZSTD_BtGetAllMatches_selectMLS_extDict (
+                        ZSTD_CCtx* zc,   /* Index table will be updated */
+                        const BYTE* ip, const BYTE* const iLowLimit, const BYTE* const iHighLimit,
+                        const U32 maxNbAttempts, const U32 matchLengthSearch, ZSTD_match_t* matches, U32 minml)
+{
+    (void)iLowLimit;
+    switch(matchLengthSearch)
+    {
+    default :
+    case 4 : return ZSTD_BtGetAllMatches_extDict(zc, ip, iHighLimit, maxNbAttempts, 4, matches, minml);
+    case 5 : return ZSTD_BtGetAllMatches_extDict(zc, ip, iHighLimit, maxNbAttempts, 5, matches, minml);
+    case 6 : return ZSTD_BtGetAllMatches_extDict(zc, ip, iHighLimit, maxNbAttempts, 6, matches, minml);
+    }
+}
+
+
+/* ***********************
+*  Hash Chain
+*************************/
+FORCE_INLINE /* inlining is important to hardwire a hot branch (template emulation) */
+U32 ZSTD_HcGetAllMatches_generic (
+                        ZSTD_CCtx* zc,   /* Index table will be updated */
+                        const BYTE* const ip, const BYTE* const iLowLimit, const BYTE* const iHighLimit,
+                        const U32 maxNbAttempts, const U32 mls, const U32 extDict, ZSTD_match_t* matches, size_t minml)
+{
+    U32* const chainTable = zc->contentTable;
+    const U32 chainSize = (1U << zc->params.contentLog);
+    const U32 chainMask = chainSize-1;
+    const BYTE* const base = zc->base;
+    const BYTE* const dictBase = zc->dictBase;
+    const U32 dictLimit = zc->dictLimit;
+    const BYTE* const prefixStart = base + dictLimit;
+    const BYTE* const dictEnd = dictBase + dictLimit;
+    const BYTE* const dictStart  = dictBase + zc->lowLimit;
+    const U32 lowLimit = zc->lowLimit;
+    const U32 current = (U32)(ip-base);
+    const U32 minChain = current > chainSize ? current - chainSize : 0;
+    U32 matchIndex;
+    U32 mnum = 0;
+    const BYTE* match;
+    U32 nbAttempts=maxNbAttempts;
+    minml=MINMATCH-1;
+
+    /* HC4 match finder */
+    matchIndex = ZSTD_insertAndFindFirstIndex (zc, ip, mls);
+
+    while ((matchIndex>lowLimit) && (nbAttempts)) {
+        size_t currentMl=0;
+        int back = 0;
+        nbAttempts--;
+        if ((!extDict) || matchIndex >= dictLimit) {
+            match = base + matchIndex;
+            if (match[minml] == ip[minml]) currentMl = ZSTD_count(ip, match, iHighLimit); if (currentMl>0) {   // faster
+            //if (MEM_read32(match) == MEM_read32(ip)) { currentMl = ZSTD_count(ip+MINMATCH, match+MINMATCH, iHighLimit)+MINMATCH;  // stronger
+                while ((match-back > prefixStart) && (ip-back > iLowLimit) && (ip[-back-1] == match[-back-1])) back++;
+                currentMl += back;
+            }
+        } else {
+            match = dictBase + matchIndex;
+            if (MEM_read32(match) == MEM_read32(ip)) {   /* assumption : matchIndex <= dictLimit-4 (by table construction) */
+                currentMl = ZSTD_count_2segments(ip+MINMATCH, match+MINMATCH, iHighLimit, dictEnd, prefixStart) + MINMATCH;
+                while ((match-back > dictStart) && (ip-back > iLowLimit) && (ip[-back-1] == match[-back-1])) back++;   /* backward match extension */
+                currentMl += back;
+        }   }
+
+        /* save best solution */
+        if (currentMl > minml) {
+            minml = currentMl;
+            matches[mnum].off = current - matchIndex;
+            matches[mnum].len = (U32)currentMl;
+            matches[mnum].back = back;
+            mnum++;
+            if (currentMl > ZSTD_OPT_NUM) break;
+            if (ip+currentMl == iHighLimit) break; /* best possible, and avoid read overflow*/
+        }
+
+        if (matchIndex <= minChain) break;
+        matchIndex = NEXT_IN_CHAIN(matchIndex, chainMask);
+    }
+
+    return mnum;
+}
+
+
+static U32 ZSTD_HcGetAllMatches_selectMLS (
+                        ZSTD_CCtx* zc,
+                        const BYTE* ip, const BYTE* const iLowLimit, const BYTE* const iHighLimit,
+                        const U32 maxNbAttempts, const U32 matchLengthSearch, ZSTD_match_t* matches, U32 minml)
+{
+    switch(matchLengthSearch)
+    {
+    default :
+    case 4 : return ZSTD_HcGetAllMatches_generic(zc, ip, iLowLimit, iHighLimit, maxNbAttempts, 4, 0, matches, minml);
+    case 5 : return ZSTD_HcGetAllMatches_generic(zc, ip, iLowLimit, iHighLimit, maxNbAttempts, 5, 0, matches, minml);
+    case 6 : return ZSTD_HcGetAllMatches_generic(zc, ip, iLowLimit, iHighLimit, maxNbAttempts, 6, 0, matches, minml);
+    }
+}
+
+static U32 ZSTD_HcGetAllMatches_selectMLS_extDict (
+                        ZSTD_CCtx* zc,
+                        const BYTE* ip, const BYTE* const iLowLimit, const BYTE* const iHighLimit,
+                        const U32 maxNbAttempts, const U32 matchLengthSearch, ZSTD_match_t* matches, U32 minml)
+{
+    switch(matchLengthSearch)
+    {
+    default :
+    case 4 : return ZSTD_HcGetAllMatches_generic(zc, ip, iLowLimit, iHighLimit, maxNbAttempts, 4, 1, matches, minml);
+    case 5 : return ZSTD_HcGetAllMatches_generic(zc, ip, iLowLimit, iHighLimit, maxNbAttempts, 5, 1, matches, minml);
+    case 6 : return ZSTD_HcGetAllMatches_generic(zc, ip, iLowLimit, iHighLimit, maxNbAttempts, 6, 1, matches, minml);
+    }
+}
+
+
+/*-*******************************
+*  Optimal parser
+*********************************/
+FORCE_INLINE
+void ZSTD_compressBlock_opt_generic(ZSTD_CCtx* ctx,
+                                    const void* src, size_t srcSize,
+                                    const U32 searchMethod, const U32 depth)
+{
+    seqStore_t* seqStorePtr = &(ctx->seqStore);
+    const BYTE* const istart = (const BYTE*)src;
+    const BYTE* ip = istart;
+    const BYTE* anchor = istart;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* const ilimit = iend - 8;
+    const BYTE* const base = ctx->base + ctx->dictLimit;
+
+    U32 rep_2=REPCODE_STARTVALUE, rep_1=REPCODE_STARTVALUE;
+    const U32 maxSearches = 1U << ctx->params.searchLog;
+    const U32 mls = ctx->params.searchLength;
+
+    typedef U32 (*getAllMatches_f)(ZSTD_CCtx* zc, const BYTE* ip, const BYTE* iLowLimit, const BYTE* iHighLimit,
+                        U32 maxNbAttempts, U32 matchLengthSearch, ZSTD_match_t* matches, U32 minml);
+    getAllMatches_f getAllMatches = searchMethod ? ZSTD_BtGetAllMatches_selectMLS : ZSTD_HcGetAllMatches_selectMLS;
+
+    ZSTD_optimal_t opt[ZSTD_OPT_NUM+4];
+    ZSTD_match_t matches[ZSTD_OPT_NUM+1];
+    const BYTE* inr;
+    U32 skip_num, cur, cur2, match_num, last_pos, litlen, price;
+
+    const U32 sufficient_len = ctx->params.targetLength;
+    const U32 faster_get_matches = (ctx->params.strategy == ZSTD_opt);
+
+
+    /* init */
+    ZSTD_resetSeqStore(seqStorePtr);
+    ZSTD_resetFreqs(seqStorePtr);
+    if ((ip-base) < REPCODE_STARTVALUE) ip = base + REPCODE_STARTVALUE;
+
+    /* Match Loop */
+    while (ip < ilimit) {
+        U32 u;
+        U32 mlen=0;
+        U32 best_mlen=0;
+        U32 best_off=0;
+        memset(opt, 0, sizeof(ZSTD_optimal_t));
+        last_pos = 0;
+        inr = ip;
+        opt[0].litlen = (U32)(ip - anchor);
+
+        /* check repCode */
+        if (MEM_read32(ip+1) == MEM_read32(ip+1 - rep_1)) {
+            /* repcode : we take it */
+            mlen = (U32)ZSTD_count(ip+1+MINMATCH, ip+1+MINMATCH-rep_1, iend) + MINMATCH;
+
+            ZSTD_LOG_PARSER("%d: start try REP rep=%d mlen=%d\n", (int)(ip-base), (int)rep_1, (int)mlen);
+            if (depth==0 || mlen > sufficient_len || mlen >= ZSTD_OPT_NUM) {
+                ip+=1; best_mlen = mlen; best_off = 0; cur = 0; last_pos = 1;
+                goto _storeSequence;
+            }
+
+            litlen = opt[0].litlen + 1;
+            do {
+                price = ZSTD_getPrice(seqStorePtr, litlen, anchor, 0, mlen - MINMATCH);
+                if (mlen + 1 > last_pos || price < opt[mlen + 1].price)
+                    SET_PRICE(mlen + 1, mlen, 0, litlen, price);   /* note : macro modifies last_pos */
+                mlen--;
+            } while (mlen >= MINMATCH);
+        }
+
+        best_mlen = (last_pos) ? last_pos : MINMATCH;
+
+        if (faster_get_matches && last_pos)
+           match_num = 0;
+        else
+           match_num = getAllMatches(ctx, ip, ip, iend, maxSearches, mls, matches, best_mlen); /* first search (depth 0) */
+
+        ZSTD_LOG_PARSER("%d: match_num=%d last_pos=%d\n", (int)(ip-base), match_num, last_pos);
+        if (!last_pos && !match_num) { ip++; continue; }
+
+        opt[0].rep = rep_1;
+        opt[0].rep2 = rep_2;
+        opt[0].mlen = 1;
+
+        if (match_num && matches[match_num-1].len > sufficient_len) {
+            best_mlen = matches[match_num-1].len;
+            best_off = matches[match_num-1].off;
+            cur = 0;
+            last_pos = 1;
+            goto _storeSequence;
+        }
+
+       // set prices using matches at position = 0
+       for (u = 0; u < match_num; u++) {
+           mlen = (u>0) ? matches[u-1].len+1 : best_mlen;
+           best_mlen = (matches[u].len < ZSTD_OPT_NUM) ? matches[u].len : ZSTD_OPT_NUM;
+           ZSTD_LOG_PARSER("%d: start Found mlen=%d off=%d best_mlen=%d last_pos=%d\n", (int)(ip-base), matches[u].len, matches[u].off, (int)best_mlen, (int)last_pos);
+           litlen = opt[0].litlen;
+           while (mlen <= best_mlen) {
+                price = ZSTD_getPrice(seqStorePtr, litlen, anchor, matches[u].off, mlen - MINMATCH);
+                if (mlen > last_pos || price < opt[mlen].price)
+                    SET_PRICE(mlen, mlen, matches[u].off, litlen, price);
+                mlen++;
+        }  }
+
+        if (last_pos < MINMATCH) { ip++; continue; }
+
+         /* check further positions */
+        for (skip_num = 0, cur = 1; cur <= last_pos; cur++) {
+           size_t cur_rep;
+           inr = ip + cur;
+
+           if (opt[cur-1].mlen == 1) {
+                litlen = opt[cur-1].litlen + 1;
+                if (cur > litlen) {
+                    price = opt[cur - litlen].price + ZSTD_getLiteralPrice(seqStorePtr, litlen, inr-litlen);
+                    ZSTD_LOG_TRY_PRICE("%d: TRY1 opt[%d].price=%d price=%d cur=%d litlen=%d\n", (int)(inr-base), cur - litlen, opt[cur - litlen].price, price, cur, litlen);
+                } else
+                    price = ZSTD_getLiteralPrice(seqStorePtr, litlen, anchor);
+           } else {
+                litlen = 1;
+                price = opt[cur - 1].price + ZSTD_getLiteralPrice(seqStorePtr, litlen, inr-1);
+                ZSTD_LOG_TRY_PRICE("%d: TRY3 price=%d cur=%d litlen=%d litonly=%d\n", (int)(inr-base), price, cur, litlen, (int)ZSTD_getLiteralPrice(seqStorePtr, litlen, inr-1));
+           }
+
+           ZSTD_LOG_TRY_PRICE("%d: TRY4 price=%d opt[%d].price=%d\n", (int)(inr-base), price, cur, opt[cur].price);
+
+           if (cur > last_pos || price <= opt[cur].price) // || ((price == opt[cur].price) && (opt[cur-1].mlen == 1) && (cur != litlen)))
+                SET_PRICE(cur, 1, 0, litlen, price);
+
+           if (cur == last_pos) break;
+
+           if (inr > ilimit)  /* last match must start at a minimum distance of 8 from oend */
+               continue;
+
+            mlen = opt[cur].mlen;
+
+            if (opt[cur-mlen].off) {
+                opt[cur].rep2 = opt[cur-mlen].rep;
+                opt[cur].rep = opt[cur-mlen].off;
+                ZSTD_LOG_PARSER("%d: COPYREP1 cur=%d mlen=%d rep=%d rep2=%d\n", (int)(inr-base), cur, mlen, opt[cur].rep, opt[cur].rep2);
+            } else {
+                if (cur!=mlen && opt[cur-mlen].litlen == 0) {
+                    opt[cur].rep2 = opt[cur-mlen].rep;
+                    opt[cur].rep = opt[cur-mlen].rep2;
+                    ZSTD_LOG_PARSER("%d: COPYREP2 cur=%d mlen=%d rep=%d rep2=%d\n", (int)(inr-base), cur, mlen, opt[cur].rep, opt[cur].rep2);
+                } else {
+                    opt[cur].rep2 = opt[cur-mlen].rep2;
+                    opt[cur].rep = opt[cur-mlen].rep;
+                    ZSTD_LOG_PARSER("%d: COPYREP3 cur=%d mlen=%d rep=%d rep2=%d\n", (int)(inr-base), cur, mlen, opt[cur].rep, opt[cur].rep2);
+            }   }
+
+           ZSTD_LOG_PARSER("%d: CURRENT price[%d/%d]=%d off=%d mlen=%d litlen=%d rep=%d rep2=%d\n", (int)(inr-base), cur, last_pos, opt[cur].price, opt[cur].off, opt[cur].mlen, opt[cur].litlen, opt[cur].rep, opt[cur].rep2);
+
+           best_mlen = 0;
+
+           if (!opt[cur].off && opt[cur].mlen != 1) {
+               cur_rep = opt[cur].rep2;
+               ZSTD_LOG_PARSER("%d: try REP2 rep2=%u mlen=%u\n", (int)(inr-base), (U32)cur_rep, mlen);
+           } else {
+               cur_rep = opt[cur].rep;
+               ZSTD_LOG_PARSER("%d: try REP1 rep=%u mlen=%u\n", (int)(inr-base), (U32)cur_rep, mlen);
+           }
+
+           if (MEM_read32(inr) == MEM_read32(inr - cur_rep)) {  // check rep
+               mlen = (U32)ZSTD_count(inr+MINMATCH, inr+MINMATCH - cur_rep, iend) + MINMATCH;
+               ZSTD_LOG_PARSER("%d: Found REP mlen=%d off=%d rep=%d opt[%d].off=%d\n", (int)(inr-base), mlen, 0, opt[cur].rep, cur, opt[cur].off);
+
+               if (mlen > sufficient_len || cur + mlen >= ZSTD_OPT_NUM) {
+                    best_mlen = mlen;
+                    best_off = 0;
+                    ZSTD_LOG_PARSER("%d: REP sufficient_len=%d best_mlen=%d best_off=%d last_pos=%d\n", (int)(inr-base), sufficient_len, best_mlen, best_off, last_pos);
+                    last_pos = cur + 1;
+                    goto _storeSequence;
+               }
+
+               if (opt[cur].mlen == 1) {
+                    litlen = opt[cur].litlen;
+                    if (cur > litlen) {
+                        price = opt[cur - litlen].price + ZSTD_getPrice(seqStorePtr, litlen, inr-litlen, 0, mlen - MINMATCH);
+                        ZSTD_LOG_TRY_PRICE("%d: TRY5 opt[%d].price=%d price=%d cur=%d litlen=%d\n", (int)(inr-base), cur - litlen, opt[cur - litlen].price, price, cur, litlen);
+                    } else
+                        price = ZSTD_getPrice(seqStorePtr, litlen, anchor, 0, mlen - MINMATCH);
+                } else {
+                    litlen = 0;
+                    price = opt[cur].price + ZSTD_getPrice(seqStorePtr, 0, NULL, 0, mlen - MINMATCH);
+                    ZSTD_LOG_TRY_PRICE("%d: TRY7 price=%d cur=%d litlen=0 getprice=%d\n", (int)(inr-base), price, cur, (int)ZSTD_getPrice(seqStorePtr, 0, NULL, 0, mlen - MINMATCH));
+                }
+
+                best_mlen = mlen;
+                if (faster_get_matches) skip_num = best_mlen;
+                ZSTD_LOG_PARSER("%d: Found REP mlen=%d off=%d price=%d litlen=%d price[%d]=%d\n", (int)(inr-base), mlen, 0, price, litlen, cur - litlen, opt[cur - litlen].price);
+
+                do {
+                    if (cur + mlen > last_pos || price <= opt[cur + mlen].price)
+                        SET_PRICE(cur + mlen, mlen, 0, litlen, price);
+                    mlen--;
+                } while (mlen >= MINMATCH);
+            }
+
+            if (faster_get_matches && skip_num > 0) { skip_num--; continue; }
+
+            best_mlen = (best_mlen > MINMATCH) ? best_mlen : MINMATCH;
+
+            match_num = getAllMatches(ctx, inr, ip, iend, maxSearches, mls, matches, best_mlen);
+            ZSTD_LOG_PARSER("%d: ZSTD_GetAllMatches match_num=%d\n", (int)(inr-base), match_num);
+
+            if (match_num > 0 && matches[match_num-1].len > sufficient_len) {
+                cur -= matches[match_num-1].back;
+                best_mlen = matches[match_num-1].len;
+                best_off = matches[match_num-1].off;
+                last_pos = cur + 1;
+                goto _storeSequence;
+            }
+
+            /* set prices using matches at position = cur */
+            for (u = 0; u < match_num; u++) {
+                mlen = (u>0) ? matches[u-1].len+1 : best_mlen;
+                cur2 = cur - matches[u].back;
+                best_mlen = (cur2 + matches[u].len < ZSTD_OPT_NUM) ? matches[u].len : ZSTD_OPT_NUM - cur2;
+
+                ZSTD_LOG_PARSER("%d: Found1 cur=%d cur2=%d mlen=%d off=%d best_mlen=%d last_pos=%d\n", (int)(inr-base), cur, cur2, matches[u].len, matches[u].off, best_mlen, last_pos);
+                if (mlen < matches[u].back + 1)
+                    mlen = matches[u].back + 1;
+
+                while (mlen <= best_mlen) {
+                    if (opt[cur2].mlen == 1) {
+                        litlen = opt[cur2].litlen;
+                        if (cur2 > litlen)
+                            price = opt[cur2 - litlen].price + ZSTD_getPrice(seqStorePtr, litlen, ip+cur2-litlen, matches[u].off, mlen - MINMATCH);
+                        else
+                            price = ZSTD_getPrice(seqStorePtr, litlen, anchor, matches[u].off, mlen - MINMATCH);
+                    } else {
+                        litlen = 0;
+                        price = opt[cur2].price + ZSTD_getPrice(seqStorePtr, 0, NULL, matches[u].off, mlen - MINMATCH);
+                    }
+
+                    ZSTD_LOG_PARSER("%d: Found2 pred=%d mlen=%d best_mlen=%d off=%d price=%d litlen=%d price[%d]=%d\n", (int)(inr-base), matches[u].back, mlen, best_mlen, matches[u].off, price, litlen, cur - litlen, opt[cur - litlen].price);
+                    ZSTD_LOG_TRY_PRICE("%d: TRY8 price=%d opt[%d].price=%d\n", (int)(inr-base), price, cur2 + mlen, opt[cur2 + mlen].price);
+
+                    if (cur2 + mlen > last_pos || (price < opt[cur2 + mlen].price))
+                        SET_PRICE(cur2 + mlen, mlen, matches[u].off, litlen, price);
+
+                    mlen++;
+        }   }   }   //  for (skip_num = 0, cur = 1; cur <= last_pos; cur++)
+
+        best_mlen = opt[last_pos].mlen;
+        best_off = opt[last_pos].off;
+        cur = last_pos - best_mlen;
+        // printf("%d: start=%d best_mlen=%d best_off=%d cur=%d\n", (int)(ip - base), (int)(start - ip), (int)best_mlen, (int)best_off, cur);
+
+        /* store sequence */
+_storeSequence:   /* cur, last_pos, best_mlen, best_off have to be set */
+        for (u = 1; u <= last_pos; u++)
+            ZSTD_LOG_PARSER("%d: price[%d/%d]=%d off=%d mlen=%d litlen=%d rep=%d rep2=%d\n", (int)(ip-base+u), u, last_pos, opt[u].price, opt[u].off, opt[u].mlen, opt[u].litlen, opt[u].rep, opt[u].rep2);
+        ZSTD_LOG_PARSER("%d: cur=%d/%d best_mlen=%d best_off=%d rep=%d\n", (int)(ip-base+cur), (int)cur, (int)last_pos, (int)best_mlen, (int)best_off, opt[cur].rep);
+
+        opt[0].mlen = 1;
+        U32 offset;
+
+        while (1) {
+            mlen = opt[cur].mlen;
+            ZSTD_LOG_PARSER("%d: cur=%d mlen=%d\n", (int)(ip-base), cur, mlen);
+            offset = opt[cur].off;
+            opt[cur].mlen = best_mlen;
+            opt[cur].off = best_off;
+            best_mlen = mlen;
+            best_off = offset;
+            if (mlen > cur) break;
+            cur -= mlen;
+        }
+
+        for (u = 0; u <= last_pos;) {
+            ZSTD_LOG_PARSER("%d: price2[%d/%d]=%d off=%d mlen=%d litlen=%d rep=%d rep2=%d\n", (int)(ip-base+u), u, last_pos, opt[u].price, opt[u].off, opt[u].mlen, opt[u].litlen, opt[u].rep, opt[u].rep2);
+            u += opt[u].mlen;
+        }
+
+        for (cur=0; cur < last_pos; ) {
+            ZSTD_LOG_PARSER("%d: price3[%d/%d]=%d off=%d mlen=%d litlen=%d rep=%d rep2=%d\n", (int)(ip-base+cur), cur, last_pos, opt[cur].price, opt[cur].off, opt[cur].mlen, opt[cur].litlen, opt[cur].rep, opt[cur].rep2);
+            mlen = opt[cur].mlen;
+            if (mlen == 1) { ip++; cur++; continue; }
+            offset = opt[cur].off;
+            cur += mlen;
+
+            U32 litLength = (U32)(ip - anchor);
+            ZSTD_LOG_ENCODE("%d/%d: ENCODE1 literals=%d mlen=%d off=%d rep1=%d rep2=%d\n", (int)(ip-base), (int)(iend-base), (int)(litLength), (int)mlen, (int)(offset), (int)rep_1, (int)rep_2);
+
+            if (offset) {
+                rep_2 = rep_1;
+                rep_1 = offset;
+            } else {
+                if (litLength == 0) {
+                    best_off = rep_2;
+                    rep_2 = rep_1;
+                    rep_1 = best_off;
+            }   }
+
+            ZSTD_LOG_ENCODE("%d/%d: ENCODE2 literals=%d mlen=%d off=%d rep1=%d rep2=%d\n", (int)(ip-base), (int)(iend-base), (int)(litLength), (int)mlen, (int)(offset), (int)rep_1, (int)rep_2);
+
+#if ZSTD_OPT_DEBUG >= 5
+            int ml2;
+            if (offset)
+                ml2 = ZSTD_count(ip, ip-offset, iend);
+            else
+                ml2 = ZSTD_count(ip, ip-rep_1, iend);
+            if (ml2 < mlen && ml2 < MINMATCH) {
+                printf("%d: ERROR iend=%d mlen=%d offset=%d ml2=%d\n", (int)(ip - base), (int)(iend - ip), (int)mlen, (int)offset, (int)ml2); exit(0); }
+            if (ip < anchor) {
+                printf("%d: ERROR ip < anchor iend=%d mlen=%d offset=%d\n", (int)(ip - base), (int)(iend - ip), (int)mlen, (int)offset); exit(0); }
+            if (ip - offset < ctx->base) {
+                printf("%d: ERROR ip - offset < base iend=%d mlen=%d offset=%d\n", (int)(ip - base), (int)(iend - ip), (int)mlen, (int)offset); exit(0); }
+            if ((int)offset >= (1 << ctx->params.windowLog)) {
+                printf("%d: offset >= (1 << params.windowLog) iend=%d mlen=%d offset=%d\n", (int)(ip - base), (int)(iend - ip), (int)mlen, (int)offset); exit(0); }
+            if (mlen < MINMATCH) {
+                printf("%d: ERROR mlen < MINMATCH iend=%d mlen=%d offset=%d\n", (int)(ip - base), (int)(iend - ip), (int)mlen, (int)offset); exit(0); }
+            if (ip + mlen > iend) {
+                printf("%d: ERROR ip + mlen >= iend iend=%d mlen=%d offset=%d\n", (int)(ip - base), (int)(iend - ip), (int)mlen, (int)offset); exit(0); }
+#endif
+
+            ZSTD_updatePrice(seqStorePtr, litLength, anchor, offset, mlen-MINMATCH);
+            ZSTD_storeSeq(seqStorePtr, litLength, anchor, offset, mlen-MINMATCH);
+            anchor = ip = ip + mlen;
+        }   /* for (cur=0; cur < last_pos; ) */
+
+        /* check immediate repcode */
+        while ( (anchor <= ilimit)
+             && (MEM_read32(anchor) == MEM_read32(anchor - rep_2)) ) {
+            /* store sequence */
+            best_mlen = (U32)ZSTD_count(anchor+MINMATCH, anchor+MINMATCH-rep_2, iend);
+            best_off = rep_2;
+            rep_2 = rep_1;
+            rep_1 = best_off;
+            ZSTD_LOG_ENCODE("%d/%d: ENCODE REP literals=%d mlen=%d off=%d rep1=%d rep2=%d\n", (int)(anchor-base), (int)(iend-base), (int)(0), (int)best_mlen, (int)(0), (int)rep_1, (int)rep_2);
+            ZSTD_updatePrice(seqStorePtr, 0, anchor, 0, best_mlen);
+            ZSTD_storeSeq(seqStorePtr, 0, anchor, 0, best_mlen);
+            anchor += best_mlen+MINMATCH;
+            continue;   /* faster when present ... (?) */
+        }
+        if (anchor > ip) ip = anchor;
+    }
+
+    {   /* Last Literals */
+        size_t lastLLSize = iend - anchor;
+        ZSTD_LOG_ENCODE("%d: lastLLSize literals=%u\n", (int)(ip-base), (U32)lastLLSize);
+        memcpy(seqStorePtr->lit, anchor, lastLLSize);
+        seqStorePtr->lit += lastLLSize;
+    }
+}
+
+
+FORCE_INLINE
+void ZSTD_compressBlock_opt_extDict_generic(ZSTD_CCtx* ctx,
+                                     const void* src, size_t srcSize,
+                                     const U32 searchMethod, const U32 depth)
+{
+    seqStore_t* seqStorePtr = &(ctx->seqStore);
+    const BYTE* const istart = (const BYTE*)src;
+    const BYTE* ip = istart;
+    const BYTE* anchor = istart;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* const ilimit = iend - 8;
+    const BYTE* const base = ctx->base;
+    const U32 dictLimit = ctx->dictLimit;
+    const BYTE* const prefixStart = base + dictLimit;
+    const BYTE* const dictBase = ctx->dictBase;
+    const BYTE* const dictEnd  = dictBase + dictLimit;
+
+    U32 rep_2=REPCODE_STARTVALUE, rep_1=REPCODE_STARTVALUE;
+    const U32 maxSearches = 1U << ctx->params.searchLog;
+    const U32 mls = ctx->params.searchLength;
+
+    typedef U32 (*getAllMatches_f)(ZSTD_CCtx* zc, const BYTE* ip, const BYTE* iLowLimit, const BYTE* iHighLimit,
+                        U32 maxNbAttempts, U32 matchLengthSearch, ZSTD_match_t* matches, U32 minml);
+    getAllMatches_f getAllMatches = searchMethod ? ZSTD_BtGetAllMatches_selectMLS_extDict : ZSTD_HcGetAllMatches_selectMLS_extDict;
+
+    ZSTD_optimal_t opt[ZSTD_OPT_NUM+4];
+    ZSTD_match_t matches[ZSTD_OPT_NUM+1];
+    const BYTE* inr;
+    U32 skip_num, cur, cur2, match_num, last_pos, litlen, price;
+
+    const U32 sufficient_len = ctx->params.targetLength;
+    const U32 faster_get_matches = (ctx->params.strategy == ZSTD_opt);
+
+    /* init */
+    ZSTD_resetSeqStore(seqStorePtr);
+    ZSTD_resetFreqs(seqStorePtr);
+    if ((ip - prefixStart) < REPCODE_STARTVALUE) ip += REPCODE_STARTVALUE;
+
+    /* Match Loop */
+    while (ip < ilimit) {
+        U32 u, offset, best_off=0;
+        U32 mlen=0, best_mlen=0;
+        U32 current = (U32)(ip-base);
+        memset(opt, 0, sizeof(ZSTD_optimal_t));
+        last_pos = 0;
+        inr = ip;
+        opt[0].litlen = (U32)(ip - anchor);
+
+        /* check repCode */
+        {
+            const U32 repIndex = (U32)(current+1 - rep_1);
+            const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
+            const BYTE* const repMatch = repBase + repIndex;
+            if ( ((U32)((dictLimit-1) - repIndex) >= 3)   /* intentional overflow */
+               && (MEM_read32(ip+1) == MEM_read32(repMatch)) ) {
+                /* repcode detected we should take it */
+                const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
+                mlen = (U32)ZSTD_count_2segments(ip+1+MINMATCH, repMatch+MINMATCH, iend, repEnd, prefixStart) + MINMATCH;
+
+                ZSTD_LOG_PARSER("%d: start try REP rep=%d mlen=%d\n", (int)(ip-base), (int)rep_1, (int)mlen);
+                if (depth==0 || mlen > sufficient_len || mlen >= ZSTD_OPT_NUM) {
+                    ip+=1; best_mlen = mlen; best_off = 0; cur = 0; last_pos = 1;
+                    goto _storeSequence;
+                }
+
+                litlen = opt[0].litlen + 1;
+                do {
+                    price = ZSTD_getPrice(seqStorePtr, litlen, anchor, 0, mlen - MINMATCH);
+                    if (mlen + 1 > last_pos || price < opt[mlen + 1].price)
+                        SET_PRICE(mlen + 1, mlen, 0, litlen, price);
+                    mlen--;
+                } while (mlen >= MINMATCH);
+        }   }
+
+       best_mlen = (last_pos) ? last_pos : MINMATCH;
+
+       if (faster_get_matches && last_pos)
+           match_num = 0;
+       else
+           match_num = getAllMatches(ctx, ip, ip, iend, maxSearches, mls, matches, best_mlen);  /* first search (depth 0) */
+
+       ZSTD_LOG_PARSER("%d: match_num=%d last_pos=%d\n", (int)(ip-base), match_num, last_pos);
+       if (!last_pos && !match_num) { ip++; continue; }
+
+       opt[0].rep = rep_1;
+       opt[0].rep2 = rep_2;
+       opt[0].mlen = 1;
+
+       if (match_num && matches[match_num-1].len > sufficient_len) {
+            best_mlen = matches[match_num-1].len;
+            best_off = matches[match_num-1].off;
+            cur = 0;
+            last_pos = 1;
+            goto _storeSequence;
+       }
+
+        // set prices using matches at position = 0
+        for (u = 0; u < match_num; u++) {
+            mlen = (u>0) ? matches[u-1].len+1 : best_mlen;
+            best_mlen = (matches[u].len < ZSTD_OPT_NUM) ? matches[u].len : ZSTD_OPT_NUM;
+            ZSTD_LOG_PARSER("%d: start Found mlen=%d off=%d best_mlen=%d last_pos=%d\n", (int)(ip-base), matches[u].len, matches[u].off, (int)best_mlen, (int)last_pos);
+            litlen = opt[0].litlen;
+            while (mlen <= best_mlen) {
+                price = ZSTD_getPrice(seqStorePtr, litlen, anchor, matches[u].off, mlen - MINMATCH);
+                if (mlen > last_pos || price < opt[mlen].price)
+                    SET_PRICE(mlen, mlen, matches[u].off, litlen, price);
+                mlen++;
+        }   }
+
+        if (last_pos < MINMATCH) {
+            // ip += ((ip-anchor) >> g_searchStrength) + 1;   /* jump faster over incompressible sections */
+            ip++; continue;
+        }
+
+        /* check further positions */
+        for (skip_num = 0, cur = 1; cur <= last_pos; cur++) {
+           size_t cur_rep;
+           inr = ip + cur;
+
+           if (opt[cur-1].mlen == 1) {
+                litlen = opt[cur-1].litlen + 1;
+                if (cur > litlen) {
+                    price = opt[cur - litlen].price + ZSTD_getLiteralPrice(seqStorePtr, litlen, inr-litlen);
+                    ZSTD_LOG_TRY_PRICE("%d: TRY1 opt[%d].price=%d price=%d cur=%d litlen=%d\n", (int)(inr-base), cur - litlen, opt[cur - litlen].price, price, cur, litlen);
+                } else
+                    price = ZSTD_getLiteralPrice(seqStorePtr, litlen, anchor);
+           } else {
+                litlen = 1;
+                price = opt[cur - 1].price + ZSTD_getLiteralPrice(seqStorePtr, litlen, inr-1);
+                ZSTD_LOG_TRY_PRICE("%d: TRY3 price=%d cur=%d litlen=%d litonly=%d\n", (int)(inr-base), price, cur, litlen, (int)ZSTD_getLiteralPrice(seqStorePtr, litlen, inr-1));
+           }
+
+           ZSTD_LOG_TRY_PRICE("%d: TRY4 price=%d opt[%d].price=%d\n", (int)(inr-base), price, cur, opt[cur].price);
+
+           if (cur > last_pos || price <= opt[cur].price) // || ((price == opt[cur].price) && (opt[cur-1].mlen == 1) && (cur != litlen)))
+                SET_PRICE(cur, 1, 0, litlen, price);
+
+           if (cur == last_pos) break;
+
+           if (inr > ilimit) // last match must start at a minimum distance of 8 from oend
+               continue;
+
+            mlen = opt[cur].mlen;
+
+            if (opt[cur-mlen].off) {
+                opt[cur].rep2 = opt[cur-mlen].rep;
+                opt[cur].rep = opt[cur-mlen].off;
+                ZSTD_LOG_PARSER("%d: COPYREP1 cur=%d mlen=%d rep=%d rep2=%d\n", (int)(inr-base), cur, mlen, opt[cur].rep, opt[cur].rep2);
+            } else {
+                if (cur!=mlen && opt[cur-mlen].litlen == 0) {
+                    opt[cur].rep2 = opt[cur-mlen].rep;
+                    opt[cur].rep = opt[cur-mlen].rep2;
+                    ZSTD_LOG_PARSER("%d: COPYREP2 cur=%d mlen=%d rep=%d rep2=%d\n", (int)(inr-base), cur, mlen, opt[cur].rep, opt[cur].rep2);
+                } else {
+                    opt[cur].rep2 = opt[cur-mlen].rep2;
+                    opt[cur].rep = opt[cur-mlen].rep;
+                    ZSTD_LOG_PARSER("%d: COPYREP3 cur=%d mlen=%d rep=%d rep2=%d\n", (int)(inr-base), cur, mlen, opt[cur].rep, opt[cur].rep2);
+            }   }
+
+           ZSTD_LOG_PARSER("%d: CURRENT price[%d/%d]=%d off=%d mlen=%d litlen=%d rep=%d rep2=%d\n", (int)(inr-base), cur, last_pos, opt[cur].price, opt[cur].off, opt[cur].mlen, opt[cur].litlen, opt[cur].rep, opt[cur].rep2);
+
+           best_mlen = 0;
+
+           if (!opt[cur].off && opt[cur].mlen != 1) {
+               cur_rep = opt[cur].rep2;
+               ZSTD_LOG_PARSER("%d: try REP2 rep2=%u mlen=%u\n", (int)(inr-base), (U32)cur_rep, mlen);
+           } else {
+               cur_rep = opt[cur].rep;
+               ZSTD_LOG_PARSER("%d: try REP1 rep=%u mlen=%u\n", (int)(inr-base), (U32)cur_rep, mlen);
+           }
+
+           const U32 repIndex = (U32)(current+cur - cur_rep);
+           const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
+           const BYTE* const repMatch = repBase + repIndex;
+           if ( ((U32)((dictLimit-1) - repIndex) >= 3)   /* intentional overflow */
+              &&(MEM_read32(inr) == MEM_read32(repMatch)) ) {
+                /* repcode detected */
+                const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
+                mlen = (U32)ZSTD_count_2segments(inr+MINMATCH, repMatch+MINMATCH, iend, repEnd, prefixStart) + MINMATCH;
+                ZSTD_LOG_PARSER("%d: Found REP mlen=%d off=%d rep=%d opt[%d].off=%d\n", (int)(inr-base), mlen, 0, opt[cur].rep, cur, opt[cur].off);
+
+                if (mlen > sufficient_len || cur + mlen >= ZSTD_OPT_NUM) {
+                    best_mlen = mlen;
+                    best_off = 0;
+                    ZSTD_LOG_PARSER("%d: REP sufficient_len=%d best_mlen=%d best_off=%d last_pos=%d\n", (int)(inr-base), sufficient_len, best_mlen, best_off, last_pos);
+                    last_pos = cur + 1;
+                    goto _storeSequence;
+                }
+
+                if (opt[cur].mlen == 1) {
+                    litlen = opt[cur].litlen;
+                    if (cur > litlen) {
+                        price = opt[cur - litlen].price + ZSTD_getPrice(seqStorePtr, litlen, inr-litlen, 0, mlen - MINMATCH);
+                        ZSTD_LOG_TRY_PRICE("%d: TRY5 opt[%d].price=%d price=%d cur=%d litlen=%d\n", (int)(inr-base), cur - litlen, opt[cur - litlen].price, price, cur, litlen);
+                    } else
+                        price = ZSTD_getPrice(seqStorePtr, litlen, anchor, 0, mlen - MINMATCH);
+                } else {
+                    litlen = 0;
+                    price = opt[cur].price + ZSTD_getPrice(seqStorePtr, 0, NULL, 0, mlen - MINMATCH);
+                    ZSTD_LOG_TRY_PRICE("%d: TRY7 price=%d cur=%d litlen=0 getprice=%d\n", (int)(inr-base), price, cur, (int)ZSTD_getPrice(seqStorePtr, 0, NULL, 0, mlen - MINMATCH));
+                }
+
+                best_mlen = mlen;
+                if (faster_get_matches) skip_num = best_mlen;
+
+                ZSTD_LOG_PARSER("%d: Found REP mlen=%d off=%d price=%d litlen=%d price[%d]=%d\n", (int)(inr-base), mlen, 0, price, litlen, cur - litlen, opt[cur - litlen].price);
+
+                do {
+                    if (cur + mlen > last_pos || price <= opt[cur + mlen].price) // || ((price == opt[cur + mlen].price) && (opt[cur].mlen == 1) && (cur != litlen))) // at equal price prefer REP instead of MATCH
+                        SET_PRICE(cur + mlen, mlen, 0, litlen, price);
+                    mlen--;
+                } while (mlen >= MINMATCH);
+            }
+
+            if (faster_get_matches && skip_num > 0) { skip_num--; continue; }
+
+            best_mlen = (best_mlen > MINMATCH) ? best_mlen : MINMATCH;
+
+            match_num = getAllMatches(ctx, inr, ip, iend, maxSearches, mls, matches, best_mlen);
+            ZSTD_LOG_PARSER("%d: ZSTD_GetAllMatches match_num=%d\n", (int)(inr-base), match_num);
+
+            if (match_num > 0 && matches[match_num-1].len > sufficient_len) {
+                cur -= matches[match_num-1].back;
+                best_mlen = matches[match_num-1].len;
+                best_off = matches[match_num-1].off;
+                last_pos = cur + 1;
+                goto _storeSequence;
+            }
+
+            // set prices using matches at position = cur
+            for (u = 0; u < match_num; u++) {
+                mlen = (u>0) ? matches[u-1].len+1 : best_mlen;
+                cur2 = cur - matches[u].back;
+                best_mlen = (cur2 + matches[u].len < ZSTD_OPT_NUM) ? matches[u].len : ZSTD_OPT_NUM - cur2;
+
+                ZSTD_LOG_PARSER("%d: Found1 cur=%d cur2=%d mlen=%d off=%d best_mlen=%d last_pos=%d\n", (int)(inr-base), cur, cur2, matches[u].len, matches[u].off, best_mlen, last_pos);
+                if (mlen < matches[u].back + 1)
+                    mlen = matches[u].back + 1;
+
+                while (mlen <= best_mlen) {
+                    if (opt[cur2].mlen == 1) {
+                        litlen = opt[cur2].litlen;
+                        if (cur2 > litlen)
+                            price = opt[cur2 - litlen].price + ZSTD_getPrice(seqStorePtr, litlen, ip+cur2-litlen, matches[u].off, mlen - MINMATCH);
+                        else
+                            price = ZSTD_getPrice(seqStorePtr, litlen, anchor, matches[u].off, mlen - MINMATCH);
+                    } else {
+                        litlen = 0;
+                        price = opt[cur2].price + ZSTD_getPrice(seqStorePtr, 0, NULL, matches[u].off, mlen - MINMATCH);
+                    }
+
+                    ZSTD_LOG_PARSER("%d: Found2 pred=%d mlen=%d best_mlen=%d off=%d price=%d litlen=%d price[%d]=%d\n", (int)(inr-base), matches[u].back, mlen, best_mlen, matches[u].off, price, litlen, cur - litlen, opt[cur - litlen].price);
+                    ZSTD_LOG_TRY_PRICE("%d: TRY8 price=%d opt[%d].price=%d\n", (int)(inr-base), price, cur2 + mlen, opt[cur2 + mlen].price);
+
+                    if (cur2 + mlen > last_pos || (price < opt[cur2 + mlen].price))
+                        SET_PRICE(cur2 + mlen, mlen, matches[u].off, litlen, price);
+
+                    mlen++;
+        }   }   }   //  for (skip_num = 0, cur = 1; cur <= last_pos; cur++)
+
+        best_mlen = opt[last_pos].mlen;
+        best_off = opt[last_pos].off;
+        cur = last_pos - best_mlen;
+        // printf("%d: start=%d best_mlen=%d best_off=%d cur=%d\n", (int)(ip - base), (int)(start - ip), (int)best_mlen, (int)best_off, cur);
+
+        /* store sequence */
+_storeSequence: // cur, last_pos, best_mlen, best_off have to be set
+        for (u = 1; u <= last_pos; u++)
+            ZSTD_LOG_PARSER("%d: price[%u/%d]=%d off=%d mlen=%d litlen=%d rep=%d rep2=%d\n", (int)(ip-base+u), u, last_pos, opt[u].price, opt[u].off, opt[u].mlen, opt[u].litlen, opt[u].rep, opt[u].rep2);
+        ZSTD_LOG_PARSER("%d: cur=%d/%d best_mlen=%d best_off=%d rep=%d\n", (int)(ip-base+cur), (int)cur, (int)last_pos, (int)best_mlen, (int)best_off, opt[cur].rep);
+
+        opt[0].mlen = 1;
+
+        while (1) {
+            mlen = opt[cur].mlen;
+            ZSTD_LOG_PARSER("%d: cur=%d mlen=%d\n", (int)(ip-base), cur, mlen);
+            offset = opt[cur].off;
+            opt[cur].mlen = best_mlen;
+            opt[cur].off = best_off;
+            best_mlen = mlen;
+            best_off = offset;
+            if (mlen > cur) break;
+            cur -= mlen;
+        }
+
+        for (u = 0; u <= last_pos; ) {
+            ZSTD_LOG_PARSER("%d: price2[%d/%d]=%d off=%d mlen=%d litlen=%d rep=%d rep2=%d\n", (int)(ip-base+u), u, last_pos, opt[u].price, opt[u].off, opt[u].mlen, opt[u].litlen, opt[u].rep, opt[u].rep2);
+            u += opt[u].mlen;
+        }
+
+        for (cur=0; cur < last_pos; ) {
+            U32 litLength;
+            ZSTD_LOG_PARSER("%d: price3[%d/%d]=%d off=%d mlen=%d litlen=%d rep=%d rep2=%d\n", (int)(ip-base+cur), cur, last_pos, opt[cur].price, opt[cur].off, opt[cur].mlen, opt[cur].litlen, opt[cur].rep, opt[cur].rep2);
+            mlen = opt[cur].mlen;
+            if (mlen == 1) { ip++; cur++; continue; }
+            offset = opt[cur].off;
+            cur += mlen;
+
+            litLength = (U32)(ip - anchor);
+            ZSTD_LOG_ENCODE("%d/%d: ENCODE1 literals=%d mlen=%d off=%d rep1=%d rep2=%d\n", (int)(ip-base), (int)(iend-base), (int)(litLength), (int)mlen, (int)(offset), (int)rep_1, (int)rep_2);
+
+            if (offset) {
+                rep_2 = rep_1;
+                rep_1 = offset;
+            } else {
+                if (litLength == 0) {
+                    best_off = rep_2;
+                    rep_2 = rep_1;
+                    rep_1 = best_off;
+            }   }
+
+            ZSTD_LOG_ENCODE("%d/%d: ENCODE2 literals=%d mlen=%d off=%d rep1=%d rep2=%d\n", (int)(ip-base), (int)(iend-base), (int)(litLength), (int)mlen, (int)(offset), (int)rep_1, (int)rep_2);
+
+#if ZSTD_OPT_DEBUG >= 5
+            int ml2;
+            if (offset)
+                ml2 = ZSTD_count(ip, ip-offset, iend);
+            else
+                ml2 = ZSTD_count(ip, ip-rep_1, iend);
+            if (ml2 < mlen && ml2 < MINMATCH) {
+                printf("%d: ERROR iend=%d mlen=%d offset=%d ml2=%d\n", (int)(ip - base), (int)(iend - ip), (int)mlen, (int)offset, (int)ml2); exit(0); }
+            if (ip < anchor) {
+                printf("%d: ERROR ip < anchor iend=%d mlen=%d offset=%d\n", (int)(ip - base), (int)(iend - ip), (int)mlen, (int)offset); exit(0); }
+            if (ip - offset < ctx->base) {
+                printf("%d: ERROR ip - offset < base iend=%d mlen=%d offset=%d\n", (int)(ip - base), (int)(iend - ip), (int)mlen, (int)offset); exit(0); }
+            if ((int)offset >= (1 << ctx->params.windowLog)) {
+                printf("%d: offset >= (1 << params.windowLog) iend=%d mlen=%d offset=%d\n", (int)(ip - base), (int)(iend - ip), (int)mlen, (int)offset); exit(0); }
+            if (mlen < MINMATCH) {
+                printf("%d: ERROR mlen < MINMATCH iend=%d mlen=%d offset=%d\n", (int)(ip - base), (int)(iend - ip), (int)mlen, (int)offset); exit(0); }
+            if (ip + mlen > iend) {
+                printf("%d: ERROR ip + mlen >= iend iend=%d mlen=%d offset=%d\n", (int)(ip - base), (int)(iend - ip), (int)mlen, (int)offset); exit(0); }
+#endif
+
+            ZSTD_updatePrice(seqStorePtr, litLength, anchor, offset, mlen-MINMATCH);
+            ZSTD_storeSeq(seqStorePtr, litLength, anchor, offset, mlen-MINMATCH);
+            anchor = ip = ip + mlen;
+        }
+
+        /* check immediate repcode */
+        while (anchor <= ilimit) {
+            const U32 repIndex = (U32)((anchor-base) - rep_2);
+            const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
+            const BYTE* const repMatch = repBase + repIndex;
+            if ( ((U32)((dictLimit-1) - repIndex) >= 3)   /* intentional overflow */
+               && (MEM_read32(anchor) == MEM_read32(repMatch)) ) {
+                /* repcode detected, let's take it */
+                const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
+                mlen = (U32)ZSTD_count_2segments(anchor+MINMATCH, repMatch+MINMATCH, iend, repEnd, prefixStart) + MINMATCH;
+                offset = rep_2; rep_2 = rep_1; rep_1 = offset;   /* swap offset history */
+                ZSTD_LOG_ENCODE("%d/%d: ENCODE REP literals=%d mlen=%d off=%d rep1=%d rep2=%d\n", (int)(anchor-base), (int)(iend-base), (int)(0), (int)best_mlen, (int)(0), (int)rep_1, (int)rep_2);
+                ZSTD_updatePrice(seqStorePtr, 0, anchor, 0, mlen-MINMATCH);
+                ZSTD_storeSeq(seqStorePtr, 0, anchor, 0, mlen-MINMATCH);
+                anchor += mlen;
+                continue;   /* faster when present ... (?) */
+            }
+            break;
+        }
+        if (anchor > ip) ip = anchor;
+    }
+
+    {   /* Last Literals */
+        size_t lastLLSize = iend - anchor;
+        ZSTD_LOG_ENCODE("%d: lastLLSize literals=%u\n", (int)(ip-base), (U32)(lastLLSize));
+        memcpy(seqStorePtr->lit, anchor, lastLLSize);
+        seqStorePtr->lit += lastLLSize;
+    }
+}
diff --git a/lib/zstd_static.h b/lib/zstd_static.h
index c60fa65..6121653 100644
--- a/lib/zstd_static.h
+++ b/lib/zstd_static.h
@@ -1,7 +1,7 @@
 /*
     zstd - standard compression library
     Header File for static linking only
-    Copyright (C) 2014-2015, Yann Collet.
+    Copyright (C) 2014-2016, Yann Collet.
 
     BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
 
@@ -27,29 +27,34 @@
     OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
     You can contact the author at :
-    - zstd source repository : https://github.com/Cyan4973/zstd
-    - ztsd public forum : https://groups.google.com/forum/#!forum/lz4c
+    - zstd homepage : http://www.zstd.net
 */
 #ifndef ZSTD_STATIC_H
 #define ZSTD_STATIC_H
 
-/* The objects defined into this file shall be considered experimental.
- * They are not considered stable, as their prototype may change in the future.
- * You can use them for tests, provide feedback, or if you can endure risks of future changes.
+/* The prototypes defined within this file are considered experimental.
+ * They should not be used in the context DLL as they may change in the future.
+ * Prefer static linking if you need them, to control breaking version changes issues.
  */
 
 #if defined (__cplusplus)
 extern "C" {
 #endif
 
-/* *************************************
-*  Includes
+/*-*************************************
+*  Dependencies
 ***************************************/
 #include "zstd.h"
 #include "mem.h"
 
 
-/* *************************************
+/*-*************************************
+*  Constants
+***************************************/
+#define ZSTD_MAGICNUMBER 0xFD2FB525   /* v0.5 */
+
+
+/*-*************************************
 *  Types
 ***************************************/
 #define ZSTD_WINDOWLOG_MAX 26
@@ -58,96 +63,97 @@ extern "C" {
 #define ZSTD_CONTENTLOG_MAX (ZSTD_WINDOWLOG_MAX+1)
 #define ZSTD_CONTENTLOG_MIN 4
 #define ZSTD_HASHLOG_MAX 28
-#define ZSTD_HASHLOG_MIN 4
+#define ZSTD_HASHLOG_MIN 12
 #define ZSTD_SEARCHLOG_MAX (ZSTD_CONTENTLOG_MAX-1)
 #define ZSTD_SEARCHLOG_MIN 1
 #define ZSTD_SEARCHLENGTH_MAX 7
 #define ZSTD_SEARCHLENGTH_MIN 4
+#define ZSTD_TARGETLENGTH_MIN 4
+#define ZSTD_TARGETLENGTH_MAX 999
 
-/** from faster to stronger */
-typedef enum { ZSTD_fast, ZSTD_greedy, ZSTD_lazy, ZSTD_lazy2, ZSTD_btlazy2 } ZSTD_strategy;
+/* from faster to stronger */
+typedef enum { ZSTD_fast, ZSTD_greedy, ZSTD_lazy, ZSTD_lazy2, ZSTD_btlazy2, ZSTD_opt, ZSTD_btopt } ZSTD_strategy;
 
 typedef struct
 {
     U64 srcSize;       /* optional : tells how much bytes are present in the frame. Use 0 if not known. */
     U32 windowLog;     /* largest match distance : larger == more compression, more memory needed during decompression */
     U32 contentLog;    /* full search segment : larger == more compression, slower, more memory (useless for fast) */
-    U32 hashLog;       /* dispatch table : larger == more memory, faster */
+    U32 hashLog;       /* dispatch table : larger == faster, more memory */
     U32 searchLog;     /* nb of searches : larger == more compression, slower */
-    U32 searchLength;  /* size of matches : larger == faster decompression, sometimes less compression */
+    U32 searchLength;  /* match length searched : larger == faster decompression, sometimes less compression */
+    U32 targetLength;  /* acceptable match size for optimal parser (only) : larger == more compression, slower */
     ZSTD_strategy strategy;
 } ZSTD_parameters;
 
 
-/* *************************************
+/*-*************************************
 *  Advanced functions
 ***************************************/
-/** ZSTD_getParams
-*   return ZSTD_parameters structure for a selected compression level and srcSize.
-*   srcSizeHint value is optional, select 0 if not known */
+ZSTDLIB_API unsigned ZSTD_maxCLevel (void);
+
+/*! ZSTD_getParams() :
+*   @return ZSTD_parameters structure for a selected compression level and srcSize.
+*   `srcSizeHint` value is optional, select 0 if not known */
 ZSTDLIB_API ZSTD_parameters ZSTD_getParams(int compressionLevel, U64 srcSizeHint);
 
-/** ZSTD_validateParams
+/*! ZSTD_validateParams() :
 *   correct params value to remain within authorized range */
 ZSTDLIB_API void ZSTD_validateParams(ZSTD_parameters* params);
 
-/** ZSTD_compress_usingDict
-*   Same as ZSTD_compressCCtx(), using a Dictionary content as prefix
-*   Note : dict can be NULL, in which case, it's equivalent to ZSTD_compressCCtx() */
-ZSTDLIB_API size_t ZSTD_compress_usingDict(ZSTD_CCtx* ctx,
-                                           void* dst, size_t maxDstSize,
-                                     const void* src, size_t srcSize,
-                                     const void* dict,size_t dictSize,
-                                           int compressionLevel);
-
-/** ZSTD_compress_advanced
+/*! ZSTD_compress_advanced() :
 *   Same as ZSTD_compress_usingDict(), with fine-tune control of each compression parameter */
 ZSTDLIB_API size_t ZSTD_compress_advanced (ZSTD_CCtx* ctx,
-                                           void* dst, size_t maxDstSize,
+                                           void* dst, size_t dstCapacity,
                                      const void* src, size_t srcSize,
                                      const void* dict,size_t dictSize,
                                            ZSTD_parameters params);
 
-/** ZSTD_decompress_usingDict
-*   Same as ZSTD_decompressDCtx, using a Dictionary content as prefix
-*   Note : dict can be NULL, in which case, it's equivalent to ZSTD_decompressDCtx() */
-ZSTDLIB_API size_t ZSTD_decompress_usingDict(ZSTD_DCtx* ctx,
-                                             void* dst, size_t maxDstSize,
-                                       const void* src, size_t srcSize,
-                                       const void* dict,size_t dictSize);
+/*! ZSTD_compress_usingPreparedDCtx() :
+*   Same as ZSTD_compress_usingDict, but using a reference context `preparedCCtx`, where dictionary has been loaded.
+*   It avoids reloading the dictionary each time.
+*   `preparedCCtx` must have been properly initialized using ZSTD_compressBegin_usingDict() or ZSTD_compressBegin_advanced().
+*   Requires 2 contexts : 1 for reference, which will not be modified, and 1 to run the compression operation */
+ZSTDLIB_API size_t ZSTD_compress_usingPreparedCCtx(
+                                           ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx,
+                                           void* dst, size_t dstCapacity,
+                                     const void* src, size_t srcSize);
+
+/*- Advanced Decompression functions -*/
+
+/*! ZSTD_decompress_usingPreparedDCtx() :
+*   Same as ZSTD_decompress_usingDict, but using a reference context `preparedDCtx`, where dictionary has been loaded.
+*   It avoids reloading the dictionary each time.
+*   `preparedDCtx` must have been properly initialized using ZSTD_decompressBegin_usingDict().
+*   Requires 2 contexts : 1 for reference, which will not be modified, and 1 to run the decompression operation */
+ZSTDLIB_API size_t ZSTD_decompress_usingPreparedDCtx(
+                                             ZSTD_DCtx* dctx, const ZSTD_DCtx* preparedDCtx,
+                                             void* dst, size_t dstCapacity,
+                                       const void* src, size_t srcSize);
 
 
 /* **************************************
 *  Streaming functions (direct mode)
 ****************************************/
 ZSTDLIB_API size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel);
-ZSTDLIB_API size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* ctx, ZSTD_parameters params);
-
-ZSTDLIB_API size_t ZSTD_compress_insertDictionary(ZSTD_CCtx* ctx, const void* src, size_t srcSize);
-ZSTDLIB_API size_t ZSTD_duplicateCCtx(ZSTD_CCtx* dstCCtx, const ZSTD_CCtx* srcCCtx);
+ZSTDLIB_API size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict,size_t dictSize, int compressionLevel);
+ZSTDLIB_API size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, const void* dict,size_t dictSize, ZSTD_parameters params);
+ZSTDLIB_API size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx);
 
-ZSTDLIB_API size_t ZSTD_compressContinue(ZSTD_CCtx* cctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize);
-ZSTDLIB_API size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, void* dst, size_t maxDstSize);
+ZSTDLIB_API size_t ZSTD_compressContinue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ZSTDLIB_API size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity);
 
-/**
+/*
   Streaming compression, synchronous mode (bufferless)
 
   A ZSTD_CCtx object is required to track streaming operations.
   Use ZSTD_createCCtx() / ZSTD_freeCCtx() to manage it.
   ZSTD_CCtx object can be re-used multiple times within successive compression operations.
 
-  First operation is to start a new frame.
-  Use ZSTD_compressBegin().
-  You may also prefer the advanced derivative ZSTD_compressBegin_advanced(), for finer parameter control.
-
-  It's then possible to add a dictionary with ZSTD_compress_insertDictionary()
-  Note that dictionary presence is a "hidden" information,
-  the decoder needs to be aware that it is required for proper decoding, or decoding will fail.
-
-  If you want to compress a lot of messages using same dictionary,
-  it can be beneficial to duplicate compression context rather than reloading dictionary each time.
-  In such case, use ZSTD_duplicateCCtx(), which will need an already created ZSTD_CCtx,
-  in order to duplicate compression context into it.
+  Start by initializing a context.
+  Use ZSTD_compressBegin(), or ZSTD_compressBegin_usingDict() for dictionary compression,
+  or ZSTD_compressBegin_advanced(), for finer parameter control.
+  It's also possible to duplicate a reference context which has been initialized, using ZSTD_copyCCtx()
 
   Then, consume your input using ZSTD_compressContinue().
   The interface is synchronous, so all input will be consumed and produce a compressed output.
@@ -155,38 +161,39 @@ ZSTDLIB_API size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, void* dst, size_t maxDstSiz
   Worst case evaluation is provided by ZSTD_compressBound().
 
   Finish a frame with ZSTD_compressEnd(), which will write the epilogue.
-  Without it, the frame will be considered incomplete by decoders.
+  Without the epilogue, frames will be considered incomplete by decoder.
 
   You can then reuse ZSTD_CCtx to compress some new frame.
 */
 
 
-ZSTDLIB_API size_t ZSTD_resetDCtx(ZSTD_DCtx* dctx);
+ZSTDLIB_API size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx);
+ZSTDLIB_API size_t ZSTD_decompressBegin_usingDict(ZSTD_DCtx* dctx, const void* dict, size_t dictSize);
+ZSTDLIB_API void   ZSTD_copyDCtx(ZSTD_DCtx* dctx, const ZSTD_DCtx* preparedDCtx);
+
 ZSTDLIB_API size_t ZSTD_getFrameParams(ZSTD_parameters* params, const void* src, size_t srcSize);
-ZSTDLIB_API void   ZSTD_decompress_insertDictionary(ZSTD_DCtx* ctx, const void* src, size_t srcSize);
 
 ZSTDLIB_API size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx);
-ZSTDLIB_API size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize);
+ZSTDLIB_API size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
 
-/**
-  Streaming decompression, bufferless mode
+/*
+  Streaming decompression, direct mode (bufferless)
 
   A ZSTD_DCtx object is required to track streaming operations.
   Use ZSTD_createDCtx() / ZSTD_freeDCtx() to manage it.
-  A ZSTD_DCtx object can be re-used multiple times. Use ZSTD_resetDCtx() to return to fresh status.
+  A ZSTD_DCtx object can be re-used multiple times.
 
-  First operation is to retrieve frame parameters, using ZSTD_getFrameParams().
-  This function doesn't consume its input. It needs enough input data to properly decode the frame header.
+  First typical operation is to retrieve frame parameters, using ZSTD_getFrameParams().
+  This operation is independent, and just needs enough input data to properly decode the frame header.
   Objective is to retrieve *params.windowlog, to know minimum amount of memory required during decoding.
   Result : 0 when successful, it means the ZSTD_parameters structure has been filled.
            >0 : means there is not enough data into src. Provides the expected size to successfully decode header.
-           errorCode, which can be tested using ZSTD_isError() (For example, if it's not a ZSTD header)
+           errorCode, which can be tested using ZSTD_isError()
 
-  Then, you can optionally insert a dictionary.
-  This operation must mimic the compressor behavior, otherwise decompression will fail or be corrupted.
+  Start decompression, with ZSTD_decompressBegin() or ZSTD_decompressBegin_usingDict()
+  Alternatively, you can copy a prepared context, using ZSTD_copyDCtx()
 
-  Then it's possible to start decompression.
-  Use ZSTD_nextSrcSizeToDecompress() and ZSTD_decompressContinue() alternatively.
+  Then use ZSTD_nextSrcSizeToDecompress() and ZSTD_decompressContinue() alternatively.
   ZSTD_nextSrcSizeToDecompress() tells how much bytes to provide as 'srcSize' to ZSTD_decompressContinue().
   ZSTD_decompressContinue() requires this exact amount of bytes, or it will fail.
   ZSTD_decompressContinue() needs previous data blocks during decompression, up to (1 << windowlog).
@@ -203,138 +210,36 @@ ZSTDLIB_API size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t ma
 /* **************************************
 *  Block functions
 ****************************************/
-
-/*!Block functions produce and decode raw zstd blocks, without frame metadata.
-   It saves associated header sizes.
-   But user will have to save and regenerate fields required to regenerate data, such as block sizes.
-
-   A few rules to respect :
-   - Uncompressed block size must be <= 128 KB
-   - Compressing or decompressing require a context structure
-     + Use ZSTD_createXCtx() to create them
-   - It is necessary to init context before starting
-     + compression : ZSTD_compressBegin(), which allows selection of compression level or parameters
-     + decompression : ZSTD_resetDCtx()
-     + If you compress multiple blocks without resetting, next blocks will create references to previous ones
-   - Dictionary can optionally be inserted, using ZSTD_de/compress_insertDictionary()
-   - When a block is considered not compressible enough, ZSTD_compressBlock() result will be zero.
-     + User must test for such outcome and be able to deal with uncompressed data
-     + ZSTD_decompressBlock() doesn't accept uncompressed data as input
+/*! Block functions produce and decode raw zstd blocks, without frame metadata.
+    User will have to take in charge required information to regenerate data, such as block sizes.
+
+    A few rules to respect :
+    - Uncompressed block size must be <= 128 KB
+    - Compressing or decompressing requires a context structure
+      + Use ZSTD_createCCtx() and ZSTD_createDCtx()
+    - It is necessary to init context before starting
+      + compression : ZSTD_compressBegin()
+      + decompression : ZSTD_decompressBegin()
+      + variants _usingDict() are also allowed
+      + copyCCtx() and copyDCtx() work too
+    - When a block is considered not compressible enough, ZSTD_compressBlock() result will be zero.
+      In which case, nothing is produced into `dst`.
+      + User must test for such outcome and deal directly with uncompressed data
+      + ZSTD_decompressBlock() doesn't accept uncompressed data as input !!
 */
 
-size_t ZSTD_compressBlock  (ZSTD_CCtx* cctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize);
-size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize);
+size_t ZSTD_compressBlock  (ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
 
 
-/* *************************************
-*  Pre-defined compression levels
-***************************************/
-#define ZSTD_MAX_CLEVEL 20
-ZSTDLIB_API unsigned ZSTD_maxCLevel (void);
-static const ZSTD_parameters ZSTD_defaultParameters[4][ZSTD_MAX_CLEVEL+1] = {
-{   /* "default" */
-    /*    W,  C,  H,  S,  L, strat */
-    { 0, 18, 12, 12,  1,  4, ZSTD_fast    },  /* level  0 - never used */
-    { 0, 19, 13, 14,  1,  7, ZSTD_fast    },  /* level  1 */
-    { 0, 19, 15, 16,  1,  6, ZSTD_fast    },  /* level  2 */
-    { 0, 20, 18, 20,  1,  6, ZSTD_fast    },  /* level  3 */
-    { 0, 21, 19, 21,  1,  6, ZSTD_fast    },  /* level  4 */
-    { 0, 20, 14, 18,  3,  5, ZSTD_greedy  },  /* level  5 */
-    { 0, 20, 18, 19,  3,  5, ZSTD_greedy  },  /* level  6 */
-    { 0, 21, 17, 20,  3,  5, ZSTD_lazy    },  /* level  7 */
-    { 0, 21, 19, 20,  3,  5, ZSTD_lazy    },  /* level  8 */
-    { 0, 21, 20, 20,  3,  5, ZSTD_lazy2   },  /* level  9 */
-    { 0, 21, 19, 21,  4,  5, ZSTD_lazy2   },  /* level 10 */
-    { 0, 22, 20, 22,  4,  5, ZSTD_lazy2   },  /* level 11 */
-    { 0, 22, 20, 22,  5,  5, ZSTD_lazy2   },  /* level 12 */
-    { 0, 22, 21, 22,  5,  5, ZSTD_lazy2   },  /* level 13 */
-    { 0, 22, 22, 23,  5,  5, ZSTD_lazy2   },  /* level 14 */
-    { 0, 23, 23, 23,  5,  5, ZSTD_lazy2   },  /* level 15 */
-    { 0, 23, 21, 22,  5,  5, ZSTD_btlazy2 },  /* level 16 */
-    { 0, 23, 24, 23,  4,  5, ZSTD_btlazy2 },  /* level 17 */
-    { 0, 25, 24, 23,  5,  5, ZSTD_btlazy2 },  /* level 18 */
-    { 0, 25, 26, 23,  5,  5, ZSTD_btlazy2 },  /* level 19 */
-    { 0, 26, 27, 25,  9,  5, ZSTD_btlazy2 },  /* level 20 */
-},
-{   /* for srcSize <= 256 KB */
-    /*     W,  C,  H,  S,  L, strat */
-    {  0, 18, 13, 14,  1,  7, ZSTD_fast    },  /* level  0 - never used */
-    {  0, 18, 14, 15,  1,  6, ZSTD_fast    },  /* level  1 */
-    {  0, 18, 14, 15,  1,  5, ZSTD_fast    },  /* level  2 */
-    {  0, 18, 12, 15,  3,  4, ZSTD_greedy  },  /* level  3 */
-    {  0, 18, 13, 15,  4,  4, ZSTD_greedy  },  /* level  4 */
-    {  0, 18, 14, 15,  5,  4, ZSTD_greedy  },  /* level  5 */
-    {  0, 18, 13, 15,  4,  4, ZSTD_lazy    },  /* level  6 */
-    {  0, 18, 14, 16,  5,  4, ZSTD_lazy    },  /* level  7 */
-    {  0, 18, 15, 16,  6,  4, ZSTD_lazy    },  /* level  8 */
-    {  0, 18, 15, 15,  7,  4, ZSTD_lazy    },  /* level  9 */
-    {  0, 18, 16, 16,  7,  4, ZSTD_lazy    },  /* level 10 */
-    {  0, 18, 16, 16,  8,  4, ZSTD_lazy    },  /* level 11 */
-    {  0, 18, 17, 16,  8,  4, ZSTD_lazy    },  /* level 12 */
-    {  0, 18, 17, 16,  9,  4, ZSTD_lazy    },  /* level 13 */
-    {  0, 18, 18, 16,  9,  4, ZSTD_lazy    },  /* level 14 */
-    {  0, 18, 17, 17,  9,  4, ZSTD_lazy2   },  /* level 15 */
-    {  0, 18, 18, 18,  9,  4, ZSTD_lazy2   },  /* level 16 */
-    {  0, 18, 18, 18, 10,  4, ZSTD_lazy2   },  /* level 17 */
-    {  0, 18, 18, 18, 11,  4, ZSTD_lazy2   },  /* level 18 */
-    {  0, 18, 18, 18, 12,  4, ZSTD_lazy2   },  /* level 19 */
-    {  0, 18, 18, 18, 13,  4, ZSTD_lazy2   },  /* level 20 */
-},
-{   /* for srcSize <= 128 KB */
-    /*    W,  C,  H,  S,  L, strat */
-    { 0, 17, 12, 12,  1,  4, ZSTD_fast    },  /* level  0 - never used */
-    { 0, 17, 12, 13,  1,  6, ZSTD_fast    },  /* level  1 */
-    { 0, 17, 14, 16,  1,  5, ZSTD_fast    },  /* level  2 */
-    { 0, 17, 15, 17,  1,  5, ZSTD_fast    },  /* level  3 */
-    { 0, 17, 13, 15,  2,  4, ZSTD_greedy  },  /* level  4 */
-    { 0, 17, 15, 17,  3,  4, ZSTD_greedy  },  /* level  5 */
-    { 0, 17, 14, 17,  3,  4, ZSTD_lazy    },  /* level  6 */
-    { 0, 17, 16, 17,  4,  4, ZSTD_lazy    },  /* level  7 */
-    { 0, 17, 16, 17,  4,  4, ZSTD_lazy2   },  /* level  8 */
-    { 0, 17, 17, 16,  5,  4, ZSTD_lazy2   },  /* level  9 */
-    { 0, 17, 17, 16,  6,  4, ZSTD_lazy2   },  /* level 10 */
-    { 0, 17, 17, 16,  7,  4, ZSTD_lazy2   },  /* level 11 */
-    { 0, 17, 17, 16,  8,  4, ZSTD_lazy2   },  /* level 12 */
-    { 0, 17, 18, 16,  4,  4, ZSTD_btlazy2 },  /* level 13 */
-    { 0, 17, 18, 16,  5,  4, ZSTD_btlazy2 },  /* level 14 */
-    { 0, 17, 18, 16,  6,  4, ZSTD_btlazy2 },  /* level 15 */
-    { 0, 17, 18, 16,  7,  4, ZSTD_btlazy2 },  /* level 16 */
-    { 0, 17, 18, 16,  8,  4, ZSTD_btlazy2 },  /* level 17 */
-    { 0, 17, 18, 16,  9,  4, ZSTD_btlazy2 },  /* level 18 */
-    { 0, 17, 18, 16, 10,  4, ZSTD_btlazy2 },  /* level 19 */
-    { 0, 17, 18, 18, 12,  4, ZSTD_btlazy2 },  /* level 20 */
-},
-{   /* for srcSize <= 16 KB */
-    /*     W,  C,  H,  S,  L, strat */
-    {  0,  0,  0,  0,  0,  0, ZSTD_fast    },  /* level  0 - never used */
-    {  0, 14, 14, 14,  1,  4, ZSTD_fast    },  /* level  1 */
-    {  0, 14, 14, 16,  1,  4, ZSTD_fast    },  /* level  2 */
-    {  0, 14, 14, 14,  5,  4, ZSTD_greedy  },  /* level  3 */
-    {  0, 14, 14, 14,  8,  4, ZSTD_greedy  },  /* level  4 */
-    {  0, 14, 11, 14,  6,  4, ZSTD_lazy    },  /* level  5 */
-    {  0, 14, 14, 13,  6,  5, ZSTD_lazy    },  /* level  6 */
-    {  0, 14, 14, 14,  7,  6, ZSTD_lazy    },  /* level  7 */
-    {  0, 14, 14, 14,  8,  4, ZSTD_lazy    },  /* level  8 */
-    {  0, 14, 14, 15,  9,  4, ZSTD_lazy    },  /* level  9 */
-    {  0, 14, 14, 15, 10,  4, ZSTD_lazy    },  /* level 10 */
-    {  0, 14, 15, 15,  6,  4, ZSTD_btlazy2 },  /* level 11 */
-    {  0, 14, 15, 15,  7,  4, ZSTD_btlazy2 },  /* level 12 */
-    {  0, 14, 15, 15,  8,  4, ZSTD_btlazy2 },  /* level 13 */
-    {  0, 14, 15, 15,  9,  4, ZSTD_btlazy2 },  /* level 14 */
-    {  0, 14, 15, 15, 10,  4, ZSTD_btlazy2 },  /* level 15 */
-    {  0, 14, 15, 15, 11,  4, ZSTD_btlazy2 },  /* level 16 */
-    {  0, 14, 15, 15, 12,  4, ZSTD_btlazy2 },  /* level 17 */
-    {  0, 14, 15, 15, 13,  4, ZSTD_btlazy2 },  /* level 18 */
-    {  0, 14, 15, 15, 14,  4, ZSTD_btlazy2 },  /* level 19 */
-    {  0, 14, 15, 15, 15,  4, ZSTD_btlazy2 },  /* level 20 */
-},
-};
-
-
-/* *************************************
+/*-*************************************
 *  Error management
 ***************************************/
 #include "error_public.h"
+/*! ZSTD_getErrorCode() :
+    convert a `size_t` function result into a `ZSTD_error_code` enum type,
+    which can be used to compare directly with enum list published into "error_public.h" */
+ZSTD_ErrorCode ZSTD_getError(size_t code);
 
 
 #if defined (__cplusplus)
diff --git a/programs/Makefile b/programs/Makefile
index c64cbe6..4a650c4 100644
--- a/programs/Makefile
+++ b/programs/Makefile
@@ -1,6 +1,6 @@
 # ##########################################################################
 # ZSTD programs - Makefile
-# Copyright (C) Yann Collet 2015
+# Copyright (C) Yann Collet 2015-2016
 #
 # GPL v2 License
 #
@@ -19,13 +19,14 @@
 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 #
 # You can contact the author at :
-#  - ZSTD source repository : http://code.google.com/p/zstd/
-#  - Public forum : https://groups.google.com/forum/#!forum/lz4c
+#  - zstd homepage : http://www.zstd.net/
 # ##########################################################################
 # zstd : Command Line Utility, supporting gzip-like arguments
 # datagen : Synthetic and parametrable data generator, for tests
 # fuzzer  : Test tool, to check zstd integrity on target platform
 # fuzzer32: Same as fuzzer, but forced to compile in 32-bits mode
+# zbufftest  : Test tool, to check ZBUFF integrity on target platform
+# zbufftest32: Same as zbufftest, but forced to compile in 32-bits mode
 # fullbench  : Precisely measure speed for each zstd inner function
 # fullbench32: Same as fullbench, but forced to compile in 32-bits mode
 # ##########################################################################
@@ -52,15 +53,15 @@ BINDIR  = $(PREFIX)/bin
 MANDIR  = $(PREFIX)/share/man/man1
 ZSTDDIR = ../lib
 
-ZSTD_FILES := $(ZSTDDIR)/zstd_compress.c $(ZSTDDIR)/zstd_decompress.c $(ZSTDDIR)/fse.c $(ZSTDDIR)/huff0.c
-ZSTD_LEGACY:= $(ZSTDDIR)/legacy/zstd_v01.c $(ZSTDDIR)/legacy/zstd_v02.c $(ZSTDDIR)/legacy/zstd_v03.c
+ZSTD_FILES := $(ZSTDDIR)/huff0.c $(ZSTDDIR)/fse.c $(ZSTDDIR)/zstd_compress.c $(ZSTDDIR)/zstd_decompress.c
 
 ifeq ($(ZSTD_LEGACY_SUPPORT), 0)
 CPPFLAGS  += -DZSTD_LEGACY_SUPPORT=0
+ZSTD_FILES_LEGACY:=
 else
-ZSTD_FILES+= $(ZSTD_LEGACY)
-CPPFLAGS  += -I../lib/legacy -I./legacy -DZSTD_LEGACY_SUPPORT=1
-ZSTD_FILEIO_LEGACY = legacy/fileio_legacy.c
+ZSTD_LEGACY_SUPPORT:=1
+CPPFLAGS  += -I../lib/legacy -I./legacy
+ZSTD_FILES_LEGACY:= $(ZSTDDIR)/legacy/zstd_v01.c $(ZSTDDIR)/legacy/zstd_v02.c $(ZSTDDIR)/legacy/zstd_v03.c $(ZSTDDIR)/legacy/zstd_v04.c legacy/fileio_legacy.c
 endif
 
 
@@ -75,6 +76,7 @@ endif
 
 ZBUFFTEST = -T2mn
 FUZZERTEST= -T5mn
+ZSTDRTTEST= --test-large-data
 
 .PHONY: default all clean install uninstall test test32 test-all
 
@@ -82,29 +84,33 @@ default: zstd
 
 all: zstd zstd32 fullbench fullbench32 fuzzer fuzzer32 zbufftest zbufftest32 paramgrill datagen
 
-zstd  : $(ZSTD_FILES) $(ZSTDDIR)/zstd_buffered.c \
-        zstdcli.c fileio.c $(ZSTD_FILEIO_LEGACY) bench.c xxhash.c datagen.c 
-	$(CC)      $(FLAGS) $^ -o $@$(EXT)
+zstd  : $(ZSTD_FILES) $(ZSTD_FILES_LEGACY) $(ZSTDDIR)/zbuff.c $(ZSTDDIR)/zdict.c $(ZSTDDIR)/divsufsort.c \
+        zstdcli.c fileio.c bench.c xxhash.c datagen.c dibio.c
+	$(CC)      $(FLAGS) -DZSTD_LEGACY_SUPPORT=$(ZSTD_LEGACY_SUPPORT) $^ -o $@$(EXT)
 
-zstd32: $(ZSTD_FILES) $(ZSTDDIR)/zstd_buffered.c \
-        zstdcli.c fileio.c $(ZSTD_FILEIO_LEGACY) bench.c xxhash.c datagen.c 
-	$(CC) -m32 $(FLAGS) $^ -o $@$(EXT)
+zstd32: $(ZSTD_FILES) $(ZSTD_FILES_LEGACY) $(ZSTDDIR)/zbuff.c $(ZSTDDIR)/zdict.c $(ZSTDDIR)/divsufsort.c \
+        zstdcli.c fileio.c bench.c xxhash.c datagen.c dibio.c 
+	$(CC) -m32 $(FLAGS) -DZSTD_LEGACY_SUPPORT=$(ZSTD_LEGACY_SUPPORT) $^ -o $@$(EXT)
 
 zstd_nolegacy :
 	$(MAKE) zstd ZSTD_LEGACY_SUPPORT=0
 
 zstd-pgo : MOREFLAGS = -fprofile-generate
 zstd-pgo : clean zstd
+	./zstd -b19i1 $(PROFILE_WITH)
+	./zstd -b16i1 $(PROFILE_WITH)
+	./zstd -b9i2 $(PROFILE_WITH)
 	./zstd -b $(PROFILE_WITH)
+	./zstd -b7i2 $(PROFILE_WITH)
+	./zstd -b5 $(PROFILE_WITH)
 	rm zstd
 	$(MAKE) zstd MOREFLAGS=-fprofile-use
 
-zstd-noBench: $(ZSTD_FILES) $(ZSTDDIR)/zstd_buffered.c \
-        zstdcli.c fileio.c $(ZSTD_FILEIO_LEGACY)
-	$(CC)      $(FLAGS) -DZSTD_NOBENCH $^ -o zstd$(EXT)
+zstd-frugal: $(ZSTD_FILES) $(ZSTDDIR)/zbuff.c zstdcli.c fileio.c
+	$(CC)      $(FLAGS) -DZSTD_NOBENCH -DZSTD_NODICT -DZSTD_LEGACY_SUPPORT=0 $^ -o zstd$(EXT)
 
-zstd-frugal: clean 
-	$(MAKE) zstd-noBench ZSTD_LEGACY_SUPPORT=0 
+zstd-small: clean 
+	CFLAGS="-Os -s" $(MAKE) zstd-frugal 
 
 fullbench  : $(ZSTD_FILES) \
         datagen.c fullbench.c
@@ -122,11 +128,11 @@ fuzzer32: $(ZSTD_FILES) \
       datagen.c xxhash.c fuzzer.c
 	$(CC) -m32 $(FLAGS) $^ -o $@$(EXT)
 
-zbufftest  : $(ZSTD_FILES) $(ZSTDDIR)/zstd_buffered.c \
+zbufftest  : $(ZSTD_FILES) $(ZSTDDIR)/zbuff.c \
       datagen.c xxhash.c zbufftest.c
 	$(CC)      $(FLAGS) $^ -o $@$(EXT)
 
-zbufftest32: $(ZSTD_FILES) $(ZSTDDIR)/zstd_buffered.c \
+zbufftest32: $(ZSTD_FILES) $(ZSTDDIR)/zbuff.c \
       datagen.c xxhash.c zbufftest.c
 	$(CC) -m32 $(FLAGS) $^ -o $@$(EXT)
 
@@ -138,7 +144,7 @@ datagen : datagen.c datagencli.c
 	$(CC)      $(FLAGS) $^ -o $@$(EXT)
 
 clean:
-	@rm -f core *.o tmp* result* *.gcda \
+	@rm -f core *.o tmp* result* *.gcda dictionary *.zst \
         zstd$(EXT) zstd32$(EXT) \
         fullbench$(EXT) fullbench32$(EXT) \
         fuzzer$(EXT) fuzzer32$(EXT) zbufftest$(EXT) zbufftest32$(EXT) \
@@ -178,7 +184,7 @@ test32: test-zstd32 test-fullbench32 test-fuzzer32 test-zbuff32
 test-all: test test32 valgrindTest
 
 zstd-playTests: datagen
-	ZSTD=$(ZSTD) ./playTests.sh --test-large-data
+	ZSTD=$(ZSTD) ./playTests.sh $(ZSTDRTTEST)
 
 test-zstd: ZSTD = ./zstd
 test-zstd: zstd zstd-playTests
@@ -213,12 +219,12 @@ valgrindTest: zstd datagen fuzzer fullbench zbufftest
 	@echo "\n ---- valgrind tests : memory analyzer ----"
 	valgrind --leak-check=yes --error-exitcode=1 ./datagen -g50M > $(VOID)
 	./datagen -g16KB > tmp
-	valgrind --leak-check=yes --error-exitcode=1 ./zstd -vf tmp $(VOID)
+	valgrind --leak-check=yes --error-exitcode=1 ./zstd -vf tmp -o $(VOID)
 	./datagen -g2930KB > tmp
-	valgrind --leak-check=yes --error-exitcode=1 ./zstd -5 -vf tmp tmp2
-	valgrind --leak-check=yes --error-exitcode=1 ./zstd -vdf tmp2 $(VOID)
+	valgrind --leak-check=yes --error-exitcode=1 ./zstd -5 -vf tmp -o tmp2
+	valgrind --leak-check=yes --error-exitcode=1 ./zstd -vdf tmp2 -o $(VOID)
 	./datagen -g64MB > tmp
-	valgrind --leak-check=yes --error-exitcode=1 ./zstd -vf tmp $(VOID)
+	valgrind --leak-check=yes --error-exitcode=1 ./zstd -vf tmp -o $(VOID)
 	@rm tmp
 	valgrind --leak-check=yes --error-exitcode=1 ./fuzzer -T1mn -t1
 	valgrind --leak-check=yes --error-exitcode=1 ./fullbench -i1
diff --git a/programs/bench.c b/programs/bench.c
index fcd674a..1c776c4 100644
--- a/programs/bench.c
+++ b/programs/bench.c
@@ -212,6 +212,7 @@ typedef struct
 } blockParam_t;
 
 #define MIN(a,b) ((a)<(b) ? (a) : (b))
+#define MAX(a,b) ((a)>(b) ? (a) : (b))
 
 static int BMK_benchMem(const void* srcBuffer, size_t srcSize,
                         const char* displayName, int cLevel,
@@ -227,6 +228,7 @@ static int BMK_benchMem(const void* srcBuffer, size_t srcSize,
     void* const resultBuffer = malloc(srcSize);
     ZSTD_CCtx* refCtx = ZSTD_createCCtx();
     ZSTD_CCtx* ctx = ZSTD_createCCtx();
+    ZSTD_DCtx* refDCtx = ZSTD_createDCtx();
     ZSTD_DCtx* dctx = ZSTD_createDCtx();
     U64 crcOrig = XXH64(srcBuffer, srcSize, 0);
     U32 nbBlocks = 0;
@@ -235,7 +237,7 @@ static int BMK_benchMem(const void* srcBuffer, size_t srcSize,
     if (strlen(displayName)>17) displayName += strlen(displayName)-17;   /* can only display 17 characters */
 
     /* Memory allocation & restrictions */
-    if (!compressedBuffer || !resultBuffer || !blockTable || !refCtx || !ctx || !dctx)
+    if (!compressedBuffer || !resultBuffer || !blockTable || !refCtx || !ctx || !refDCtx || !dctx)
         EXM_THROW(31, "not enough memory");
 
     /* Init blockTable data */
@@ -244,13 +246,11 @@ static int BMK_benchMem(const void* srcBuffer, size_t srcSize,
         const char* srcPtr = (const char*)srcBuffer;
         char* cPtr = (char*)compressedBuffer;
         char* resPtr = (char*)resultBuffer;
-        for (fileNb=0; fileNb<nbFiles; fileNb++)
-        {
+        for (fileNb=0; fileNb<nbFiles; fileNb++) {
             size_t remaining = fileSizes[fileNb];
             U32 nbBlocksforThisFile = (U32)((remaining + (blockSize-1)) / blockSize);
             U32 blockEnd = nbBlocks + nbBlocksforThisFile;
-            for ( ; nbBlocks<blockEnd; nbBlocks++)
-            {
+            for ( ; nbBlocks<blockEnd; nbBlocks++) {
                 size_t thisBlockSize = MIN(remaining, blockSize);
                 blockTable[nbBlocks].srcPtr = srcPtr;
                 blockTable[nbBlocks].cPtr = cPtr;
@@ -262,9 +262,7 @@ static int BMK_benchMem(const void* srcBuffer, size_t srcSize,
                 resPtr += thisBlockSize;
                 remaining -= thisBlockSize;
                 if (thisBlockSize > largestBlockSize) largestBlockSize = thisBlockSize;
-            }
-        }
-    }
+    }   }   }
 
     /* warmimg up memory */
     RDG_genBuffer(compressedBuffer, maxCompressedSize, 0.10, 0.50, 1);
@@ -278,8 +276,7 @@ static int BMK_benchMem(const void* srcBuffer, size_t srcSize,
         U64 crcCheck = 0;
 
         DISPLAY("\r%79s\r", "");
-        for (loopNb = 1; loopNb <= nbIterations; loopNb++)
-        {
+        for (loopNb = 1; loopNb <= nbIterations; loopNb++) {
             int nbLoops;
             int milliTime;
             U32 blockNb;
@@ -292,29 +289,15 @@ static int BMK_benchMem(const void* srcBuffer, size_t srcSize,
             milliTime = BMK_GetMilliStart();
             while (BMK_GetMilliStart() == milliTime);
             milliTime = BMK_GetMilliStart();
-            while (BMK_GetMilliSpan(milliTime) < TIMELOOP)
-            {
-                ZSTD_compressBegin_advanced(refCtx, ZSTD_getParams(cLevel, dictBufferSize+largestBlockSize));
-                ZSTD_compress_insertDictionary(refCtx, dictBuffer, dictBufferSize);
-                for (blockNb=0; blockNb<nbBlocks; blockNb++)
-                {
-                    ZSTD_duplicateCCtx(ctx, refCtx);
-                    size_t rSize = ZSTD_compressContinue(ctx,
-                                          blockTable[blockNb].cPtr,  blockTable[blockNb].cRoom,
-                                          blockTable[blockNb].srcPtr,blockTable[blockNb].srcSize);
-                    if (ZSTD_isError(rSize)) EXM_THROW(1, "ZSTD_compressContinue() failed : %s", ZSTD_getErrorName(rSize));
+            while (BMK_GetMilliSpan(milliTime) < TIMELOOP) {
+                ZSTD_compressBegin_advanced(refCtx, dictBuffer, dictBufferSize, ZSTD_getParams(cLevel, MAX(dictBufferSize, largestBlockSize)));
+                for (blockNb=0; blockNb<nbBlocks; blockNb++) {
+                    size_t rSize = ZSTD_compress_usingPreparedCCtx(ctx, refCtx,
+                                        blockTable[blockNb].cPtr,  blockTable[blockNb].cRoom,
+                                        blockTable[blockNb].srcPtr,blockTable[blockNb].srcSize);
+                    if (ZSTD_isError(rSize)) EXM_THROW(1, "ZSTD_compress_usingPreparedCCtx() failed : %s", ZSTD_getErrorName(rSize));
                     blockTable[blockNb].cSize = rSize;
-                    rSize = ZSTD_compressEnd(ctx,
-                                          blockTable[blockNb].cPtr  + rSize,
-                                          blockTable[blockNb].cRoom - rSize);
-                    if (ZSTD_isError(rSize)) EXM_THROW(2, "ZSTD_compressEnd() failed : %s", ZSTD_getErrorName(rSize));
-                    blockTable[blockNb].cSize += rSize;
                 }
-                    /*blockTable[blockNb].cSize = ZSTD_compress_usingDict(ctx,
-                                                              blockTable[blockNb].cPtr,  blockTable[blockNb].cRoom,
-                                                              blockTable[blockNb].srcPtr,blockTable[blockNb].srcSize,
-                                                              dictBuffer, dictBufferSize,
-                                                              cLevel);*/
                 nbLoops++;
             }
             milliTime = BMK_GetMilliSpan(milliTime);
@@ -328,47 +311,54 @@ static int BMK_benchMem(const void* srcBuffer, size_t srcSize,
 
 #if 1
             /* Decompression */
-            memset(resultBuffer, 0xD6, srcSize);
+            memset(resultBuffer, 0xD6, srcSize);  /* warm result buffer */
 
             nbLoops = 0;
             milliTime = BMK_GetMilliStart();
             while (BMK_GetMilliStart() == milliTime);
             milliTime = BMK_GetMilliStart();
-            for ( ; BMK_GetMilliSpan(milliTime) < TIMELOOP; nbLoops++)
-            {
-                for (blockNb=0; blockNb<nbBlocks; blockNb++)
-                    blockTable[blockNb].resSize = ZSTD_decompress_usingDict(dctx,
-                                                                blockTable[blockNb].resPtr, blockTable[blockNb].srcSize,
-                                                                blockTable[blockNb].cPtr, blockTable[blockNb].cSize,
-                                                                dictBuffer, dictBufferSize);
-            }
-            milliTime = BMK_GetMilliSpan(milliTime);
 
+            for ( ; BMK_GetMilliSpan(milliTime) < TIMELOOP; nbLoops++) {
+                ZSTD_decompressBegin_usingDict(refDCtx, dictBuffer, dictBufferSize);
+                for (blockNb=0; blockNb<nbBlocks; blockNb++) {
+                    size_t regenSize = ZSTD_decompress_usingPreparedDCtx(dctx, refDCtx,
+                        blockTable[blockNb].resPtr, blockTable[blockNb].srcSize,
+                        blockTable[blockNb].cPtr, blockTable[blockNb].cSize);
+                    if (ZSTD_isError(regenSize)) {
+                        DISPLAY("ZSTD_decompress_usingPreparedDCtx() failed on block %u : %s",
+                                  blockNb, ZSTD_getErrorName(regenSize));
+                        goto _findError;
+                    }
+                    blockTable[blockNb].resSize = regenSize;
+            }   }
+
+            milliTime = BMK_GetMilliSpan(milliTime);
             if ((double)milliTime < fastestD*nbLoops) fastestD = (double)milliTime / nbLoops;
             DISPLAY("%2i-%-17.17s :%10i ->%10i (%5.3f),%6.1f MB/s ,%6.1f MB/s\r", loopNb, displayName, (int)srcSize, (int)cSize, ratio, (double)srcSize / fastestC / 1000., (double)srcSize / fastestD / 1000.);
 
             /* CRC Checking */
+_findError:
             crcCheck = XXH64(resultBuffer, srcSize, 0);
-            if (crcOrig!=crcCheck)
-            {
+            if (crcOrig!=crcCheck) {
                 size_t u;
                 DISPLAY("\n!!! WARNING !!! %14s : Invalid Checksum : %x != %x\n", displayName, (unsigned)crcOrig, (unsigned)crcCheck);
-                for (u=0; u<srcSize; u++)
-                {
-                    if (((const BYTE*)srcBuffer)[u] != ((const BYTE*)resultBuffer)[u])
-                    {
-                        U32 bn;
+                for (u=0; u<srcSize; u++) {
+                    if (((const BYTE*)srcBuffer)[u] != ((const BYTE*)resultBuffer)[u]) {
+                        U32 segNb, bNb, pos;
                         size_t bacc = 0;
                         printf("Decoding error at pos %u ", (U32)u);
-                        for (bn = 0; bn < nbBlocks; bn++)
-                        {
-                            if (bacc + blockTable[bn].srcSize > u) break;
-                            bacc += blockTable[bn].srcSize;
+                        for (segNb = 0; segNb < nbBlocks; segNb++) {
+                            if (bacc + blockTable[segNb].srcSize > u) break;
+                            bacc += blockTable[segNb].srcSize;
                         }
-                        printf("(block %u, pos %u) \n", bn, (U32)(u - bacc));
+                        pos = (U32)(u - bacc);
+                        bNb = pos / (128 KB);
+                        printf("(block %u, sub %u, pos %u) \n", segNb, bNb, pos);
                         break;
                     }
-                }
+                    if (u==srcSize-1) {  /* should never happen */
+                        printf("no difference detected\n");
+                }   }
                 break;
             }
 #endif
@@ -377,7 +367,7 @@ static int BMK_benchMem(const void* srcBuffer, size_t srcSize,
         if (crcOrig == crcCheck)
             DISPLAY("%2i-%-17.17s :%10i ->%10i (%5.3f),%6.1f MB/s ,%6.1f MB/s \n", cLevel, displayName, (int)srcSize, (int)cSize, ratio, (double)srcSize / fastestC / 1000., (double)srcSize / fastestD / 1000.);
         else
-            DISPLAY("X \n");
+            DISPLAY("%2i-\n", cLevel);
     }
 
     /* clean up */
@@ -385,6 +375,7 @@ static int BMK_benchMem(const void* srcBuffer, size_t srcSize,
     free(resultBuffer);
     ZSTD_freeCCtx(refCtx);
     ZSTD_freeCCtx(ctx);
+    ZSTD_freeDCtx(refDCtx);
     ZSTD_freeDCtx(dctx);
     return 0;
 }
@@ -399,12 +390,10 @@ static size_t BMK_findMaxMem(U64 requiredMem)
     requiredMem += 2 * step;
     if (requiredMem > maxMemory) requiredMem = maxMemory;
 
-    while (!testmem)
-    {
+    while (!testmem) {
         requiredMem -= step;
         testmem = (BYTE*)malloc((size_t)requiredMem);
     }
-
     free(testmem);
     return (size_t)(requiredMem - step);
 }
@@ -414,8 +403,7 @@ static void BMK_benchCLevel(void* srcBuffer, size_t benchedSize,
                             const size_t* fileSizes, unsigned nbFiles,
                             const void* dictBuffer, size_t dictBufferSize)
 {
-    if (cLevel < 0)
-    {
+    if (cLevel < 0) {
         int l;
         for (l=1; l <= -cLevel; l++)
             BMK_benchMem(srcBuffer, benchedSize,
@@ -447,8 +435,7 @@ static void BMK_loadFiles(void* buffer, size_t bufferSize,
     size_t pos = 0;
     unsigned n;
 
-    for (n=0; n<nbFiles; n++)
-    {
+    for (n=0; n<nbFiles; n++) {
         size_t readSize;
         U64 fileSize = BMK_getFileSize(fileNamesTable[n]);
         FILE* f = fopen(fileNamesTable[n], "rb");
@@ -478,8 +465,7 @@ static void BMK_benchFileTable(const char** fileNamesTable, unsigned nbFiles,
     if (!fileSizes) EXM_THROW(12, "not enough memory for fileSizes");
 
     /* Load dictionary */
-    if (dictFileName != NULL)
-    {
+    if (dictFileName != NULL) {
         U64 dictFileSize = BMK_getFileSize(dictFileName);
         if (dictFileSize > 64 MB) EXM_THROW(10, "dictionary file %s too large", dictFileName);
         dictBufferSize = (size_t)dictFileSize;
diff --git a/programs/datagen.c b/programs/datagen.c
index 2bb3426..ff3a8cd 100644
--- a/programs/datagen.c
+++ b/programs/datagen.c
@@ -23,7 +23,7 @@
    - Public forum : https://groups.google.com/forum/#!forum/lz4c
 */
 
-/**************************************
+/*-************************************
 *  Includes
 **************************************/
 #include <stdlib.h>    /* malloc */
@@ -31,7 +31,7 @@
 #include <string.h>    /* memcpy */
 
 
-/**************************************
+/*-************************************
 *  Basic Types
 **************************************/
 #if defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)   /* C99 */
@@ -50,7 +50,7 @@
 #endif
 
 
-/**************************************
+/*-************************************
 *  OS-specific Includes
 **************************************/
 #if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(_WIN32) || defined(__CYGWIN__)
@@ -62,7 +62,7 @@
 #endif
 
 
-/**************************************
+/*-************************************
 *  Constants
 **************************************/
 #define KB *(1 <<10)
@@ -71,7 +71,7 @@
 #define PRIME2   2246822519U
 
 
-/**************************************
+/*-************************************
 *  Local types
 **************************************/
 #define LTLOG 13
@@ -81,7 +81,7 @@ typedef BYTE litDistribTable[LTSIZE];
 
 
 
-/*********************************************************
+/*-*******************************************************
 *  Local Functions
 *********************************************************/
 #define RDG_rotl32(x,r) ((x << r) | (x >> (32 - r)))
@@ -103,14 +103,12 @@ static void RDG_fillLiteralDistrib(litDistribTable lt, double ld)
     BYTE firstChar = '(';
     BYTE lastChar = '}';
 
-    if (ld==0.0)
-    {
+    if (ld==0.0) {
         character = 0;
         firstChar = 0;
         lastChar =255;
     }
-    while (i<LTSIZE)
-    {
+    while (i<LTSIZE) {
         U32 weight = (U32)((double)(LTSIZE - i) * ld) + 1;
         U32 end;
         if (weight + i > LTSIZE) weight = LTSIZE-i;
@@ -140,13 +138,11 @@ void RDG_genBlock(void* buffer, size_t buffSize, size_t prefixSize, double match
     U32 prevOffset = 1;
 
     /* special case : sparse content */
-    while (matchProba >= 1.0)
-    {
+    while (matchProba >= 1.0) {
         size_t size0 = RDG_rand(seed) & 3;
         size0  = (size_t)1 << (16 + size0 * 2);
         size0 += RDG_rand(seed) & (size0-1);   /* because size0 is power of 2*/
-        if (buffSize < pos + size0)
-        {
+        if (buffSize < pos + size0) {
             memset(buffPtr+pos, 0, buffSize-pos);
             return;
         }
@@ -160,11 +156,9 @@ void RDG_genBlock(void* buffer, size_t buffSize, size_t prefixSize, double match
     if (pos==0) buffPtr[0] = RDG_genChar(seed, lt), pos=1;
 
     /* Generate compressible data */
-    while (pos < buffSize)
-    {
+    while (pos < buffSize) {
         /* Select : Literal (char) or Match (within 32K) */
-        if (RDG_RAND15BITS < matchProba32)
-        {
+        if (RDG_RAND15BITS < matchProba32) {
             /* Copy (within 32K) */
             size_t match;
             size_t d;
@@ -178,17 +172,14 @@ void RDG_genBlock(void* buffer, size_t buffSize, size_t prefixSize, double match
             d = pos + length;
             if (d > buffSize) d = buffSize;
             while (pos < d) buffPtr[pos++] = buffPtr[match++];   /* correctly manages overlaps */
-        }
-        else
-        {
+        } else {
             /* Literal (noise) */
             size_t d;
             size_t length = RDG_RANDLENGTH;
             d = pos + length;
             if (d > buffSize) d = buffSize;
             while (pos < d) buffPtr[pos++] = RDG_genChar(seed, lt);
-        }
-    }
+    }   }
 }
 
 
@@ -220,8 +211,7 @@ void RDG_genStdout(unsigned long long size, double matchProba, double litProba,
     RDG_genBlock(buff, RDG_DICTSIZE, 0, matchProba, lt, &seed);
 
     /* Generate compressible data */
-    while (total < size)
-    {
+    while (total < size) {
         RDG_genBlock(buff, RDG_DICTSIZE+RDG_BLOCKSIZE, RDG_DICTSIZE, matchProba, lt, &seed);
         if (size-total < RDG_BLOCKSIZE) genBlockSize = (size_t)(size-total);
         total += genBlockSize;
@@ -230,6 +220,6 @@ void RDG_genStdout(unsigned long long size, double matchProba, double litProba,
         memcpy(buff, buff + RDG_BLOCKSIZE, RDG_DICTSIZE);
     }
 
-    // cleanup
+    /* cleanup */
     free(buff);
 }
diff --git a/programs/dibio.c b/programs/dibio.c
new file mode 100644
index 0000000..646fe2c
--- /dev/null
+++ b/programs/dibio.c
@@ -0,0 +1,277 @@
+/*
+    dibio - I/O API for dictionary builder
+    Copyright (C) Yann Collet 2016
+
+    GPL v2 License
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    You can contact the author at :
+    - zstd homepage : http://www.zstd.net/
+*/
+
+/*-**************************************
+*  Compiler Options
+****************************************/
+/* Disable some Visual warning messages */
+#ifdef _MSC_VER
+#  define _CRT_SECURE_NO_WARNINGS                /* fopen */
+#  pragma warning(disable : 4127)                /* disable: C4127: conditional expression is constant */
+#endif
+
+/* Unix Large Files support (>4GB) */
+#define _FILE_OFFSET_BITS 64
+#if (defined(__sun__) && (!defined(__LP64__)))   /* Sun Solaris 32-bits requires specific definitions */
+#  define _LARGEFILE_SOURCE
+#elif ! defined(__LP64__)                        /* No point defining Large file for 64 bit */
+#  define _LARGEFILE64_SOURCE
+#endif
+
+
+/*-*************************************
+*  Includes
+***************************************/
+#include <stdlib.h>         /* malloc, free */
+#include <string.h>         /* memset */
+#include <stdio.h>          /* fprintf, fopen, ftello64 */
+#include <sys/types.h>      /* stat64 */
+#include <sys/stat.h>       /* stat64 */
+#include <time.h>           /* clock */
+
+#include "mem.h"            /* read */
+#include "error_private.h"
+#include "zdict_static.h"
+
+
+/*-*************************************
+*  Compiler specifics
+***************************************/
+#if !defined(S_ISREG)
+#  define S_ISREG(x) (((x) & S_IFMT) == S_IFREG)
+#endif
+
+
+/*-*************************************
+*  Constants
+***************************************/
+#define KB *(1 <<10)
+#define MB *(1 <<20)
+#define GB *(1U<<30)
+
+#define DICTLISTSIZE 10000
+#define MEMMULT 11
+static const size_t maxMemory = (sizeof(size_t) == 4) ? (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t));
+
+#define NOISELENGTH 32
+#define PRIME1   2654435761U
+#define PRIME2   2246822519U
+
+
+/*-*************************************
+*  Console display
+***************************************/
+#define DISPLAY(...)         fprintf(stderr, __VA_ARGS__)
+#define DISPLAYLEVEL(l, ...) if (g_displayLevel>=l) { DISPLAY(__VA_ARGS__); }
+static unsigned g_displayLevel = 0;   /* 0 : no display;   1: errors;   2: default;  4: full information */
+
+
+/*-*************************************
+*  Exceptions
+***************************************/
+#ifndef DEBUG
+#  define DEBUG 0
+#endif
+#define DEBUGOUTPUT(...) if (DEBUG) DISPLAY(__VA_ARGS__);
+#define EXM_THROW(error, ...)                                             \
+{                                                                         \
+    DEBUGOUTPUT("Error defined at %s, line %i : \n", __FILE__, __LINE__); \
+    DISPLAYLEVEL(1, "Error %i : ", error);                                \
+    DISPLAYLEVEL(1, __VA_ARGS__);                                         \
+    DISPLAYLEVEL(1, "\n");                                                \
+    exit(error);                                                          \
+}
+
+
+/* ********************************************************
+*  Helper functions
+**********************************************************/
+unsigned DiB_isError(size_t errorCode) { return ERR_isError(errorCode); }
+
+const char* DiB_getErrorName(size_t errorCode) { return ERR_getErrorName(errorCode); }
+
+
+/* ********************************************************
+*  File related operations
+**********************************************************/
+static unsigned long long DiB_getFileSize(const char* infilename)
+{
+    int r;
+#if defined(_MSC_VER)
+    struct _stat64 statbuf;
+    r = _stat64(infilename, &statbuf);
+#else
+    struct stat statbuf;
+    r = stat(infilename, &statbuf);
+#endif
+    if (r || !S_ISREG(statbuf.st_mode)) return 0;   /* No good... */
+    return (unsigned long long)statbuf.st_size;
+}
+
+
+static unsigned long long DiB_getTotalFileSize(const char** fileNamesTable, unsigned nbFiles)
+{
+    unsigned long long total = 0;
+    unsigned n;
+    for (n=0; n<nbFiles; n++)
+        total += DiB_getFileSize(fileNamesTable[n]);
+    return total;
+}
+
+
+static void DiB_loadFiles(void* buffer, size_t bufferSize,
+                          size_t* fileSizes,
+                          const char** fileNamesTable, unsigned nbFiles)
+{
+    char* buff = (char*)buffer;
+    size_t pos = 0;
+    unsigned n;
+
+    for (n=0; n<nbFiles; n++) {
+        size_t readSize;
+        unsigned long long fileSize = DiB_getFileSize(fileNamesTable[n]);
+        FILE* f = fopen(fileNamesTable[n], "rb");
+        if (f==NULL) EXM_THROW(10, "impossible to open file %s", fileNamesTable[n]);
+        DISPLAYLEVEL(2, "Loading %s...       \r", fileNamesTable[n]);
+        if (fileSize > bufferSize-pos) fileSize = 0;  /* stop there, not enough memory to load all files */
+        readSize = fread(buff+pos, 1, (size_t)fileSize, f);
+        if (readSize != (size_t)fileSize) EXM_THROW(11, "could not read %s", fileNamesTable[n]);
+        pos += readSize;
+        fileSizes[n] = (size_t)fileSize;
+        fclose(f);
+    }
+}
+
+
+/*-********************************************************
+*  Dictionary training functions
+**********************************************************/
+static size_t DiB_findMaxMem(unsigned long long requiredMem)
+{
+    size_t step = 8 MB;
+    void* testmem = NULL;
+
+    requiredMem = (((requiredMem >> 23) + 1) << 23);
+    requiredMem += 2 * step;
+    if (requiredMem > maxMemory) requiredMem = maxMemory;
+
+    while (!testmem) {
+        requiredMem -= step;
+        testmem = malloc((size_t)requiredMem);
+    }
+
+    free(testmem);
+    return (size_t)(requiredMem - step);
+}
+
+
+static void DiB_fillNoise(void* buffer, size_t length)
+{
+    unsigned acc = PRIME1;
+    size_t p=0;;
+
+    for (p=0; p<length; p++) {
+        acc *= PRIME2;
+        ((unsigned char*)buffer)[p] = (unsigned char)(acc >> 21);
+    }
+}
+
+
+static void DiB_saveDict(const char* dictFileName,
+                         const void* buff, size_t buffSize)
+{
+    FILE* f;
+    size_t n;
+
+    f = fopen(dictFileName, "wb");
+    if (f==NULL) EXM_THROW(3, "cannot open %s ", dictFileName);
+
+    n = fwrite(buff, 1, buffSize, f);
+    if (n!=buffSize) EXM_THROW(4, "%s : write error", dictFileName)
+
+    n = (size_t)fclose(f);
+    if (n!=0) EXM_THROW(5, "%s : flush error", dictFileName)
+}
+
+
+/*! ZDICT_trainFromBuffer_unsafe() :
+    Strictly Internal use only !!
+    Same as ZDICT_trainFromBuffer_advanced(), but does not control `samplesBuffer`.
+    `samplesBuffer` must be followed by noisy guard band to avoid out-of-buffer reads.
+    @return : size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
+              or an error code.
+*/
+size_t ZDICT_trainFromBuffer_unsafe(void* dictBuffer, size_t dictBufferCapacity,
+                              const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
+                              ZDICT_params_t parameters);
+
+
+int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
+                       const char** fileNamesTable, unsigned nbFiles,
+                       ZDICT_params_t params)
+{
+    void* srcBuffer;
+    size_t benchedSize;
+    size_t* fileSizes = (size_t*)malloc(nbFiles * sizeof(size_t));
+    unsigned long long totalSizeToLoad = DiB_getTotalFileSize(fileNamesTable, nbFiles);
+    void* dictBuffer = malloc(maxDictSize);
+    size_t dictSize;
+    int result = 0;
+
+    /* init */
+    g_displayLevel = params.notificationLevel;
+    benchedSize = DiB_findMaxMem(totalSizeToLoad * MEMMULT) / MEMMULT;
+    if ((unsigned long long)benchedSize > totalSizeToLoad) benchedSize = (size_t)totalSizeToLoad;
+    if (benchedSize < totalSizeToLoad)
+        DISPLAYLEVEL(1, "Not enough memory; training on %u MB only...\n", (unsigned)(benchedSize >> 20));
+
+    /* Memory allocation & restrictions */
+    srcBuffer = malloc(benchedSize+NOISELENGTH);     /* + noise */
+    if ((!fileSizes) || (!srcBuffer) || (!dictBuffer)) EXM_THROW(12, "not enough memory for DiB_trainFiles");  /* should not happen */
+
+    /* Load input buffer */
+    DiB_loadFiles(srcBuffer, benchedSize, fileSizes, fileNamesTable, nbFiles);
+    DiB_fillNoise((char*)srcBuffer + benchedSize, NOISELENGTH);   /* guard band, for end of buffer condition */
+
+    /* call buffer version */
+    dictSize = ZDICT_trainFromBuffer_unsafe(dictBuffer, maxDictSize,
+                        srcBuffer, fileSizes, nbFiles,
+                        params);
+    if (ZDICT_isError(dictSize)) {
+        DISPLAYLEVEL(1, "dictionary training failed : %s", ZDICT_getErrorName(dictSize));   /* should not happen */
+        result = 1;
+        goto _cleanup;
+    }
+
+    /* save dict */
+    DISPLAYLEVEL(2, "Save dictionary of size %u into file %s \n", (U32)dictSize, dictFileName);
+    DiB_saveDict(dictFileName, dictBuffer, dictSize);
+
+    /* clean up */
+_cleanup:
+    free(srcBuffer);
+    free(dictBuffer);
+    free(fileSizes);
+    return result;
+}
diff --git a/programs/dibio.h b/programs/dibio.h
new file mode 100644
index 0000000..0ccec41
--- /dev/null
+++ b/programs/dibio.h
@@ -0,0 +1,52 @@
+/*
+    dibio.h - I/O API for dictionary builder
+    Copyright (C) Yann Collet 2016
+
+    GPL v2 License
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    You can contact the author at :
+    - zstd homepage : http://www.zstd.net/
+*/
+
+/* This library is designed for a single-threaded console application.
+*  It exit() and printf() into stderr when it encounters an error condition. */
+
+#ifndef DIBIO_H_003
+#define DIBIO_H_003
+
+
+/*-*************************************
+*  Dependencies
+***************************************/
+#include "zdict_static.h"   /* ZDICT_params_t */
+
+
+/*-*************************************
+*  Public functions
+***************************************/
+/*! DiB_trainFromFiles() :
+    Train a dictionary from a set of files provided by `fileNamesTable`.
+    Resulting dictionary is written into file `dictFileName`.
+    `parameters` is optional and can be provided with values set to 0, meaning "default".
+    @return : 0 == ok. Any other : error.
+*/
+int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
+                       const char** fileNamesTable, unsigned nbFiles,
+                       ZDICT_params_t parameters);
+
+
+#endif
diff --git a/programs/fileio.c b/programs/fileio.c
index 0d49af2..028c7db 100644
--- a/programs/fileio.c
+++ b/programs/fileio.c
@@ -1,6 +1,6 @@
 /*
-  fileio.c - File i/o handler
-  Copyright (C) Yann Collet 2013-2015
+  fileio.c - File i/o handler for zstd
+  Copyright (C) Yann Collet 2013-2016
 
   GPL v2 License
 
@@ -19,8 +19,7 @@
   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 
   You can contact the author at :
-  - zstd source repository : https://github.com/Cyan4973/zstd
-  - Public forum : https://groups.google.com/forum/#!forum/lz4c
+  - zstd homepage : http://www.zstd.net
 */
 /*
   Note : this is stand-alone program.
@@ -33,7 +32,7 @@
 *  Tuning options
 ***************************************/
 #ifndef ZSTD_LEGACY_SUPPORT
-/**LEGACY_SUPPORT :
+/* LEGACY_SUPPORT :
 *  decompressor can decode older formats (starting from Zstd 0.1+) */
 #  define ZSTD_LEGACY_SUPPORT 1
 #endif
@@ -53,7 +52,7 @@
 #define _POSIX_SOURCE 1        /* enable fileno() within <stdio.h> on unix */
 
 
-/* *************************************
+/*-*************************************
 *  Includes
 ***************************************/
 #include <stdio.h>      /* fprintf, fopen, fread, _fileno, stdin, stdout */
@@ -66,23 +65,20 @@
 #include "mem.h"
 #include "fileio.h"
 #include "zstd_static.h"   /* ZSTD_magicNumber */
-#include "zstd_buffered_static.h"
+#include "zbuff_static.h"
 
 #if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT==1)
-#  include "zstd_legacy.h"    /* legacy */
-#  include "fileio_legacy.h"  /* legacy */
+#  include "zstd_legacy.h"    /* ZSTD_isLegacy */
+#  include "fileio_legacy.h"  /* FIO_decompressLegacyFrame */
 #endif
 
 
-/* *************************************
+/*-*************************************
 *  OS-specific Includes
 ***************************************/
 #if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(_WIN32) || defined(__CYGWIN__)
 #  include <fcntl.h>    /* _O_BINARY */
 #  include <io.h>       /* _setmode, _isatty */
-#  ifdef __MINGW32__
-   // int _fileno(FILE *stream);   /* seems no longer useful /* MINGW somehow forgets to include this windows declaration into <stdio.h> */
-#  endif
 #  define SET_BINARY_MODE(file) { int unused = _setmode(_fileno(file), _O_BINARY); (void)unused; }
 #  define IS_CONSOLE(stdStream) _isatty(_fileno(stdStream))
 #else
@@ -96,7 +92,7 @@
 #endif
 
 
-/* *************************************
+/*-*************************************
 *  Constants
 ***************************************/
 #define KB *(1U<<10)
@@ -116,14 +112,15 @@
 #define BLOCKSIZE      (128 KB)
 #define ROLLBUFFERSIZE (BLOCKSIZE*8*64)
 
-#define FIO_FRAMEHEADERSIZE 5        /* as a define, because needed to allocated table on stack */
-#define FSE_CHECKSUM_SEED        0
+#define FIO_FRAMEHEADERSIZE  5        /* as a define, because needed to allocated table on stack */
+#define FSE_CHECKSUM_SEED    0
 
 #define CACHELINE 64
 
-#define MAX_DICT_SIZE (512 KB)
+#define MAX_DICT_SIZE (1 MB)   /* protection against large input (attack scenario) ; can be changed */
 
-/* *************************************
+
+/*-*************************************
 *  Macros
 ***************************************/
 #define DISPLAY(...)         fprintf(stderr, __VA_ARGS__)
@@ -137,17 +134,18 @@ static U32 g_displayLevel = 2;   /* 0 : no display;   1: errors;   2 : + result
 static const unsigned refreshRate = 150;
 static clock_t g_time = 0;
 
+#define MAX(a,b)   ((a)>(b)?(a):(b))
 
-/* *************************************
+
+/*-*************************************
 *  Local Parameters
 ***************************************/
 static U32 g_overwrite = 0;
-
 void FIO_overwriteMode(void) { g_overwrite=1; }
 void FIO_setNotificationLevel(unsigned level) { g_displayLevel=level; }
 
 
-/* *************************************
+/*-*************************************
 *  Exceptions
 ***************************************/
 #ifndef DEBUG
@@ -164,7 +162,7 @@ void FIO_setNotificationLevel(unsigned level) { g_displayLevel=level; }
 }
 
 
-/* *************************************
+/*-*************************************
 *  Functions
 ***************************************/
 static unsigned FIO_GetMilliSpan(clock_t nPrevious)
@@ -190,69 +188,57 @@ static U64 FIO_getFileSize(const char* infilename)
 }
 
 
-static int FIO_getFiles(FILE** fileOutPtr, FILE** fileInPtr,
-                        const char* dstFileName, const char* srcFileName)
+static FILE* FIO_openSrcFile(const char* srcFileName)
 {
-    if (!strcmp (srcFileName, stdinmark))
-    {
+    FILE* f;
+
+    if (!strcmp (srcFileName, stdinmark)) {
         DISPLAYLEVEL(4,"Using stdin for input\n");
-        *fileInPtr = stdin;
+        f = stdin;
         SET_BINARY_MODE(stdin);
-    }
-    else
-    {
-        *fileInPtr = fopen(srcFileName, "rb");
+    } else {
+        f = fopen(srcFileName, "rb");
     }
 
-    if ( *fileInPtr==0 )
-    {
-        DISPLAYLEVEL(1, "Unable to access file for processing: %s\n", srcFileName);
-        return 1;
-    }
+    if ( f==NULL ) DISPLAYLEVEL(1, "zstd: %s: No such file\n", srcFileName);
 
-    if (!strcmp (dstFileName, stdoutmark))
-    {
+    return f;
+}
+
+
+static FILE* FIO_openDstFile(const char* dstFileName)
+{
+    FILE* f;
+
+    if (!strcmp (dstFileName, stdoutmark)) {
         DISPLAYLEVEL(4,"Using stdout for output\n");
-        *fileOutPtr = stdout;
+        f = stdout;
         SET_BINARY_MODE(stdout);
-    }
-    else
-    {
-        /* Check if destination file already exists */
-        if (!g_overwrite)
-        {
-            *fileOutPtr = fopen( dstFileName, "rb" );
-            if (*fileOutPtr != 0)
-            {
-                /* prompt for overwrite authorization */
-                fclose(*fileOutPtr);
-                DISPLAY("Warning : %s already exists \n", dstFileName);
-                if ((g_displayLevel <= 1) || (*fileInPtr == stdin))
-                {
+    } else {
+        if (!g_overwrite) {  /* Check if destination file already exists */
+            f = fopen( dstFileName, "rb" );
+            if (f != 0) {  /* dest file exists, prompt for overwrite authorization */
+                fclose(f);
+                if (g_displayLevel <= 1) {
                     /* No interaction possible */
-                    DISPLAY("Operation aborted : %s already exists \n", dstFileName);
-                    return 1;
+                    DISPLAY("zstd: %s already exists; not overwritten  \n", dstFileName);
+                    return 0;
                 }
-                DISPLAY("Overwrite ? (y/N) : ");
+                DISPLAY("zstd: %s already exists; do you wish to overwrite (y/N) ? ", dstFileName);
                 {
                     int ch = getchar();
-                    if ((ch!='Y') && (ch!='y'))
-                    {
-                        DISPLAY("No. Operation aborted : %s already exists \n", dstFileName);
-                        return 1;
+                    if ((ch!='Y') && (ch!='y')) {
+                        DISPLAY("    not overwritten  \n");
+                        return 0;
                     }
                     while ((ch!=EOF) && (ch!='\n')) ch = getchar();  /* flush rest of input line */
-                }
-            }
-        }
-        *fileOutPtr = fopen( dstFileName, "wb" );
+        }   }   }
+        f = fopen( dstFileName, "wb" );
     }
-
-    if (*fileOutPtr==0) EXM_THROW(13, "Pb opening %s", dstFileName);
-
-    return 0;
+    return f;
 }
 
+
 /*!FIO_loadFile
 *  creates a buffer, pointed by *bufferPtr,
 *  loads "filename" content into it
@@ -265,15 +251,13 @@ static size_t FIO_loadFile(void** bufferPtr, const char* fileName)
     U64 fileSize;
 
     *bufferPtr = NULL;
-    if (fileName == NULL)
-        return 0;
+    if (fileName == NULL) return 0;
 
     DISPLAYLEVEL(4,"Loading %s as dictionary \n", fileName);
     fileHandle = fopen(fileName, "rb");
     if (fileHandle==0) EXM_THROW(31, "Error opening file %s", fileName);
     fileSize = FIO_getFileSize(fileName);
-    if (fileSize > MAX_DICT_SIZE)
-    {
+    if (fileSize > MAX_DICT_SIZE) {
         int seekResult;
         if (fileSize > 1 GB) EXM_THROW(32, "Dictionary file %s is too large", fileName);   /* avoid extreme cases */
         DISPLAYLEVEL(2,"Dictionary %s is too large : using last %u bytes only \n", fileName, MAX_DICT_SIZE);
@@ -301,6 +285,8 @@ typedef struct {
     void*  dictBuffer;
     size_t dictBufferSize;
     ZBUFF_CCtx* ctx;
+    FILE* dstFile;
+    FILE* srcFile;
 } cRess_t;
 
 static cRess_t FIO_createCResources(const char* dictFileName)
@@ -334,46 +320,37 @@ static void FIO_freeCResources(cRess_t ress)
 }
 
 
-/*
- * FIO_compressFilename_extRess()
- * result : 0 : compression completed correctly
- *          1 : missing or pb opening srcFileName
+/*! FIO_compressFilename_internal() :
+ *  same as FIO_compressFilename_extRess(), with ress.desFile already opened
+ *  @return : 0 : compression completed correctly,
+ *            1 : missing or pb opening srcFileName
  */
-static int FIO_compressFilename_extRess(cRess_t ress,
-                                        const char* dstFileName, const char* srcFileName,
-                                        int cLevel)
+static int FIO_compressFilename_internal(cRess_t ress,
+                                         const char* dstFileName, const char* srcFileName,
+                                         int cLevel)
 {
-    FILE* srcFile;
-    FILE* dstFile;
+    FILE* srcFile = ress.srcFile;
+    FILE* dstFile = ress.dstFile;
     U64 filesize = 0;
     U64 compressedfilesize = 0;
     size_t dictSize = ress.dictBufferSize;
     size_t sizeCheck, errorCode;
 
-    /* File check */
-    if (FIO_getFiles(&dstFile, &srcFile, dstFileName, srcFileName)) return 1;
-
     /* init */
-    filesize = FIO_getFileSize(srcFileName) + dictSize;
-    errorCode = ZBUFF_compressInit_advanced(ress.ctx, ZSTD_getParams(cLevel, filesize));
-    if (ZBUFF_isError(errorCode)) EXM_THROW(21, "Error initializing compression");
-    errorCode = ZBUFF_compressWithDictionary(ress.ctx, ress.dictBuffer, ress.dictBufferSize);
-    if (ZBUFF_isError(errorCode)) EXM_THROW(22, "Error initializing dictionary");
+    filesize = MAX(FIO_getFileSize(srcFileName),dictSize);
+    errorCode = ZBUFF_compressInit_advanced(ress.ctx, ress.dictBuffer, ress.dictBufferSize, ZSTD_getParams(cLevel, filesize));
+    if (ZBUFF_isError(errorCode)) EXM_THROW(21, "Error initializing compression : %s", ZBUFF_getErrorName(errorCode));
 
     /* Main compression loop */
     filesize = 0;
-    while (1)
-    {
-        size_t inSize;
-
+    while (1) {
         /* Fill input Buffer */
-        inSize = fread(ress.srcBuffer, (size_t)1, ress.srcBufferSize, srcFile);
+        size_t inSize = fread(ress.srcBuffer, (size_t)1, ress.srcBufferSize, srcFile);
         if (inSize==0) break;
         filesize += inSize;
         DISPLAYUPDATE(2, "\rRead : %u MB  ", (U32)(filesize>>20));
 
-        {
-            /* Compress (buffered streaming ensures appropriate formatting) */
+        {   /* Compress using buffered streaming */
             size_t usedInSize = inSize;
             size_t cSize = ress.dstBufferSize;
             size_t result = ZBUFF_compressContinue(ress.ctx, ress.dstBuffer, &cSize, ress.srcBuffer, &usedInSize);
@@ -388,7 +365,6 @@ static int FIO_compressFilename_extRess(cRess_t ress,
             if (sizeCheck!=cSize) EXM_THROW(25, "Write error : cannot write compressed block into %s", dstFileName);
             compressedfilesize += cSize;
         }
-
         DISPLAYUPDATE(2, "\rRead : %u MB  ==> %.2f%%   ", (U32)(filesize>>20), (double)compressedfilesize/filesize*100);
     }
 
@@ -408,11 +384,53 @@ static int FIO_compressFilename_extRess(cRess_t ress,
     DISPLAYLEVEL(2,"Compressed %llu bytes into %llu bytes ==> %.2f%%\n",
         (unsigned long long) filesize, (unsigned long long) compressedfilesize, (double)compressedfilesize/filesize*100);
 
+    return 0;
+}
+
+
+/*! FIO_compressFilename_internal() :
+ *  same as FIO_compressFilename_extRess(), with ress.desFile already opened
+ *  @return : 0 : compression completed correctly,
+ *            1 : missing or pb opening srcFileName
+ */
+static int FIO_compressFilename_srcFile(cRess_t ress,
+                                        const char* dstFileName, const char* srcFileName,
+                                        int cLevel)
+{
+    int result;
+
+    /* File check */
+    ress.srcFile = FIO_openSrcFile(srcFileName);
+    if (!ress.srcFile) return 1;   /* srcFile could not be opened */
+
+    result = FIO_compressFilename_internal(ress, dstFileName, srcFileName, cLevel);
+
     /* clean */
-    fclose(srcFile);
-    if (fclose(dstFile)) EXM_THROW(28, "Write error : cannot properly close %s", dstFileName);
+    fclose(ress.srcFile);
+    return result;
+}
 
-    return 0;
+
+/*! FIO_compressFilename_extRess() :
+ *  @return : 0 : compression completed correctly,
+ *            1 : missing or pb opening srcFileName
+ */
+static int FIO_compressFilename_extRess(cRess_t ress,
+                                        const char* dstFileName, const char* srcFileName,
+                                        int cLevel)
+{
+    int result;
+
+    ress.srcFile = FIO_openSrcFile(srcFileName);
+    if (ress.srcFile==0) return 1;
+    ress.dstFile = FIO_openDstFile(dstFileName);
+    if (ress.dstFile==0) { fclose(ress.srcFile); return 1; }
+
+    result = FIO_compressFilename_internal(ress, dstFileName, srcFileName, cLevel);
+
+    fclose(ress.srcFile);   /* no pb to expect : only reading */
+    if (fclose(ress.dstFile)) EXM_THROW(28, "Write error : cannot properly close %s", dstFileName);
+    return result;
 }
 
 
@@ -453,22 +471,28 @@ int FIO_compressMultipleFilenames(const char** inFileNamesTable, unsigned nbFile
     int missed_files = 0;
     char* dstFileName = (char*)malloc(FNSPACE);
     size_t dfnSize = FNSPACE;
-    const size_t suffixSize = strlen(suffix);
+    const size_t suffixSize = suffix ? strlen(suffix) : 0;
     cRess_t ress;
 
     /* init */
     ress = FIO_createCResources(dictFileName);
 
     /* loop on each file */
-    for (u=0; u<nbFiles; u++)
-    {
-        size_t ifnSize = strlen(inFileNamesTable[u]);
-        if (dfnSize <= ifnSize+suffixSize+1) { free(dstFileName); dfnSize = ifnSize + 20; dstFileName = (char*)malloc(dfnSize); }
-        strcpy(dstFileName, inFileNamesTable[u]);
-        strcat(dstFileName, suffix);
-
-        missed_files += FIO_compressFilename_extRess(ress, dstFileName, inFileNamesTable[u], compressionLevel);
-    }
+    if (!strcmp(suffix, stdoutmark)) {
+        ress.dstFile = stdout;
+        for (u=0; u<nbFiles; u++)
+            missed_files += FIO_compressFilename_srcFile(ress, stdoutmark,
+                                                          inFileNamesTable[u], compressionLevel);
+        if (fclose(ress.dstFile)) EXM_THROW(29, "Write error : cannot properly close %s", stdoutmark);
+    } else {
+        for (u=0; u<nbFiles; u++) {
+            size_t ifnSize = strlen(inFileNamesTable[u]);
+            if (dfnSize <= ifnSize+suffixSize+1) { free(dstFileName); dfnSize = ifnSize + 20; dstFileName = (char*)malloc(dfnSize); }
+            strcpy(dstFileName, inFileNamesTable[u]);
+            strcat(dstFileName, suffix);
+            missed_files += FIO_compressFilename_extRess(ress, dstFileName,
+                                                         inFileNamesTable[u], compressionLevel);
+    }   }
 
     /* Close & Free */
     FIO_freeCResources(ress);
@@ -489,6 +513,7 @@ typedef struct {
     void*  dictBuffer;
     size_t dictBufferSize;
     ZBUFF_DCtx* dctx;
+    FILE*  dstFile;
 } dRess_t;
 
 static dRess_t FIO_createDResources(const char* dictFileName)
@@ -529,10 +554,8 @@ unsigned long long FIO_decompressFrame(dRess_t ress,
     size_t readSize=alreadyLoaded;
 
     /* Main decompression Loop */
-    ZBUFF_decompressInit(ress.dctx);
-    ZBUFF_decompressWithDictionary(ress.dctx, ress.dictBuffer, ress.dictBufferSize);
-    while (1)
-    {
+    ZBUFF_decompressInitDictionary(ress.dctx, ress.dictBuffer, ress.dictBufferSize);
+    while (1) {
         /* Decode */
         size_t sizeCheck;
         size_t inSize=readSize, decodedSize=ress.dstBufferSize;
@@ -559,33 +582,36 @@ unsigned long long FIO_decompressFrame(dRess_t ress,
 }
 
 
-static int FIO_decompressFile_extRess(dRess_t ress,
-                                      const char* dstFileName, const char* srcFileName)
+/** FIO_decompressSrcFile() :
+    Decompression `srcFileName` into `ress.dstFile`
+    @return : 0 : OK
+              1 : operation not started
+*/
+static int FIO_decompressSrcFile(dRess_t ress, const char* srcFileName)
 {
     unsigned long long filesize = 0;
-    FILE* srcFile;
-    FILE* dstFile;
-
-    /* Init */
-    if (FIO_getFiles(&dstFile, &srcFile, dstFileName, srcFileName)) return 1;
+    FILE* dstFile = ress.dstFile;
+    FILE* srcFile = FIO_openSrcFile(srcFileName);
+    if (srcFile==0) return 1;
 
     /* for each frame */
-    for ( ; ; )
-    {
+    for ( ; ; ) {
         size_t sizeCheck;
         /* check magic number -> version */
         size_t toRead = 4;
         sizeCheck = fread(ress.srcBuffer, (size_t)1, toRead, srcFile);
         if (sizeCheck==0) break;   /* no more input */
-        if (sizeCheck != toRead) EXM_THROW(31, "Read error : cannot read header");
+        if (sizeCheck != toRead) EXM_THROW(31, "zstd: %s read error : cannot read header", srcFileName);
 #if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT==1)
-        if (ZSTD_isLegacy(MEM_readLE32(ress.srcBuffer)))
-        {
+        if (ZSTD_isLegacy(MEM_readLE32(ress.srcBuffer))) {
             filesize += FIO_decompressLegacyFrame(dstFile, srcFile, MEM_readLE32(ress.srcBuffer));
             continue;
         }
 #endif   /* ZSTD_LEGACY_SUPPORT */
-
+        if (MEM_readLE32(ress.srcBuffer) !=  ZSTD_MAGICNUMBER) {
+            DISPLAYLEVEL(1, "zstd: %s: not in zstd format \n", srcFileName);
+            return 1;
+        }
         filesize += FIO_decompressFrame(ress, dstFile, srcFile, toRead);
     }
 
@@ -595,8 +621,24 @@ static int FIO_decompressFile_extRess(dRess_t ress,
 
     /* Close */
     fclose(srcFile);
-    if (fclose(dstFile)) EXM_THROW(38, "Write error : cannot properly close %s", dstFileName);
+    return 0;
+}
+
 
+/** FIO_decompressFile_extRess() :
+    decompress `srcFileName` into `dstFileName`
+    @return : 0 : OK
+              1 : operation aborted (src not available, dst already taken, etc.)
+*/
+static int FIO_decompressFile_extRess(dRess_t ress,
+                                      const char* dstFileName, const char* srcFileName)
+{
+    ress.dstFile = FIO_openDstFile(dstFileName);
+    if (ress.dstFile==0) return 1;
+
+    FIO_decompressSrcFile(ress, srcFileName);
+
+    if (fclose(ress.dstFile)) EXM_THROW(38, "Write error : cannot properly close %s", dstFileName);
     return 0;
 }
 
@@ -624,31 +666,42 @@ int FIO_decompressMultipleFilenames(const char** srcNamesTable, unsigned nbFiles
     int missingFiles = 0;
     char* dstFileName = (char*)malloc(FNSPACE);
     size_t dfnSize = FNSPACE;
-    const size_t suffixSize = strlen(suffix);
+    const size_t suffixSize = suffix ? strlen(suffix) : 0;
     dRess_t ress;
 
 	if (dstFileName==NULL) EXM_THROW(70, "not enough memory for dstFileName");
     ress = FIO_createDResources(dictFileName);
 
-    for (u=0; u<nbFiles; u++)
-    {
-        const char* srcFileName = srcNamesTable[u];
-        size_t sfnSize = strlen(srcFileName);
-        const char* suffixPtr = srcFileName + sfnSize - suffixSize;
-        if (dfnSize <= sfnSize-suffixSize+1) { free(dstFileName); dfnSize = sfnSize + 20; dstFileName = (char*)malloc(dfnSize); if (dstFileName==NULL) EXM_THROW(71, "not enough memory for dstFileName"); }
-        if (sfnSize <= suffixSize  ||  strcmp(suffixPtr, suffix) != 0)
-        {
-            DISPLAYLEVEL(1, "File extension doesn't match expected extension (%4s); will not process file: %s\n", suffix, srcFileName);
-            skippedFiles++;
-            continue;
-        }
-        memcpy(dstFileName, srcFileName, sfnSize - suffixSize);
-        dstFileName[sfnSize-suffixSize] = '\0';
+    if (!strcmp(suffix, stdoutmark) || !strcmp(suffix, nulmark)) {
+        ress.dstFile = FIO_openDstFile(suffix);
+        if (ress.dstFile == 0) EXM_THROW(71, "cannot open %s", suffix);
+        for (u=0; u<nbFiles; u++)
+            missingFiles += FIO_decompressSrcFile(ress, srcNamesTable[u]);
+        if (fclose(ress.dstFile)) EXM_THROW(39, "Write error : cannot properly close %s", stdoutmark);
+    } else {
+        for (u=0; u<nbFiles; u++) {   /* create dstFileName */
+            const char* srcFileName = srcNamesTable[u];
+            size_t sfnSize = strlen(srcFileName);
+            const char* suffixPtr = srcFileName + sfnSize - suffixSize;
+            if (dfnSize+suffixSize <= sfnSize+1) {
+                free(dstFileName);
+                dfnSize = sfnSize + 20;
+                dstFileName = (char*)malloc(dfnSize);
+                if (dstFileName==NULL) EXM_THROW(71, "not enough memory for dstFileName");
+            }
+            if (sfnSize <= suffixSize || strcmp(suffixPtr, suffix) != 0) {
+                DISPLAYLEVEL(1, "zstd: %s: unknown suffix (%4s expected) -- ignored \n", srcFileName, suffix);
+                skippedFiles++;
+                continue;
+            }
+            memcpy(dstFileName, srcFileName, sfnSize - suffixSize);
+            dstFileName[sfnSize-suffixSize] = '\0';
 
-        missingFiles += FIO_decompressFile_extRess(ress, dstFileName, srcFileName);
-    }
+            missingFiles += FIO_decompressFile_extRess(ress, dstFileName, srcFileName);
+    }   }
 
     FIO_freeDResources(ress);
     free(dstFileName);
     return missingFiles + skippedFiles;
 }
+
diff --git a/programs/fileio.h b/programs/fileio.h
index 0e25d84..ee3cf22 100644
--- a/programs/fileio.h
+++ b/programs/fileio.h
@@ -1,6 +1,6 @@
 /*
   fileio.h - file i/o handler
-  Copyright (C) Yann Collet 2013-2015
+  Copyright (C) Yann Collet 2013-2016
 
   GPL v2 License
 
@@ -19,8 +19,7 @@
   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 
   You can contact the author at :
-  - ZSTD source repository : https://github.com/Cyan4973/zstd
-  - Public forum : https://groups.google.com/forum/#!forum/lz4c
+  - ZSTD homepage : http://www.zstd.net/
 */
 #pragma once
 
@@ -33,8 +32,8 @@ extern "C" {
 *  Special i/o constants
 **************************************/
 #define nullString "null"
-#define stdinmark "-"
-#define stdoutmark "-"
+#define stdinmark "stdin"
+#define stdoutmark "stdout"
 #ifdef _WIN32
 #  define nulmark "nul"
 #else
@@ -52,32 +51,29 @@ void FIO_setNotificationLevel(unsigned level);
 /* *************************************
 *  Single File functions
 ***************************************/
+/** FIO_compressFilename() :
+    @return : 0 == ok;  1 == pb with src file. */
 int FIO_compressFilename (const char* outfilename, const char* infilename, const char* dictFileName, int compressionLevel);
-int FIO_decompressFilename (const char* outfilename, const char* infilename, const char* dictFileName);
-/**
-FIO_compressFilename :
-    @result : 0 == ok;  1 == pb with src file.
 
-FIO_decompressFilename :
-    @result : 0 == ok;  1 == pb with src file.
-*/
+/** FIO_decompressFilename() :
+    @return : 0 == ok;  1 == pb with src file. */
+int FIO_decompressFilename (const char* outfilename, const char* infilename, const char* dictFileName);
 
 
 /* *************************************
 *  Multiple File functions
 ***************************************/
+/** FIO_compressMultipleFilenames() :
+    @return : nb of missing files */
 int FIO_compressMultipleFilenames(const char** srcNamesTable, unsigned nbFiles,
                                   const char* suffix,
                                   const char* dictFileName, int compressionLevel);
+
+/** FIO_decompressMultipleFilenames() :
+    @return : nb of missing or skipped files */
 int FIO_decompressMultipleFilenames(const char** srcNamesTable, unsigned nbFiles,
                                     const char* suffix,
                                     const char* dictFileName);
-/**
-FIO_compressMultipleFilenames :
-    @result : nb of missing files
-FIO_decompressMultipleFilenames :
-    @result : nb of missing or skipped files
-*/
 
 
 #if defined (__cplusplus)
diff --git a/programs/fuzzer.c b/programs/fuzzer.c
index 4058ef2..f09cf06 100644
--- a/programs/fuzzer.c
+++ b/programs/fuzzer.c
@@ -91,6 +91,7 @@ static U32 g_testTime = 0;
 /*********************************************************
 *  Fuzzer functions
 *********************************************************/
+#define MIN(a,b) ((a)<(b)?(a):(b))
 #define MAX(a,b) ((a)>(b)?(a):(b))
 
 static U32 FUZ_GetMilliStart(void)
@@ -203,11 +204,9 @@ static int basicUnitTests(U32 seed, double compressibility)
         size_t cSizeOrig;
 
         DISPLAYLEVEL(4, "test%3i : load dictionary into context : ", testNb++);
-        result = ZSTD_compressBegin(ctxOrig, 2);
+        result = ZSTD_compressBegin_usingDict(ctxOrig, CNBuffer, dictSize, 2);
         if (ZSTD_isError(result)) goto _output_error;
-        result = ZSTD_compress_insertDictionary(ctxOrig, CNBuffer, dictSize);
-        if (ZSTD_isError(result)) goto _output_error;
-        result = ZSTD_duplicateCCtx(ctxDuplicated, ctxOrig);
+        result = ZSTD_copyCCtx(ctxDuplicated, ctxOrig);
         if (ZSTD_isError(result)) goto _output_error;
         DISPLAYLEVEL(4, "OK \n");
 
@@ -284,7 +283,7 @@ static int basicUnitTests(U32 seed, double compressibility)
         DISPLAYLEVEL(4, "OK \n");
 
         DISPLAYLEVEL(4, "test%3i : Block decompression test : ", testNb++);
-        result = ZSTD_resetDCtx(dctx);
+        result = ZSTD_decompressBegin(dctx);
         if (ZSTD_isError(result)) goto _output_error;
         result = ZSTD_decompressBlock(dctx, decodedBuffer, COMPRESSIBLE_NOISE_LENGTH, compressedBuffer, cSize);
         if (ZSTD_isError(result)) goto _output_error;
@@ -293,18 +292,15 @@ static int basicUnitTests(U32 seed, double compressibility)
 
         /* dictionary block compression */
         DISPLAYLEVEL(4, "test%3i : Dictionary Block compression test : ", testNb++);
-        result = ZSTD_compressBegin(cctx, 5);
-        if (ZSTD_isError(result)) goto _output_error;
-        result = ZSTD_compress_insertDictionary(cctx, CNBuffer, dictSize);
+        result = ZSTD_compressBegin_usingDict(cctx, CNBuffer, dictSize, 5);
         if (ZSTD_isError(result)) goto _output_error;
         cSize = ZSTD_compressBlock(cctx, compressedBuffer, ZSTD_compressBound(blockSize), (char*)CNBuffer+dictSize, blockSize);
         if (ZSTD_isError(cSize)) goto _output_error;
         DISPLAYLEVEL(4, "OK \n");
 
         DISPLAYLEVEL(4, "test%3i : Dictionary Block decompression test : ", testNb++);
-        result = ZSTD_resetDCtx(dctx);
+        result = ZSTD_decompressBegin_usingDict(dctx, CNBuffer, dictSize);
         if (ZSTD_isError(result)) goto _output_error;
-        ZSTD_decompress_insertDictionary(dctx, CNBuffer, dictSize);
         result = ZSTD_decompressBlock(dctx, decodedBuffer, COMPRESSIBLE_NOISE_LENGTH, compressedBuffer, cSize);
         if (ZSTD_isError(result)) goto _output_error;
         if (result != blockSize) goto _output_error;
@@ -457,7 +453,8 @@ int fuzzerTests(U32 seed, U32 nbTests, unsigned startTest, double compressibilit
         crcOrig = XXH64(sampleBuffer, sampleSize, 0);
 
         /* compression test */
-        cLevelMod = MAX(1, 38 - (int)(MAX(9, sampleSizeLog) * 2));   /* use high compression levels with small samples, for speed */
+        //cLevelMod = MAX(1, 38 - (int)(MAX(9, sampleSizeLog) * 2));   /* high levels only for small samples, for manageable speed */
+        cLevelMod = MIN( ZSTD_maxCLevel(), (U32)MAX(1,  55 - 3*(int)sampleSizeLog) );   /* high levels only for small samples, for manageable speed */
         cLevel = (FUZ_rand(&lseed) % cLevelMod) +1;
         cSize = ZSTD_compressCCtx(ctx, cBuffer, cBufferSize, sampleBuffer, sampleSize, cLevel);
         CHECK(ZSTD_isError(cSize), "ZSTD_compressCCtx failed");
@@ -570,12 +567,10 @@ int fuzzerTests(U32 seed, U32 nbTests, unsigned startTest, double compressibilit
         dict = srcBuffer + sampleStart;
         dictSize = sampleSize;
 
-        errorCode = ZSTD_compressBegin(refCtx, (FUZ_rand(&lseed) % (20 - (sampleSizeLog/3))) + 1);
-        CHECK (ZSTD_isError(errorCode), "start streaming error : %s", ZSTD_getErrorName(errorCode));
-        errorCode = ZSTD_compress_insertDictionary(refCtx, dict, dictSize);
-        CHECK (ZSTD_isError(errorCode), "dictionary insertion error : %s", ZSTD_getErrorName(errorCode));
-        errorCode = ZSTD_duplicateCCtx(ctx, refCtx);
-        CHECK (ZSTD_isError(errorCode), "context duplication error : %s", ZSTD_getErrorName(errorCode));
+        errorCode = ZSTD_compressBegin_usingDict(refCtx, dict, dictSize, (FUZ_rand(&lseed) % (20 - (sampleSizeLog/3))) + 1);
+        CHECK (ZSTD_isError(errorCode), "ZSTD_compressBegin_usingDict error : %s", ZSTD_getErrorName(errorCode));
+        errorCode = ZSTD_copyCCtx(ctx, refCtx);
+        CHECK (ZSTD_isError(errorCode), "ZSTD_copyCCtx error : %s", ZSTD_getErrorName(errorCode));
         totalTestSize = 0; cSize = 0;
         for (n=0; n<nbChunks; n++)
         {
@@ -603,9 +598,8 @@ int fuzzerTests(U32 seed, U32 nbTests, unsigned startTest, double compressibilit
         crcOrig = XXH64_digest(xxh64);
 
         /* streaming decompression test */
-        errorCode = ZSTD_resetDCtx(dctx);
+        errorCode = ZSTD_decompressBegin_usingDict(dctx, dict, dictSize);
         CHECK (ZSTD_isError(errorCode), "cannot init DCtx : %s", ZSTD_getErrorName(errorCode));
-        ZSTD_decompress_insertDictionary(dctx, dict, dictSize);
         totalCSize = 0;
         totalGenSize = 0;
         while (totalCSize < cSize)
diff --git a/programs/legacy/fileio_legacy.c b/programs/legacy/fileio_legacy.c
index f5c4241..419e69d 100644
--- a/programs/legacy/fileio_legacy.c
+++ b/programs/legacy/fileio_legacy.c
@@ -329,6 +329,86 @@ unsigned long long FIOv03_decompressFrame(FILE* foutput, FILE* finput)
 }
 
 
+/*- v0.4.x -*/
+
+typedef struct {
+    void*  srcBuffer;
+    size_t srcBufferSize;
+    void*  dstBuffer;
+    size_t dstBufferSize;
+    void*  dictBuffer;
+    size_t dictBufferSize;
+    ZBUFFv04_DCtx* dctx;
+} dRessv04_t;
+
+static dRessv04_t FIOv04_createDResources(void)
+{
+    dRessv04_t ress;
+
+    /* init */
+    ress.dctx = ZBUFFv04_createDCtx();
+    if (ress.dctx==NULL) EXM_THROW(60, "Can't create ZBUFF decompression context");
+    ress.dictBuffer = NULL; ress.dictBufferSize=0;
+
+    /* Allocate Memory */
+    ress.srcBufferSize = ZBUFFv04_recommendedDInSize();
+    ress.srcBuffer = malloc(ress.srcBufferSize);
+    ress.dstBufferSize = ZBUFFv04_recommendedDOutSize();
+    ress.dstBuffer = malloc(ress.dstBufferSize);
+    if (!ress.srcBuffer || !ress.dstBuffer) EXM_THROW(61, "Allocation error : not enough memory");
+
+    return ress;
+}
+
+static void FIOv04_freeDResources(dRessv04_t ress)
+{
+    size_t errorCode = ZBUFFv04_freeDCtx(ress.dctx);
+    if (ZBUFFv04_isError(errorCode)) EXM_THROW(69, "Error : can't free ZBUFF context resource : %s", ZBUFFv04_getErrorName(errorCode));
+    free(ress.srcBuffer);
+    free(ress.dstBuffer);
+    free(ress.dictBuffer);
+}
+
+
+unsigned long long FIOv04_decompressFrame(dRessv04_t ress,
+                                          FILE* foutput, FILE* finput)
+{
+    U64    frameSize = 0;
+    size_t readSize = 4;
+
+    MEM_writeLE32(ress.srcBuffer, ZSTDv04_magicNumber);
+    ZBUFFv04_decompressInit(ress.dctx);
+    ZBUFFv04_decompressWithDictionary(ress.dctx, ress.dictBuffer, ress.dictBufferSize);
+
+    while (1)
+    {
+        /* Decode */
+        size_t sizeCheck;
+        size_t inSize=readSize, decodedSize=ress.dstBufferSize;
+        size_t toRead = ZBUFFv04_decompressContinue(ress.dctx, ress.dstBuffer, &decodedSize, ress.srcBuffer, &inSize);
+        if (ZBUFFv04_isError(toRead)) EXM_THROW(36, "Decoding error : %s", ZBUFFv04_getErrorName(toRead));
+        readSize -= inSize;
+
+        /* Write block */
+        sizeCheck = fwrite(ress.dstBuffer, 1, decodedSize, foutput);
+        if (sizeCheck != decodedSize) EXM_THROW(37, "Write error : unable to write data block to destination file");
+        frameSize += decodedSize;
+        DISPLAYUPDATE(2, "\rDecoded : %u MB...     ", (U32)(frameSize>>20) );
+
+        if (toRead == 0) break;
+        if (readSize) EXM_THROW(38, "Decoding error : should consume entire input");
+
+        /* Fill input buffer */
+        if (toRead > ress.srcBufferSize) EXM_THROW(34, "too large block");
+        readSize = fread(ress.srcBuffer, 1, toRead, finput);
+        if (readSize != toRead) EXM_THROW(35, "Read error");
+    }
+
+    FIOv04_freeDResources(ress);
+    return frameSize;
+}
+
+
 unsigned long long FIO_decompressLegacyFrame(FILE* foutput, FILE* finput, U32 magicNumberLE)
 {
 	switch(magicNumberLE)
@@ -339,6 +419,8 @@ unsigned long long FIO_decompressLegacyFrame(FILE* foutput, FILE* finput, U32 ma
 			return FIOv02_decompressFrame(foutput, finput);
 		case ZSTDv03_magicNumber :
 			return FIOv03_decompressFrame(foutput, finput);
+		case ZSTDv04_magicNumber :
+			return FIOv04_decompressFrame(FIOv04_createDResources(), foutput, finput);
 		default :
 		    return ERROR(prefix_unknown);
 	}
diff --git a/programs/paramgrill.c b/programs/paramgrill.c
index a34da88..23a54d4 100644
--- a/programs/paramgrill.c
+++ b/programs/paramgrill.c
@@ -1,6 +1,6 @@
 /*
-    paramgrill.c - parameter tester for zstd_hc
-    Copyright (C) Yann Collet 2015
+    paramgrill.c - parameter tester for zstd
+    Copyright (C) Yann Collet 2015-2016
 
     GPL v2 License
 
@@ -19,11 +19,10 @@
     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 
     You can contact the author at :
-    - zstd source repository : https://github.com/Cyan4973/zstd
-    - ztsd public forum : https://groups.google.com/forum/#!forum/lz4c
+    - zstd homepage : http://www.zstd.net/
 */
 
-/**************************************
+/*-************************************
 *  Compiler Options
 **************************************/
 /* Disable some Visual warning messages */
@@ -48,8 +47,8 @@
 #endif
 
 
-/**************************************
-*  Includes
+/*-************************************
+*  Dependencies
 **************************************/
 #include <stdlib.h>       /* malloc */
 #include <stdio.h>        /* fprintf, fopen, ftello64 */
@@ -71,7 +70,7 @@
 #include "xxhash.h"
 
 
-/**************************************
+/*-************************************
 *  Compiler Options
 **************************************/
 /* S_ISREG & gettimeofday() are not supported by MSVC */
@@ -80,7 +79,7 @@
 #endif
 
 
-/**************************************
+/*-************************************
 *  Constants
 **************************************/
 #define PROGRAM_DESCRIPTION "ZSTD_HC parameters tester"
@@ -98,6 +97,8 @@
 #define NBLOOPS    2
 #define TIMELOOP   2000
 
+#define NB_LEVELS_TRACKED 30
+
 static const size_t maxMemory = (sizeof(size_t)==4)  ?  (2 GB - 64 MB) : (size_t)(1ULL << ((sizeof(size_t)*8)-31));
 #define DEFAULT_CHUNKSIZE   (4<<20)
 
@@ -110,13 +111,13 @@ static const int g_maxVariationTime = 60000;   /* 60 sec */
 static const int g_maxNbVariations = 64;
 
 
-/**************************************
+/*-************************************
 *  Macros
 **************************************/
 #define DISPLAY(...)  fprintf(stderr, __VA_ARGS__)
 
 
-/**************************************
+/*-************************************
 *  Benchmark Parameters
 **************************************/
 static U32 g_nbIterations = NBLOOPS;
@@ -126,7 +127,7 @@ static U32 g_rand = 1;
 static U32 g_singleRun = 0;
 static U32 g_target = 0;
 static U32 g_noSeed = 0;
-static ZSTD_parameters g_params = { 0, 0, 0, 0, 0, 0, ZSTD_greedy };
+static ZSTD_parameters g_params = { 0, 0, 0, 0, 0, 0, 0, ZSTD_greedy };
 
 void BMK_SetNbIterations(int nbLoops)
 {
@@ -135,7 +136,7 @@ void BMK_SetNbIterations(int nbLoops)
 }
 
 
-/*********************************************************
+/*-*******************************************************
 *  Private functions
 *********************************************************/
 
@@ -187,8 +188,7 @@ static size_t BMK_findMaxMem(U64 requiredMem)
     if (requiredMem > maxMemory) requiredMem = maxMemory;
 
     requiredMem += 2*step;
-    while (!testmem)
-    {
+    while (!testmem) {
         requiredMem -= step;
         testmem = (BYTE*) malloc ((size_t)requiredMem);
     }
@@ -226,7 +226,7 @@ U32 FUZ_rand(U32* src)
 }
 
 
-/*********************************************************
+/*-*******************************************************
 *  Bench functions
 *********************************************************/
 typedef struct {
@@ -265,14 +265,14 @@ static size_t BMK_benchParam(BMK_result_t* resultPtr,
     U32 Hlog = params.hashLog;
     U32 Slog = params.searchLog;
     U32 Slength = params.searchLength;
+    U32 Tlength = params.targetLength;
     ZSTD_strategy strat = params.strategy;
     char name[30] = { 0 };
     U64 crcOrig;
 
     /* Memory allocation & restrictions */
-    snprintf(name, 30, "Sw%02uc%02uh%02us%02ul%1ut%1u", Wlog, Clog, Hlog, Slog, Slength, strat);
-    if (!compressedBuffer || !resultBuffer || !blockTable)
-    {
+    snprintf(name, 30, "Sw%02uc%02uh%02us%02ul%1ut%03uS%1u", Wlog, Clog, Hlog, Slog, Slength, Tlength, strat);
+    if (!compressedBuffer || !resultBuffer || !blockTable) {
         DISPLAY("\nError: not enough memory!\n");
         free(compressedBuffer);
         free(resultBuffer);
@@ -290,8 +290,7 @@ static size_t BMK_benchParam(BMK_result_t* resultPtr,
         const char* srcPtr = (const char*)srcBuffer;
         char* cPtr = (char*)compressedBuffer;
         char* resPtr = (char*)resultBuffer;
-        for (i=0; i<nbBlocks; i++)
-        {
+        for (i=0; i<nbBlocks; i++) {
             size_t thisBlockSize = MIN(remaining, blockSize);
             blockTable[i].srcPtr = srcPtr;
             blockTable[i].cPtr = cPtr;
@@ -302,8 +301,7 @@ static size_t BMK_benchParam(BMK_result_t* resultPtr,
             cPtr += blockTable[i].cRoom;
             resPtr += thisBlockSize;
             remaining -= thisBlockSize;
-        }
-    }
+    }   }
 
     /* warmimg up memory */
     RDG_genBuffer(compressedBuffer, maxCompressedSize, 0.10, 0.10, 1);
@@ -318,8 +316,7 @@ static size_t BMK_benchParam(BMK_result_t* resultPtr,
         const int startTime =BMK_GetMilliStart();
 
         DISPLAY("\r%79s\r", "");
-        for (loopNb = 1; loopNb <= g_nbIterations; loopNb++)
-        {
+        for (loopNb = 1; loopNb <= g_nbIterations; loopNb++) {
             int nbLoops;
             int milliTime;
             U32 blockNb;
@@ -336,8 +333,7 @@ static size_t BMK_benchParam(BMK_result_t* resultPtr,
             milliTime = BMK_GetMilliStart();
             while (BMK_GetMilliStart() == milliTime);
             milliTime = BMK_GetMilliStart();
-            while (BMK_GetMilliSpan(milliTime) < TIMELOOP)
-            {
+            while (BMK_GetMilliSpan(milliTime) < TIMELOOP) {
                 for (blockNb=0; blockNb<nbBlocks; blockNb++)
                     blockTable[blockNb].cSize = ZSTD_compress_advanced(ctx,
                                                     blockTable[blockNb].cPtr,  blockTable[blockNb].cRoom,
@@ -367,8 +363,7 @@ static size_t BMK_benchParam(BMK_result_t* resultPtr,
             milliTime = BMK_GetMilliStart();
             while (BMK_GetMilliStart() == milliTime);
             milliTime = BMK_GetMilliStart();
-            for ( ; BMK_GetMilliSpan(milliTime) < TIMELOOP; nbLoops++)
-            {
+            for ( ; BMK_GetMilliSpan(milliTime) < TIMELOOP; nbLoops++) {
                 for (blockNb=0; blockNb<nbBlocks; blockNb++)
                     blockTable[blockNb].resSize = ZSTD_decompress(blockTable[blockNb].resPtr, blockTable[blockNb].srcSize,
                                                                   blockTable[blockNb].cPtr, blockTable[blockNb].cSize);
@@ -384,24 +379,19 @@ static size_t BMK_benchParam(BMK_result_t* resultPtr,
 
             /* CRC Checking */
             crcCheck = XXH64(resultBuffer, srcSize, 0);
-            if (crcOrig!=crcCheck)
-            {
+            if (crcOrig!=crcCheck) {
                 unsigned u;
                 unsigned eBlockSize = (unsigned)(MIN(65536*2, blockSize));
                 DISPLAY("\n!!! WARNING !!! Invalid Checksum : %x != %x\n", (unsigned)crcOrig, (unsigned)crcCheck);
-                for (u=0; u<srcSize; u++)
-                {
-                    if (((const BYTE*)srcBuffer)[u] != ((BYTE*)resultBuffer)[u])
-                    {
+                for (u=0; u<srcSize; u++) {
+                    if (((const BYTE*)srcBuffer)[u] != ((BYTE*)resultBuffer)[u]) {
                         printf("Decoding error at pos %u (block %u, pos %u) \n", u, u / eBlockSize, u % eBlockSize);
                         break;
-                    }
-                }
+                }   }
                 break;
             }
 #endif
-        }
-    }
+    }   }
 
     /* End cleaning */
     DISPLAY("\r");
@@ -415,21 +405,23 @@ const char* g_stratName[] = { "ZSTD_fast   ",
                               "ZSTD_greedy ",
                               "ZSTD_lazy   ",
                               "ZSTD_lazy2  ",
-                              "ZSTD_btlazy2" };
+                              "ZSTD_btlazy2",
+                              "ZSTD_opt    ",
+                              "ZSTD_btopt  " };
 
 static void BMK_printWinner(FILE* f, U32 cLevel, BMK_result_t result, ZSTD_parameters params, size_t srcSize)
 {
     DISPLAY("\r%79s\r", "");
-    fprintf(f,"    {%3u,%3u,%3u,%3u,%3u,%3u, %s },  ",
+    fprintf(f,"    {%3u,%3u,%3u,%3u,%3u,%3u,%3u, %s },  ",
             0, params.windowLog, params.contentLog, params.hashLog, params.searchLog, params.searchLength,
-            g_stratName[(U32)(params.strategy)]);
+            params.targetLength, g_stratName[(U32)(params.strategy)]);
     fprintf(f,
             "/* level %2u */   /* R:%5.3f at %5.1f MB/s - %5.1f MB/s */\n",
             cLevel, (double)srcSize / result.cSize, (double)result.cSpeed / 1000., (double)result.dSpeed / 1000.);
 }
 
 
-static U32 g_cSpeedTarget[ZSTD_MAX_CLEVEL+1] = { 0 };
+static U32 g_cSpeedTarget[NB_LEVELS_TRACKED] = { 0 };   /* NB_LEVELS_TRACKED : checked at main() */
 
 typedef struct {
     BMK_result_t result;
@@ -438,14 +430,12 @@ typedef struct {
 
 static void BMK_printWinners2(FILE* f, const winnerInfo_t* winners, size_t srcSize)
 {
-    int cLevel;
+    unsigned cLevel;
 
     fprintf(f, "\n /* Proposed configurations : */ \n");
-    fprintf(f, "#define ZSTD_MAX_CLEVEL %2u \n", ZSTD_MAX_CLEVEL);
-    fprintf(f, "static const ZSTD_parameters ZSTD_defaultParameters[ZSTD_MAX_CLEVEL+1] = {\n");
-    fprintf(f, "    /* l,  W,  C,  H,  S,  L, strat */ \n");
+    fprintf(f, "    /* l,  W,  C,  H,  S,  L,  T, strat */ \n");
 
-    for (cLevel=0; cLevel <= ZSTD_MAX_CLEVEL; cLevel++)
+    for (cLevel=0; cLevel <= ZSTD_maxCLevel(); cLevel++)
         BMK_printWinner(f, cLevel, winners[cLevel].result, winners[cLevel].params, srcSize);
 }
 
@@ -465,16 +455,14 @@ static int BMK_seed(winnerInfo_t* winners, const ZSTD_parameters params,
 {
     BMK_result_t testResult;
     int better = 0;
-    int cLevel;
+    unsigned cLevel;
 
     BMK_benchParam(&testResult, srcBuffer, srcSize, ctx, params);
 
-    for (cLevel = 1; cLevel <= ZSTD_MAX_CLEVEL; cLevel++)
-    {
+    for (cLevel = 1; cLevel <= ZSTD_maxCLevel(); cLevel++) {
         if (testResult.cSpeed < g_cSpeedTarget[cLevel])
             continue;   /* not fast enough for this level */
-        if (winners[cLevel].result.cSize==0)
-        {
+        if (winners[cLevel].result.cSize==0) {
             /* first solution for this cLevel */
             winners[cLevel].result = testResult;
             winners[cLevel].params = params;
@@ -483,8 +471,7 @@ static int BMK_seed(winnerInfo_t* winners, const ZSTD_parameters params,
             continue;
         }
 
-        if ((double)testResult.cSize <= ((double)winners[cLevel].result.cSize * (1. + (0.02 / cLevel))) )
-        {
+        if ((double)testResult.cSize <= ((double)winners[cLevel].result.cSize * (1. + (0.02 / cLevel))) ) {
             /* Validate solution is "good enough" */
             double W_ratio = (double)srcSize / testResult.cSize;
             double O_ratio = (double)srcSize / winners[cLevel].result.cSize;
@@ -509,8 +496,7 @@ static int BMK_seed(winnerInfo_t* winners, const ZSTD_parameters params,
             double O_DSpeed_note = O_ratioNote * ( 20 + 2*cLevel) + log((double)winners[cLevel].result.dSpeed);
 
 
-            if (W_DMemUsed_note < O_DMemUsed_note)
-            {
+            if (W_DMemUsed_note < O_DMemUsed_note) {
                 /* uses too much Decompression memory for too little benefit */
                 if (W_ratio > O_ratio)
                 DISPLAY ("Decompression Memory : %5.3f @ %4.1f MB  vs  %5.3f @ %4.1f MB   : not enough for level %i\n",
@@ -518,8 +504,7 @@ static int BMK_seed(winnerInfo_t* winners, const ZSTD_parameters params,
                          O_ratio, (double)(O_DMemUsed) / 1024 / 1024,   cLevel);
                 continue;
             }
-            if (W_CMemUsed_note < O_CMemUsed_note)
-            {
+            if (W_CMemUsed_note < O_CMemUsed_note) {
                 /* uses too much memory for compression for too little benefit */
                 if (W_ratio > O_ratio)
                 DISPLAY ("Compression Memory : %5.3f @ %4.1f MB  vs  %5.3f @ %4.1f MB   : not enough for level %i\n",
@@ -527,8 +512,7 @@ static int BMK_seed(winnerInfo_t* winners, const ZSTD_parameters params,
                          O_ratio, (double)(O_CMemUsed) / 1024 / 1024,   cLevel);
                 continue;
             }
-            if (W_CSpeed_note   < O_CSpeed_note  )
-            {
+            if (W_CSpeed_note   < O_CSpeed_note  ) {
                 /* too large compression speed difference for the compression benefit */
                 if (W_ratio > O_ratio)
                 DISPLAY ("Compression Speed : %5.3f @ %4.1f MB/s  vs  %5.3f @ %4.1f MB/s   : not enough for level %i\n",
@@ -536,8 +520,7 @@ static int BMK_seed(winnerInfo_t* winners, const ZSTD_parameters params,
                          O_ratio, (double)(winners[cLevel].result.cSpeed) / 1000.,   cLevel);
                 continue;
             }
-            if (W_DSpeed_note   < O_DSpeed_note  )
-            {
+            if (W_DSpeed_note   < O_DSpeed_note  ) {
                 /* too large decompression speed difference for the compression benefit */
                 if (W_ratio > O_ratio)
                 DISPLAY ("Decompression Speed : %5.3f @ %4.1f MB/s  vs  %5.3f @ %4.1f MB/s   : not enough for level %i\n",
@@ -554,9 +537,7 @@ static int BMK_seed(winnerInfo_t* winners, const ZSTD_parameters params,
             BMK_printWinner(stdout, cLevel, testResult, params, srcSize);
 
             better = 1;
-        }
-
-    }
+    }   }
 
     return better;
 }
@@ -567,10 +548,9 @@ static ZSTD_parameters* sanitizeParams(ZSTD_parameters params)
 {
     g_params = params;
     if (params.strategy == ZSTD_fast)
-    {
-        g_params.contentLog = 0;
-        g_params.searchLog = 0;
-    }
+        g_params.contentLog = 0, g_params.searchLog = 0;
+    if ((params.strategy != ZSTD_opt) && (params.strategy != ZSTD_btopt ))
+        g_params.targetLength = 0;
     return &g_params;
 }
 
@@ -578,9 +558,8 @@ static ZSTD_parameters* sanitizeParams(ZSTD_parameters params)
 static void paramVariation(ZSTD_parameters* p)
 {
     U32 nbChanges = (FUZ_rand(&g_rand) & 3) + 1;
-    for (; nbChanges; nbChanges--)
-    {
-        const U32 changeID = FUZ_rand(&g_rand) % 12;
+    for (; nbChanges; nbChanges--) {
+        const U32 changeID = FUZ_rand(&g_rand) % 14;
         switch(changeID)
         {
         case 0:
@@ -607,6 +586,10 @@ static void paramVariation(ZSTD_parameters* p)
             p->strategy = (ZSTD_strategy)(((U32)p->strategy)+1); break;
         case 11:
             p->strategy = (ZSTD_strategy)(((U32)p->strategy)-1); break;
+        case 12:
+            p->targetLength *= 1 + ((double)(FUZ_rand(&g_rand)&255)) / 256.; break;
+        case 13:
+            p->targetLength /= 1 + ((double)(FUZ_rand(&g_rand)&255)) / 256.; break;
         }
     }
     ZSTD_validateParams(p);
@@ -632,8 +615,7 @@ static void playAround(FILE* f, winnerInfo_t* winners,
     int nbVariations = 0;
     const int startTime = BMK_GetMilliStart();
 
-    while (BMK_GetMilliSpan(startTime) < g_maxVariationTime)
-    {
+    while (BMK_GetMilliSpan(startTime) < g_maxVariationTime) {
         ZSTD_parameters p = params;
 
         if (nbVariations++ > g_maxNbVariations) break;
@@ -658,15 +640,15 @@ static void playAround(FILE* f, winnerInfo_t* winners,
 static void potentialRandomParams(ZSTD_parameters* p, U32 inverseChance)
 {
     U32 chance = (FUZ_rand(&g_rand) % (inverseChance+1));
-    if (!chance)
-    {
+    if (!chance) {
         /* totally random entry */
         p->contentLog = FUZ_rand(&g_rand) % (ZSTD_CONTENTLOG_MAX+1 - ZSTD_CONTENTLOG_MIN) + ZSTD_CONTENTLOG_MIN;
         p->hashLog    = FUZ_rand(&g_rand) % (ZSTD_HASHLOG_MAX+1 - ZSTD_HASHLOG_MIN) + ZSTD_HASHLOG_MIN;
         p->searchLog  = FUZ_rand(&g_rand) % (ZSTD_SEARCHLOG_MAX+1 - ZSTD_SEARCHLOG_MIN) + ZSTD_SEARCHLOG_MIN;
         p->windowLog  = FUZ_rand(&g_rand) % (ZSTD_WINDOWLOG_MAX+1 - ZSTD_WINDOWLOG_MIN) + ZSTD_WINDOWLOG_MIN;
         p->searchLength=FUZ_rand(&g_rand) % (ZSTD_SEARCHLENGTH_MAX+1 - ZSTD_SEARCHLENGTH_MIN) + ZSTD_SEARCHLENGTH_MIN;
-        p->strategy   = (ZSTD_strategy) (FUZ_rand(&g_rand) % (ZSTD_btlazy2+1));
+        p->targetLength=FUZ_rand(&g_rand) % (ZSTD_TARGETLENGTH_MAX+1 - ZSTD_TARGETLENGTH_MIN) + ZSTD_TARGETLENGTH_MIN;
+        p->strategy   = (ZSTD_strategy) (FUZ_rand(&g_rand) % (ZSTD_btopt +1));
         ZSTD_validateParams(p);
     }
 }
@@ -676,9 +658,8 @@ static void BMK_selectRandomStart(
                        const void* srcBuffer, size_t srcSize,
                        ZSTD_CCtx* ctx)
 {
-    U32 id = (FUZ_rand(&g_rand) % (ZSTD_MAX_CLEVEL+1));
-    if ((id==0) || (winners[id].params.windowLog==0))
-    {
+    U32 id = (FUZ_rand(&g_rand) % (ZSTD_maxCLevel()+1));
+    if ((id==0) || (winners[id].params.windowLog==0)) {
         /* totally random entry */
         ZSTD_parameters p;
         potentialRandomParams(&p, 1);
@@ -695,14 +676,14 @@ static void BMK_benchMem(void* srcBuffer, size_t srcSize)
 {
     ZSTD_CCtx* ctx = ZSTD_createCCtx();
     ZSTD_parameters params;
-    winnerInfo_t winners[ZSTD_MAX_CLEVEL+1];
+    winnerInfo_t winners[NB_LEVELS_TRACKED];
     int i;
+    unsigned u;
     const char* rfName = "grillResults.txt";
     FILE* f;
     const size_t blockSize = g_blockSize ? g_blockSize : srcSize;
 
-    if (g_singleRun)
-    {
+    if (g_singleRun) {
         BMK_result_t testResult;
         g_params.srcSize = blockSize;
         ZSTD_validateParams(&g_params);
@@ -718,8 +699,7 @@ static void BMK_benchMem(void* srcBuffer, size_t srcSize)
 
     if (g_target)
         g_cSpeedTarget[1] = g_target * 1000;
-    else
-    {
+    else {
         /* baseline config for level 1 */
         BMK_result_t testResult;
         params = ZSTD_getParams(1, blockSize);
@@ -728,14 +708,13 @@ static void BMK_benchMem(void* srcBuffer, size_t srcSize)
     }
 
     /* establish speed objectives (relative to level 1) */
-    for (i=2; i<=ZSTD_MAX_CLEVEL; i++)
-        g_cSpeedTarget[i] = (g_cSpeedTarget[i-1] * 25) >> 5;
+    for (u=2; u<=ZSTD_maxCLevel(); u++)
+        g_cSpeedTarget[u] = (g_cSpeedTarget[u-1] * 25) >> 5;
 
     /* populate initial solution */
     {
-        const int maxSeeds = g_noSeed ? 1 : ZSTD_MAX_CLEVEL;
-        for (i=1; i<=maxSeeds; i++)
-        {
+        const int maxSeeds = g_noSeed ? 1 : ZSTD_maxCLevel();
+        for (i=1; i<=maxSeeds; i++) {
             params = ZSTD_getParams(i, blockSize);
             ZSTD_validateParams(&params);
             BMK_seed(winners, params, srcBuffer, srcSize, ctx);
@@ -746,8 +725,7 @@ static void BMK_benchMem(void* srcBuffer, size_t srcSize)
     /* start tests */
     {
         const int milliStart = BMK_GetMilliStart();
-        do
-        {
+        do {
             BMK_selectRandomStart(f, winners, srcBuffer, srcSize, ctx);
         } while (BMK_GetMilliSpan(milliStart) < g_grillDuration);
     }
@@ -764,17 +742,13 @@ static void BMK_benchMem(void* srcBuffer, size_t srcSize)
 
 static int benchSample(void)
 {
-    char* origBuff;
+    void* origBuff;
     size_t benchedSize = sampleSize;
     const char* name = "Sample 10MiB";
 
     /* Allocation */
-    origBuff = (char*) malloc((size_t)benchedSize);
-    if(!origBuff)
-    {
-        DISPLAY("\nError: not enough memory!\n");
-        return 12;
-    }
+    origBuff = malloc(benchedSize);
+    if (!origBuff) { DISPLAY("\nError: not enough memory!\n"); return 12; }
 
     /* Fill buffer */
     RDG_genBuffer(origBuff, benchedSize, g_compressibility, 0.0, 0);
@@ -794,8 +768,7 @@ int benchFiles(char** fileNamesTable, int nbFiles)
     int fileIdx=0;
 
     /* Loop for each file */
-    while (fileIdx<nbFiles)
-    {
+    while (fileIdx<nbFiles) {
         FILE* inFile;
         char* inFileName;
         U64   inFileSize;
@@ -806,25 +779,21 @@ int benchFiles(char** fileNamesTable, int nbFiles)
         /* Check file existence */
         inFileName = fileNamesTable[fileIdx++];
         inFile = fopen( inFileName, "rb" );
-        if (inFile==NULL)
-        {
+        if (inFile==NULL) {
             DISPLAY( "Pb opening %s\n", inFileName);
             return 11;
         }
 
         /* Memory allocation & restrictions */
         inFileSize = BMK_GetFileSize(inFileName);
-        benchedSize = (size_t) BMK_findMaxMem(inFileSize*3) / 3;
+        benchedSize = BMK_findMaxMem(inFileSize*3) / 3;
         if ((U64)benchedSize > inFileSize) benchedSize = (size_t)inFileSize;
         if (benchedSize < inFileSize)
-        {
             DISPLAY("Not enough memory for '%s' full size; testing %i MB only...\n", inFileName, (int)(benchedSize>>20));
-        }
 
         /* Alloc */
         origBuff = (char*) malloc((size_t)benchedSize);
-        if(!origBuff)
-        {
+        if(!origBuff) {
             DISPLAY("\nError: not enough memory!\n");
             fclose(inFile);
             return 12;
@@ -835,8 +804,7 @@ int benchFiles(char** fileNamesTable, int nbFiles)
         readSize = fread(origBuff, 1, benchedSize, inFile);
         fclose(inFile);
 
-        if(readSize != benchedSize)
-        {
+        if(readSize != benchedSize) {
             DISPLAY("\nError: problem reading file '%s' !!    \n", inFileName);
             free(origBuff);
             return 13;
@@ -862,8 +830,7 @@ int optimizeForSize(char* inFileName)
 
     /* Check file existence */
     inFile = fopen( inFileName, "rb" );
-    if (inFile==NULL)
-    {
+    if (inFile==NULL) {
         DISPLAY( "Pb opening %s\n", inFileName);
         return 11;
     }
@@ -873,14 +840,11 @@ int optimizeForSize(char* inFileName)
     benchedSize = (size_t) BMK_findMaxMem(inFileSize*3) / 3;
     if ((U64)benchedSize > inFileSize) benchedSize = (size_t)inFileSize;
     if (benchedSize < inFileSize)
-    {
         DISPLAY("Not enough memory for '%s' full size; testing %i MB only...\n", inFileName, (int)(benchedSize>>20));
-    }
 
     /* Alloc */
     origBuff = (char*) malloc((size_t)benchedSize);
-    if(!origBuff)
-    {
+    if(!origBuff) {
         DISPLAY("\nError: not enough memory!\n");
         fclose(inFile);
         return 12;
@@ -891,8 +855,7 @@ int optimizeForSize(char* inFileName)
     readSize = fread(origBuff, 1, benchedSize, inFile);
     fclose(inFile);
 
-    if(readSize != benchedSize)
-    {
+    if(readSize != benchedSize) {
         DISPLAY("\nError: problem reading file '%s' !!    \n", inFileName);
         free(origBuff);
         return 13;
@@ -916,9 +879,8 @@ int optimizeForSize(char* inFileName)
 
         /* find best solution from default params */
         {
-            const int maxSeeds = g_noSeed ? 1 : ZSTD_MAX_CLEVEL;
-            for (i=1; i<=maxSeeds; i++)
-            {
+            const int maxSeeds = g_noSeed ? 1 : ZSTD_maxCLevel();
+            for (i=1; i<=maxSeeds; i++) {
                 params = ZSTD_getParams(i, blockSize);
                 BMK_benchParam(&candidate, origBuff, benchedSize, ctx, params);
                 if ( (candidate.cSize < winner.result.cSize)
@@ -927,16 +889,14 @@ int optimizeForSize(char* inFileName)
                     winner.params = params;
                     winner.result = candidate;
                     BMK_printWinner(stdout, i, winner.result, winner.params, benchedSize);
-                }
-            }
+            }   }
         }
         BMK_printWinner(stdout, 99, winner.result, winner.params, benchedSize);
 
         /* start tests */
         {
             const int milliStart = BMK_GetMilliStart();
-            do
-            {
+            do {
                 params = winner.params;
                 paramVariation(&params);
                 potentialRandomParams(&params, 16);
@@ -950,13 +910,11 @@ int optimizeForSize(char* inFileName)
 
                 /* improvement found => new winner */
                 if ( (candidate.cSize < winner.result.cSize)
-                   ||((candidate.cSize == winner.result.cSize) && (candidate.cSpeed > winner.result.cSpeed)) )
-                {
+                   ||((candidate.cSize == winner.result.cSize) && (candidate.cSpeed > winner.result.cSpeed)) ) {
                     winner.params = params;
                     winner.result = candidate;
                     BMK_printWinner(stdout, 99, winner.result, winner.params, benchedSize);
                 }
-
             } while (BMK_GetMilliSpan(milliStart) < g_grillDuration);
         }
 
@@ -972,7 +930,7 @@ int optimizeForSize(char* inFileName)
 }
 
 
-int usage(char* exename)
+static int usage(char* exename)
 {
     DISPLAY( "Usage :\n");
     DISPLAY( "      %s [arg] file\n", exename);
@@ -982,16 +940,17 @@ int usage(char* exename)
     return 0;
 }
 
-int usage_advanced(void)
+static int usage_advanced(void)
 {
     DISPLAY( "\nAdvanced options :\n");
     DISPLAY( " -i#    : iteration loops [1-9](default : %i)\n", NBLOOPS);
     DISPLAY( " -B#    : cut input into blocks of size # (default : single block)\n");
     DISPLAY( " -P#    : generated sample compressibility (default : %.1f%%)\n", COMPRESSIBILITY_DEFAULT * 100);
+    DISPLAY( " -S     : Single run\n");
     return 0;
 }
 
-int badusage(char* exename)
+static int badusage(char* exename)
 {
     DISPLAY("Wrong parameters\n");
     usage(exename);
@@ -1008,6 +967,12 @@ int main(int argc, char** argv)
     U32 optimizer = 0;
     U32 main_pause = 0;
 
+    /* checks */
+    if (NB_LEVELS_TRACKED <= ZSTD_maxCLevel()) {
+        DISPLAY("Error : NB_LEVELS_TRACKED <= ZSTD_maxCLevel() \n");
+        exit(1);
+    }
+
     /* Welcome message */
     DISPLAY(WELCOME_MESSAGE);
 
@@ -1022,12 +987,10 @@ int main(int argc, char** argv)
         if(!strcmp(argument,"--no-seed")) { g_noSeed = 1; continue; }
 
         /* Decode command (note : aggregated commands are allowed) */
-        if (argument[0]=='-')
-        {
+        if (argument[0]=='-') {
             argument++;
 
-            while (argument[0]!=0)
-            {
+            while (argument[0]!=0) {
 
                 switch(argument[0])
                 {
@@ -1050,8 +1013,7 @@ int main(int argc, char** argv)
                     argument++;
                     {
                         U32 proba32 = 0;
-                        while ((argument[0]>= '0') && (argument[0]<= '9'))
-                        {
+                        while ((argument[0]>= '0') && (argument[0]<= '9')) {
                             proba32 *= 10;
                             proba32 += argument[0] - '0';
                             argument++;
@@ -1070,8 +1032,7 @@ int main(int argc, char** argv)
                     g_singleRun = 1;
                     argument++;
                     g_params = ZSTD_getParams(2, g_blockSize);
-                    for ( ; ; )
-                    {
+                    for ( ; ; ) {
                         switch(*argument)
                         {
                         case 'w':
@@ -1104,14 +1065,16 @@ int main(int argc, char** argv)
                             while ((*argument>= '0') && (*argument<='9'))
                                 g_params.searchLength *= 10, g_params.searchLength += *argument++ - '0';
                             continue;
-                        case 't':  /* strategy */
-                            g_params.strategy = (ZSTD_strategy)0;
+                        case 't':  /* target length */
+                            g_params.targetLength = 0;
                             argument++;
                             while ((*argument>= '0') && (*argument<='9'))
-                            {
-                                g_params.strategy = (ZSTD_strategy)((U32)g_params.strategy *10);
-                                g_params.strategy = (ZSTD_strategy)((U32)g_params.strategy + *argument++ - '0');
-                            }
+                                g_params.targetLength *= 10, g_params.targetLength += *argument++ - '0';
+                            continue;
+                        case 'S':  /* strategy */
+                            argument++;
+                            while ((*argument>= '0') && (*argument<='9'))
+                                g_params.strategy = (ZSTD_strategy)(*argument++ - '0');
                             continue;
                         case 'L':
                             {
@@ -1132,8 +1095,7 @@ int main(int argc, char** argv)
                 case 'T':
                     argument++;
                     g_target = 0;
-                    while ((*argument >= '0') && (*argument <= '9'))
-                    {
+                    while ((*argument >= '0') && (*argument <= '9')) {
                         g_target *= 10;
                         g_target += *argument - '0';
                         argument++;
@@ -1167,8 +1129,7 @@ int main(int argc, char** argv)
 
     if (filenamesStart==0)
         result = benchSample();
-    else
-    {
+    else {
         if (optimizer)
             result = optimizeForSize(input_filename);
         else
@@ -1179,4 +1140,3 @@ int main(int argc, char** argv)
 
     return result;
 }
-
diff --git a/programs/playTests.sh b/programs/playTests.sh
index 5d641ec..ec625ee 100755
--- a/programs/playTests.sh
+++ b/programs/playTests.sh
@@ -16,28 +16,45 @@ roundTripTest() {
     rm -f tmp1 tmp2
     echo "roundTripTest: ./datagen $1 $p | $ZSTD -v$c | $ZSTD -d"
     ./datagen $1 $p | md5sum > tmp1
-    ./datagen $1 $p | $ZSTD -v$c | $ZSTD -d  | md5sum > tmp2
+    ./datagen $1 $p | $ZSTD -vq$c | $ZSTD -d  | md5sum > tmp2
     diff -q tmp1 tmp2
 }
 
 [ -n "$ZSTD" ] || die "ZSTD variable must be defined!"
 
-printf "\n**** frame concatenation **** "
+
+echo "\n**** simple tests **** "
+./datagen > tmp
+$ZSTD tmp
+$ZSTD -99 tmp && die "too large compression level undetected"
+$ZSTD tmp -c > tmpCompressed
+$ZSTD tmp --stdout > tmpCompressed
+$ZSTD -d tmpCompressed && die "wrong suffix error not detected!"
+$ZSTD -d tmpCompressed -c > tmpResult
+$ZSTD --decompress tmpCompressed -c > tmpResult
+$ZSTD --decompress tmpCompressed --stdout > tmpResult
+$ZSTD -q tmp && die "overwrite check failed!"
+$ZSTD -q -f tmp
+$ZSTD -q --force tmp
+
+
+echo "\n**** frame concatenation **** "
 
 echo "hello " > hello.tmp
 echo "world!" > world.tmp
 cat hello.tmp world.tmp > helloworld.tmp
-$ZSTD hello.tmp > hello.zstd
-$ZSTD world.tmp > world.zstd
+$ZSTD -c hello.tmp > hello.zstd
+$ZSTD -c world.tmp > world.zstd
 cat hello.zstd world.zstd > helloworld.zstd
-$ZSTD -df helloworld.zstd > result.tmp
+$ZSTD -dc helloworld.zstd > result.tmp
 cat result.tmp
 sdiff helloworld.tmp result.tmp
 rm ./*.tmp ./*.zstd
 
 echo frame concatenation test completed
 
-echo "**** flush write error test **** "
+
+echo "\n**** flush write error test **** "
 
 echo "echo foo | $ZSTD > /dev/full"
 echo foo | $ZSTD > /dev/full && die "write error not detected!"
@@ -45,30 +62,52 @@ echo "echo foo | $ZSTD | $ZSTD -d > /dev/full"
 echo foo | $ZSTD | $ZSTD -d > /dev/full && die "write error not detected!"
 
 
-echo "*** dictionary tests *** "
+echo "\n**** dictionary tests **** "
 
 ./datagen > tmpDict
 ./datagen -g1M | md5sum > tmp1
-./datagen -g1M | $ZSTD -D tmpDict | $ZSTD -D tmpDict -dv | md5sum > tmp2
+./datagen -g1M | $ZSTD -D tmpDict | $ZSTD -D tmpDict -dvq | md5sum > tmp2
 diff -q tmp1 tmp2
 
-echo "*** multiple files tests *** "
+echo "\n**** multiple files tests **** "
 
 ./datagen -s1        > tmp1 2> /dev/null
 ./datagen -s2 -g100K > tmp2 2> /dev/null
 ./datagen -s3 -g1M   > tmp3 2> /dev/null
-$ZSTD -f -m tmp*
+$ZSTD -f tmp*
+echo "compress tmp* : "
 ls -ls tmp*
 rm tmp1 tmp2 tmp3
-$ZSTD -df -m *.zst
+echo "decompress tmp* : "
+$ZSTD -df *.zst
 ls -ls tmp*
-$ZSTD -f -m tmp1 notHere tmp2 && die "missing file not detected!"
-rm tmp*
+echo "compress tmp* into stdout > tmpall : "
+$ZSTD -c tmp1 tmp2 tmp3 > tmpall
+ls -ls tmp*
+echo "decompress tmpall* into stdout > tmpdec : "
+cp tmpall tmpall2
+$ZSTD -dc tmpall* > tmpdec
+ls -ls tmp*
+echo "compress multiple files including a missing one (notHere) : "
+$ZSTD -f tmp1 notHere tmp2 && die "missing file not detected!"
 
-echo "**** zstd round-trip tests **** "
+echo "\n**** integrity tests **** "
+echo "test one file (tmp1.zst) "
+$ZSTD -t tmp1.zst
+$ZSTD --test tmp1.zst
+echo "test multiple files (*.zst) "
+$ZSTD -t *.zst
+echo "test good and bad files (*) "
+$ZSTD -t * && die "bad files not detected !"
+
+echo "\n**** zstd round-trip tests **** "
 
 roundTripTest
-roundTripTest '' 6
+roundTripTest -g512K 6    # greedy, hash chain
+roundTripTest -g512K 16   # btlazy2 
+roundTripTest -g512K 19   # btopt
+
+rm tmp*
 
 if [ "$1" != "--test-large-data" ]; then
     echo "Skipping large data tests"
@@ -102,3 +141,6 @@ roundTripTest -g50000000 -P94 19
 
 roundTripTest -g99000000 -P99 20
 roundTripTest -g6000000000 -P99 q
+
+rm tmp*
+
diff --git a/programs/xxhash.c b/programs/xxhash.c
index d33113f..352d1e5 100644
--- a/programs/xxhash.c
+++ b/programs/xxhash.c
@@ -175,7 +175,7 @@ static U64 XXH_read64(const void* memPtr)
     return val;
 }
 
-#endif // XXH_FORCE_DIRECT_MEMORY_ACCESS
+#endif   /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
 
 
 /* ****************************************
diff --git a/programs/zbufftest.c b/programs/zbufftest.c
index f9677f0..aa57b57 100644
--- a/programs/zbufftest.c
+++ b/programs/zbufftest.c
@@ -41,7 +41,7 @@
 #include <sys/timeb.h>   /* timeb */
 #include <string.h>      /* strcmp */
 #include "mem.h"
-#include "zstd_buffered.h"
+#include "zbuff.h"
 #include "zstd.h"        /* ZSTD_compressBound() */
 #include "datagen.h"     /* RDG_genBuffer */
 #include "xxhash.h"      /* XXH64 */
@@ -158,10 +158,9 @@ static int basicUnitTests(U32 seed, double compressibility)
 
     /* Basic compression test */
     DISPLAYLEVEL(4, "test%3i : compress %u bytes : ", testNb++, COMPRESSIBLE_NOISE_LENGTH);
-    ZBUFF_compressInit(zc, 1);
+    ZBUFF_compressInitDictionary(zc, CNBuffer, 128 KB, 1);
     readSize = CNBufferSize;
     genSize = compressedBufferSize;
-    ZBUFF_compressWithDictionary(zc, CNBuffer, 128 KB);
     result = ZBUFF_compressContinue(zc, compressedBuffer, &genSize, CNBuffer, &readSize);
     if (ZBUFF_isError(result)) goto _output_error;
     if (readSize != CNBufferSize) goto _output_error;   /* entire input should be consumed */
@@ -174,8 +173,7 @@ static int basicUnitTests(U32 seed, double compressibility)
 
     /* Basic decompression test */
     DISPLAYLEVEL(4, "test%3i : decompress %u bytes : ", testNb++, COMPRESSIBLE_NOISE_LENGTH);
-    ZBUFF_decompressInit(zd);
-    ZBUFF_decompressWithDictionary(zd, CNBuffer, 128 KB);
+    ZBUFF_decompressInitDictionary(zd, CNBuffer, 128 KB);
     readSize = cSize;
     genSize = CNBufferSize;
     result = ZBUFF_decompressContinue(zd, decodedBuffer, &genSize, compressedBuffer, &readSize);
@@ -318,7 +316,6 @@ int fuzzerTests(U32 seed, U32 nbTests, unsigned startTest, double compressibilit
         sampleSizeLog = FUZ_rand(&lseed) % maxSrcLog;
         maxTestSize = (size_t)1 << sampleSizeLog;
         maxTestSize += FUZ_rand(&lseed) & (maxTestSize-1);
-        ZBUFF_compressInit(zc, (FUZ_rand(&lseed) % (20 - (sampleSizeLog/3))) + 1);
 
         sampleSizeLog = FUZ_rand(&lseed) % maxSampleLog;
         sampleSize = (size_t)1 << sampleSizeLog;
@@ -326,7 +323,7 @@ int fuzzerTests(U32 seed, U32 nbTests, unsigned startTest, double compressibilit
         sampleStart = FUZ_rand(&lseed) % (srcBufferSize - sampleSize);
         dict = srcBuffer + sampleStart;
         dictSize = sampleSize;
-        ZBUFF_compressWithDictionary(zc, dict, dictSize);
+        ZBUFF_compressInitDictionary(zc, dict, dictSize, (FUZ_rand(&lseed) % (20 - (sampleSizeLog/3))) + 1);
 
         totalTestSize = 0;
         cSize = 0;
@@ -374,8 +371,7 @@ int fuzzerTests(U32 seed, U32 nbTests, unsigned startTest, double compressibilit
         crcOrig = XXH64_digest(xxh64);
 
         /* multi - fragments decompression test */
-        ZBUFF_decompressInit(zd);
-        ZBUFF_decompressWithDictionary(zd, dict, dictSize);
+        ZBUFF_decompressInitDictionary(zd, dict, dictSize);
         totalCSize = 0;
         totalGenSize = 0;
         while (totalCSize < cSize)
diff --git a/programs/zstd.1 b/programs/zstd.1
index 8d69c4d..27d607f 100644
--- a/programs/zstd.1
+++ b/programs/zstd.1
@@ -14,7 +14,7 @@
 
 .SH SYNOPSIS
 .TP 5
-\fBzstd\fR [\fBOPTIONS\fR] [-|INPUT-FILE] <OUTPUT-FILE>
+\fBzstd\fR [\fBOPTIONS\fR] [-|INPUT-FILE] [-o <OUTPUT-FILE>]
 .PP
 .B unzstd
 is equivalent to
@@ -28,15 +28,13 @@ is equivalent to
 .SH DESCRIPTION
 .PP
 \fBzstd\fR is a fast lossless compression algorithm.
-It is based on the \fBLZ77\fR family, with FSE & huff0 entropy stage.
-zstd offers compression speed > 200 MB/s per core.
-It also features a fast decoder, with speed > 500 MB/s per core.
+It is based on the \fBLZ77\fR family, with further FSE & huff0 entropy stages.
+\fBzstd\fR offers configurable compression speed, with fast modes at > 200 MB/s per core.
+It also features a very fast decoder, with speed > 500 MB/s per core.
 
 \fBzstd\fR command line is generally similar to gzip, but features the following differences :
  - Original files are preserved
- - By default, \fBzstd file1 file2\fR means : compress file1 \fBinto\fR file2.
-     Use \fB-m\fR command if you want : compress file1 into file1.zstd and file2 into file2.zst
- - By default, when compressing files, \fBzstd\fR displays advancement notification and result summary.
+ - By default, when compressing a single file, \fBzstd\fR displays progress notifications and result summary.
      Use \fB-q\fR to turn them off
 
 
@@ -45,21 +43,19 @@ It also features a fast decoder, with speed > 500 MB/s per core.
 .SH OPTIONS
 .TP
 .B \-#
- # compression level [1-19](default:1)
+ # compression level [1-21] (default:1)
 .TP
-.B \-d
+.BR \-d ", " --decompress
  decompression
 .TP
-.B \-f
- overwrite output without prompting
+.B \-D file
+ use `file` as Dictionary to compress or decompress FILE(s)
 .TP
-.BR \-m ", " --multiple
- multiple files mode
- In this mode, multiple files on the command line means compression or decompression of each named file
- Notifications are also turned off by default
+.B \-o file
+ save result into `file` (only possible with a single input FILE)
 .TP
-.B \-D
- Use next file as dictionary content for compress / decompression
+.BR \-f ", " --force
+ overwrite output without prompting
 .TP
 .BR \-h/\-H ", " --help
  display help/long help and exit
@@ -73,17 +69,47 @@ It also features a fast decoder, with speed > 500 MB/s per core.
 .BR \-q ", " --quiet
  suppress warnings and notifications; specify twice to suppress errors too
 .TP
-.B \-c 
+.BR \-c ", " --stdout
  force write to standard output, even if it is the console
+
+.SH DICTIONARY
+.PP
+\fBzstd\fR offers \fIdictionary\fR compression, useful for very small files and messages.
+It's possible to train \fBzstd\fR with some samples, the result of which is saved into a file called `dictionary`.
+Then during compression and decompression, make reference to the same dictionary.
+It will improve compression ratio of small files.
+Typical gains range from ~10% (at 64KB) to x5 better (at <1KB).
+.TP
+.B \--train FILEs
+ use FILEs as training set to create a dictionary.
+ The training set should contain a lot of small files (> 100).
+ and weight typically 100x the target dictionary size
+ (for example, 10 MB for a 100 KB dictionary)
+.TP
+.B \-o file
+ dictionary saved into `file` (default: dictionary)
 .TP
-.B \-z
- force compression
+.B \--maxdict #
+ limit dictionary to specified size (default : 112640) 
+.TP
+.B \-s#
+ dictionary selectivity level (default: 9)
+ the smaller the value, the denser the dictionary, improving its efficiency but reducing its possible maximum size.
+
+.SH BENCHMARK
 .TP
 .B \-b#
  benchmark file(s) using compression level #
 .TP
 .B \-i#
  iteration loops [1-9](default : 3), benchmark mode only
+.TP
+.B \-B#
+ cut file into independent blocks of size # (default: no block)
+.TP
+.B \-r#
+ test all compression levels from 1 to # (default: disabled)
+
 
 .SH BUGS
 Report bugs at:- https://github.com/Cyan4973/zstd/issues
diff --git a/programs/zstdcli.c b/programs/zstdcli.c
index 564686c..abe1301 100644
--- a/programs/zstdcli.c
+++ b/programs/zstdcli.c
@@ -1,6 +1,6 @@
 /*
   zstdcli - Command Line Interface (cli) for zstd
-  Copyright (C) Yann Collet 2014-2015
+  Copyright (C) Yann Collet 2014-2016
 
   GPL v2 License
 
@@ -19,25 +19,23 @@
   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 
   You can contact the author at :
-  - zstd source repository : https://github.com/Cyan4973/zstd
-  - ztsd public forum : https://groups.google.com/forum/#!forum/lz4c
+  - zstd homepage : http://www.zstd.net/
 */
 /*
-  Note : this is user program.
-  It is not part of zstd compression library.
-  The license of this compression CLI program is GPLv2.
-  The license of zstd library is BSD.
+  Note : this is user program, not part of libzstd.
+  The license of this command line program is GPLv2.
+  The license of libzstd is BSD.
 */
 
 
-/**************************************
+/*-************************************
 *  Compiler Options
 **************************************/
 #define _CRT_SECURE_NO_WARNINGS  /* Visual : removes warning from strcpy */
 #define _POSIX_SOURCE 1          /* triggers fileno() within <stdio.h> on unix */
 
 
-/**************************************
+/*-************************************
 *  Includes
 **************************************/
 #include <stdio.h>    /* fprintf, getchar */
@@ -47,18 +45,18 @@
 #ifndef ZSTD_NOBENCH
 #  include "bench.h"  /* BMK_benchFiles, BMK_SetNbIterations */
 #endif
-#include "zstd.h"     /* ZSTD version numbers */
+#include "zstd_static.h" /* ZSTD_maxCLevel, ZSTD version numbers  */
+#ifndef ZSTD_NODICT
+#  include "dibio.h"  /* BMK_benchFiles, BMK_SetNbIterations */
+#endif
 
 
-/**************************************
+/*-************************************
 *  OS-specific Includes
 **************************************/
 #if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(_WIN32) || defined(__CYGWIN__)
 #  include <fcntl.h>    /* _O_BINARY */
 #  include <io.h>       /* _setmode, _isatty */
-#  ifdef __MINGW32__
-   /* int _fileno(FILE *stream);   // seems no longer useful // MINGW somehow forgets to include this windows declaration into <stdio.h> */
-#  endif
 #  define SET_BINARY_MODE(file) _setmode(_fileno(file), _O_BINARY)
 #  define IS_CONSOLE(stdStream) _isatty(_fileno(stdStream))
 #else
@@ -68,7 +66,7 @@
 #endif
 
 
-/**************************************
+/*-************************************
 *  Constants
 **************************************/
 #define COMPRESSOR_NAME "zstd command line interface"
@@ -78,7 +76,8 @@
 #  define ZSTD_VERSION "v" EXPAND_AND_QUOTE(ZSTD_VERSION_MAJOR) "." EXPAND_AND_QUOTE(ZSTD_VERSION_MINOR) "." EXPAND_AND_QUOTE(ZSTD_VERSION_RELEASE)
 #endif
 #define AUTHOR "Yann Collet"
-#define WELCOME_MESSAGE "*** %s %i-bits %s, by %s (%s) ***\n", COMPRESSOR_NAME, (int)(sizeof(void*)*8), ZSTD_VERSION, AUTHOR, __DATE__
+#define WELCOME_MESSAGE "*** %s %i-bits %s, by %s ***\n", COMPRESSOR_NAME, (int)(sizeof(void*)*8), ZSTD_VERSION, AUTHOR
+
 #define ZSTD_EXTENSION ".zst"
 #define ZSTD_CAT "zstdcat"
 #define ZSTD_UNZSTD "unzstd"
@@ -87,45 +86,36 @@
 #define MB *(1 <<20)
 #define GB *(1U<<30)
 
+static const char* g_defaultDictName = "dictionary";
+static const unsigned g_defaultMaxDictSize = 110 KB;
+static const unsigned g_defaultDictCLevel = 5;
+static const unsigned g_defaultSelectivityLevel = 9;
+
 
-/**************************************
+/*-************************************
 *  Display Macros
 **************************************/
 #define DISPLAY(...)           fprintf(displayOut, __VA_ARGS__)
 #define DISPLAYLEVEL(l, ...)   if (displayLevel>=l) { DISPLAY(__VA_ARGS__); }
 static FILE* displayOut;
-static unsigned displayLevel = 2;   // 0 : no display  // 1: errors  // 2 : + result + interaction + warnings ;  // 3 : + progression;  // 4 : + information
-
-
-/**************************************
-*  Exceptions
-**************************************/
-#define DEBUG 0
-#define DEBUGOUTPUT(...) if (DEBUG) DISPLAY(__VA_ARGS__);
-#define EXM_THROW(error, ...)                                             \
-{                                                                         \
-    DEBUGOUTPUT("Error defined at %s, line %i : \n", __FILE__, __LINE__); \
-    DISPLAYLEVEL(1, "Error %i : ", error);                                \
-    DISPLAYLEVEL(1, __VA_ARGS__);                                         \
-    DISPLAYLEVEL(1, "\n");                                                \
-    exit(error);                                                          \
-}
+static unsigned displayLevel = 2;   /* 0 : no display,  1: errors,  2 : + result + interaction + warnings,  3 : + progression,  4 : + information */
 
 
-/**************************************
+/*-************************************
 *  Command Line
 **************************************/
 static int usage(const char* programName)
 {
     DISPLAY( "Usage :\n");
-    DISPLAY( "      %s [arg] [input] [output]\n", programName);
+    DISPLAY( "      %s [args] [FILE(s)] [-o file]\n", programName);
     DISPLAY( "\n");
-    DISPLAY( "input   : a filename\n");
+    DISPLAY( "FILE    : a filename\n");
     DISPLAY( "          with no FILE, or when FILE is - , read standard input\n");
     DISPLAY( "Arguments :\n");
-    DISPLAY( " -#     : # compression level (1-19, default:1) \n");
-    DISPLAY( " -d     : decompression (default for %s extension)\n", ZSTD_EXTENSION);
-    //DISPLAY( " -z     : force compression\n");
+    DISPLAY( " -#     : # compression level (1-%u, default:1) \n", ZSTD_maxCLevel());
+    DISPLAY( " -d     : decompression \n");
+    DISPLAY( " -D file: use `file` as Dictionary \n");
+    DISPLAY( " -o file: result stored into `file` (only if 1 input file) \n");
     DISPLAY( " -f     : overwrite output without prompting \n");
     DISPLAY( " -h/-H  : display help/long help and exit\n");
     return 0;
@@ -138,17 +128,23 @@ static int usage_advanced(const char* programName)
     DISPLAY( "\n");
     DISPLAY( "Advanced arguments :\n");
     DISPLAY( " -V     : display Version number and exit\n");
+    DISPLAY( " -t     : test compressed file integrity \n");
     DISPLAY( " -v     : verbose mode\n");
     DISPLAY( " -q     : suppress warnings; specify twice to suppress errors too\n");
-    DISPLAY( " -m     : multiple input filenames mode \n");
     DISPLAY( " -c     : force write to standard output, even if it is the console\n");
-    DISPLAY( " -D file: use file content as Dictionary \n");
+#ifndef ZSTD_NODICT
+    DISPLAY( "Dictionary builder :\n");
+    DISPLAY( "--train : create a dictionary from a training set of files \n");
+    DISPLAY( " -o file: `file` is dictionary name (default: %s) \n", g_defaultDictName);
+    DISPLAY( "--maxdict:limit dictionary to specified size (default : %u) \n", g_defaultMaxDictSize);
+    DISPLAY( " -s#    : dictionary selectivity level (default: %u)\n", g_defaultSelectivityLevel);
+#endif
 #ifndef ZSTD_NOBENCH
     DISPLAY( "Benchmark arguments :\n");
     DISPLAY( " -b#    : benchmark file(s), using # compression level (default : 1) \n");
-    DISPLAY( " -B#    : cut file into independent blocks of size # (default : no block)\n");
     DISPLAY( " -i#    : iteration loops [1-9](default : 3)\n");
-    DISPLAY( " -r#    : test all compression levels from 1 to # (default : disabled)\n");
+    DISPLAY( " -B#    : cut file into independent blocks of size # (default: no block)\n");
+    DISPLAY( " -r#    : test all compression levels from 1 to # (default: disabled)\n");
 #endif
     return 0;
 }
@@ -178,8 +174,10 @@ int main(int argCount, const char** argv)
         forceStdout=0,
         main_pause=0,
         nextEntryIsDictionary=0,
-        multiple=0,
-        operationResult=0;
+        operationResult=0,
+        dictBuild=0,
+        nextArgumentIsOutFileName=0,
+        nextArgumentIsMaxDict=0;
     unsigned cLevel = 1;
     const char** filenameTable = (const char**)malloc(argCount * sizeof(const char*));   /* argCount >= 1 */
     unsigned filenameIdx = 0;
@@ -187,11 +185,13 @@ int main(int argCount, const char** argv)
     const char* outFileName = NULL;
     const char* dictFileName = NULL;
     char* dynNameSpace = NULL;
-    const char extension[] = ZSTD_EXTENSION;
     int rangeBench = 1;
+    unsigned maxDictSize = g_defaultMaxDictSize;
+    unsigned dictCLevel = g_defaultDictCLevel;
+    unsigned dictSelect = g_defaultSelectivityLevel;
 
     /* init */
-    (void)rangeBench;   /* not used when ZSTD_NOBENCH set */
+    (void)rangeBench; (void)dictCLevel;   /* not used when ZSTD_NOBENCH / ZSTD_NODICT set */
     if (filenameTable==NULL) { DISPLAY("not enough memory\n"); exit(1); }
     displayOut = stderr;
     /* Pick out program name from path. Don't rely on stdlib because of conflicting behavior */
@@ -203,43 +203,46 @@ int main(int argCount, const char** argv)
     if (!strcmp(programName, ZSTD_CAT)) { decode=1; forceStdout=1; displayLevel=1; outFileName=stdoutmark; }
 
     /* command switches */
-    for(i=1; i<argCount; i++)
-    {
+    for(i=1; i<argCount; i++) {
         const char* argument = argv[i];
-
         if(!argument) continue;   /* Protection if argument empty */
 
         /* long commands (--long-word) */
+        if (!strcmp(argument, "--decompress")) { decode=1; continue; }
+        if (!strcmp(argument, "--force")) {  FIO_overwriteMode(); continue; }
         if (!strcmp(argument, "--version")) { displayOut=stdout; DISPLAY(WELCOME_MESSAGE); return 0; }
         if (!strcmp(argument, "--help")) { displayOut=stdout; return usage_advanced(programName); }
-        if (!strcmp(argument, "--multiple")) { multiple=1; continue; }
         if (!strcmp(argument, "--verbose")) { displayLevel=4; continue; }
         if (!strcmp(argument, "--quiet")) { displayLevel--; continue; }
+        if (!strcmp(argument, "--stdout")) { forceStdout=1; outFileName=stdoutmark; displayLevel=1; continue; }
+        if (!strcmp(argument, "--test")) { decode=1; outFileName=nulmark; FIO_overwriteMode(); continue; }
+        if (!strcmp(argument, "--train")) { dictBuild=1; outFileName=g_defaultDictName; continue; }
+        if (!strcmp(argument, "--maxdict")) { nextArgumentIsMaxDict=1; continue; }
+        if (!strcmp(argument, "--keep")) { continue; }   /* does nothing, since preserving input is default; for gzip/xz compatibility */
+
+        /* '-' means stdin/stdout */
+        if (!strcmp(argument, "-")){
+            if (!filenameIdx) { filenameIdx=1, filenameTable[0]=stdinmark; continue; }
+            outFileName=stdoutmark; continue;
+        }
 
         /* Decode commands (note : aggregated commands are allowed) */
-        if (argument[0]=='-')
-        {
-            /* '-' means stdin/stdout */
-            if (argument[1]==0)
-            {
-                if (!filenameIdx) { filenameIdx=1, filenameTable[0]=stdinmark; continue; }
-                outFileName=stdoutmark; continue;
-            }
-
+        if (argument[0]=='-') {
             argument++;
 
-            while (argument[0]!=0)
-            {
+            while (argument[0]!=0) {
+
                 /* compression Level */
-                if ((*argument>='0') && (*argument<='9'))
-                {
+                if ((*argument>='0') && (*argument<='9')) {
                     cLevel = 0;
-                    while ((*argument >= '0') && (*argument <= '9'))
-                    {
+                    while ((*argument >= '0') && (*argument <= '9')) {
                         cLevel *= 10;
                         cLevel += *argument - '0';
                         argument++;
                     }
+                    dictCLevel = cLevel;
+                    if (dictCLevel > ZSTD_maxCLevel())
+                        return badusage(programName);
                     continue;
                 }
 
@@ -250,24 +253,15 @@ int main(int argCount, const char** argv)
                 case 'H':
                 case 'h': displayOut=stdout; return usage_advanced(programName);
 
-                    /* Compression (default) */
-                //case 'z': forceCompress = 1; break;
-
-                    /* Decoding */
+                     /* Decoding */
                 case 'd': decode=1; argument++; break;
 
-                    /* Multiple input files */
-                case 'm': multiple=1; argument++; break;
-
                     /* Force stdout, even if stdout==console */
                 case 'c': forceStdout=1; outFileName=stdoutmark; displayLevel=1; argument++; break;
 
                     /* Use file content as dictionary */
                 case 'D': nextEntryIsDictionary = 1; argument++; break;
 
-                    /* Test -- not implemented */
-                /* case 't': decode=1; LZ4IO_setOverwrite(1); output_filename=nulmark; break; */
-
                     /* Overwrite */
                 case 'f': FIO_overwriteMode(); argument++; break;
 
@@ -280,6 +274,12 @@ int main(int argCount, const char** argv)
                     /* keep source file (default anyway, so useless; for gzip/xz compatibility) */
                 case 'k': argument++; break;
 
+                    /* test compressed file */
+                case 't': decode=1; outFileName=nulmark; FIO_overwriteMode(); argument++; break;
+
+                    /* dictionary name */
+                case 'o': nextArgumentIsOutFileName=1; argument++; break;
+
 #ifndef ZSTD_NOBENCH
                     /* Benchmark */
                 case 'b': bench=1; argument++; break;
@@ -316,6 +316,13 @@ int main(int argCount, const char** argv)
                         break;
 #endif   /* ZSTD_NOBENCH */
 
+                    /* Selection level */
+                case 's': argument++;
+                    dictSelect = 0;
+                    while ((*argument >= '0') && (*argument <= '9'))
+                        dictSelect *= 10, dictSelect += *argument++ - '0';
+                    break;
+
                     /* Pause at the end (hidden option) */
                 case 'p': main_pause=1; argument++; break;
 
@@ -326,14 +333,29 @@ int main(int argCount, const char** argv)
             continue;
         }
 
-        /* dictionary */
-        if (nextEntryIsDictionary)
-        {
+        if (nextEntryIsDictionary) {
             nextEntryIsDictionary = 0;
             dictFileName = argument;
             continue;
         }
 
+        if (nextArgumentIsOutFileName) {
+            nextArgumentIsOutFileName = 0;
+            outFileName = argument;
+            if (!strcmp(outFileName, "-")) outFileName = stdoutmark;
+            continue;
+        }
+
+        if (nextArgumentIsMaxDict) {
+            nextArgumentIsMaxDict = 0;
+            maxDictSize = 0;
+            while ((*argument>='0') && (*argument<='9'))
+                maxDictSize = maxDictSize * 10 + (*argument - '0'), argument++;
+            if (*argument=='k' || *argument=='K')
+                maxDictSize <<= 10;
+            continue;
+        }
+
         /* add filename to list */
         filenameTable[filenameIdx++] = argument;
     }
@@ -342,81 +364,54 @@ int main(int argCount, const char** argv)
     DISPLAYLEVEL(3, WELCOME_MESSAGE);
 
     /* Check if benchmark is selected */
-    if (bench)
-    {
+    if (bench) {
 #ifndef ZSTD_NOBENCH
         BMK_benchFiles(filenameTable, filenameIdx, dictFileName, cLevel*rangeBench);
 #endif
         goto _end;
     }
 
-    /* No input filename ==> use stdin */
-    if(!filenameIdx) filenameIdx=1, filenameTable[0]=stdinmark;
-
-    /* Check if input defined as console; trigger an error in this case */
-    if (!strcmp(filenameTable[0], stdinmark) && IS_CONSOLE(stdin) ) return badusage(programName);
-
-    /* No output filename ==> try to select one automatically (when possible) */
-    if (filenameIdx>=2) outFileName = filenameTable[1];
-    while (!outFileName)   /* while : just to allow break statement */
-    {
-        if (!IS_CONSOLE(stdout)) { outFileName=stdoutmark; break; }   /* Default to stdout whenever possible (i.e. not a console) */
-        if (!decode)   /* compression to file */
-        {
-            size_t l = strlen(filenameTable[0]);
-            dynNameSpace = (char*)calloc(1,l+5);
-            if (dynNameSpace==NULL) { DISPLAY("not enough memory\n"); exit(1); }
-            strcpy(dynNameSpace, filenameTable[0]);
-            strcpy(dynNameSpace+l, ZSTD_EXTENSION);
-            outFileName = dynNameSpace;
-            DISPLAYLEVEL(2, "Compressed filename will be : %s \n", outFileName);
-            break;
-        }
-        /* decompression to file (automatic name will work only if input filename has correct format extension) */
-        {
-            size_t filenameSize = strlen(filenameTable[0]);
-            if (strcmp(filenameTable[0] + (filenameSize-4), extension))
-            {
-                 DISPLAYLEVEL(1, "unknown suffix - cannot determine destination filename\n");
-                 return badusage(programName);
-            }
-            dynNameSpace = (char*)calloc(1,filenameSize+1);
-            if (dynNameSpace==NULL) { DISPLAY("not enough memory\n"); exit(1); }
-            outFileName = dynNameSpace;
-            strcpy(dynNameSpace, filenameTable[0]);
-            dynNameSpace[filenameSize-4]=0;
-            DISPLAYLEVEL(2, "Decoding file %s \n", outFileName);
-        }
+    /* Check if dictionary builder is selected */
+    if (dictBuild) {
+#ifndef ZSTD_NODICT
+        ZDICT_params_t dictParams;
+        dictParams.compressionLevel = dictCLevel;
+        dictParams.selectivityLevel = dictSelect;
+        dictParams.notificationLevel = displayLevel;
+        DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, dictParams);
+#endif
+        goto _end;
     }
 
-    /* Check if output is defined as console; trigger an error in this case */
-    if (!strcmp(outFileName,stdoutmark) && IS_CONSOLE(stdout) && !forceStdout) return badusage(programName);
+    /* No input filename ==> use stdin and stdout */
+    if(!filenameIdx) filenameIdx=1, filenameTable[0]=stdinmark, outFileName=stdoutmark;
 
-    /* No warning message in pure pipe mode (stdin + stdout) or multiple mode */
-    if (!strcmp(filenameTable[0], stdinmark) && !strcmp(outFileName,stdoutmark) && (displayLevel==2)) displayLevel=1;
-    if (multiple && (displayLevel==2)) displayLevel=1;
+    /* Check if input/output defined as console; trigger an error in this case */
+    if (!strcmp(filenameTable[0], stdinmark) && IS_CONSOLE(stdin) ) return badusage(programName);
+    if (outFileName && !strcmp(outFileName, stdoutmark) && IS_CONSOLE(stdout) && !forceStdout) return badusage(programName);
 
-    if ((!multiple) && (filenameIdx>2))
-    {
-        DISPLAY("Too many files on the command line (%u > 2). Do you mean -m ? \n", filenameIdx);
+    /* user-selected output filename, only possible with a single file */
+    if (outFileName && strcmp(outFileName,stdoutmark) && strcmp(outFileName,nulmark) && (filenameIdx>1)) {
+        DISPLAY("Too many files (%u) on the command line. \n", filenameIdx);
         return filenameIdx;
     }
 
+    /* No warning message in pipe mode (stdin + stdout) or multiple mode */
+    if (!strcmp(filenameTable[0], stdinmark) && !strcmp(outFileName,stdoutmark) && (displayLevel==2)) displayLevel=1;
+    if ((filenameIdx>1) && (displayLevel==2)) displayLevel=1;
+
     /* IO Stream/File */
     FIO_setNotificationLevel(displayLevel);
-    if (decode)
-    {
-      if (multiple)
-        operationResult = FIO_decompressMultipleFilenames(filenameTable, filenameIdx, ZSTD_EXTENSION, dictFileName);
-      else
+    if (decode) {
+      if (filenameIdx==1 && outFileName)
         operationResult = FIO_decompressFilename(outFileName, filenameTable[0], dictFileName);
-    }
-    else
-    {
-        if (multiple)
-          operationResult = FIO_compressMultipleFilenames(filenameTable, filenameIdx, ZSTD_EXTENSION, dictFileName, cLevel);
-        else
+      else
+        operationResult = FIO_decompressMultipleFilenames(filenameTable, filenameIdx, outFileName ? outFileName : ZSTD_EXTENSION, dictFileName);
+    } else {  /* compression */
+        if (filenameIdx==1 && outFileName)
           operationResult = FIO_compressFilename(outFileName, filenameTable[0], dictFileName, cLevel);
+        else
+          operationResult = FIO_compressMultipleFilenames(filenameTable, filenameIdx, outFileName ? outFileName : ZSTD_EXTENSION, dictFileName, cLevel);
     }
 
 _end:
diff --git a/visual/2013/fullbench/fullbench.vcxproj b/visual/2013/fullbench/fullbench.vcxproj
index c0d7376..3797960 100644
--- a/visual/2013/fullbench/fullbench.vcxproj
+++ b/visual/2013/fullbench/fullbench.vcxproj
@@ -161,9 +161,6 @@
   <ItemGroup>
     <ClCompile Include="..\..\..\lib\fse.c" />
     <ClCompile Include="..\..\..\lib\huff0.c" />
-    <ClCompile Include="..\..\..\lib\legacy\zstd_v01.c" />
-    <ClCompile Include="..\..\..\lib\legacy\zstd_v02.c" />
-    <ClCompile Include="..\..\..\lib\legacy\zstd_v03.c" />
     <ClCompile Include="..\..\..\lib\zstd_compress.c" />
     <ClCompile Include="..\..\..\lib\zstd_decompress.c" />
     <ClCompile Include="..\..\..\programs\datagen.c" />
diff --git a/visual/2013/fullbench/fullbench.vcxproj.filters b/visual/2013/fullbench/fullbench.vcxproj.filters
index c3db197..3a82000 100644
--- a/visual/2013/fullbench/fullbench.vcxproj.filters
+++ b/visual/2013/fullbench/fullbench.vcxproj.filters
@@ -24,24 +24,15 @@
     <ClCompile Include="..\..\..\programs\datagen.c">
       <Filter>Fichiers sources</Filter>
     </ClCompile>
-    <ClCompile Include="..\..\..\lib\legacy\zstd_v01.c">
-      <Filter>Fichiers sources</Filter>
-    </ClCompile>
     <ClCompile Include="..\..\..\lib\huff0.c">
       <Filter>Fichiers sources</Filter>
     </ClCompile>
-    <ClCompile Include="..\..\..\lib\legacy\zstd_v02.c">
-      <Filter>Fichiers sources</Filter>
-    </ClCompile>
     <ClCompile Include="..\..\..\lib\zstd_compress.c">
       <Filter>Fichiers sources</Filter>
     </ClCompile>
     <ClCompile Include="..\..\..\lib\zstd_decompress.c">
       <Filter>Fichiers sources</Filter>
     </ClCompile>
-    <ClCompile Include="..\..\..\lib\legacy\zstd_v03.c">
-      <Filter>Fichiers sources</Filter>
-    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\..\..\lib\fse.h">
diff --git a/visual/2013/zstd.sln b/visual/2013/zstd.sln
index 4f2447f..3186fc6 100644
--- a/visual/2013/zstd.sln
+++ b/visual/2013/zstd.sln
@@ -1,7 +1,7 @@
 
 Microsoft Visual Studio Solution File, Format Version 12.00
-# Visual Studio 14
-VisualStudioVersion = 14.0.24720.0
+# Visual Studio 2013
+VisualStudioVersion = 12.0.40629.0
 MinimumVisualStudioVersion = 10.0.40219.1
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "zstd", "zstd\zstd.vcxproj", "{4E52A41A-F33B-4C7A-8C36-A1A6B4F4277C}"
 EndProject
diff --git a/visual/2013/zstd/zstd.vcxproj b/visual/2013/zstd/zstd.vcxproj
index 2ab1285..4531976 100644
--- a/visual/2013/zstd/zstd.vcxproj
+++ b/visual/2013/zstd/zstd.vcxproj
@@ -19,22 +19,27 @@
     </ProjectConfiguration>
   </ItemGroup>
   <ItemGroup>
+    <ClCompile Include="..\..\..\lib\divsufsort.c" />
     <ClCompile Include="..\..\..\lib\fse.c" />
     <ClCompile Include="..\..\..\lib\huff0.c" />
     <ClCompile Include="..\..\..\lib\legacy\zstd_v01.c" />
     <ClCompile Include="..\..\..\lib\legacy\zstd_v02.c" />
     <ClCompile Include="..\..\..\lib\legacy\zstd_v03.c" />
-    <ClCompile Include="..\..\..\lib\zstd_buffered.c" />
+    <ClCompile Include="..\..\..\lib\legacy\zstd_v04.c" />
+    <ClCompile Include="..\..\..\lib\zbuff.c" />
+    <ClCompile Include="..\..\..\lib\zdict.c" />
     <ClCompile Include="..\..\..\lib\zstd_compress.c" />
     <ClCompile Include="..\..\..\lib\zstd_decompress.c" />
     <ClCompile Include="..\..\..\programs\bench.c" />
     <ClCompile Include="..\..\..\programs\datagen.c" />
+    <ClCompile Include="..\..\..\programs\dibio.c" />
     <ClCompile Include="..\..\..\programs\fileio.c" />
     <ClCompile Include="..\..\..\programs\legacy\fileio_legacy.c" />
     <ClCompile Include="..\..\..\programs\xxhash.c" />
     <ClCompile Include="..\..\..\programs\zstdcli.c" />
   </ItemGroup>
   <ItemGroup>
+    <ClInclude Include="..\..\..\lib\divsufsort.h" />
     <ClInclude Include="..\..\..\lib\fse.h" />
     <ClInclude Include="..\..\..\lib\fse_static.h" />
     <ClInclude Include="..\..\..\lib\huff0.h" />
@@ -43,6 +48,11 @@
     <ClInclude Include="..\..\..\lib\legacy\zstd_v01.h" />
     <ClInclude Include="..\..\..\lib\legacy\zstd_v02.h" />
     <ClInclude Include="..\..\..\lib\legacy\zstd_v03.h" />
+    <ClInclude Include="..\..\..\lib\legacy\zstd_v04.h" />
+    <ClInclude Include="..\..\..\lib\zbuff.h" />
+    <ClInclude Include="..\..\..\lib\zbuff_static.h" />
+    <ClInclude Include="..\..\..\lib\zdict.h" />
+    <ClInclude Include="..\..\..\lib\zdict_static.h" />
     <ClInclude Include="..\..\..\lib\zstd.h" />
     <ClInclude Include="..\..\..\lib\zstd_buffered.h" />
     <ClInclude Include="..\..\..\lib\zstd_buffered_static.h" />
@@ -50,6 +60,7 @@
     <ClInclude Include="..\..\..\lib\zstd_static.h" />
     <ClInclude Include="..\..\..\programs\bench.h" />
     <ClInclude Include="..\..\..\programs\datagen.h" />
+    <ClInclude Include="..\..\..\programs\dibio.h" />
     <ClInclude Include="..\..\..\programs\fileio.h" />
     <ClInclude Include="..\..\..\programs\legacy\fileio_legacy.h" />
     <ClInclude Include="..\..\..\programs\xxhash.h" />
diff --git a/visual/2013/zstd/zstd.vcxproj.filters b/visual/2013/zstd/zstd.vcxproj.filters
index 49bae35..31a9780 100644
--- a/visual/2013/zstd/zstd.vcxproj.filters
+++ b/visual/2013/zstd/zstd.vcxproj.filters
@@ -48,15 +48,27 @@
     <ClCompile Include="..\..\..\lib\zstd_decompress.c">
       <Filter>Fichiers sources</Filter>
     </ClCompile>
-    <ClCompile Include="..\..\..\lib\zstd_buffered.c">
-      <Filter>Fichiers sources</Filter>
-    </ClCompile>
     <ClCompile Include="..\..\..\lib\legacy\zstd_v03.c">
       <Filter>Fichiers sources</Filter>
     </ClCompile>
     <ClCompile Include="..\..\..\programs\datagen.c">
       <Filter>Fichiers sources</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\..\lib\legacy\zstd_v04.c">
+      <Filter>Fichiers sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\..\lib\divsufsort.c">
+      <Filter>Fichiers sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\..\lib\zbuff.c">
+      <Filter>Fichiers sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\..\lib\zdict.c">
+      <Filter>Fichiers sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\..\programs\dibio.c">
+      <Filter>Fichiers sources</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\..\..\lib\fse.h">
@@ -113,5 +125,26 @@
     <ClInclude Include="..\..\..\programs\datagen.h">
       <Filter>Fichiers d%27en-tête</Filter>
     </ClInclude>
+    <ClInclude Include="..\..\..\lib\legacy\zstd_v04.h">
+      <Filter>Fichiers d%27en-tête</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\..\lib\divsufsort.h">
+      <Filter>Fichiers sources</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\..\lib\zbuff.h">
+      <Filter>Fichiers sources</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\..\lib\zbuff_static.h">
+      <Filter>Fichiers sources</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\..\lib\zdict.h">
+      <Filter>Fichiers sources</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\..\lib\zdict_static.h">
+      <Filter>Fichiers sources</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\..\programs\dibio.h">
+      <Filter>Fichiers d%27en-tête</Filter>
+    </ClInclude>
   </ItemGroup>
 </Project>
\ No newline at end of file
diff --git a/visual/2013/zstdlib/zstdlib.vcxproj b/visual/2013/zstdlib/zstdlib.vcxproj
index a580048..b13bc98 100644
--- a/visual/2013/zstdlib/zstdlib.vcxproj
+++ b/visual/2013/zstdlib/zstdlib.vcxproj
@@ -21,20 +21,22 @@
   <ItemGroup>
     <ClCompile Include="..\..\..\lib\fse.c" />
     <ClCompile Include="..\..\..\lib\huff0.c" />
-    <ClCompile Include="..\..\..\lib\zstd_buffered.c" />
+    <ClCompile Include="..\..\..\lib\zbuff.c" />
     <ClCompile Include="..\..\..\lib\zstd_compress.c" />
     <ClCompile Include="..\..\..\lib\zstd_decompress.c" />
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\..\..\lib\bitstream.h" />
+    <ClInclude Include="..\..\..\lib\error_private.h" />
+    <ClInclude Include="..\..\..\lib\error_public.h" />
     <ClInclude Include="..\..\..\lib\fse.h" />
     <ClInclude Include="..\..\..\lib\fse_static.h" />
     <ClInclude Include="..\..\..\lib\huff0.h" />
     <ClInclude Include="..\..\..\lib\huff0_static.h" />
     <ClInclude Include="..\..\..\lib\mem.h" />
+    <ClInclude Include="..\..\..\lib\zbuff.h" />
+    <ClInclude Include="..\..\..\lib\zbuff_static.h" />
     <ClInclude Include="..\..\..\lib\zstd.h" />
-    <ClInclude Include="..\..\..\lib\zstd_buffered.h" />
-    <ClInclude Include="..\..\..\lib\zstd_buffered_static.h" />
     <ClInclude Include="..\..\..\lib\zstd_internal.h" />
     <ClInclude Include="..\..\..\lib\zstd_static.h" />
     <ClInclude Include="resource.h" />
diff --git a/visual/2013/zstdlib/zstdlib.vcxproj.filters b/visual/2013/zstdlib/zstdlib.vcxproj.filters
index 8600c5f..ffb457b 100644
--- a/visual/2013/zstdlib/zstdlib.vcxproj.filters
+++ b/visual/2013/zstdlib/zstdlib.vcxproj.filters
@@ -21,15 +21,15 @@
     <ClCompile Include="..\..\..\lib\huff0.c">
       <Filter>Source Files</Filter>
     </ClCompile>
-    <ClCompile Include="..\..\..\lib\zstd_buffered.c">
-      <Filter>Source Files</Filter>
-    </ClCompile>
     <ClCompile Include="..\..\..\lib\zstd_compress.c">
       <Filter>Source Files</Filter>
     </ClCompile>
     <ClCompile Include="..\..\..\lib\zstd_decompress.c">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\..\lib\zbuff.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\..\..\lib\fse.h">
@@ -59,13 +59,19 @@
     <ClInclude Include="..\..\..\lib\zstd_internal.h">
       <Filter>Header Files</Filter>
     </ClInclude>
-    <ClInclude Include="..\..\..\lib\zstd_buffered.h">
+    <ClInclude Include="..\..\..\lib\mem.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\..\lib\error_private.h">
       <Filter>Header Files</Filter>
     </ClInclude>
-    <ClInclude Include="..\..\..\lib\zstd_buffered_static.h">
+    <ClInclude Include="..\..\..\lib\error_public.h">
       <Filter>Header Files</Filter>
     </ClInclude>
-    <ClInclude Include="..\..\..\lib\mem.h">
+    <ClInclude Include="..\..\..\lib\zbuff.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\..\lib\zbuff_static.h">
       <Filter>Header Files</Filter>
     </ClInclude>
   </ItemGroup>

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/libzstd.git