[med-svn] [gmap] 08/11: Imported Upstream version 2015-07-23

Alex Mestiashvili malex-guest at moszumanska.debian.org
Sat Aug 22 06:25:59 UTC 2015


This is an automated email from the git hooks/post-receive script.

malex-guest pushed a commit to branch master
in repository gmap.

commit 1f0c49c841cbde61523d576a683b91f91c9846d7
Author: Alexandre Mestiashvili <alex at biotec.tu-dresden.de>
Date:   Fri Aug 21 20:22:34 2015 +0200

    Imported Upstream version 2015-07-23
---
 ChangeLog               |  117 ++
 Makefile.in             |    1 +
 README                  |   25 +-
 VERSION                 |    2 +-
 acinclude.m4            |    1 +
 config/shm-flags.m4     |   18 +
 configure               |   93 +-
 configure.ac            |    1 +
 mpi/Makefile.in         |    1 +
 src/Makefile.am         |    4 +-
 src/Makefile.in         |    5 +-
 src/access.c            |   20 +-
 src/bigendian.c         |  528 ++----
 src/bigendian.h         |   28 +-
 src/bitpack64-access.c  |  712 ++++----
 src/bitpack64-read.c    | 4234 +++++++++++++++++++++++++++++------------------
 src/bitpack64-readtwo.c | 4128 +++++++++++++++++++++++++--------------------
 src/bytecoding.c        |  154 +-
 src/bytecoding.h        |    5 +-
 src/compress.c          |  315 ++--
 src/compress.h          |    8 +-
 src/config.h.in         |    3 +
 src/dynprog_genome.c    |   50 +-
 src/genome-write.c      |    6 +-
 src/genome.c            |  124 +-
 src/genome.h            |    5 +-
 src/genome128_hr.c      | 2188 +++++++++++++-----------
 src/get-genome.c        |   73 +-
 src/gmap.c              |   64 +-
 src/gsnap.c             |  102 +-
 src/iit-read-univ.c     |   28 +-
 src/indexdb.c           |   61 +-
 src/indexdb_hr.c        |   18 +-
 src/mode.h              |    4 +-
 src/oligoindex_hr.c     | 2448 ++++++++++++++++++---------
 src/sarray-read.c       |  261 ++-
 src/sarray-write.c      |   67 +-
 src/sequence.c          |   42 +-
 src/sequence.h          |    6 +-
 src/snpindex.c          |   64 +-
 src/splice.c            |   36 +-
 src/stage1hr.c          | 3444 ++++++++++++++++++--------------------
 src/stage3hr.c          |   14 +-
 src/substring.c         |   95 +-
 src/types.h             |    9 +-
 src/uniqscan.c          |   56 +-
 src/univinterval.h      |    6 +-
 tests/Makefile.in       |    1 +
 util/Makefile.in        |    1 +
 49 files changed, 11504 insertions(+), 8172 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 62843c5..bcab131 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,122 @@
+2015-07-23  twu
+
+    * VERSION: Updated version number
+
+    * stage1hr.c: Removed an abort command from debugging
+
+    * sarray-read.c: Using new interface to Bytecoding lcp_next function. 
+      Commented out code that is not used when SUBDIVIDE_ENDS is not defined.
+
+    * bytecoding.c, bytecoding.h: Call to lcp_next now returns child_next
+
+    * VERSION: Updated version number
+
+    * dynprog_genome.c: Fixed boundaries that led to negative coordinates for
+      splice site candidates.
+
+    * stage1hr.c: Removed unused variables
+
+    * stage1hr.c: Removed allvalidp as parameter to align_end and align_pair.
+
+2015-07-22  twu
+
+    * stage1hr.c: Setting spanningsetp and completesetp to false if querylength
+      < min_kmer_readlength
+
+    * stage1hr.c: Removed restriction on min_readlength.  Running only suffix
+      array, if possible, if reads are too short.
+
+    * access.c: Changed user message
+
+    * sarray-write.c: Changing plcp[n] to be 0 instead of -1
+
+    * sarray-read.c: Improved debugging results
+
+    * access.c: Printing user message if shmem fails
+
+2015-07-17  twu
+
+    * get-genome.c, sequence.c, sequence.h: Added flags for --stream-chars and
+      --stream-ints
+
+2015-06-26  twu
+
+    * 2015-statgen, Makefile.gsnaptoo.am, algorithm.tex, discussion.tex,
+      features.tex, introduction.tex, trunk, util: Modified mergeinfo
+
+    * config.site.rescomp.tst: Updated version
+
+    * index.html: Updated for version 2015-06-23
+
+    * archive.html: Updated for version 2014-12-31
+
+    * README: Removed references to Goby
+
+    * access.c, bigendian.c, bigendian.h, bitpack64-access.c, bitpack64-read.c,
+      bitpack64-readtwo.c, bytecoding.c, compress.c, compress.h, genome-write.c,
+      genome.c, genome.h, genome128_hr.c, iit-read-univ.c, indexdb.c,
+      indexdb_hr.c, sarray-read.c, sarray-write.c, snpindex.c, src, types.h,
+      univinterval.h: Merged revisions 167282 through 168383 from
+      branches/2015-06-10-bigendian to support bigendian architectures
+
+    * Makefile.dna.am, Makefile.util.am: Added instructions for check-bigendian
+
+2015-06-24  twu
+
+    * VERSION, config.site.rescomp.tst: Updated version number
+
+    * algorithm.tex, biblio.bib, discussion.tex, features.tex, introduction.tex,
+      toplevel.tex: Final version
+
+    * stage1hr.c: Added comments
+
+    * gmap.c: Removed message about different batch levels
+
+    * gsnap.c: Added option --master-is-worker for MPI version
+
+    * access.c: Using malloc whenever shmget fails
+
+2015-06-15  twu
+
+    * stage1hr.c: Removed extra #endif statements
+
+    * 2015-statgen, Ambiguous-splicing.eps, Hierarchical-GMAP.eps,
+      Large-hash-table.eps, Makefile.gsnaptoo.am, Overlapping-alignment.eps,
+      VERSION, biblio.bib, config.site.rescomp.tst, toplevel.tex, trunk, util:
+      Updated version number
+
+    * stage1hr.c: Fixed indentation
+
+    * genome.c, genome128_hr.c, gmap.c, gsnap.c, indexdb.c, mode.h,
+      sarray-read.c, src, stage1hr.c, substring.c, uniqscan.c: Merged revisions
+      165630 through 167691 from branches/2015-05-13-ttoc to implement ttoc mode
+
+    * splice.c: Applied revision 167580 from releases/public-2014-12-17.  In
+      group_by_segmenti_aux and group_by_segmentj_aux, checking plusp for each
+      individual hit in deciding whether to group donor or acceptor.
+
+    * bitpack64-readtwo.c: Added debugging statements
+
+    * sarray-read.c: Defining a variable for debugging
+
+    * oligoindex_hr.c: Defining reverse_nt for machines without SSE4.1
+
 2015-06-11  twu
 
+    * stage3hr.c: Changed occurrences of Uintlist_next to Uint8list_next for
+      LARGE_GENOMES
+
+    * oligoindex_hr.c: Providing alternative to _mm_extract_epi32 for machines
+      without SSE4.1
+
+    * access.c, acinclude.m4, configure.ac, shm-flags.m4: Including check for
+      SHM_NORESERVE
+
+    * Makefile.gsnaptoo.am: Removed -lrt
+
+    * sarray-read.c: Initializing chromosome values to be those for chrnum 1 to
+      handle left == 0
+
     * VERSION, index.html: Updated version number
 
     * sarray-write.c: Removing rankfile
diff --git a/Makefile.in b/Makefile.in
index c92bdf8..651b3b2 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -51,6 +51,7 @@ am__aclocal_m4_deps = $(top_srcdir)/config/libtool.m4 \
 	$(top_srcdir)/config/mmap-flags.m4 \
 	$(top_srcdir)/config/acx_mmap_fixed.m4 \
 	$(top_srcdir)/config/acx_mmap_variable.m4 \
+	$(top_srcdir)/config/shm-flags.m4 \
 	$(top_srcdir)/config/ax_mpi.m4 \
 	$(top_srcdir)/config/acx_pthread.m4 \
 	$(top_srcdir)/config/builtin-popcount.m4 \
diff --git a/README b/README
index 9e7ee80..fac3a4c 100644
--- a/README
+++ b/README
@@ -89,15 +89,6 @@ However, to disable this feature, you can add "--disable-bzlib" to the
 "disable_bzlib".
 
 
-Note 6: GSNAP optionally supports the Goby input and output file
-formats.  To implement this functionality, you need to obtain and
-compile the libraries from http://campagnelab.org/software/goby.  If
-the resulting header files are located in /path/to/goby/include and
-the library files are in /path/to/goby/lib, you can then add the flag
-"--with-goby=/path/to/goby" to your ./configure command or edit your
-config.site file to have this directory as the value for "with_goby".
-
-
 2.  Possible issues during compilation
 ======================================
 
@@ -122,13 +113,13 @@ instructions work, but popcnt is so widely implemented that they
 generally do not cause any problems.)
 
 In that case, you may need to compile your program for the lowest
-common denominator by disabling SSE2 instructions by providing
---disable-sse4.1 or --disable-sse2 to ./configure as necessary.
-Alternatively, your computer cluster may have the ability to detect
-the capabilities of each computer when it receives a job.  Then, you
-may want to create different compiled versions of GMAP and GSNAP, and
-call the appropriate binary for that particular job.  You will have to
-work with your system administrator if you want to accomplish this.
+common denominator by by providing --disable-avx, --disable-sse4.1, or
+--disable-sse2 to ./configure as necessary.  Alternatively, your
+computer cluster may have the ability to detect the capabilities of
+each computer when it receives a job.  Then, you may want to create
+different compiled versions of GMAP and GSNAP, and call the
+appropriate binary for that particular job.  You will have to work
+with your system administrator if you want to accomplish this.
 
 
 Compiler issue 2.  The most recent versions of GSNAP (starting with
@@ -288,7 +279,7 @@ all other chromosomes in numeric/alphabetical order.  If you don't
 want this sort, provide the "-s none" flag to gmap_build.  Other sort
 options besides "none" and "chrom" are "alpha" and "numeric-alpha".
 
-You can type "gmap_setup --help" to see the full set of options.  We
+You can type "gmap_build --help" to see the full set of options.  We
 discuss some specific situations below.
 
 
diff --git a/VERSION b/VERSION
index 00ee667..b93269f 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-2015-06-10
\ No newline at end of file
+2015-07-23
\ No newline at end of file
diff --git a/acinclude.m4 b/acinclude.m4
index 3bbc4c4..602b9cd 100644
--- a/acinclude.m4
+++ b/acinclude.m4
@@ -7,6 +7,7 @@ m4_include([config/madvise-flags.m4])
 m4_include([config/mmap-flags.m4])
 m4_include([config/acx_mmap_fixed.m4])
 m4_include([config/acx_mmap_variable.m4])
+m4_include([config/shm-flags.m4])
 
 m4_include([config/ax_mpi.m4])
 m4_include([config/acx_pthread.m4])
diff --git a/config/shm-flags.m4 b/config/shm-flags.m4
new file mode 100644
index 0000000..04297b0
--- /dev/null
+++ b/config/shm-flags.m4
@@ -0,0 +1,18 @@
+
+AC_DEFUN([ACX_SHM_FLAGS], [
+AC_LANG_SAVE
+AC_LANG(C)
+
+AC_MSG_CHECKING(for SHM_NORESERVE in shmget)
+AC_COMPILE_IFELSE(
+  [AC_LANG_PROGRAM([[#include <sys/ipc.h>
+#include <sys/shm.h>]],
+                   [[int flags = SHM_NORESERVE;]])],
+  [AC_MSG_RESULT(yes)
+   AC_DEFINE([HAVE_SHM_NORESERVE],[1],[Define to 1 if SHM_NORESERVE available for shmget.])],
+  [AC_MSG_RESULT(no)])
+
+AC_LANG_RESTORE
+])
+
+
diff --git a/configure b/configure
index 9689771..9e9988e 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.63 for gmap 2015-06-10.
+# Generated by GNU Autoconf 2.63 for gmap 2015-07-23.
 #
 # Report bugs to <Thomas Wu <twu at gene.com>>.
 #
@@ -745,8 +745,8 @@ SHELL=${CONFIG_SHELL-/bin/sh}
 # Identity of this package.
 PACKAGE_NAME='gmap'
 PACKAGE_TARNAME='gmap'
-PACKAGE_VERSION='2015-06-10'
-PACKAGE_STRING='gmap 2015-06-10'
+PACKAGE_VERSION='2015-07-23'
+PACKAGE_STRING='gmap 2015-07-23'
 PACKAGE_BUGREPORT='Thomas Wu <twu at gene.com>'
 
 ac_unique_file="src/gmap.c"
@@ -1513,7 +1513,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures gmap 2015-06-10 to adapt to many kinds of systems.
+\`configure' configures gmap 2015-07-23 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1584,7 +1584,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of gmap 2015-06-10:";;
+     short | recursive ) echo "Configuration of gmap 2015-07-23:";;
    esac
   cat <<\_ACEOF
 
@@ -1721,7 +1721,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-gmap configure 2015-06-10
+gmap configure 2015-07-23
 generated by GNU Autoconf 2.63
 
 Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001,
@@ -1735,7 +1735,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by gmap $as_me 2015-06-10, which was
+It was created by gmap $as_me 2015-07-23, which was
 generated by GNU Autoconf 2.63.  Invocation command line was
 
   $ $0 $@
@@ -2105,8 +2105,8 @@ ac_compiler_gnu=$ac_cv_c_compiler_gnu
 
 { $as_echo "$as_me:$LINENO: checking package version" >&5
 $as_echo_n "checking package version... " >&6; }
-{ $as_echo "$as_me:$LINENO: result: 2015-06-10" >&5
-$as_echo "2015-06-10" >&6; }
+{ $as_echo "$as_me:$LINENO: result: 2015-07-23" >&5
+$as_echo "2015-07-23" >&6; }
 
 
 ### Read defaults
@@ -4172,7 +4172,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='gmap'
- VERSION='2015-06-10'
+ VERSION='2015-07-23'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -18793,6 +18793,75 @@ ac_compiler_gnu=$ac_cv_c_compiler_gnu
 
 
 
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
+
+{ $as_echo "$as_me:$LINENO: checking for SHM_NORESERVE in shmget" >&5
+$as_echo_n "checking for SHM_NORESERVE in shmget... " >&6; }
+cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+#include <sys/ipc.h>
+#include <sys/shm.h>
+int
+main ()
+{
+int flags = SHM_NORESERVE;
+  ;
+  return 0;
+}
+_ACEOF
+rm -f conftest.$ac_objext
+if { (ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+  (eval "$ac_compile") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } && {
+	 test -z "$ac_c_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest.$ac_objext; then
+  { $as_echo "$as_me:$LINENO: result: yes" >&5
+$as_echo "yes" >&6; }
+
+cat >>confdefs.h <<\_ACEOF
+#define HAVE_SHM_NORESERVE 1
+_ACEOF
+
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+	{ $as_echo "$as_me:$LINENO: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
+
+
+
 
 
 
@@ -26522,7 +26591,7 @@ exec 6>&1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by gmap $as_me 2015-06-10, which was
+This file was extended by gmap $as_me 2015-07-23, which was
 generated by GNU Autoconf 2.63.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -26585,7 +26654,7 @@ Report bugs to <bug-autoconf at gnu.org>."
 _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_version="\\
-gmap config.status 2015-06-10
+gmap config.status 2015-07-23
 configured by $0, generated by GNU Autoconf 2.63,
   with options \\"`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`\\"
 
diff --git a/configure.ac b/configure.ac
index 44b328d..153c7df 100644
--- a/configure.ac
+++ b/configure.ac
@@ -298,6 +298,7 @@ fi
 
 ACX_MMAP_FLAGS
 ACX_MADVISE_FLAGS
+ACX_SHM_FLAGS
 
 AC_CHECK_FUNCS([ceil floor index log madvise memcpy memmove memset munmap pow rint stat64 strtoul sysconf sysctl sigaction \
                 shmget shmctl shmat shmdt semget semctl semop])
diff --git a/mpi/Makefile.in b/mpi/Makefile.in
index 941269f..eaafa03 100644
--- a/mpi/Makefile.in
+++ b/mpi/Makefile.in
@@ -49,6 +49,7 @@ am__aclocal_m4_deps = $(top_srcdir)/config/libtool.m4 \
 	$(top_srcdir)/config/mmap-flags.m4 \
 	$(top_srcdir)/config/acx_mmap_fixed.m4 \
 	$(top_srcdir)/config/acx_mmap_variable.m4 \
+	$(top_srcdir)/config/shm-flags.m4 \
 	$(top_srcdir)/config/ax_mpi.m4 \
 	$(top_srcdir)/config/acx_pthread.m4 \
 	$(top_srcdir)/config/builtin-popcount.m4 \
diff --git a/src/Makefile.am b/src/Makefile.am
index 88b6c45..836fbff 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -150,12 +150,12 @@ GSNAP_FILES = fopen.h bool.h types.h separator.h comp.h \
 
 
 # Note: dist_ commands get read by bootstrap, and don't follow the flags
-# -lrt is needed for shm_open
+# Previously included -lrt for shm_open, but we are not calling that
 
 gsnap_CC = $(PTHREAD_CC)
 gsnap_CFLAGS = $(AM_CFLAGS) $(PTHREAD_CFLAGS) $(POPCNT_CFLAGS) $(SIMD_CFLAGS) -DTARGET=\"$(target)\" -DGMAPDB=\"$(GMAPDB)\" -DMAX_READLENGTH=$(MAX_READLENGTH) -DGSNAP=1
 gsnap_LDFLAGS = $(AM_LDFLAGS) $(STATIC_LDFLAG)
-gsnap_LDADD = $(PTHREAD_LIBS) $(ZLIB_LIBS) $(BZLIB_LIBS) -lrt
+gsnap_LDADD = $(PTHREAD_LIBS) $(ZLIB_LIBS) $(BZLIB_LIBS)
 
 dist_gsnap_SOURCES = $(GSNAP_FILES)
 
diff --git a/src/Makefile.in b/src/Makefile.in
index 55726b0..708e7f3 100644
--- a/src/Makefile.in
+++ b/src/Makefile.in
@@ -54,6 +54,7 @@ am__aclocal_m4_deps = $(top_srcdir)/config/libtool.m4 \
 	$(top_srcdir)/config/mmap-flags.m4 \
 	$(top_srcdir)/config/acx_mmap_fixed.m4 \
 	$(top_srcdir)/config/acx_mmap_variable.m4 \
+	$(top_srcdir)/config/shm-flags.m4 \
 	$(top_srcdir)/config/ax_mpi.m4 \
 	$(top_srcdir)/config/acx_pthread.m4 \
 	$(top_srcdir)/config/builtin-popcount.m4 \
@@ -879,11 +880,11 @@ GSNAP_FILES = fopen.h bool.h types.h separator.h comp.h \
 
 
 # Note: dist_ commands get read by bootstrap, and don't follow the flags
-# -lrt is needed for shm_open
+# Previously included -lrt for shm_open, but we are not calling that
 gsnap_CC = $(PTHREAD_CC)
 gsnap_CFLAGS = $(AM_CFLAGS) $(PTHREAD_CFLAGS) $(POPCNT_CFLAGS) $(SIMD_CFLAGS) -DTARGET=\"$(target)\" -DGMAPDB=\"$(GMAPDB)\" -DMAX_READLENGTH=$(MAX_READLENGTH) -DGSNAP=1
 gsnap_LDFLAGS = $(AM_LDFLAGS) $(STATIC_LDFLAG)
-gsnap_LDADD = $(PTHREAD_LIBS) $(ZLIB_LIBS) $(BZLIB_LIBS) -lrt
+gsnap_LDADD = $(PTHREAD_LIBS) $(ZLIB_LIBS) $(BZLIB_LIBS)
 dist_gsnap_SOURCES = $(GSNAP_FILES)
 GSNAPL_FILES = fopen.h bool.h types.h separator.h comp.h \
  except.c except.h assert.c assert.h mem.c mem.h \
diff --git a/src/access.c b/src/access.c
index 9be6d68..e819902 100644
--- a/src/access.c
+++ b/src/access.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: access.c 165967 2015-05-20 00:15:27Z twu $";
+static char rcsid[] = "$Id: access.c 170327 2015-07-22 17:50:11Z twu $";
 #ifdef HAVE_CONFIG_H
 #include <config.h>
 #endif
@@ -592,7 +592,11 @@ shmem_attach (int *shmid, char *filename, off_t filesize, size_t eltsize) {
      others wait.  They will be woken up when the semaphore is
      removed. */
 
-  if ((*shmid = shmget(key,filesize,IPC_CREAT | IPC_EXCL | SHM_NORESERVE | 0666)) != -1) {
+  if ((*shmid = shmget(key,filesize,IPC_CREAT | IPC_EXCL |
+#ifdef HAVE_SHM_NORESERVE
+		       SHM_NORESERVE | 
+#endif
+		       0666)) != -1) {
     /* Created new shared memory */
     if ((memory = shmat(*shmid,NULL,0)) == (void *) -1) {
       fprintf(stderr,"Error with shmat.  Error %d: %s\n",errno,strerror(errno));
@@ -614,8 +618,8 @@ shmem_attach (int *shmid, char *filename, off_t filesize, size_t eltsize) {
     }
     
   } else {
-    fprintf(stderr,"Error with shmget.  Error %d: %s\n",errno,strerror(errno));
-    abort();
+    fprintf(stderr,"Using malloc instead of shmget for file %s\n",filename);
+    memory = (void *) NULL;
   }
 
   /* The process that proceeded removes the semaphore here, allowing
@@ -677,7 +681,13 @@ Access_allocate (int *shmid, size_t *len, double *seconds, char *filename, size_
     MPI_Win_allocate_shared(*len,/*disp_unit*/1,MPI_INFO_NULL,comm,&memory,&win);
     MPI_Win_free(&win);
 #else
-    memory = shmem_attach(&(*shmid),filename,/*filesize*/*len,eltsize);
+    if ((memory = shmem_attach(&(*shmid),filename,/*filesize*/*len,eltsize)) == NULL) {
+      fprintf(stderr,"shm_attach not working on file %s, so using malloc instead on %lu bytes\n",
+	      filename,*len);
+      *shmid = 0;
+      memory = (void *) MALLOC(*len);
+      copy_memory_from_file(memory,filename,/*filesize*/*len,eltsize);
+    }
 #endif
   } else {
     *shmid = 0;
diff --git a/src/bigendian.c b/src/bigendian.c
index 886de05..f010f94 100644
--- a/src/bigendian.c
+++ b/src/bigendian.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: bigendian.c 99737 2013-06-27 19:33:03Z twu $";
+static char rcsid[] = "$Id: bigendian.c 168395 2015-06-26 17:13:13Z twu $";
 #ifdef HAVE_CONFIG_H
 #include <config.h>
 #endif
@@ -7,22 +7,21 @@ static char rcsid[] = "$Id: bigendian.c 99737 2013-06-27 19:33:03Z twu $";
 #include <unistd.h>		/* For read() */
 
 
-/*************************************************************************
- *  OUTPUT_BIGENDIAN provided to test bigendian code on a littleendian
- *  machine.  To use, compile all programs with WORDS_BIGENDIAN defined
- *  in config.h and define OUTPUT_BIGENDIAN here.
- ************************************************************************/
+/* Same as Littleendian_write_char */
+void
+Bigendian_write_char (unsigned char value, int fd) {
+  unsigned char buf[1];
+
+  buf[0] = value;
+  write(fd,buf,1);
+
+  return;
+}
 
 /************************************************************************
  *   Int
  ************************************************************************/
 
-#ifdef OUTPUT_BIGENDIAN
-int
-Bigendian_convert_int (int littleendian) {
-  return littleendian;
-}
-#else
 int
 Bigendian_convert_int (int littleendian) {
   int bigendian;
@@ -37,10 +36,8 @@ Bigendian_convert_int (int littleendian) {
 
   return bigendian;
 }
-#endif
 
 
-#ifdef OUTPUT_BIGENDIAN
 size_t
 Bigendian_fwrite_int (int value, FILE *fp) {
   unsigned char buf[4];
@@ -56,25 +53,8 @@ Bigendian_fwrite_int (int value, FILE *fp) {
     return 1;
   }
 }
-#else
-size_t
-Bigendian_fwrite_int (int value, FILE *fp) {
-  unsigned char buf[4];
 
-  buf[0] = (unsigned char) (value & 0xff);
-  buf[1] = (unsigned char) ((value >>= 8) & 0xff);
-  buf[2] = (unsigned char) ((value >>= 8) & 0xff);
-  buf[3] = (unsigned char) ((value >>= 8) & 0xff);
-  if (fwrite(buf,sizeof(unsigned char),4,fp) == 0) {
-    /* Should set error indicator for stream and set errno */
-    return 0;
-  } else {
-    return 1;
-  }
-}
-#endif
 
-#ifdef OUTPUT_BIGENDIAN
 size_t
 Bigendian_fwrite_ints (int *array, int n, FILE *fp) {
   unsigned char buf[4];
@@ -93,28 +73,8 @@ Bigendian_fwrite_ints (int *array, int n, FILE *fp) {
   }
   return n;
 }
-#else
-size_t
-Bigendian_fwrite_ints (int *array, int n, FILE *fp) {
-  unsigned char buf[4];
-  int value, i;
 
-  for (i = 0; i < n; i++) {
-    value = array[i];
-    buf[0] = (unsigned char) (value & 0xff);
-    buf[1] = (unsigned char) ((value >>= 8) & 0xff);
-    buf[2] = (unsigned char) ((value >>= 8) & 0xff);
-    buf[3] = (unsigned char) ((value >>= 8) & 0xff);
-    if (fwrite(buf,sizeof(unsigned char),4,fp) == 0) {
-      /* Should set error indicator for stream and set errno */
-      return 0;
-    }
-  }
-  return n;
-}
-#endif
 
-#ifdef OUTPUT_BIGENDIAN
 size_t
 Bigendian_fread_int (int *value, FILE *fp) {
   unsigned char buf[4];
@@ -123,87 +83,49 @@ Bigendian_fread_int (int *value, FILE *fp) {
     /* Should set error indicator for stream and set errno */
     return 0;
   } else {
-    *value = (buf[0] & 0xff);
+#if 0
+    *value = buf[0];
     *value <<= 8;
-    *value |= (buf[1] & 0xff);
+    *value |= buf[1];
     *value <<= 8;
-    *value |= (buf[2] & 0xff);
+    *value |= buf[2];
     *value <<= 8;
-    *value |= (buf[3] & 0xff);
-    return 1;
-  }
-}
+    *value |= buf[3];
 #else
-size_t
-Bigendian_fread_int (int *value, FILE *fp) {
-  unsigned char buf[4];
-
-  if (fread(buf,sizeof(unsigned char),4,fp) < 4) {
-    /* Should set error indicator for stream and set errno */
-    return 0;
-  } else {
-#if 0
-    fprintf(stderr,"Reading %2X %2X %2X %2X, and using last as most sig\n",buf[0],buf[1],buf[2],buf[3]);
+    *value = ((int) buf[0] << 24) | ((int) buf[1] << 16) | ((int) buf[2] << 8) | (int) buf[3];
 #endif
-    *value = (buf[3] & 0xff);
-    *value <<= 8;
-    *value |= (buf[2] & 0xff);
-    *value <<= 8;
-    *value |= (buf[1] & 0xff);
-    *value <<= 8;
-    *value |= (buf[0] & 0xff);
     return 1;
   }
 }
-#endif
 
-#ifdef OUTPUT_BIGENDIAN
-size_t
-Bigendian_fread_ints (int *array, int n, FILE *fp) {
-  unsigned char buf[4];
-  int value, i;
 
-  for (i = 0; i < n; i++) {
-    if (fread(buf,sizeof(unsigned char),4,fp) < 4) {
-      /* Should set error indicator for stream and set errno */
-      return 0;
-    } else {
-      value = (buf[0] & 0xff);
-      value <<= 8;
-      value |= (buf[1] & 0xff);
-      value <<= 8;
-      value |= (buf[2] & 0xff);
-      value <<= 8;
-      value |= (buf[3] & 0xff);
-      array[i] = value;
-    }
-  }
-  return n;
-}
-#else
 size_t
 Bigendian_fread_ints (int *array, int n, FILE *fp) {
   unsigned char buf[4];
-  int value, i;
+  /* int value; */
+  int i;
 
   for (i = 0; i < n; i++) {
     if (fread(buf,sizeof(unsigned char),4,fp) < 4) {
       /* Should set error indicator for stream and set errno */
       return 0;
     } else {
-      value = (buf[3] & 0xff);
+#if 0
+      value = buf[0];
       value <<= 8;
-      value |= (buf[2] & 0xff);
+      value |= buf[1];
       value <<= 8;
-      value |= (buf[1] & 0xff);
+      value |= buf[2];
       value <<= 8;
-      value |= (buf[0] & 0xff);
+      value |= buf[3];
       array[i] = value;
+#else
+      array[i] = ((int) buf[0] << 24) | ((int) buf[1] << 16) | ((int) buf[2] << 8) | (int) buf[3];
+#endif
     }
   }
   return n;
 }
-#endif
 
 
 /************************************************************************
@@ -226,7 +148,6 @@ Bigendian_convert_uint (unsigned int littleendian) {
 }
 
 
-#ifdef OUTPUT_BIGENDIAN
 size_t
 Bigendian_fwrite_uint (unsigned int value, FILE *fp) {
   unsigned char buf[4];
@@ -242,26 +163,8 @@ Bigendian_fwrite_uint (unsigned int value, FILE *fp) {
     return 1;
   }
 }
-#else
-size_t
-Bigendian_fwrite_uint (unsigned int value, FILE *fp) {
-  unsigned char buf[4];
 
-  buf[0] = (unsigned char) (value & 0xff);
-  buf[1] = (unsigned char) ((value >>= 8) & 0xff);
-  buf[2] = (unsigned char) ((value >>= 8) & 0xff);
-  buf[3] = (unsigned char) ((value >>= 8) & 0xff);
-  if (fwrite(buf,sizeof(unsigned char),4,fp) == 0) {
-    /* Should set error indicator for stream and set errno */
-    return 0;
-  } else {
-    return 1;
-  }
-}
-#endif
 
-
-#ifdef OUTPUT_BIGENDIAN
 void
 Bigendian_write_uint (unsigned int value, int fd) {
   unsigned char buf[4];
@@ -273,20 +176,8 @@ Bigendian_write_uint (unsigned int value, int fd) {
   write(fd,buf,4);
   return;
 }
-#else
-void
-Bigendian_write_uint (unsigned int value, int fd) {
-  unsigned char buf[4];
 
-  buf[0] = (unsigned char) (value & 0xff);
-  buf[1] = (unsigned char) ((value >>= 8) & 0xff);
-  buf[2] = (unsigned char) ((value >>= 8) & 0xff);
-  buf[3] = (unsigned char) ((value >>= 8) & 0xff);
-  write(fd,buf,4);
-}
-#endif
 
-#ifdef OUTPUT_BIGENDIAN
 size_t
 Bigendian_fwrite_uints (unsigned int *array, int n, FILE *fp) {
   unsigned char buf[4];
@@ -306,30 +197,8 @@ Bigendian_fwrite_uints (unsigned int *array, int n, FILE *fp) {
   }
   return n;
 }
-#else
-size_t
-Bigendian_fwrite_uints (unsigned int *array, int n, FILE *fp) {
-  unsigned char buf[4];
-  unsigned int value;
-  int i;
-  
-  for (i = 0; i < n; i++) {
-    value = array[i];
-    buf[0] = (unsigned char) (value & 0xff);
-    buf[1] = (unsigned char) ((value >>= 8) & 0xff);
-    buf[2] = (unsigned char) ((value >>= 8) & 0xff);
-    buf[3] = (unsigned char) ((value >>= 8) & 0xff);
-    if (fwrite(buf,sizeof(unsigned char),4,fp) == 0) {
-      /* Should set error indicator for stream and set errno */
-      return 0;
-    }
-  }
-  return n;
-}
-#endif
 
 
-#ifdef OUTPUT_BIGENDIAN
 size_t
 Bigendian_fread_uint (unsigned int *value, FILE *fp) {
   unsigned char buf[4];
@@ -338,66 +207,26 @@ Bigendian_fread_uint (unsigned int *value, FILE *fp) {
     /* Should set error indicator for stream and set errno */
     return 0;
   } else {
-    *value = (buf[0] & 0xff);
+#if 0
+    *value = buf[0];
     *value <<= 8;
-    *value |= (buf[1] & 0xff);
+    *value |= buf[1];
     *value <<= 8;
-    *value |= (buf[2] & 0xff);
+    *value |= buf[2];
     *value <<= 8;
-    *value |= (buf[3] & 0xff);
-    return 1;
-  }
-}
+    *value |= buf[3];
 #else
-size_t
-Bigendian_fread_uint (unsigned int *value, FILE *fp) {
-  unsigned char buf[4];
-
-  if (fread(buf,sizeof(unsigned char),4,fp) < 4) {
-    /* Should set error indicator for stream and set errno */
-    return 0;
-  } else {
-    *value = (buf[3] & 0xff);
-    *value <<= 8;
-    *value |= (buf[2] & 0xff);
-    *value <<= 8;
-    *value |= (buf[1] & 0xff);
-    *value <<= 8;
-    *value |= (buf[0] & 0xff);
+    *value = ((unsigned int) buf[0] << 24) | ((unsigned int) buf[1] << 16) | ((unsigned int) buf[2] << 8) | (unsigned int) buf[3];
+#endif
     return 1;
   }
 }
-#endif
 
-#ifdef OUTPUT_BIGENDIAN
-size_t
-Bigendian_fread_uints (unsigned int *array, int n, FILE *fp) {
-  unsigned char buf[4];
-  unsigned int value;
-  int i;
 
-  for (i = 0; i < n; i++) {
-    if (fread(buf,sizeof(unsigned char),4,fp) < 4) {
-      /* Should set error indicator for stream and set errno */
-      return 0;
-    } else {
-      value = (buf[0] & 0xff);
-      value <<= 8;
-      value |= (buf[1] & 0xff);
-      value <<= 8;
-      value |= (buf[2] & 0xff);
-      value <<= 8;
-      value |= (buf[3] & 0xff);
-      array[i] = value;
-    }
-  }
-  return n;
-}
-#else
 size_t
 Bigendian_fread_uints (unsigned int *array, int n, FILE *fp) {
   unsigned char buf[4];
-  unsigned int value;
+  /* unsigned int value; */
   int i;
 
   for (i = 0; i < n; i++) {
@@ -405,54 +234,43 @@ Bigendian_fread_uints (unsigned int *array, int n, FILE *fp) {
       /* Should set error indicator for stream and set errno */
       return 0;
     } else {
-      value = (buf[3] & 0xff);
+#if 0
+      value = buf[0];
       value <<= 8;
-      value |= (buf[2] & 0xff);
+      value |= buf[1];
       value <<= 8;
-      value |= (buf[1] & 0xff);
+      value |= buf[2];
       value <<= 8;
-      value |= (buf[0] & 0xff);
+      value |= buf[3];
       array[i] = value;
+#else
+      array[i] = ((unsigned int) buf[0] << 24) | ((unsigned int) buf[1] << 16) | ((unsigned int) buf[2] << 8) | (unsigned int) buf[3];
+#endif
     }
   }
   return n;
 }
-#endif
 
 
-#ifdef OUTPUT_BIGENDIAN
 unsigned int
 Bigendian_fileio_read_uint (int fd) {
   unsigned int value = 0U;
   unsigned char buf[4];
 
   read(fd,buf,4);
-  value = (buf[0] & 0xff);
+#if 0
+  value = buf[0];
   value <<= 8;
-  value |= (buf[1] & 0xff);
+  value |= buf[1];
   value <<= 8;
-  value |= (buf[2] & 0xff);
+  value |= buf[2];
   value <<= 8;
-  value |= (buf[3] & 0xff);
-  return value;
-}
+  value |= buf[3];
 #else
-unsigned int
-Bigendian_fileio_read_uint (int fd) {
-  unsigned int value = 0U;
-  unsigned char buf[4];
-
-  read(fd,buf,4);
-  value = (buf[3] & 0xff);
-  value <<= 8;
-  value |= (buf[2] & 0xff);
-  value <<= 8;
-  value |= (buf[1] & 0xff);
-  value <<= 8;
-  value |= (buf[0] & 0xff);
+  value = ((unsigned int) buf[0] << 24) | ((unsigned int) buf[1] << 16) | ((unsigned int) buf[2] << 8) | (unsigned int) buf[3];
+#endif
   return value;
 }
-#endif
 
 
 /************************************************************************
@@ -495,7 +313,6 @@ Bigendian_convert_uint8 (UINT8 littleendian) {
 }
 
 
-#ifdef OUTPUT_BIGENDIAN
 void
 Bigendian_write_uint8 (UINT8 value, int fd) {
   unsigned char buf[8];
@@ -511,26 +328,8 @@ Bigendian_write_uint8 (UINT8 value, int fd) {
   write(fd,buf,8);
   return;
 }
-#else
-void
-Bigendian_write_uint8 (UINT8 value, int fd) {
-  unsigned char buf[8];
-
-  buf[0] = (unsigned char) (value & 0xff);
-  buf[1] = (unsigned char) ((value >>= 8) & 0xff);
-  buf[2] = (unsigned char) ((value >>= 8) & 0xff);
-  buf[3] = (unsigned char) ((value >>= 8) & 0xff);
-  buf[4] = (unsigned char) ((value >>= 8) & 0xff);
-  buf[5] = (unsigned char) ((value >>= 8) & 0xff);
-  buf[6] = (unsigned char) ((value >>= 8) & 0xff);
-  buf[7] = (unsigned char) ((value >>= 8) & 0xff);
-  write(fd,buf,8);
-}
-#endif
 
 
-
-#ifdef OUTPUT_BIGENDIAN
 size_t
 Bigendian_fwrite_uint8 (UINT8 value, FILE *fp) {
   unsigned char buf[8];
@@ -550,29 +349,8 @@ Bigendian_fwrite_uint8 (UINT8 value, FILE *fp) {
     return 1;
   }
 }
-#else
-size_t
-Bigendian_fwrite_uint8 (UINT8 value, FILE *fp) {
-  unsigned char buf[8];
 
-  buf[0] = value & 0xff;
-  buf[1] = (value >>= 8) & 0xff;
-  buf[2] = (value >>= 8) & 0xff;
-  buf[3] = (value >>= 8) & 0xff;
-  buf[4] = (value >>= 8) & 0xff;
-  buf[5] = (value >>= 8) & 0xff;
-  buf[6] = (value >>= 8) & 0xff;
-  buf[7] = (value >>= 8) & 0xff;
-  if (fwrite(buf,sizeof(unsigned char),8,fp) == 0) {
-    /* Should set error indicator for stream and set errno */
-    return 0;
-  } else {
-    return 1;
-  }
-}
-#endif
 
-#ifdef OUTPUT_BIGENDIAN
 size_t
 Bigendian_fwrite_uint8s (UINT8 *array, int n, FILE *fp) {
   unsigned char buf[8];
@@ -596,34 +374,8 @@ Bigendian_fwrite_uint8s (UINT8 *array, int n, FILE *fp) {
   }
   return n;
 }
-#else
-size_t
-Bigendian_fwrite_uint8s (UINT8 *array, int n, FILE *fp) {
-  unsigned char buf[8];
-  UINT8 value;
-  int i;
-  
-  for (i = 0; i < n; i++) {
-    value = array[i];
-    buf[0] = value & 0xff;
-    buf[1] = (value >>= 8) & 0xff;
-    buf[2] = (value >>= 8) & 0xff;
-    buf[3] = (value >>= 8) & 0xff;
-    buf[4] = (value >>= 8) & 0xff;
-    buf[5] = (value >>= 8) & 0xff;
-    buf[6] = (value >>= 8) & 0xff;
-    buf[7] = (value >>= 8) & 0xff;
-    if (fwrite(buf,sizeof(unsigned char),8,fp) == 0) {
-      /* Should set error indicator for stream and set errno */
-      return 0;
-    }
-  }
-  return n;
-}
-#endif
 
 
-#ifdef OUTPUT_BIGENDIAN
 size_t
 Bigendian_fread_uint8 (UINT8 *value, FILE *fp) {
   unsigned char buf[8];
@@ -632,55 +384,26 @@ Bigendian_fread_uint8 (UINT8 *value, FILE *fp) {
     /* Should set error indicator for stream and set errno */
     return 0;
   } else {
-    *value = (buf[0] & 0xff);
+    *value = (UINT8) buf[0];
     *value <<= 8;
-    *value |= (buf[1] & 0xff);
+    *value |= (UINT8) buf[1];
     *value <<= 8;
-    *value |= (buf[2] & 0xff);
+    *value |= (UINT8) buf[2];
     *value <<= 8;
-    *value |= (buf[3] & 0xff);
+    *value |= (UINT8) buf[3];
     *value <<= 8;
-    *value |= (buf[4] & 0xff);
+    *value |= (UINT8) buf[4];
     *value <<= 8;
-    *value |= (buf[5] & 0xff);
+    *value |= (UINT8) buf[5];
     *value <<= 8;
-    *value |= (buf[6] & 0xff);
+    *value |= (UINT8) buf[6];
     *value <<= 8;
-    *value |= (buf[7] & 0xff);
+    *value |= (UINT8) buf[7];
     return 1;
   }
 }
-#else
-size_t
-Bigendian_fread_uint8 (UINT8 *value, FILE *fp) {
-  unsigned char buf[8];
-
-  if (fread(buf,sizeof(unsigned char),8,fp) < 8) {
-    /* Should set error indicator for stream and set errno */
-    return 0;
-  } else {
-    *value = (buf[7] & 0xff);
-    *value <<= 8;
-    *value = (buf[6] & 0xff);
-    *value <<= 8;
-    *value = (buf[5] & 0xff);
-    *value <<= 8;
-    *value = (buf[4] & 0xff);
-    *value <<= 8;
-    *value = (buf[3] & 0xff);
-    *value <<= 8;
-    *value |= (buf[2] & 0xff);
-    *value <<= 8;
-    *value |= (buf[1] & 0xff);
-    *value <<= 8;
-    *value |= (buf[0] & 0xff);
-    return 1;
-  }
-}
-#endif
 
 
-#ifdef OUTPUT_BIGENDIAN
 size_t
 Bigendian_fread_uint8s (UINT8 *array, int n, FILE *fp) {
   unsigned char buf[8];
@@ -692,112 +415,97 @@ Bigendian_fread_uint8s (UINT8 *array, int n, FILE *fp) {
       /* Should set error indicator for stream and set errno */
       return 0;
     } else {
-      value = (buf[0] & 0xff);
+      value = (UINT8) buf[0];
       value <<= 8;
-      value |= (buf[1] & 0xff);
+      value |= (UINT8) buf[1];
       value <<= 8;
-      value |= (buf[2] & 0xff);
+      value |= (UINT8) buf[2];
       value <<= 8;
-      value |= (buf[3] & 0xff);
+      value |= (UINT8) buf[3];
       value <<= 8;
-      value |= (buf[4] & 0xff);
+      value |= (UINT8) buf[4];
       value <<= 8;
-      value |= (buf[5] & 0xff);
+      value |= (UINT8) buf[5];
       value <<= 8;
-      value |= (buf[6] & 0xff);
+      value |= (UINT8) buf[6];
       value <<= 8;
-      value |= (buf[7] & 0xff);
-
+      value |= (UINT8) buf[7];
       array[i] = value;
     }
   }
   return n;
 }
-#else
-size_t
-Bigendian_fread_uint8s (UINT8 *array, int n, FILE *fp) {
-  unsigned char buf[8];
-  UINT8 value;
-  int i;
 
-  for (i = 0; i < n; i++) {
-    if (fread(buf,sizeof(unsigned char),8,fp) < 8) {
-      /* Should set error indicator for stream and set errno */
-      return 0;
-    } else {
-      value = (buf[7] & 0xff);
-      value <<= 8;
-      value = (buf[6] & 0xff);
-      value <<= 8;
-      value = (buf[5] & 0xff);
-      value <<= 8;
-      value = (buf[4] & 0xff);
-      value <<= 8;
-      value = (buf[3] & 0xff);
-      value <<= 8;
-      value |= (buf[2] & 0xff);
-      value <<= 8;
-      value |= (buf[1] & 0xff);
-      value <<= 8;
-      value |= (buf[0] & 0xff);
-      array[i] = value;
-    }
-  }
-  return n;
-}
-#endif
 
-
-#ifdef OUTPUT_BIGENDIAN
 UINT8
 Bigendian_fileio_read_uint8 (int fd) {
   UINT8 value = 0LU;
   unsigned char buf[8];
 
   read(fd,buf,8);
-  value = (buf[0] & 0xff);
+  value = (UINT8) buf[0];
   value <<= 8;
-  value |= (buf[1] & 0xff);
+  value |= (UINT8) buf[1];
   value <<= 8;
-  value |= (buf[2] & 0xff);
+  value |= (UINT8) buf[2];
   value <<= 8;
-  value |= (buf[3] & 0xff);
+  value |= (UINT8) buf[3];
   value <<= 8;
-  value |= (buf[4] & 0xff);
+  value |= (UINT8) buf[4];
   value <<= 8;
-  value |= (buf[5] & 0xff);
+  value |= (UINT8) buf[5];
   value <<= 8;
-  value |= (buf[6] & 0xff);
+  value |= (UINT8) buf[6];
   value <<= 8;
-  value |= (buf[7] & 0xff);
+  value |= (UINT8) buf[7];
   return value;
 }
-#else
-UINT8
-Bigendian_fileio_read_uint8 (int fd) {
-  UINT8 value = 0LU;
-  unsigned char buf[8];
 
-  read(fd,buf,8);
-  value = (buf[7] & 0xff);
-  value <<= 8;
-  value = (buf[6] & 0xff);
-  value <<= 8;
-  value = (buf[5] & 0xff);
-  value <<= 8;
-  value = (buf[4] & 0xff);
-  value <<= 8;
-  value = (buf[3] & 0xff);
-  value <<= 8;
-  value |= (buf[2] & 0xff);
-  value <<= 8;
-  value |= (buf[1] & 0xff);
-  value <<= 8;
-  value |= (buf[0] & 0xff);
-  return value;
+#endif /* HAVE_64_BIT */
+
+
+/************************************************************************
+ *   Double
+ ************************************************************************/
+
+size_t
+Bigendian_fwrite_double (double value, FILE *fp) {
+  unsigned char buf[8], *ptr = (unsigned char *) &value;
+  size_t i, j;
+
+  /* buf = (unsigned char *) MALLOC(sizeof(double) * sizeof(unsigned char)); */
+
+  i = 0;
+  j = sizeof(double);
+  while (i < sizeof(double)) {
+    buf[i++] = ptr[--j];
+  }
+
+  if (fwrite(buf,sizeof(unsigned char),sizeof(double),fp) == 0) {
+    /* Should set error indicator for stream and set errno */
+    /* FREE(buf); */
+    return 0;
+  } else {
+    /* FREE(buf); */
+    return sizeof(double)/4;
+  }
 }
-#endif
 
 
-#endif /* HAVE_64_BIT */
+double
+Bigendian_convert_double (double value) {
+  unsigned char *ptr = (unsigned char *) &value, temp;
+  size_t i, j;
+
+  i = 0;
+  j = sizeof(double);
+  while (i < sizeof(double)) {
+    /* swap */
+    temp = ptr[--j];
+    ptr[j] = ptr[i];
+    ptr[i++] = temp;
+  }
+
+  return value;
+}
 
diff --git a/src/bigendian.h b/src/bigendian.h
index 7c0528f..46f15e3 100644
--- a/src/bigendian.h
+++ b/src/bigendian.h
@@ -1,4 +1,4 @@
-/* $Id: bigendian.h 157223 2015-01-22 18:43:01Z twu $ */
+/* $Id: bigendian.h 168395 2015-06-26 17:13:13Z twu $ */
 #ifndef BIGENDIAN_INCLUDED
 #define BIGENDIAN_INCLUDED
 #ifdef HAVE_CONFIG_H
@@ -9,6 +9,11 @@
 #include <stddef.h>
 #include "types.h"
 
+extern void
+Bigendian_write_char (unsigned char value, int fd);
+
+
+
 extern int
 Bigendian_convert_int (int littleendian);
 extern size_t
@@ -49,9 +54,27 @@ extern size_t
 Bigendian_fread_uint8s (UINT8 *array, int n, FILE *fp);
 extern UINT8
 Bigendian_fileio_read_uint8 (int fd);
+
+#ifdef UTILITYP
+#define Bigendian_convert_univcoord Bigendian_convert_uint8
+#elif defined(LARGE_GENOMES)
+#define Bigendian_convert_univcoord Bigendian_convert_uint8
+#else
+#define Bigendian_convert_univcoord Bigendian_convert_uint
 #endif
 
+#else
+#define Bigendian_convert_univcoord Bigendian_convert_uint
+#endif
+
+
+extern double
+Bigendian_convert_double (double value);
+extern size_t
+Bigendian_fwrite_double (double value, FILE *fp);
+
 
+#define FREAD_CHAR(p,fp) fread(p,sizeof(unsigned char),1,fp)
 #define FREAD_INT(p,fp) Bigendian_fread_int(p,fp)
 #define FREAD_UINT(p,fp) Bigendian_fread_uint(p,fp)
 #define FREAD_INTS(a,n,fp) Bigendian_fread_ints(a,n,fp)
@@ -61,8 +84,11 @@ Bigendian_fileio_read_uint8 (int fd);
 #define FREAD_UINT8S(a,n,fp) Bigendian_fread_uint8s(a,n,fp)
 #endif
 
+#define FWRITE_CHAR(x,fp) fwrite(&(x),sizeof(unsigned char),1,fp)
 #define FWRITE_INT(x,fp) Bigendian_fwrite_int(x,fp)
 #define FWRITE_UINT(x,fp) Bigendian_fwrite_uint(x,fp)
+#define FWRITE_DOUBLE(x,fp) Bigendian_fwrite_double(x,fp)
+#define WRITE_CHAR(x,fd) Bigendian_write_char(x,fd)
 #define WRITE_UINT(x,fd) Bigendian_write_uint(x,fd)
 #define WRITE_UINT8(x,fd) Bigendian_write_uin8t(x,fd)
 #define FWRITE_INTS(a,n,fp) Bigendian_fwrite_ints(a,n,fp)
diff --git a/src/bitpack64-access.c b/src/bitpack64-access.c
index c4621dd..d69cc43 100644
--- a/src/bitpack64-access.c
+++ b/src/bitpack64-access.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: bitpack64-access.c 132144 2014-04-02 16:02:28Z twu $";
+static char rcsid[] = "$Id: bitpack64-access.c 168395 2015-06-26 17:13:13Z twu $";
 #ifdef HAVE_CONFIG_H
 #include <config.h>
 #endif
@@ -8,6 +8,14 @@ static char rcsid[] = "$Id: bitpack64-access.c 132144 2014-04-02 16:02:28Z twu $
 #include <stdio.h>
 #include <stdlib.h>
 
+#ifdef WORDS_BIGENDIAN
+#include "bigendian.h"
+#define CONVERT(x) Bigendian_convert_uint(x)
+#else
+#define CONVERT(x) x
+#endif
+
+
 #ifdef DEBUG
 #define debug(x) x
 #else
@@ -35,232 +43,232 @@ access_00 (const UINT4 *in) {
 
 static UINT4
 access_02_00 (const UINT4 *in) {
-  return ( (*in) >>  0  )   % (1U << 2 ) ;
+  return ( CONVERT(*in) >>  0  )   % (1U << 2 ) ;
 }
 
 static UINT4
 access_02_01 (const UINT4 *in) {
-  return ( (*in) >>  2  )   % (1U << 2 ) ;
+  return ( CONVERT(*in) >>  2  )   % (1U << 2 ) ;
 }
 
 static UINT4
 access_02_02 (const UINT4 *in) {
-  return ( (*in) >>  4  )   % (1U << 2 ) ;
+  return ( CONVERT(*in) >>  4  )   % (1U << 2 ) ;
 }
 
 static UINT4
 access_02_03 (const UINT4 *in) {
-  return ( (*in) >>  6  )   % (1U << 2 ) ;
+  return ( CONVERT(*in) >>  6  )   % (1U << 2 ) ;
 }
 
 static UINT4
 access_02_04 (const UINT4 *in) {
-  return ( (*in) >>  8  )   % (1U << 2 ) ;
+  return ( CONVERT(*in) >>  8  )   % (1U << 2 ) ;
 }
 
 static UINT4
 access_02_05 (const UINT4 *in) {
-  return ( (*in) >>  10  )   % (1U << 2 ) ;
+  return ( CONVERT(*in) >>  10  )   % (1U << 2 ) ;
 }
 
 static UINT4
 access_02_06 (const UINT4 *in) {
-  return ( (*in) >>  12  )   % (1U << 2 ) ;
+  return ( CONVERT(*in) >>  12  )   % (1U << 2 ) ;
 }
 
 static UINT4
 access_02_07 (const UINT4 *in) {
-  return ( (*in) >>  14  )   % (1U << 2 ) ;
+  return ( CONVERT(*in) >>  14  )   % (1U << 2 ) ;
 }
 
 static UINT4
 access_02_08 (const UINT4 *in) {
-  return ( (*in) >>  16  )   % (1U << 2 ) ;
+  return ( CONVERT(*in) >>  16  )   % (1U << 2 ) ;
 }
 
 static UINT4
 access_02_09 (const UINT4 *in) {
-  return ( (*in) >>  18  )   % (1U << 2 ) ;
+  return ( CONVERT(*in) >>  18  )   % (1U << 2 ) ;
 }
 
 static UINT4
 access_02_10 (const UINT4 *in) {
-  return ( (*in) >>  20  )   % (1U << 2 ) ;
+  return ( CONVERT(*in) >>  20  )   % (1U << 2 ) ;
 }
 
 static UINT4
 access_02_11 (const UINT4 *in) {
-  return ( (*in) >>  22  )   % (1U << 2 ) ;
+  return ( CONVERT(*in) >>  22  )   % (1U << 2 ) ;
 }
 
 static UINT4
 access_02_12 (const UINT4 *in) {
-  return ( (*in) >>  24  )   % (1U << 2 ) ;
+  return ( CONVERT(*in) >>  24  )   % (1U << 2 ) ;
 }
 
 static UINT4
 access_02_13 (const UINT4 *in) {
-  return ( (*in) >>  26  )   % (1U << 2 ) ;
+  return ( CONVERT(*in) >>  26  )   % (1U << 2 ) ;
 }
 
 static UINT4
 access_02_14 (const UINT4 *in) {
-  return ( (*in) >>  28  )   % (1U << 2 ) ;
+  return ( CONVERT(*in) >>  28  )   % (1U << 2 ) ;
 }
 
 static UINT4
 access_02_15 (const UINT4 *in) {
-  return ( (*in) >>  30  )   % (1U << 2 ) ;
+  return ( CONVERT(*in) >>  30  )   % (1U << 2 ) ;
 }
 
 
 
 static UINT4
 access_04_00 (const UINT4 *in) {
-  return ( (*in) >> 0 )   % (1U << 4 ) ;
+  return ( CONVERT(*in) >> 0 )   % (1U << 4 ) ;
 }
 
 static UINT4
 access_04_01 (const UINT4 *in) {
-  return ( (*in) >> 4 )   % (1U << 4 ) ;
+  return ( CONVERT(*in) >> 4 )   % (1U << 4 ) ;
 }
 
 static UINT4
 access_04_02 (const UINT4 *in) {
-  return ( (*in) >> 8 )   % (1U << 4 ) ;
+  return ( CONVERT(*in) >> 8 )   % (1U << 4 ) ;
 }
 
 static UINT4
 access_04_03 (const UINT4 *in) {
-  return ( (*in) >> 12 )   % (1U << 4 ) ;
+  return ( CONVERT(*in) >> 12 )   % (1U << 4 ) ;
 }
 
 static UINT4
 access_04_04 (const UINT4 *in) {
-  return ( (*in) >> 16 )   % (1U << 4 ) ;
+  return ( CONVERT(*in) >> 16 )   % (1U << 4 ) ;
 }
 
 static UINT4
 access_04_05 (const UINT4 *in) {
-  return ( (*in) >> 20 )   % (1U << 4 ) ;
+  return ( CONVERT(*in) >> 20 )   % (1U << 4 ) ;
 }
 
 static UINT4
 access_04_06 (const UINT4 *in) {
-  return ( (*in) >> 24 )   % (1U << 4 ) ;
+  return ( CONVERT(*in) >> 24 )   % (1U << 4 ) ;
 }
 
 static UINT4
 access_04_07 (const UINT4 *in) {
-  return ( (*in) >> 28 )   % (1U << 4 ) ;
+  return ( CONVERT(*in) >> 28 )   % (1U << 4 ) ;
 }
 
 static UINT4
 access_04_08 (const UINT4 *in) {
   in += 1 * WORD_INCR;
-  return ( (*in) >> 0 )   % (1U << 4 ) ;
+  return ( CONVERT(*in) >> 0 )   % (1U << 4 ) ;
 }
 
 static UINT4
 access_04_09 (const UINT4 *in) {
   in += 1 * WORD_INCR;
-  return ( (*in) >> 4 )   % (1U << 4 ) ;
+  return ( CONVERT(*in) >> 4 )   % (1U << 4 ) ;
 }
 
 static UINT4
 access_04_10 (const UINT4 *in) {
   in += 1 * WORD_INCR;
-  return ( (*in) >> 8 )   % (1U << 4 ) ;
+  return ( CONVERT(*in) >> 8 )   % (1U << 4 ) ;
 }
 
 static UINT4
 access_04_11 (const UINT4 *in) {
   in += 1 * WORD_INCR;
-  return ( (*in) >> 12 )   % (1U << 4 ) ;
+  return ( CONVERT(*in) >> 12 )   % (1U << 4 ) ;
 }
 
 static UINT4
 access_04_12 (const UINT4 *in) {
   in += 1 * WORD_INCR;
-  return ( (*in) >> 16 )   % (1U << 4 ) ;
+  return ( CONVERT(*in) >> 16 )   % (1U << 4 ) ;
 }
 
 static UINT4
 access_04_13 (const UINT4 *in) {
   in += 1 * WORD_INCR;
-  return ( (*in) >> 20 )   % (1U << 4 ) ;
+  return ( CONVERT(*in) >> 20 )   % (1U << 4 ) ;
 }
 
 static UINT4
 access_04_14 (const UINT4 *in) {
   in += 1 * WORD_INCR;
-  return ( (*in) >> 24 )   % (1U << 4 ) ;
+  return ( CONVERT(*in) >> 24 )   % (1U << 4 ) ;
 }
 
 static UINT4
 access_04_15 (const UINT4 *in) {
   in += 1 * WORD_INCR;
-  return ( (*in) >> 28 )   % (1U << 4 ) ;
+  return ( CONVERT(*in) >> 28 )   % (1U << 4 ) ;
 }
 
 
 static UINT4
 access_06_00 (const UINT4 *in) {
-  return ( (*in) >>  0  )   % (1U << 6 ) ;
+  return ( CONVERT(*in) >>  0  )   % (1U << 6 ) ;
 }
 
 static UINT4
 access_06_01 (const UINT4 *in) {
-  return ( (*in) >>  6  )   % (1U << 6 ) ;
+  return ( CONVERT(*in) >>  6  )   % (1U << 6 ) ;
 }
 
 static UINT4
 access_06_02 (const UINT4 *in) {
-  return ( (*in) >>  12  )   % (1U << 6 ) ;
+  return ( CONVERT(*in) >>  12  )   % (1U << 6 ) ;
 }
 
 static UINT4
 access_06_03 (const UINT4 *in) {
-  return ( (*in) >>  18  )   % (1U << 6 ) ;
+  return ( CONVERT(*in) >>  18  )   % (1U << 6 ) ;
 }
 
 static UINT4
 access_06_04 (const UINT4 *in) {
-  return ( (*in) >>  24  )   % (1U << 6 ) ;
+  return ( CONVERT(*in) >>  24  )   % (1U << 6 ) ;
 }
 
 static UINT4
 access_06_05 (const UINT4 *in) {
   UINT4 out;
 
-  out = ( (*in) >>  30  )   % (1U << 6 ) ;
+  out = ( CONVERT(*in) >>  30  )   % (1U << 6 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 4 ))<<( 6 - 4 );
+  out |= (CONVERT(*in) % (1U<< 4 ))<<( 6 - 4 );
   return out;
 }
 
 static UINT4
 access_06_06 (const UINT4 *in) {
   in += 1 * WORD_INCR;
-  return ( (*in) >>  4  )   % (1U << 6 ) ;
+  return ( CONVERT(*in) >>  4  )   % (1U << 6 ) ;
 }
 
 static UINT4
 access_06_07 (const UINT4 *in) {
   in += 1 * WORD_INCR;
-  return ( (*in) >>  10  )   % (1U << 6 ) ;
+  return ( CONVERT(*in) >>  10  )   % (1U << 6 ) ;
 }
 
 static UINT4
 access_06_08 (const UINT4 *in) {
   in += 1 * WORD_INCR;
-  return ( (*in) >>  16  )   % (1U << 6 ) ;
+  return ( CONVERT(*in) >>  16  )   % (1U << 6 ) ;
 }
 
 static UINT4
 access_06_09 (const UINT4 *in) {
   in += 1 * WORD_INCR;
-  return ( (*in) >>  22  )   % (1U << 6 ) ;
+  return ( CONVERT(*in) >>  22  )   % (1U << 6 ) ;
 }
 
 static UINT4
@@ -268,171 +276,171 @@ access_06_10 (const UINT4 *in) {
   UINT4 out;
 
   in += 1 * WORD_INCR;
-  out = ( (*in) >>  28  )   % (1U << 6 ) ;
+  out = ( CONVERT(*in) >>  28  )   % (1U << 6 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 2 ))<<( 6 - 2 );
+  out |= (CONVERT(*in) % (1U<< 2 ))<<( 6 - 2 );
   return out;
 }
 
 static UINT4
 access_06_11 (const UINT4 *in) {
   in += 2 * WORD_INCR;
-  return ( (*in) >>  2  )   % (1U << 6 ) ;
+  return ( CONVERT(*in) >>  2  )   % (1U << 6 ) ;
 }
 
 static UINT4
 access_06_12 (const UINT4 *in) {
   in += 2 * WORD_INCR;
-  return ( (*in) >>  8  )   % (1U << 6 ) ;
+  return ( CONVERT(*in) >>  8  )   % (1U << 6 ) ;
 }
 
 static UINT4
 access_06_13 (const UINT4 *in) {
   in += 2 * WORD_INCR;
-  return ( (*in) >>  14  )   % (1U << 6 ) ;
+  return ( CONVERT(*in) >>  14  )   % (1U << 6 ) ;
 }
 
 static UINT4
 access_06_14 (const UINT4 *in) {
   in += 2 * WORD_INCR;
-  return ( (*in) >>  20  )   % (1U << 6 ) ;
+  return ( CONVERT(*in) >>  20  )   % (1U << 6 ) ;
 }
 
 static UINT4
 access_06_15 (const UINT4 *in) {
   in += 2 * WORD_INCR;
-  return ( (*in) >>  26  )   % (1U << 6 ) ;
+  return ( CONVERT(*in) >>  26  )   % (1U << 6 ) ;
 }
 
 
 static UINT4
 access_08_00 (const UINT4 *in) {
-  return ( (*in) >> 0 )   % (1U << 8 ) ;
+  return ( CONVERT(*in) >> 0 )   % (1U << 8 ) ;
 }
 
 static UINT4
 access_08_01 (const UINT4 *in) {
-  return ( (*in) >> 8 )   % (1U << 8 ) ;
+  return ( CONVERT(*in) >> 8 )   % (1U << 8 ) ;
 }
 
 static UINT4
 access_08_02 (const UINT4 *in) {
-  return ( (*in) >> 16 )   % (1U << 8 ) ;
+  return ( CONVERT(*in) >> 16 )   % (1U << 8 ) ;
 }
 
 static UINT4
 access_08_03 (const UINT4 *in) {
-  return ( (*in) >> 24 )   % (1U << 8 ) ;
+  return ( CONVERT(*in) >> 24 )   % (1U << 8 ) ;
 }
 
 static UINT4
 access_08_04 (const UINT4 *in) {
   in += 1 * WORD_INCR;
-  return ( (*in) >> 0 )   % (1U << 8 ) ;
+  return ( CONVERT(*in) >> 0 )   % (1U << 8 ) ;
 }
 
 static UINT4
 access_08_05 (const UINT4 *in) {
   in += 1 * WORD_INCR;
-  return ( (*in) >> 8 )   % (1U << 8 ) ;
+  return ( CONVERT(*in) >> 8 )   % (1U << 8 ) ;
 }
 
 static UINT4
 access_08_06 (const UINT4 *in) {
   in += 1 * WORD_INCR;
-  return ( (*in) >> 16 )   % (1U << 8 ) ;
+  return ( CONVERT(*in) >> 16 )   % (1U << 8 ) ;
 }
 
 static UINT4
 access_08_07 (const UINT4 *in) {
   in += 1 * WORD_INCR;
-  return ( (*in) >> 24 )   % (1U << 8 ) ;
+  return ( CONVERT(*in) >> 24 )   % (1U << 8 ) ;
 }
 
 static UINT4
 access_08_08 (const UINT4 *in) {
   in += 2 * WORD_INCR;
-  return ( (*in) >> 0 )   % (1U << 8 ) ;
+  return ( CONVERT(*in) >> 0 )   % (1U << 8 ) ;
 }
 
 static UINT4
 access_08_09 (const UINT4 *in) {
   in += 2 * WORD_INCR;
-  return ( (*in) >> 8 )   % (1U << 8 ) ;
+  return ( CONVERT(*in) >> 8 )   % (1U << 8 ) ;
 }
 
 static UINT4
 access_08_10 (const UINT4 *in) {
   in += 2 * WORD_INCR;
-  return ( (*in) >> 16 )   % (1U << 8 ) ;
+  return ( CONVERT(*in) >> 16 )   % (1U << 8 ) ;
 }
 
 static UINT4
 access_08_11 (const UINT4 *in) {
   in += 2 * WORD_INCR;
-  return ( (*in) >> 24 )   % (1U << 8 ) ;
+  return ( CONVERT(*in) >> 24 )   % (1U << 8 ) ;
 }
 
 static UINT4
 access_08_12 (const UINT4 *in) {
   in += 3 * WORD_INCR;
-  return ( (*in) >> 0 )   % (1U << 8 ) ;
+  return ( CONVERT(*in) >> 0 )   % (1U << 8 ) ;
 }
 
 static UINT4
 access_08_13 (const UINT4 *in) {
   in += 3 * WORD_INCR;
-  return ( (*in) >> 8 )   % (1U << 8 ) ;
+  return ( CONVERT(*in) >> 8 )   % (1U << 8 ) ;
 }
 
 static UINT4
 access_08_14 (const UINT4 *in) {
   in += 3 * WORD_INCR;
-  return ( (*in) >> 16 )   % (1U << 8 ) ;
+  return ( CONVERT(*in) >> 16 )   % (1U << 8 ) ;
 }
 
 static UINT4
 access_08_15 (const UINT4 *in) {
   in += 3 * WORD_INCR;
-  return ( (*in) >> 24 )   % (1U << 8 ) ;
+  return ( CONVERT(*in) >> 24 )   % (1U << 8 ) ;
 }
 
 
 static UINT4
 access_10_00 (const UINT4 *in) {
-  return ( (*in) >>  0  )   % (1U << 10 ) ;
+  return ( CONVERT(*in) >>  0  )   % (1U << 10 ) ;
 }
 
 static UINT4
 access_10_01 (const UINT4 *in) {
-  return ( (*in) >>  10  )   % (1U << 10 ) ;
+  return ( CONVERT(*in) >>  10  )   % (1U << 10 ) ;
 }
 
 static UINT4
 access_10_02 (const UINT4 *in) {
-  return ( (*in) >>  20  )   % (1U << 10 ) ;
+  return ( CONVERT(*in) >>  20  )   % (1U << 10 ) ;
 }
 
 static UINT4
 access_10_03 (const UINT4 *in) {
   UINT4 out;
 
-  out = ( (*in) >>  30  )   % (1U << 10 ) ;
+  out = ( CONVERT(*in) >>  30  )   % (1U << 10 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 8 ))<<( 10 - 8 );
+  out |= (CONVERT(*in) % (1U<< 8 ))<<( 10 - 8 );
   return out;
 }
 
 static UINT4
 access_10_04 (const UINT4 *in) {
   in += 1 * WORD_INCR;
-  return ( (*in) >>  8  )   % (1U << 10 ) ;
+  return ( CONVERT(*in) >>  8  )   % (1U << 10 ) ;
 }
 
 static UINT4
 access_10_05 (const UINT4 *in) {
   in += 1 * WORD_INCR;
-  return ( (*in) >>  18  )   % (1U << 10 ) ;
+  return ( CONVERT(*in) >>  18  )   % (1U << 10 ) ;
 }
 
 static UINT4
@@ -440,22 +448,22 @@ access_10_06 (const UINT4 *in) {
   UINT4 out;
   
   in += 1 * WORD_INCR;
-  out = ( (*in) >>  28  )   % (1U << 10 ) ;
+  out = ( CONVERT(*in) >>  28  )   % (1U << 10 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 6 ))<<( 10 - 6 );
+  out |= (CONVERT(*in) % (1U<< 6 ))<<( 10 - 6 );
   return out;
 }
 
 static UINT4
 access_10_07 (const UINT4 *in) {
   in += 2 * WORD_INCR;
-  return ( (*in) >>  6  )   % (1U << 10 ) ;
+  return ( CONVERT(*in) >>  6  )   % (1U << 10 ) ;
 }
 
 static UINT4
 access_10_08 (const UINT4 *in) {
   in += 2 * WORD_INCR;
-  return ( (*in) >>  16  )   % (1U << 10 ) ;
+  return ( CONVERT(*in) >>  16  )   % (1U << 10 ) ;
 }
 
 static UINT4
@@ -463,22 +471,22 @@ access_10_09 (const UINT4 *in) {
   UINT4 out;
 
   in += 2 * WORD_INCR;
-  out = ( (*in) >>  26  )   % (1U << 10 ) ;
+  out = ( CONVERT(*in) >>  26  )   % (1U << 10 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 4 ))<<( 10 - 4 );
+  out |= (CONVERT(*in) % (1U<< 4 ))<<( 10 - 4 );
   return out;
 }
 
 static UINT4
 access_10_10 (const UINT4 *in) {
   in += 3 * WORD_INCR;
-  return ( (*in) >>  4  )   % (1U << 10 ) ;
+  return ( CONVERT(*in) >>  4  )   % (1U << 10 ) ;
 }
 
 static UINT4
 access_10_11 (const UINT4 *in) {
   in += 3 * WORD_INCR;
-  return ( (*in) >>  14  )   % (1U << 10 ) ;
+  return ( CONVERT(*in) >>  14  )   % (1U << 10 ) ;
 }
 
 static UINT4
@@ -486,61 +494,61 @@ access_10_12 (const UINT4 *in) {
   UINT4 out;
 
   in += 3 * WORD_INCR;
-  out = ( (*in) >>  24  )   % (1U << 10 ) ;
+  out = ( CONVERT(*in) >>  24  )   % (1U << 10 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 2 ))<<( 10 - 2 );
+  out |= (CONVERT(*in) % (1U<< 2 ))<<( 10 - 2 );
   return out;
 }
 
 static UINT4
 access_10_13 (const UINT4 *in) {
   in += 4 * WORD_INCR;
-  return ( (*in) >>  2  )   % (1U << 10 ) ;
+  return ( CONVERT(*in) >>  2  )   % (1U << 10 ) ;
 }
 
 static UINT4
 access_10_14 (const UINT4 *in) {
   in += 4 * WORD_INCR;
-  return ( (*in) >>  12  )   % (1U << 10 ) ;
+  return ( CONVERT(*in) >>  12  )   % (1U << 10 ) ;
 }
 
 static UINT4
 access_10_15 (const UINT4 *in) {
   in += 4 * WORD_INCR;
-  return ( (*in) >>  22  )   % (1U << 10 ) ;
+  return ( CONVERT(*in) >>  22  )   % (1U << 10 ) ;
 }
 
 
 static UINT4
 access_12_00 (const UINT4 *in) {
-    return ( (*in) >>  0  )   % (1U << 12 ) ;
+    return ( CONVERT(*in) >>  0  )   % (1U << 12 ) ;
 }
 
 static UINT4
 access_12_01 (const UINT4 *in) {
-  return ( (*in) >>  12  )   % (1U << 12 ) ;
+  return ( CONVERT(*in) >>  12  )   % (1U << 12 ) ;
 }
 
 static UINT4
 access_12_02 (const UINT4 *in) {
   UINT4 out;
 
-  out = ( (*in) >>  24  )   % (1U << 12 ) ;
+  out = ( CONVERT(*in) >>  24  )   % (1U << 12 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 4 ))<<( 12 - 4 );
+  out |= (CONVERT(*in) % (1U<< 4 ))<<( 12 - 4 );
   return out;
 }
 
 static UINT4
 access_12_03 (const UINT4 *in) {
   in += 1 * WORD_INCR;
-  return ( (*in) >>  4  )   % (1U << 12 ) ;
+  return ( CONVERT(*in) >>  4  )   % (1U << 12 ) ;
 }
 
 static UINT4
 access_12_04 (const UINT4 *in) {
   in += 1 * WORD_INCR;
-  return ( (*in) >>  16  )   % (1U << 12 ) ;
+  return ( CONVERT(*in) >>  16  )   % (1U << 12 ) ;
 }
 
 static UINT4
@@ -548,34 +556,34 @@ access_12_05 (const UINT4 *in) {
   UINT4 out;
 
   in += 1 * WORD_INCR;
-  out = ( (*in) >>  28  )   % (1U << 12 ) ;
+  out = ( CONVERT(*in) >>  28  )   % (1U << 12 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 8 ))<<( 12 - 8 );
+  out |= (CONVERT(*in) % (1U<< 8 ))<<( 12 - 8 );
   return out;
 }
 
 static UINT4
 access_12_06 (const UINT4 *in) {
   in += 2 * WORD_INCR;
-  return ( (*in) >>  8  )   % (1U << 12 ) ;
+  return ( CONVERT(*in) >>  8  )   % (1U << 12 ) ;
 }
 
 static UINT4
 access_12_07 (const UINT4 *in) {
   in += 2 * WORD_INCR;
-  return ( (*in) >>  20  )   % (1U << 12 ) ;
+  return ( CONVERT(*in) >>  20  )   % (1U << 12 ) ;
 }
 
 static UINT4
 access_12_08 (const UINT4 *in) {
   in += 3 * WORD_INCR;
-  return ( (*in) >>  0  )   % (1U << 12 ) ;
+  return ( CONVERT(*in) >>  0  )   % (1U << 12 ) ;
 }
 
 static UINT4
 access_12_09 (const UINT4 *in) {
   in += 3 * WORD_INCR;
-  return ( (*in) >>  12  )   % (1U << 12 ) ;
+  return ( CONVERT(*in) >>  12  )   % (1U << 12 ) ;
 }
 
 static UINT4
@@ -583,22 +591,22 @@ access_12_10 (const UINT4 *in) {
   UINT4 out;
 
   in += 3 * WORD_INCR;
-  out = ( (*in) >>  24  )   % (1U << 12 ) ;
+  out = ( CONVERT(*in) >>  24  )   % (1U << 12 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 4 ))<<( 12 - 4 );
+  out |= (CONVERT(*in) % (1U<< 4 ))<<( 12 - 4 );
   return out;
 }
 
 static UINT4
 access_12_11 (const UINT4 *in) {
   in += 4 * WORD_INCR;
-  return ( (*in) >>  4  )   % (1U << 12 ) ;
+  return ( CONVERT(*in) >>  4  )   % (1U << 12 ) ;
 }
 
 static UINT4
 access_12_12 (const UINT4 *in) {
   in += 4 * WORD_INCR;
-  return ( (*in) >>  16  )   % (1U << 12 ) ;
+  return ( CONVERT(*in) >>  16  )   % (1U << 12 ) ;
 }
 
 static UINT4
@@ -606,49 +614,49 @@ access_12_13 (const UINT4 *in) {
   UINT4 out;
 
   in += 4 * WORD_INCR;
-  out = ( (*in) >>  28  )   % (1U << 12 ) ;
+  out = ( CONVERT(*in) >>  28  )   % (1U << 12 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 8 ))<<( 12 - 8 );
+  out |= (CONVERT(*in) % (1U<< 8 ))<<( 12 - 8 );
   return out;
 }
 
 static UINT4
 access_12_14 (const UINT4 *in) {
   in += 5 * WORD_INCR;
-  return ( (*in) >>  8  )   % (1U << 12 ) ;
+  return ( CONVERT(*in) >>  8  )   % (1U << 12 ) ;
 }
 
 static UINT4
 access_12_15 (const UINT4 *in) {
   in += 5 * WORD_INCR;
-  return ( (*in) >>  20  )   % (1U << 12 ) ;
+  return ( CONVERT(*in) >>  20  )   % (1U << 12 ) ;
 }
 
 
 static UINT4
 access_14_00 (const UINT4 *in) {
-  return ( (*in) >>  0  )   % (1U << 14 ) ;
+  return ( CONVERT(*in) >>  0  )   % (1U << 14 ) ;
 }
 
 static UINT4
 access_14_01 (const UINT4 *in) {
-  return ( (*in) >>  14  )   % (1U << 14 ) ;
+  return ( CONVERT(*in) >>  14  )   % (1U << 14 ) ;
 }
 
 static UINT4
 access_14_02 (const UINT4 *in) {
   UINT4 out;
 
-  out = ( (*in) >>  28  )   % (1U << 14 ) ;
+  out = ( CONVERT(*in) >>  28  )   % (1U << 14 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 10 ))<<( 14 - 10 );
+  out |= (CONVERT(*in) % (1U<< 10 ))<<( 14 - 10 );
   return out;
 }
 
 static UINT4
 access_14_03 (const UINT4 *in) {
   in += 1 * WORD_INCR;
-  return ( (*in) >>  10  )   % (1U << 14 ) ;
+  return ( CONVERT(*in) >>  10  )   % (1U << 14 ) ;
 }
 
 static UINT4
@@ -656,16 +664,16 @@ access_14_04 (const UINT4 *in) {
   UINT4 out;
 
   in += 1 * WORD_INCR;
-  out = ( (*in) >>  24  )   % (1U << 14 ) ;
+  out = ( CONVERT(*in) >>  24  )   % (1U << 14 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 6 ))<<( 14 - 6 );
+  out |= (CONVERT(*in) % (1U<< 6 ))<<( 14 - 6 );
   return out;
 }
 
 static UINT4
 access_14_05 (const UINT4 *in) {
   in += 2 * WORD_INCR;
-  return ( (*in) >>  6  )   % (1U << 14 ) ;
+  return ( CONVERT(*in) >>  6  )   % (1U << 14 ) ;
 }
 
 static UINT4
@@ -673,22 +681,22 @@ access_14_06 (const UINT4 *in) {
   UINT4 out;
   in += 2 * WORD_INCR;
 
-  out = ( (*in) >>  20  )   % (1U << 14 ) ;
+  out = ( CONVERT(*in) >>  20  )   % (1U << 14 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 2 ))<<( 14 - 2 );
+  out |= (CONVERT(*in) % (1U<< 2 ))<<( 14 - 2 );
   return out;
 }
 
 static UINT4
 access_14_07 (const UINT4 *in) {
   in += 3 * WORD_INCR;
-  return ( (*in) >>  2  )   % (1U << 14 ) ;
+  return ( CONVERT(*in) >>  2  )   % (1U << 14 ) ;
 }
 
 static UINT4
 access_14_08 (const UINT4 *in) {
   in += 3 * WORD_INCR;
-  return ( (*in) >>  16  )   % (1U << 14 ) ;
+  return ( CONVERT(*in) >>  16  )   % (1U << 14 ) ;
 }
 
 static UINT4
@@ -696,16 +704,16 @@ access_14_09 (const UINT4 *in) {
   UINT4 out;
 
   in += 3 * WORD_INCR;
-  out = ( (*in) >>  30  )   % (1U << 14 ) ;
+  out = ( CONVERT(*in) >>  30  )   % (1U << 14 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 12 ))<<( 14 - 12 );
+  out |= (CONVERT(*in) % (1U<< 12 ))<<( 14 - 12 );
   return out;
 }
 
 static UINT4
 access_14_10 (const UINT4 *in) {
   in += 4 * WORD_INCR;
-  return ( (*in) >>  12  )   % (1U << 14 ) ;
+  return ( CONVERT(*in) >>  12  )   % (1U << 14 ) ;
 }
 
 static UINT4
@@ -713,16 +721,16 @@ access_14_11 (const UINT4 *in) {
   UINT4 out;
 
   in += 4 * WORD_INCR;
-  out = ( (*in) >>  26  )   % (1U << 14 ) ;
+  out = ( CONVERT(*in) >>  26  )   % (1U << 14 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 8 ))<<( 14 - 8 );
+  out |= (CONVERT(*in) % (1U<< 8 ))<<( 14 - 8 );
   return out;
 }
 
 static UINT4
 access_14_12 (const UINT4 *in) {
   in += 5 * WORD_INCR;
-  return ( (*in) >>  8  )   % (1U << 14 ) ;
+  return ( CONVERT(*in) >>  8  )   % (1U << 14 ) ;
 }
 
 static UINT4
@@ -730,139 +738,139 @@ access_14_13 (const UINT4 *in) {
   UINT4 out;
 
   in += 5 * WORD_INCR;
-  out = ( (*in) >>  22  )   % (1U << 14 ) ;
+  out = ( CONVERT(*in) >>  22  )   % (1U << 14 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 4 ))<<( 14 - 4 );
+  out |= (CONVERT(*in) % (1U<< 4 ))<<( 14 - 4 );
   return out;
 }
 
 static UINT4
 access_14_14 (const UINT4 *in) {
   in += 6 * WORD_INCR;
-  return ( (*in) >>  4  )   % (1U << 14 ) ;
+  return ( CONVERT(*in) >>  4  )   % (1U << 14 ) ;
 }
 
 static UINT4
 access_14_15 (const UINT4 *in) {
   in += 6 * WORD_INCR;
-  return ( (*in) >>  18  )   % (1U << 14 ) ;
+  return ( CONVERT(*in) >>  18  )   % (1U << 14 ) ;
 }
 
 
 static UINT4
 access_16_00 (const UINT4 *in) {
-  return ( (*in) >> 0 )   % (1U << 16 ) ;
+  return ( CONVERT(*in) >> 0 )   % (1U << 16 ) ;
 }
 
 static UINT4
 access_16_01 (const UINT4 *in) {
-  return ( (*in) >> 16 )   % (1U << 16 ) ;
+  return ( CONVERT(*in) >> 16 )   % (1U << 16 ) ;
 }
 
 static UINT4
 access_16_02 (const UINT4 *in) {
   in += 1 * WORD_INCR;
-  return ( (*in) >> 0 )   % (1U << 16 ) ;
+  return ( CONVERT(*in) >> 0 )   % (1U << 16 ) ;
 }
 
 static UINT4
 access_16_03 (const UINT4 *in) {
   in += 1 * WORD_INCR;
-  return ( (*in) >> 16 )   % (1U << 16 ) ;
+  return ( CONVERT(*in) >> 16 )   % (1U << 16 ) ;
 }
 
 static UINT4
 access_16_04 (const UINT4 *in) {
   in += 2 * WORD_INCR;
-  return ( (*in) >> 0 )   % (1U << 16 ) ;
+  return ( CONVERT(*in) >> 0 )   % (1U << 16 ) ;
 }
 
 static UINT4
 access_16_05 (const UINT4 *in) {
   in += 2 * WORD_INCR;
-  return ( (*in) >> 16 )   % (1U << 16 ) ;
+  return ( CONVERT(*in) >> 16 )   % (1U << 16 ) ;
 }
 
 static UINT4
 access_16_06 (const UINT4 *in) {
   in += 3 * WORD_INCR;
-  return ( (*in) >> 0 )   % (1U << 16 ) ;
+  return ( CONVERT(*in) >> 0 )   % (1U << 16 ) ;
 }
 
 static UINT4
 access_16_07 (const UINT4 *in) {
   in += 3 * WORD_INCR;
-  return ( (*in) >> 16 )   % (1U << 16 ) ;
+  return ( CONVERT(*in) >> 16 )   % (1U << 16 ) ;
 }
 
 static UINT4
 access_16_08 (const UINT4 *in) {
   in += 4 * WORD_INCR;
-  return ( (*in) >> 0 )   % (1U << 16 ) ;
+  return ( CONVERT(*in) >> 0 )   % (1U << 16 ) ;
 }
 
 static UINT4
 access_16_09 (const UINT4 *in) {
   in += 4 * WORD_INCR;
-  return ( (*in) >> 16 )   % (1U << 16 ) ;
+  return ( CONVERT(*in) >> 16 )   % (1U << 16 ) ;
 }
 
 static UINT4
 access_16_10 (const UINT4 *in) {
   in += 5 * WORD_INCR;
-  return ( (*in) >> 0 )   % (1U << 16 ) ;
+  return ( CONVERT(*in) >> 0 )   % (1U << 16 ) ;
 }
 
 static UINT4
 access_16_11 (const UINT4 *in) {
   in += 5 * WORD_INCR;
-  return ( (*in) >> 16 )   % (1U << 16 ) ;
+  return ( CONVERT(*in) >> 16 )   % (1U << 16 ) ;
 }
 
 static UINT4
 access_16_12 (const UINT4 *in) {
   in += 6 * WORD_INCR;
-  return ( (*in) >> 0 )   % (1U << 16 ) ;
+  return ( CONVERT(*in) >> 0 )   % (1U << 16 ) ;
 }
 
 static UINT4
 access_16_13 (const UINT4 *in) {
   in += 6 * WORD_INCR;
-  return ( (*in) >> 16 )   % (1U << 16 ) ;
+  return ( CONVERT(*in) >> 16 )   % (1U << 16 ) ;
 }
 
 static UINT4
 access_16_14 (const UINT4 *in) {
   in += 7 * WORD_INCR;
-  return ( (*in) >> 0 )   % (1U << 16 ) ;
+  return ( CONVERT(*in) >> 0 )   % (1U << 16 ) ;
 }
 
 static UINT4
 access_16_15 (const UINT4 *in) {
   in += 7 * WORD_INCR;
-  return ( (*in) >> 16 )   % (1U << 16 ) ;
+  return ( CONVERT(*in) >> 16 )   % (1U << 16 ) ;
 }
 
 
 static UINT4
 access_18_00 (const UINT4 *in) {
-  return ( (*in) >>  0  )   % (1U << 18 ) ;
+  return ( CONVERT(*in) >>  0  )   % (1U << 18 ) ;
 }
 
 static UINT4
 access_18_01 (const UINT4 *in) {
   UINT4 out;
 
-  out = ( (*in) >>  18  )   % (1U << 18 ) ;
+  out = ( CONVERT(*in) >>  18  )   % (1U << 18 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 4 ))<<( 18 - 4 );
+  out |= (CONVERT(*in) % (1U<< 4 ))<<( 18 - 4 );
   return out;
 }
 
 static UINT4
 access_18_02 (const UINT4 *in) {
   in += 1 * WORD_INCR;
-  return ( (*in) >>  4  )   % (1U << 18 ) ;
+  return ( CONVERT(*in) >>  4  )   % (1U << 18 ) ;
 }
 
 static UINT4
@@ -870,16 +878,16 @@ access_18_03 (const UINT4 *in) {
   UINT4 out;
 
   in += 1 * WORD_INCR;
-  out = ( (*in) >>  22  )   % (1U << 18 ) ;
+  out = ( CONVERT(*in) >>  22  )   % (1U << 18 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 8 ))<<( 18 - 8 );
+  out |= (CONVERT(*in) % (1U<< 8 ))<<( 18 - 8 );
   return out;
 }
 
 static UINT4
 access_18_04 (const UINT4 *in) {
   in += 2 * WORD_INCR;
-  return ( (*in) >>  8  )   % (1U << 18 ) ;
+  return ( CONVERT(*in) >>  8  )   % (1U << 18 ) ;
 }
 
 static UINT4
@@ -887,16 +895,16 @@ access_18_05 (const UINT4 *in) {
   UINT4 out;
 
   in += 2 * WORD_INCR;
-  out = ( (*in) >>  26  )   % (1U << 18 ) ;
+  out = ( CONVERT(*in) >>  26  )   % (1U << 18 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 12 ))<<( 18 - 12 );
+  out |= (CONVERT(*in) % (1U<< 12 ))<<( 18 - 12 );
   return out;
 }
 
 static UINT4
 access_18_06 (const UINT4 *in) {
   in += 3 * WORD_INCR;
-  return ( (*in) >>  12  )   % (1U << 18 ) ;
+  return ( CONVERT(*in) >>  12  )   % (1U << 18 ) ;
 }
 
 static UINT4
@@ -904,9 +912,9 @@ access_18_07 (const UINT4 *in) {
   UINT4 out;
 
   in += 3 * WORD_INCR;
-  out = ( (*in) >>  30  )   % (1U << 18 ) ;
+  out = ( CONVERT(*in) >>  30  )   % (1U << 18 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 16 ))<<( 18 - 16 );
+  out |= (CONVERT(*in) % (1U<< 16 ))<<( 18 - 16 );
   return out;
 }
 
@@ -915,16 +923,16 @@ access_18_08 (const UINT4 *in) {
   UINT4 out;
 
   in += 4 * WORD_INCR;
-  out = ( (*in) >>  16  )   % (1U << 18 ) ;
+  out = ( CONVERT(*in) >>  16  )   % (1U << 18 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 2 ))<<( 18 - 2 );
+  out |= (CONVERT(*in) % (1U<< 2 ))<<( 18 - 2 );
   return out;
 }
 
 static UINT4
 access_18_09 (const UINT4 *in) {
   in += 5 * WORD_INCR;
-  return ( (*in) >>  2  )   % (1U << 18 ) ;
+  return ( CONVERT(*in) >>  2  )   % (1U << 18 ) ;
 }
 
 static UINT4
@@ -932,16 +940,16 @@ access_18_10 (const UINT4 *in) {
   UINT4 out;
 
   in += 5 * WORD_INCR;
-  out = ( (*in) >>  20  )   % (1U << 18 ) ;
+  out = ( CONVERT(*in) >>  20  )   % (1U << 18 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 6 ))<<( 18 - 6 );
+  out |= (CONVERT(*in) % (1U<< 6 ))<<( 18 - 6 );
   return out;
 }
 
 static UINT4
 access_18_11 (const UINT4 *in) {
   in += 6 * WORD_INCR;
-  return ( (*in) >>  6  )   % (1U << 18 ) ;
+  return ( CONVERT(*in) >>  6  )   % (1U << 18 ) ;
 }
 
 static UINT4
@@ -949,16 +957,16 @@ access_18_12 (const UINT4 *in) {
   UINT4 out;
 
   in += 6 * WORD_INCR;
-  out = ( (*in) >>  24  )   % (1U << 18 ) ;
+  out = ( CONVERT(*in) >>  24  )   % (1U << 18 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 10 ))<<( 18 - 10 );
+  out |= (CONVERT(*in) % (1U<< 10 ))<<( 18 - 10 );
   return out;
 }
 
 static UINT4
 access_18_13 (const UINT4 *in) {
   in += 7 * WORD_INCR;
-  return ( (*in) >>  10  )   % (1U << 18 ) ;
+  return ( CONVERT(*in) >>  10  )   % (1U << 18 ) ;
 }
 
 static UINT4
@@ -966,38 +974,38 @@ access_18_14 (const UINT4 *in) {
   UINT4 out;
 
   in += 7 * WORD_INCR;
-  out = ( (*in) >>  28  )   % (1U << 18 ) ;
+  out = ( CONVERT(*in) >>  28  )   % (1U << 18 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 14 ))<<( 18 - 14 );
+  out |= (CONVERT(*in) % (1U<< 14 ))<<( 18 - 14 );
   return out;
 }
 
 static UINT4
 access_18_15 (const UINT4 *in) {
   in += 8 * WORD_INCR;
-  return ( (*in) >>  14  )   % (1U << 18 ) ;
+  return ( CONVERT(*in) >>  14  )   % (1U << 18 ) ;
 }
 
 
 static UINT4
 access_20_00 (const UINT4 *in) {
-    return ( (*in) >>  0  )   % (1U << 20 ) ;
+    return ( CONVERT(*in) >>  0  )   % (1U << 20 ) ;
 }
 
 static UINT4
 access_20_01 (const UINT4 *in) {
   UINT4 out;
 
-  out = ( (*in) >>  20  )   % (1U << 20 ) ;
+  out = ( CONVERT(*in) >>  20  )   % (1U << 20 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 8 ))<<( 20 - 8 );
+  out |= (CONVERT(*in) % (1U<< 8 ))<<( 20 - 8 );
   return out;
 }
 
 static UINT4
 access_20_02 (const UINT4 *in) {
   in += 1 * WORD_INCR;
-  return ( (*in) >>  8  )   % (1U << 20 ) ;
+  return ( CONVERT(*in) >>  8  )   % (1U << 20 ) ;
 }
 
 static UINT4
@@ -1005,9 +1013,9 @@ access_20_03 (const UINT4 *in) {
   UINT4 out;
 
   in += 1 * WORD_INCR;
-  out = ( (*in) >>  28  )   % (1U << 20 ) ;
+  out = ( CONVERT(*in) >>  28  )   % (1U << 20 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 16 ))<<( 20 - 16 );
+  out |= (CONVERT(*in) % (1U<< 16 ))<<( 20 - 16 );
   return out;
 }
 
@@ -1016,16 +1024,16 @@ access_20_04 (const UINT4 *in) {
   UINT4 out;
 
   in += 2 * WORD_INCR;
-  out = ( (*in) >>  16  )   % (1U << 20 ) ;
+  out = ( CONVERT(*in) >>  16  )   % (1U << 20 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 4 ))<<( 20 - 4 );
+  out |= (CONVERT(*in) % (1U<< 4 ))<<( 20 - 4 );
   return out;
 }
 
 static UINT4
 access_20_05 (const UINT4 *in) {
   in += 3 * WORD_INCR;
-  return ( (*in) >>  4  )   % (1U << 20 ) ;
+  return ( CONVERT(*in) >>  4  )   % (1U << 20 ) ;
 }
 
 static UINT4
@@ -1033,22 +1041,22 @@ access_20_06 (const UINT4 *in) {
   UINT4 out;
 
   in += 3 * WORD_INCR;
-  out = ( (*in) >>  24  )   % (1U << 20 ) ;
+  out = ( CONVERT(*in) >>  24  )   % (1U << 20 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 12 ))<<( 20 - 12 );
+  out |= (CONVERT(*in) % (1U<< 12 ))<<( 20 - 12 );
   return out;
 }
 
 static UINT4
 access_20_07 (const UINT4 *in) {
   in += 4 * WORD_INCR;
-  return ( (*in) >>  12  )   % (1U << 20 ) ;
+  return ( CONVERT(*in) >>  12  )   % (1U << 20 ) ;
 }
 
 static UINT4
 access_20_08 (const UINT4 *in) {
   in += 5 * WORD_INCR;
-  return ( (*in) >>  0  )   % (1U << 20 ) ;
+  return ( CONVERT(*in) >>  0  )   % (1U << 20 ) ;
 }
 
 static UINT4
@@ -1056,16 +1064,16 @@ access_20_09 (const UINT4 *in) {
   UINT4 out;
 
   in += 5 * WORD_INCR;
-  out = ( (*in) >>  20  )   % (1U << 20 ) ;
+  out = ( CONVERT(*in) >>  20  )   % (1U << 20 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 8 ))<<( 20 - 8 );
+  out |= (CONVERT(*in) % (1U<< 8 ))<<( 20 - 8 );
   return out;
 }
 
 static UINT4
 access_20_10 (const UINT4 *in) {
   in += 6 * WORD_INCR;
-  return ( (*in) >>  8  )   % (1U << 20 ) ;
+  return ( CONVERT(*in) >>  8  )   % (1U << 20 ) ;
 }
 
 static UINT4
@@ -1073,9 +1081,9 @@ access_20_11 (const UINT4 *in) {
   UINT4 out;
 
   in += 6 * WORD_INCR;
-  out = ( (*in) >>  28  )   % (1U << 20 ) ;
+  out = ( CONVERT(*in) >>  28  )   % (1U << 20 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 16 ))<<( 20 - 16 );
+  out |= (CONVERT(*in) % (1U<< 16 ))<<( 20 - 16 );
   return out;
 }
 
@@ -1084,16 +1092,16 @@ access_20_12 (const UINT4 *in) {
   UINT4 out;
 
   in += 7 * WORD_INCR;
-  out = ( (*in) >>  16  )   % (1U << 20 ) ;
+  out = ( CONVERT(*in) >>  16  )   % (1U << 20 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 4 ))<<( 20 - 4 );
+  out |= (CONVERT(*in) % (1U<< 4 ))<<( 20 - 4 );
   return out;
 }
 
 static UINT4
 access_20_13 (const UINT4 *in) {
   in += 8 * WORD_INCR;
-  return ( (*in) >>  4  )   % (1U << 20 ) ;
+  return ( CONVERT(*in) >>  4  )   % (1U << 20 ) ;
 }
 
 static UINT4
@@ -1101,31 +1109,31 @@ access_20_14 (const UINT4 *in) {
   UINT4 out;
 
   in += 8 * WORD_INCR;
-  out = ( (*in) >>  24  )   % (1U << 20 ) ;
+  out = ( CONVERT(*in) >>  24  )   % (1U << 20 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 12 ))<<( 20 - 12 );
+  out |= (CONVERT(*in) % (1U<< 12 ))<<( 20 - 12 );
   return out;
 }
 
 static UINT4
 access_20_15 (const UINT4 *in) {
   in += 9 * WORD_INCR;
-  return ( (*in) >>  12  )   % (1U << 20 ) ;
+  return ( CONVERT(*in) >>  12  )   % (1U << 20 ) ;
 }
 
 
 static UINT4
 access_22_00 (const UINT4 *in) {
-  return ( (*in) >>  0  )   % (1U << 22 ) ;
+  return ( CONVERT(*in) >>  0  )   % (1U << 22 ) ;
 }
 
 static UINT4
 access_22_01 (const UINT4 *in) {
   UINT4 out;
 
-  out = ( (*in) >>  22  )   % (1U << 22 ) ;
+  out = ( CONVERT(*in) >>  22  )   % (1U << 22 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 12 ))<<( 22 - 12 );
+  out |= (CONVERT(*in) % (1U<< 12 ))<<( 22 - 12 );
   return out;
 }
 
@@ -1134,16 +1142,16 @@ access_22_02 (const UINT4 *in) {
   UINT4 out;
 
   in += 1 * WORD_INCR;
-  out = ( (*in) >>  12  )   % (1U << 22 ) ;
+  out = ( CONVERT(*in) >>  12  )   % (1U << 22 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 2 ))<<( 22 - 2 );
+  out |= (CONVERT(*in) % (1U<< 2 ))<<( 22 - 2 );
   return out;
 }
 
 static UINT4
 access_22_03 (const UINT4 *in) {
   in += 2 * WORD_INCR;
-  return ( (*in) >>  2  )   % (1U << 22 ) ;
+  return ( CONVERT(*in) >>  2  )   % (1U << 22 ) ;
 }
 
 static UINT4
@@ -1151,9 +1159,9 @@ access_22_04 (const UINT4 *in) {
   UINT4 out;
 
   in += 2 * WORD_INCR;
-  out = ( (*in) >>  24  )   % (1U << 22 ) ;
+  out = ( CONVERT(*in) >>  24  )   % (1U << 22 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 14 ))<<( 22 - 14 );
+  out |= (CONVERT(*in) % (1U<< 14 ))<<( 22 - 14 );
   return out;
 }
 
@@ -1162,16 +1170,16 @@ access_22_05 (const UINT4 *in) {
   UINT4 out;
 
   in += 3 * WORD_INCR;
-  out = ( (*in) >>  14  )   % (1U << 22 ) ;
+  out = ( CONVERT(*in) >>  14  )   % (1U << 22 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 4 ))<<( 22 - 4 );
+  out |= (CONVERT(*in) % (1U<< 4 ))<<( 22 - 4 );
   return out;
 }
 
 static UINT4
 access_22_06 (const UINT4 *in) {
   in += 4 * WORD_INCR;
-  return ( (*in) >>  4  )   % (1U << 22 ) ;
+  return ( CONVERT(*in) >>  4  )   % (1U << 22 ) ;
 }
 
 static UINT4
@@ -1179,9 +1187,9 @@ access_22_07 (const UINT4 *in) {
   UINT4 out;
 
   in += 4 * WORD_INCR;
-  out = ( (*in) >>  26  )   % (1U << 22 ) ;
+  out = ( CONVERT(*in) >>  26  )   % (1U << 22 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 16 ))<<( 22 - 16 );
+  out |= (CONVERT(*in) % (1U<< 16 ))<<( 22 - 16 );
   return out;
 }
 
@@ -1190,16 +1198,16 @@ access_22_08 (const UINT4 *in) {
   UINT4 out;
 
   in += 5 * WORD_INCR;
-  out = ( (*in) >>  16  )   % (1U << 22 ) ;
+  out = ( CONVERT(*in) >>  16  )   % (1U << 22 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 6 ))<<( 22 - 6 );
+  out |= (CONVERT(*in) % (1U<< 6 ))<<( 22 - 6 );
   return out;
 }
 
 static UINT4
 access_22_09 (const UINT4 *in) {
   in += 6 * WORD_INCR;
-  return ( (*in) >>  6  )   % (1U << 22 ) ;
+  return ( CONVERT(*in) >>  6  )   % (1U << 22 ) ;
 }
 
 static UINT4
@@ -1207,9 +1215,9 @@ access_22_10 (const UINT4 *in) {
   UINT4 out;
 
   in += 6 * WORD_INCR;
-  out = ( (*in) >>  28  )   % (1U << 22 ) ;
+  out = ( CONVERT(*in) >>  28  )   % (1U << 22 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 18 ))<<( 22 - 18 );
+  out |= (CONVERT(*in) % (1U<< 18 ))<<( 22 - 18 );
   return out;
 }
 
@@ -1218,16 +1226,16 @@ access_22_11 (const UINT4 *in) {
   UINT4 out;
 
   in += 7 * WORD_INCR;
-  out = ( (*in) >>  18  )   % (1U << 22 ) ;
+  out = ( CONVERT(*in) >>  18  )   % (1U << 22 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 8 ))<<( 22 - 8 );
+  out |= (CONVERT(*in) % (1U<< 8 ))<<( 22 - 8 );
   return out;
 }
 
 static UINT4
 access_22_12 (const UINT4 *in) {
   in += 8 * WORD_INCR;
-  return ( (*in) >>  8  )   % (1U << 22 ) ;
+  return ( CONVERT(*in) >>  8  )   % (1U << 22 ) ;
 }
 
 static UINT4
@@ -1235,9 +1243,9 @@ access_22_13 (const UINT4 *in) {
   UINT4 out;
 
   in += 8 * WORD_INCR;
-  out = ( (*in) >>  30  )   % (1U << 22 ) ;
+  out = ( CONVERT(*in) >>  30  )   % (1U << 22 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 20 ))<<( 22 - 20 );
+  out |= (CONVERT(*in) % (1U<< 20 ))<<( 22 - 20 );
   return out;
 }
 
@@ -1246,32 +1254,32 @@ access_22_14 (const UINT4 *in) {
   UINT4 out;
 
   in += 9 * WORD_INCR;
-  out = ( (*in) >>  20  )   % (1U << 22 ) ;
+  out = ( CONVERT(*in) >>  20  )   % (1U << 22 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 10 ))<<( 22 - 10 );
+  out |= (CONVERT(*in) % (1U<< 10 ))<<( 22 - 10 );
   return out;
 }
 
 static UINT4
 access_22_15 (const UINT4 *in) {
   in += 10 * WORD_INCR;
-  return ( (*in) >>  10  )   % (1U << 22 ) ;
+  return ( CONVERT(*in) >>  10  )   % (1U << 22 ) ;
 }
 
 
 
 static UINT4
 access_24_00 (const UINT4 *in) {
-  return ( (*in) >>  0  )   % (1U << 24 ) ;
+  return ( CONVERT(*in) >>  0  )   % (1U << 24 ) ;
 }
 
 static UINT4
 access_24_01 (const UINT4 *in) {
   UINT4 out;
 
-  out = ( (*in) >>  24  )   % (1U << 24 ) ;
+  out = ( CONVERT(*in) >>  24  )   % (1U << 24 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 16 ))<<( 24 - 16 );
+  out |= (CONVERT(*in) % (1U<< 16 ))<<( 24 - 16 );
   return out;
 }
 
@@ -1280,22 +1288,22 @@ access_24_02 (const UINT4 *in) {
   UINT4 out;
 
   in += 1 * WORD_INCR;
-  out = ( (*in) >>  16  )   % (1U << 24 ) ;
+  out = ( CONVERT(*in) >>  16  )   % (1U << 24 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 8 ))<<( 24 - 8 );
+  out |= (CONVERT(*in) % (1U<< 8 ))<<( 24 - 8 );
   return out;
 }
 
 static UINT4
 access_24_03 (const UINT4 *in) {
   in += 2 * WORD_INCR;
-  return ( (*in) >>  8  )   % (1U << 24 ) ;
+  return ( CONVERT(*in) >>  8  )   % (1U << 24 ) ;
 }
 
 static UINT4
 access_24_04 (const UINT4 *in) {
   in += 3 * WORD_INCR;
-  return ( (*in) >>  0  )   % (1U << 24 ) ;
+  return ( CONVERT(*in) >>  0  )   % (1U << 24 ) ;
 }
 
 static UINT4
@@ -1303,9 +1311,9 @@ access_24_05 (const UINT4 *in) {
   UINT4 out;
 
   in += 3 * WORD_INCR;
-  out = ( (*in) >>  24  )   % (1U << 24 ) ;
+  out = ( CONVERT(*in) >>  24  )   % (1U << 24 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 16 ))<<( 24 - 16 );
+  out |= (CONVERT(*in) % (1U<< 16 ))<<( 24 - 16 );
   return out;
 }
 
@@ -1314,22 +1322,22 @@ access_24_06 (const UINT4 *in) {
   UINT4 out;
 
   in += 4 * WORD_INCR;
-  out = ( (*in) >>  16  )   % (1U << 24 ) ;
+  out = ( CONVERT(*in) >>  16  )   % (1U << 24 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 8 ))<<( 24 - 8 );
+  out |= (CONVERT(*in) % (1U<< 8 ))<<( 24 - 8 );
   return out;
 }
 
 static UINT4
 access_24_07 (const UINT4 *in) {
   in += 5 * WORD_INCR;
-  return ( (*in) >>  8  )   % (1U << 24 ) ;
+  return ( CONVERT(*in) >>  8  )   % (1U << 24 ) ;
 }
 
 static UINT4
 access_24_08 (const UINT4 *in) {
   in += 6 * WORD_INCR;
-  return ( (*in) >>  0  )   % (1U << 24 ) ;
+  return ( CONVERT(*in) >>  0  )   % (1U << 24 ) ;
 }
 
 static UINT4
@@ -1337,9 +1345,9 @@ access_24_09 (const UINT4 *in) {
   UINT4 out;
 
   in += 6 * WORD_INCR;
-  out = ( (*in) >>  24  )   % (1U << 24 ) ;
+  out = ( CONVERT(*in) >>  24  )   % (1U << 24 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 16 ))<<( 24 - 16 );
+  out |= (CONVERT(*in) % (1U<< 16 ))<<( 24 - 16 );
   return out;
 }
 
@@ -1348,22 +1356,22 @@ access_24_10 (const UINT4 *in) {
   UINT4 out;
 
   in += 7 * WORD_INCR;
-  out = ( (*in) >>  16  )   % (1U << 24 ) ;
+  out = ( CONVERT(*in) >>  16  )   % (1U << 24 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 8 ))<<( 24 - 8 );
+  out |= (CONVERT(*in) % (1U<< 8 ))<<( 24 - 8 );
   return out;
 }
 
 static UINT4
 access_24_11 (const UINT4 *in) {
   in += 8 * WORD_INCR;
-  return ( (*in) >>  8  )   % (1U << 24 ) ;
+  return ( CONVERT(*in) >>  8  )   % (1U << 24 ) ;
 }
 
 static UINT4
 access_24_12 (const UINT4 *in) {
   in += 9 * WORD_INCR;
-  return ( (*in) >>  0  )   % (1U << 24 ) ;
+  return ( CONVERT(*in) >>  0  )   % (1U << 24 ) ;
 }
 
 static UINT4
@@ -1371,9 +1379,9 @@ access_24_13 (const UINT4 *in) {
   UINT4 out;
 
   in += 9 * WORD_INCR;
-  out = ( (*in) >>  24  )   % (1U << 24 ) ;
+  out = ( CONVERT(*in) >>  24  )   % (1U << 24 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 16 ))<<( 24 - 16 );
+  out |= (CONVERT(*in) % (1U<< 16 ))<<( 24 - 16 );
   return out;
 }
 
@@ -1382,32 +1390,32 @@ access_24_14 (const UINT4 *in) {
   UINT4 out;
 
   in += 10 * WORD_INCR;
-  out = ( (*in) >>  16  )   % (1U << 24 ) ;
+  out = ( CONVERT(*in) >>  16  )   % (1U << 24 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 8 ))<<( 24 - 8 );
+  out |= (CONVERT(*in) % (1U<< 8 ))<<( 24 - 8 );
   return out;
 }
 
 static UINT4
 access_24_15 (const UINT4 *in) {
   in += 11 * WORD_INCR;
-  return ( (*in) >>  8  )   % (1U << 24 ) ;
+  return ( CONVERT(*in) >>  8  )   % (1U << 24 ) ;
 }
 
 
 
 static UINT4
 access_26_00 (const UINT4 *in) {
-  return ( (*in) >>  0  )   % (1U << 26 ) ;
+  return ( CONVERT(*in) >>  0  )   % (1U << 26 ) ;
 }
 
 static UINT4
 access_26_01 (const UINT4 *in) {
   UINT4 out;
 
-  out = ( (*in) >>  26  )   % (1U << 26 ) ;
+  out = ( CONVERT(*in) >>  26  )   % (1U << 26 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 20 ))<<( 26 - 20 );
+  out |= (CONVERT(*in) % (1U<< 20 ))<<( 26 - 20 );
   return out;
 }
 
@@ -1416,9 +1424,9 @@ access_26_02 (const UINT4 *in) {
   UINT4 out;
 
   in += 1 * WORD_INCR;
-  out = ( (*in) >>  20  )   % (1U << 26 ) ;
+  out = ( CONVERT(*in) >>  20  )   % (1U << 26 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 14 ))<<( 26 - 14 );
+  out |= (CONVERT(*in) % (1U<< 14 ))<<( 26 - 14 );
   return out;
 }
 
@@ -1427,9 +1435,9 @@ access_26_03 (const UINT4 *in) {
   UINT4 out;
 
   in += 2 * WORD_INCR;
-  out = ( (*in) >>  14  )   % (1U << 26 ) ;
+  out = ( CONVERT(*in) >>  14  )   % (1U << 26 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 8 ))<<( 26 - 8 );
+  out |= (CONVERT(*in) % (1U<< 8 ))<<( 26 - 8 );
   return out;
 }
 
@@ -1438,16 +1446,16 @@ access_26_04 (const UINT4 *in) {
   UINT4 out;
 
   in += 3 * WORD_INCR;
-  out = ( (*in) >>  8  )   % (1U << 26 ) ;
+  out = ( CONVERT(*in) >>  8  )   % (1U << 26 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 2 ))<<( 26 - 2 );
+  out |= (CONVERT(*in) % (1U<< 2 ))<<( 26 - 2 );
   return out;
 }
 
 static UINT4
 access_26_05 (const UINT4 *in) {
   in += 4 * WORD_INCR;
-  return ( (*in) >>  2  )   % (1U << 26 ) ;
+  return ( CONVERT(*in) >>  2  )   % (1U << 26 ) ;
 }
 
 static UINT4
@@ -1455,9 +1463,9 @@ access_26_06 (const UINT4 *in) {
   UINT4 out;
 
   in += 4 * WORD_INCR;
-  out = ( (*in) >>  28  )   % (1U << 26 ) ;
+  out = ( CONVERT(*in) >>  28  )   % (1U << 26 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 22 ))<<( 26 - 22 );
+  out |= (CONVERT(*in) % (1U<< 22 ))<<( 26 - 22 );
   return out;
 }
 
@@ -1466,9 +1474,9 @@ access_26_07 (const UINT4 *in) {
   UINT4 out;
 
   in += 5 * WORD_INCR;
-  out = ( (*in) >>  22  )   % (1U << 26 ) ;
+  out = ( CONVERT(*in) >>  22  )   % (1U << 26 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 16 ))<<( 26 - 16 );
+  out |= (CONVERT(*in) % (1U<< 16 ))<<( 26 - 16 );
   return out;
 }
 
@@ -1477,9 +1485,9 @@ access_26_08 (const UINT4 *in) {
   UINT4 out;
 
   in += 6 * WORD_INCR;
-  out = ( (*in) >>  16  )   % (1U << 26 ) ;
+  out = ( CONVERT(*in) >>  16  )   % (1U << 26 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 10 ))<<( 26 - 10 );
+  out |= (CONVERT(*in) % (1U<< 10 ))<<( 26 - 10 );
   return out;
 }
 
@@ -1488,16 +1496,16 @@ access_26_09 (const UINT4 *in) {
   UINT4 out;
 
   in += 7 * WORD_INCR;
-  out = ( (*in) >>  10  )   % (1U << 26 ) ;
+  out = ( CONVERT(*in) >>  10  )   % (1U << 26 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 4 ))<<( 26 - 4 );
+  out |= (CONVERT(*in) % (1U<< 4 ))<<( 26 - 4 );
   return out;
 }
 
 static UINT4
 access_26_10 (const UINT4 *in) {
   in += 8 * WORD_INCR;
-  return ( (*in) >>  4  )   % (1U << 26 ) ;
+  return ( CONVERT(*in) >>  4  )   % (1U << 26 ) ;
 }
 
 static UINT4
@@ -1505,9 +1513,9 @@ access_26_11 (const UINT4 *in) {
   UINT4 out;
 
   in += 8 * WORD_INCR;
-  out = ( (*in) >>  30  )   % (1U << 26 ) ;
+  out = ( CONVERT(*in) >>  30  )   % (1U << 26 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 24 ))<<( 26 - 24 );
+  out |= (CONVERT(*in) % (1U<< 24 ))<<( 26 - 24 );
   return out;
 }
 
@@ -1516,9 +1524,9 @@ access_26_12 (const UINT4 *in) {
   UINT4 out;
 
   in += 9 * WORD_INCR;
-  out = ( (*in) >>  24  )   % (1U << 26 ) ;
+  out = ( CONVERT(*in) >>  24  )   % (1U << 26 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 18 ))<<( 26 - 18 );
+  out |= (CONVERT(*in) % (1U<< 18 ))<<( 26 - 18 );
   return out;
 }
 
@@ -1527,9 +1535,9 @@ access_26_13 (const UINT4 *in) {
   UINT4 out;
 
   in += 10 * WORD_INCR;
-  out = ( (*in) >>  18  )   % (1U << 26 ) ;
+  out = ( CONVERT(*in) >>  18  )   % (1U << 26 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 12 ))<<( 26 - 12 );
+  out |= (CONVERT(*in) % (1U<< 12 ))<<( 26 - 12 );
   return out;
 }
 
@@ -1538,31 +1546,31 @@ access_26_14 (const UINT4 *in) {
   UINT4 out;
 
   in += 11 * WORD_INCR;
-  out = ( (*in) >>  12  )   % (1U << 26 ) ;
+  out = ( CONVERT(*in) >>  12  )   % (1U << 26 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 6 ))<<( 26 - 6 );
+  out |= (CONVERT(*in) % (1U<< 6 ))<<( 26 - 6 );
   return out;
 }
 
 static UINT4
 access_26_15 (const UINT4 *in) {
   in += 12 * WORD_INCR;
-  return ( (*in) >>  6  )   % (1U << 26 ) ;
+  return ( CONVERT(*in) >>  6  )   % (1U << 26 ) ;
 }
 
 
 static UINT4
 access_28_00 (const UINT4 *in) {
-  return ( (*in) >>  0  )   % (1U << 28 ) ;
+  return ( CONVERT(*in) >>  0  )   % (1U << 28 ) ;
 }
 
 static UINT4
 access_28_01 (const UINT4 *in) {
   UINT4 out;
 
-  out = ( (*in) >>  28  )   % (1U << 28 ) ;
+  out = ( CONVERT(*in) >>  28  )   % (1U << 28 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 24 ))<<( 28 - 24 );
+  out |= (CONVERT(*in) % (1U<< 24 ))<<( 28 - 24 );
   return out;
 }
 
@@ -1571,9 +1579,9 @@ access_28_02 (const UINT4 *in) {
   UINT4 out;
 
   in += 1 * WORD_INCR;
-  out = ( (*in) >>  24  )   % (1U << 28 ) ;
+  out = ( CONVERT(*in) >>  24  )   % (1U << 28 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 20 ))<<( 28 - 20 );
+  out |= (CONVERT(*in) % (1U<< 20 ))<<( 28 - 20 );
   return out;
 }
 
@@ -1582,9 +1590,9 @@ access_28_03 (const UINT4 *in) {
   UINT4 out;
 
   in += 2 * WORD_INCR;
-  out = ( (*in) >>  20  )   % (1U << 28 ) ;
+  out = ( CONVERT(*in) >>  20  )   % (1U << 28 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 16 ))<<( 28 - 16 );
+  out |= (CONVERT(*in) % (1U<< 16 ))<<( 28 - 16 );
   return out;
 }
 
@@ -1593,9 +1601,9 @@ access_28_04 (const UINT4 *in) {
   UINT4 out;
 
   in += 3 * WORD_INCR;
-  out = ( (*in) >>  16  )   % (1U << 28 ) ;
+  out = ( CONVERT(*in) >>  16  )   % (1U << 28 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 12 ))<<( 28 - 12 );
+  out |= (CONVERT(*in) % (1U<< 12 ))<<( 28 - 12 );
   return out;
 }
 
@@ -1604,9 +1612,9 @@ access_28_05 (const UINT4 *in) {
   UINT4 out;
 
   in += 4 * WORD_INCR;
-  out = ( (*in) >>  12  )   % (1U << 28 ) ;
+  out = ( CONVERT(*in) >>  12  )   % (1U << 28 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 8 ))<<( 28 - 8 );
+  out |= (CONVERT(*in) % (1U<< 8 ))<<( 28 - 8 );
   return out;
 }
 
@@ -1615,22 +1623,22 @@ access_28_06 (const UINT4 *in) {
   UINT4 out;
 
   in += 5 * WORD_INCR;
-  out = ( (*in) >>  8  )   % (1U << 28 ) ;
+  out = ( CONVERT(*in) >>  8  )   % (1U << 28 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 4 ))<<( 28 - 4 );
+  out |= (CONVERT(*in) % (1U<< 4 ))<<( 28 - 4 );
   return out;
 }
 
 static UINT4
 access_28_07 (const UINT4 *in) {
   in += 6 * WORD_INCR;
-  return ( (*in) >>  4  )   % (1U << 28 ) ;
+  return ( CONVERT(*in) >>  4  )   % (1U << 28 ) ;
 }
 
 static UINT4
 access_28_08 (const UINT4 *in) {
   in += 7 * WORD_INCR;
-  return ( (*in) >>  0  )   % (1U << 28 ) ;
+  return ( CONVERT(*in) >>  0  )   % (1U << 28 ) ;
 }
 
 static UINT4
@@ -1638,9 +1646,9 @@ access_28_09 (const UINT4 *in) {
   UINT4 out;
 
   in += 7 * WORD_INCR;
-  out = ( (*in) >>  28  )   % (1U << 28 ) ;
+  out = ( CONVERT(*in) >>  28  )   % (1U << 28 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 24 ))<<( 28 - 24 );
+  out |= (CONVERT(*in) % (1U<< 24 ))<<( 28 - 24 );
   return out;
 }
 
@@ -1649,9 +1657,9 @@ access_28_10 (const UINT4 *in) {
   UINT4 out;
 
   in += 8 * WORD_INCR;
-  out = ( (*in) >>  24  )   % (1U << 28 ) ;
+  out = ( CONVERT(*in) >>  24  )   % (1U << 28 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 20 ))<<( 28 - 20 );
+  out |= (CONVERT(*in) % (1U<< 20 ))<<( 28 - 20 );
   return out;
 }
 
@@ -1660,9 +1668,9 @@ access_28_11 (const UINT4 *in) {
   UINT4 out;
 
   in += 9 * WORD_INCR;
-  out = ( (*in) >>  20  )   % (1U << 28 ) ;
+  out = ( CONVERT(*in) >>  20  )   % (1U << 28 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 16 ))<<( 28 - 16 );
+  out |= (CONVERT(*in) % (1U<< 16 ))<<( 28 - 16 );
   return out;
 }
 
@@ -1671,9 +1679,9 @@ access_28_12 (const UINT4 *in) {
   UINT4 out;
 
   in += 10 * WORD_INCR;
-  out = ( (*in) >>  16  )   % (1U << 28 ) ;
+  out = ( CONVERT(*in) >>  16  )   % (1U << 28 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 12 ))<<( 28 - 12 );
+  out |= (CONVERT(*in) % (1U<< 12 ))<<( 28 - 12 );
   return out;
 }
 
@@ -1682,9 +1690,9 @@ access_28_13 (const UINT4 *in) {
   UINT4 out;
 
   in += 11 * WORD_INCR;
-  out = ( (*in) >>  12  )   % (1U << 28 ) ;
+  out = ( CONVERT(*in) >>  12  )   % (1U << 28 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 8 ))<<( 28 - 8 );
+  out |= (CONVERT(*in) % (1U<< 8 ))<<( 28 - 8 );
   return out;
 }
 
@@ -1693,31 +1701,31 @@ access_28_14 (const UINT4 *in) {
   UINT4 out;
 
   in += 12 * WORD_INCR;
-  out = ( (*in) >>  8  )   % (1U << 28 ) ;
+  out = ( CONVERT(*in) >>  8  )   % (1U << 28 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 4 ))<<( 28 - 4 );
+  out |= (CONVERT(*in) % (1U<< 4 ))<<( 28 - 4 );
   return out;
 }
 
 static UINT4
 access_28_15 (const UINT4 *in) {
   in += 13 * WORD_INCR;
-  return ( (*in) >>  4  )   % (1U << 28 ) ;
+  return ( CONVERT(*in) >>  4  )   % (1U << 28 ) ;
 }
 
 
 static UINT4
 access_30_00 (const UINT4 *in) {
-  return ( (*in) >>  0  )   % (1U << 30 ) ;
+  return ( CONVERT(*in) >>  0  )   % (1U << 30 ) ;
 }
 
 static UINT4
 access_30_01 (const UINT4 *in) {
   UINT4 out;
 
-  out = ( (*in) >>  30  )   % (1U << 30 ) ;
+  out = ( CONVERT(*in) >>  30  )   % (1U << 30 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 28 ))<<( 30 - 28 );
+  out |= (CONVERT(*in) % (1U<< 28 ))<<( 30 - 28 );
   return out;
 }
 
@@ -1726,9 +1734,9 @@ access_30_02 (const UINT4 *in) {
   UINT4 out;
 
   in += 1 * WORD_INCR;
-  out = ( (*in) >>  28  )   % (1U << 30 ) ;
+  out = ( CONVERT(*in) >>  28  )   % (1U << 30 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 26 ))<<( 30 - 26 );
+  out |= (CONVERT(*in) % (1U<< 26 ))<<( 30 - 26 );
   return out;
 }
 
@@ -1737,9 +1745,9 @@ access_30_03 (const UINT4 *in) {
   UINT4 out;
 
   in += 2 * WORD_INCR;
-  out = ( (*in) >>  26  )   % (1U << 30 ) ;
+  out = ( CONVERT(*in) >>  26  )   % (1U << 30 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 24 ))<<( 30 - 24 );
+  out |= (CONVERT(*in) % (1U<< 24 ))<<( 30 - 24 );
   return out;
 }
 
@@ -1748,9 +1756,9 @@ access_30_04 (const UINT4 *in) {
   UINT4 out;
 
   in += 3 * WORD_INCR;
-  out = ( (*in) >>  24  )   % (1U << 30 ) ;
+  out = ( CONVERT(*in) >>  24  )   % (1U << 30 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 22 ))<<( 30 - 22 );
+  out |= (CONVERT(*in) % (1U<< 22 ))<<( 30 - 22 );
   return out;
 }
 
@@ -1759,9 +1767,9 @@ access_30_05 (const UINT4 *in) {
   UINT4 out;
 
   in += 4 * WORD_INCR;
-  out = ( (*in) >>  22  )   % (1U << 30 ) ;
+  out = ( CONVERT(*in) >>  22  )   % (1U << 30 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 20 ))<<( 30 - 20 );
+  out |= (CONVERT(*in) % (1U<< 20 ))<<( 30 - 20 );
   return out;
 }
 
@@ -1770,9 +1778,9 @@ access_30_06 (const UINT4 *in) {
   UINT4 out;
 
   in += 5 * WORD_INCR;
-  out = ( (*in) >>  20  )   % (1U << 30 ) ;
+  out = ( CONVERT(*in) >>  20  )   % (1U << 30 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 18 ))<<( 30 - 18 );
+  out |= (CONVERT(*in) % (1U<< 18 ))<<( 30 - 18 );
   return out;
 }
 
@@ -1781,9 +1789,9 @@ access_30_07 (const UINT4 *in) {
   UINT4 out;
 
   in += 6 * WORD_INCR;
-  out = ( (*in) >>  18  )   % (1U << 30 ) ;
+  out = ( CONVERT(*in) >>  18  )   % (1U << 30 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 16 ))<<( 30 - 16 );
+  out |= (CONVERT(*in) % (1U<< 16 ))<<( 30 - 16 );
   return out;
 }
 
@@ -1792,9 +1800,9 @@ access_30_08 (const UINT4 *in) {
   UINT4 out;
 
   in += 7 * WORD_INCR;
-  out = ( (*in) >>  16  )   % (1U << 30 ) ;
+  out = ( CONVERT(*in) >>  16  )   % (1U << 30 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 14 ))<<( 30 - 14 );
+  out |= (CONVERT(*in) % (1U<< 14 ))<<( 30 - 14 );
   return out;
 }
 
@@ -1803,9 +1811,9 @@ access_30_09 (const UINT4 *in) {
   UINT4 out;
 
   in += 8 * WORD_INCR;
-  out = ( (*in) >>  14  )   % (1U << 30 ) ;
+  out = ( CONVERT(*in) >>  14  )   % (1U << 30 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 12 ))<<( 30 - 12 );
+  out |= (CONVERT(*in) % (1U<< 12 ))<<( 30 - 12 );
   return out;
 }
 
@@ -1814,9 +1822,9 @@ access_30_10 (const UINT4 *in) {
   UINT4 out;
 
   in += 9 * WORD_INCR;
-  out = ( (*in) >>  12  )   % (1U << 30 ) ;
+  out = ( CONVERT(*in) >>  12  )   % (1U << 30 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 10 ))<<( 30 - 10 );
+  out |= (CONVERT(*in) % (1U<< 10 ))<<( 30 - 10 );
   return out;
 }
 
@@ -1825,9 +1833,9 @@ access_30_11 (const UINT4 *in) {
   UINT4 out;
 
   in += 10 * WORD_INCR;
-  out = ( (*in) >>  10  )   % (1U << 30 ) ;
+  out = ( CONVERT(*in) >>  10  )   % (1U << 30 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 8 ))<<( 30 - 8 );
+  out |= (CONVERT(*in) % (1U<< 8 ))<<( 30 - 8 );
   return out;
 }
 
@@ -1836,9 +1844,9 @@ access_30_12 (const UINT4 *in) {
   UINT4 out;
 
   in += 11 * WORD_INCR;
-  out = ( (*in) >>  8  )   % (1U << 30 ) ;
+  out = ( CONVERT(*in) >>  8  )   % (1U << 30 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 6 ))<<( 30 - 6 );
+  out |= (CONVERT(*in) % (1U<< 6 ))<<( 30 - 6 );
   return out;
 }
 
@@ -1847,9 +1855,9 @@ access_30_13 (const UINT4 *in) {
   UINT4 out;
 
   in += 12 * WORD_INCR;
-  out = ( (*in) >>  6  )   % (1U << 30 ) ;
+  out = ( CONVERT(*in) >>  6  )   % (1U << 30 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 4 ))<<( 30 - 4 );
+  out |= (CONVERT(*in) % (1U<< 4 ))<<( 30 - 4 );
   return out;
 }
 
@@ -1858,112 +1866,112 @@ access_30_14 (const UINT4 *in) {
   UINT4 out;
 
   in += 13 * WORD_INCR;
-  out = ( (*in) >>  4  )   % (1U << 30 ) ;
+  out = ( CONVERT(*in) >>  4  )   % (1U << 30 ) ;
   in += 1 * WORD_INCR;
-  out |= ((*in) % (1U<< 2 ))<<( 30 - 2 );
+  out |= (CONVERT(*in) % (1U<< 2 ))<<( 30 - 2 );
   return out;
 }
 
 static UINT4
 access_30_15 (const UINT4 *in) {
   in += 14 * WORD_INCR;
-  return ( (*in) >>  2  )   % (1U << 30 ) ;
+  return ( CONVERT(*in) >>  2  )   % (1U << 30 ) ;
 }
 
 
 static UINT4
 access_32_00 (const UINT4 *in) {
-  return *in;
+  return CONVERT(*in);
 }
 
 static UINT4
 access_32_01 (const UINT4 *in) {
   in += 1 * WORD_INCR;
-  return *in;
+  return CONVERT(*in);
 }
 
 static UINT4
 access_32_02 (const UINT4 *in) {
   in += 2 * WORD_INCR;
-  return *in;
+  return CONVERT(*in);
 }
 
 static UINT4
 access_32_03 (const UINT4 *in) {
   in += 3 * WORD_INCR;
-  return *in;
+  return CONVERT(*in);
 }
 
 static UINT4
 access_32_04 (const UINT4 *in) {
   in += 4 * WORD_INCR;
-  return *in;
+  return CONVERT(*in);
 }
 
 static UINT4
 access_32_05 (const UINT4 *in) {
   in += 5 * WORD_INCR;
-  return *in;
+  return CONVERT(*in);
 }
 
 static UINT4
 access_32_06 (const UINT4 *in) {
   in += 6 * WORD_INCR;
-  return *in;
+  return CONVERT(*in);
 }
 
 static UINT4
 access_32_07 (const UINT4 *in) {
   in += 7 * WORD_INCR;
-  return *in;
+  return CONVERT(*in);
 }
 
 static UINT4
 access_32_08 (const UINT4 *in) {
   in += 8 * WORD_INCR;
-  return *in;
+  return CONVERT(*in);
 }
 
 static UINT4
 access_32_09 (const UINT4 *in) {
   in += 9 * WORD_INCR;
-  return *in;
+  return CONVERT(*in);
 }
 
 static UINT4
 access_32_10 (const UINT4 *in) {
   in += 10 * WORD_INCR;
-  return *in;
+  return CONVERT(*in);
 }
 
 static UINT4
 access_32_11 (const UINT4 *in) {
   in += 11 * WORD_INCR;
-  return *in;
+  return CONVERT(*in);
 }
 
 static UINT4
 access_32_12 (const UINT4 *in) {
   in += 12 * WORD_INCR;
-  return *in;
+  return CONVERT(*in);
 }
 
 static UINT4
 access_32_13 (const UINT4 *in) {
   in += 13 * WORD_INCR;
-  return *in;
+  return CONVERT(*in);
 }
 
 static UINT4
 access_32_14 (const UINT4 *in) {
   in += 14 * WORD_INCR;
-  return *in;
+  return CONVERT(*in);
 }
 
 static UINT4
 access_32_15 (const UINT4 *in) {
   in += 15 * WORD_INCR;
-  return *in;
+  return CONVERT(*in);
 }
 
 
@@ -2074,9 +2082,16 @@ Bitpack64_access (UINT4 position, UINT4 *ptrs, UINT4 *comp) {
 #endif
 
   info = &(ptrs[position/BLOCKSIZE * DIRECT_METAINFO_SIZE]);
+
+#ifdef WORDS_BIGENDIAN
+  start = Bigendian_convert_uint(info[0]);
+  bitpack = (UINT4 *) &(comp[start*4]);
+  nwritten = Bigendian_convert_uint(info[1]) - start;	/* In 128-bit registers */
+#else
   start = info[0];
   bitpack = (UINT4 *) &(comp[start*4]);
   nwritten = info[1] - start;	/* In 128-bit registers */
+#endif
 
   remainder = position % BLOCKSIZE;
   index = nwritten*16 + remainder % 16;
@@ -2109,9 +2124,16 @@ Bitpack64_access (UINT4 position, UINT4 *ptrs, UINT4 *comp) {
 #endif
 
   info = &(ptrs[position/BLOCKSIZE * DIRECT_METAINFO_SIZE]);
+
+#ifdef WORDS_BIGENDIAN
+  start = Bigendian_convert_uint(info[0]);
+  bitpack = (UINT4 *) &(comp[start*4]);
+  nwritten = Bigendian_convert_uint(info[1]) - start;	/* In 128-bit registers */
+#else
   start = info[0];
   bitpack = (UINT4 *) &(comp[start*4]);
   nwritten = info[1] - start;	/* In 128-bit registers */
+#endif
 
   remainder = position % BLOCKSIZE;
   index = nwritten*16 + remainder/4;
diff --git a/src/bitpack64-read.c b/src/bitpack64-read.c
index 306697e..d611400 100644
--- a/src/bitpack64-read.c
+++ b/src/bitpack64-read.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: bitpack64-read.c 153955 2014-11-24 17:54:45Z twu $";
+static char rcsid[] = "$Id: bitpack64-read.c 168395 2015-06-26 17:13:13Z twu $";
 #ifdef HAVE_CONFIG_H
 #include <config.h>
 #endif
@@ -8,7 +8,9 @@ static char rcsid[] = "$Id: bitpack64-read.c 153955 2014-11-24 17:54:45Z twu $";
 #include <stdio.h>
 #include <stdlib.h>
 
-#ifdef HAVE_SSE2
+#ifdef WORDS_BIGENDIAN
+#include "bigendian.h"
+#elif defined(HAVE_SSE2)
 #include <emmintrin.h>
 #endif
 
@@ -126,7 +128,19 @@ Bitpack64_read_setup () {
 #endif
 
 
-#ifdef HAVE_SSE2
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+static void
+unpack_00 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
+  int i;
+
+  for (i = 0; i < BLOCKSIZE; i++) {
+    *out++ = 0;
+  }
+
+  return;
+}
+
+#else
 static void
 unpack_00 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
   __m128i total = _mm_set1_epi32(0U);
@@ -184,21 +198,10 @@ unpack_00_2 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
   return;
 }
 
-
-#else
-static void
-unpack_00 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
-  int i;
-
-  for (i = 0; i < BLOCKSIZE; i++) {
-    *out++ = 0;
-  }
-
-  return;
-}
 #endif
 
 
+
 #ifdef ALLOW_ODD_PACKSIZES
 static void
 unpack_01 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
@@ -274,7 +277,100 @@ unpack_01 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
 #endif
 
 
-#ifdef HAVE_SSE2
+#ifdef WORDS_BIGENDIAN
+static void
+unpack_02 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
+  unsigned int column;
+  const UINT4 *bitpack = in;
+
+  for (column = 0; column < 4; column++) {
+    in = &(bitpack[column]);
+
+    *out = ( Bigendian_convert_uint(*in) >>  0  )   % (1U << 2 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  2  )   % (1U << 2 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  4  )   % (1U << 2 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  6  )   % (1U << 2 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  8  )   % (1U << 2 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  10  )   % (1U << 2 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  12  )   % (1U << 2 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  14  )   % (1U << 2 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  16  )   % (1U << 2 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  18  )   % (1U << 2 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  20  )   % (1U << 2 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  22  )   % (1U << 2 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  24  )   % (1U << 2 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  26  )   % (1U << 2 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  28  )   % (1U << 2 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  30  )   % (1U << 2 ) ;
+    out++;
+  }
+
+  return;
+}
+
+
+#elif !defined(HAVE_SSE2)
+static void
+unpack_02 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
+  unsigned int column;
+  const UINT4 *bitpack = in;
+
+  for (column = 0; column < 4; column++) {
+    in = &(bitpack[column]);
+
+    *out = ( (*in) >>  0  )   % (1U << 2 ) ;
+    out++;
+    *out = ( (*in) >>  2  )   % (1U << 2 ) ;
+    out++;
+    *out = ( (*in) >>  4  )   % (1U << 2 ) ;
+    out++;
+    *out = ( (*in) >>  6  )   % (1U << 2 ) ;
+    out++;
+    *out = ( (*in) >>  8  )   % (1U << 2 ) ;
+    out++;
+    *out = ( (*in) >>  10  )   % (1U << 2 ) ;
+    out++;
+    *out = ( (*in) >>  12  )   % (1U << 2 ) ;
+    out++;
+    *out = ( (*in) >>  14  )   % (1U << 2 ) ;
+    out++;
+    *out = ( (*in) >>  16  )   % (1U << 2 ) ;
+    out++;
+    *out = ( (*in) >>  18  )   % (1U << 2 ) ;
+    out++;
+    *out = ( (*in) >>  20  )   % (1U << 2 ) ;
+    out++;
+    *out = ( (*in) >>  22  )   % (1U << 2 ) ;
+    out++;
+    *out = ( (*in) >>  24  )   % (1U << 2 ) ;
+    out++;
+    *out = ( (*in) >>  26  )   % (1U << 2 ) ;
+    out++;
+    *out = ( (*in) >>  28  )   % (1U << 2 ) ;
+    out++;
+    *out = ( (*in) >>  30  )   % (1U << 2 ) ;
+    out++;
+  }
+
+  return;
+}
+
+#else
 static void
 unpack_02_fwd (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
     __m128i InReg = _mm_load_si128(in);
@@ -743,52 +839,6 @@ unpack_02_rev_8 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
     return;
 }
 
-#else
-
-static void
-unpack_02 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
-  unsigned int column;
-  const UINT4 *bitpack = in;
-
-  for (column = 0; column < 4; column++) {
-    in = &(bitpack[column]);
-
-    *out = ( (*in) >>  0  )   % (1U << 2 ) ;
-    out++;
-    *out = ( (*in) >>  2  )   % (1U << 2 ) ;
-    out++;
-    *out = ( (*in) >>  4  )   % (1U << 2 ) ;
-    out++;
-    *out = ( (*in) >>  6  )   % (1U << 2 ) ;
-    out++;
-    *out = ( (*in) >>  8  )   % (1U << 2 ) ;
-    out++;
-    *out = ( (*in) >>  10  )   % (1U << 2 ) ;
-    out++;
-    *out = ( (*in) >>  12  )   % (1U << 2 ) ;
-    out++;
-    *out = ( (*in) >>  14  )   % (1U << 2 ) ;
-    out++;
-    *out = ( (*in) >>  16  )   % (1U << 2 ) ;
-    out++;
-    *out = ( (*in) >>  18  )   % (1U << 2 ) ;
-    out++;
-    *out = ( (*in) >>  20  )   % (1U << 2 ) ;
-    out++;
-    *out = ( (*in) >>  22  )   % (1U << 2 ) ;
-    out++;
-    *out = ( (*in) >>  24  )   % (1U << 2 ) ;
-    out++;
-    *out = ( (*in) >>  26  )   % (1U << 2 ) ;
-    out++;
-    *out = ( (*in) >>  28  )   % (1U << 2 ) ;
-    out++;
-    *out = ( (*in) >>  30  )   % (1U << 2 ) ;
-    out++;
-  }
-
-  return;
-}
 #endif
 
 
@@ -875,7 +925,50 @@ unpack_03 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
 #endif
 
 
-#ifdef HAVE_SSE2
+
+#ifdef WORDS_BIGENDIAN
+static void
+unpack_04 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
+  UINT4 outer, inwordpointer;
+  unsigned int column;
+  const UINT4 *bitpack = in;
+
+  for (column = 0; column < 4; column++) {
+    in = &(bitpack[column]);
+
+    for (outer = 0; outer < 2 ; outer++) {
+      for (inwordpointer = 0; inwordpointer < 32; inwordpointer +=  4) {
+	*(out++) = ( Bigendian_convert_uint(*in) >> inwordpointer )   % (1U << 4 ) ;
+      }
+      in += 4;
+    }
+  }
+
+  return;
+}
+
+#elif !defined(HAVE_SSE2)
+static void
+unpack_04 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
+  UINT4 outer, inwordpointer;
+  unsigned int column;
+  const UINT4 *bitpack = in;
+
+  for (column = 0; column < 4; column++) {
+    in = &(bitpack[column]);
+
+    for (outer = 0; outer < 2 ; outer++) {
+      for (inwordpointer = 0; inwordpointer < 32; inwordpointer +=  4) {
+	*(out++) = ( (*in) >> inwordpointer )   % (1U << 4 ) ;
+      }
+      in += 4;
+    }
+  }
+
+  return;
+}
+
+#else
 static void
 unpack_04_fwd (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
     __m128i InReg = _mm_load_si128(in);
@@ -1343,26 +1436,6 @@ unpack_04_rev_8 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
     return;
 }
 
-#else
-static void
-unpack_04 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
-  UINT4 outer, inwordpointer;
-  unsigned int column;
-  const UINT4 *bitpack = in;
-
-  for (column = 0; column < 4; column++) {
-    in = &(bitpack[column]);
-
-    for (outer = 0; outer < 2 ; outer++) {
-      for (inwordpointer = 0; inwordpointer < 32; inwordpointer +=  4) {
-	*(out++) = ( (*in) >> inwordpointer )   % (1U << 4 ) ;
-      }
-      in += 4;
-    }
-  }
-
-  return;
-}
 #endif
 
 
@@ -1455,29 +1528,129 @@ unpack_05 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
 #endif
 
 
-#ifdef HAVE_SSE2
+#ifdef WORDS_BIGENDIAN
 static void
-unpack_06_fwd (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
-    __m128i InReg = _mm_load_si128(in);
-    __m128i OutReg;
-    const __m128i mask6 =  _mm_set1_epi32(63U);
-
-    OutReg = _mm_and_si128( InReg , mask6);
-    _mm_store_si128(out++, OutReg);
+unpack_06 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
+  unsigned int column;
+  const UINT4 *bitpack = in;
 
-    OutReg = _mm_and_si128(  _mm_srli_epi32(InReg,6) , mask6);
-    /* total = _mm_add_epi32(total, OutReg); */
-    _mm_store_si128(out++, OutReg);
+  for (column = 0; column < 4; column++) {
+    in = &(bitpack[column]);
 
-    OutReg = _mm_and_si128(  _mm_srli_epi32(InReg,12) , mask6);
-    /* total = _mm_add_epi32(total, OutReg); */
-    _mm_store_si128(out++, OutReg);
+    *out = ( Bigendian_convert_uint(*in) >>  0  )   % (1U << 6 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  6  )   % (1U << 6 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  12  )   % (1U << 6 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  18  )   % (1U << 6 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  24  )   % (1U << 6 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  30  )   % (1U << 6 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 4 ))<<( 6 - 4 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  4  )   % (1U << 6 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  10  )   % (1U << 6 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  16  )   % (1U << 6 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  22  )   % (1U << 6 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  28  )   % (1U << 6 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 2 ))<<( 6 - 2 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  2  )   % (1U << 6 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  8  )   % (1U << 6 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  14  )   % (1U << 6 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  20  )   % (1U << 6 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  26  )   % (1U << 6 ) ;
+    out++;
+  }
 
-    OutReg = _mm_and_si128(  _mm_srli_epi32(InReg,18) , mask6);
-    /* total = _mm_add_epi32(total, OutReg); */
-    _mm_store_si128(out++, OutReg);
+  return;
+}
 
-    OutReg = _mm_and_si128(  _mm_srli_epi32(InReg,24) , mask6);
+#elif !defined(HAVE_SSE2)
+static void
+unpack_06 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
+  unsigned int column;
+  const UINT4 *bitpack = in;
+
+  for (column = 0; column < 4; column++) {
+    in = &(bitpack[column]);
+
+    *out = ( (*in) >>  0  )   % (1U << 6 ) ;
+    out++;
+    *out = ( (*in) >>  6  )   % (1U << 6 ) ;
+    out++;
+    *out = ( (*in) >>  12  )   % (1U << 6 ) ;
+    out++;
+    *out = ( (*in) >>  18  )   % (1U << 6 ) ;
+    out++;
+    *out = ( (*in) >>  24  )   % (1U << 6 ) ;
+    out++;
+    *out = ( (*in) >>  30  )   % (1U << 6 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 4 ))<<( 6 - 4 );
+    out++;
+    *out = ( (*in) >>  4  )   % (1U << 6 ) ;
+    out++;
+    *out = ( (*in) >>  10  )   % (1U << 6 ) ;
+    out++;
+    *out = ( (*in) >>  16  )   % (1U << 6 ) ;
+    out++;
+    *out = ( (*in) >>  22  )   % (1U << 6 ) ;
+    out++;
+    *out = ( (*in) >>  28  )   % (1U << 6 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 2 ))<<( 6 - 2 );
+    out++;
+    *out = ( (*in) >>  2  )   % (1U << 6 ) ;
+    out++;
+    *out = ( (*in) >>  8  )   % (1U << 6 ) ;
+    out++;
+    *out = ( (*in) >>  14  )   % (1U << 6 ) ;
+    out++;
+    *out = ( (*in) >>  20  )   % (1U << 6 ) ;
+    out++;
+    *out = ( (*in) >>  26  )   % (1U << 6 ) ;
+    out++;
+  }
+
+  return;
+}
+
+#else
+static void
+unpack_06_fwd (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
+    __m128i InReg = _mm_load_si128(in);
+    __m128i OutReg;
+    const __m128i mask6 =  _mm_set1_epi32(63U);
+
+    OutReg = _mm_and_si128( InReg , mask6);
+    _mm_store_si128(out++, OutReg);
+
+    OutReg = _mm_and_si128(  _mm_srli_epi32(InReg,6) , mask6);
+    /* total = _mm_add_epi32(total, OutReg); */
+    _mm_store_si128(out++, OutReg);
+
+    OutReg = _mm_and_si128(  _mm_srli_epi32(InReg,12) , mask6);
+    /* total = _mm_add_epi32(total, OutReg); */
+    _mm_store_si128(out++, OutReg);
+
+    OutReg = _mm_and_si128(  _mm_srli_epi32(InReg,18) , mask6);
+    /* total = _mm_add_epi32(total, OutReg); */
+    _mm_store_si128(out++, OutReg);
+
+    OutReg = _mm_and_si128(  _mm_srli_epi32(InReg,24) , mask6);
     /* total = _mm_add_epi32(total, OutReg); */
     _mm_store_si128(out++, OutReg);
 
@@ -1962,55 +2135,6 @@ unpack_06_rev_8 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
     return;
 }
 
-#else
-static void
-unpack_06 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
-  unsigned int column;
-  const UINT4 *bitpack = in;
-
-  for (column = 0; column < 4; column++) {
-    in = &(bitpack[column]);
-
-    *out = ( (*in) >>  0  )   % (1U << 6 ) ;
-    out++;
-    *out = ( (*in) >>  6  )   % (1U << 6 ) ;
-    out++;
-    *out = ( (*in) >>  12  )   % (1U << 6 ) ;
-    out++;
-    *out = ( (*in) >>  18  )   % (1U << 6 ) ;
-    out++;
-    *out = ( (*in) >>  24  )   % (1U << 6 ) ;
-    out++;
-    *out = ( (*in) >>  30  )   % (1U << 6 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 4 ))<<( 6 - 4 );
-    out++;
-    *out = ( (*in) >>  4  )   % (1U << 6 ) ;
-    out++;
-    *out = ( (*in) >>  10  )   % (1U << 6 ) ;
-    out++;
-    *out = ( (*in) >>  16  )   % (1U << 6 ) ;
-    out++;
-    *out = ( (*in) >>  22  )   % (1U << 6 ) ;
-    out++;
-    *out = ( (*in) >>  28  )   % (1U << 6 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 2 ))<<( 6 - 2 );
-    out++;
-    *out = ( (*in) >>  2  )   % (1U << 6 ) ;
-    out++;
-    *out = ( (*in) >>  8  )   % (1U << 6 ) ;
-    out++;
-    *out = ( (*in) >>  14  )   % (1U << 6 ) ;
-    out++;
-    *out = ( (*in) >>  20  )   % (1U << 6 ) ;
-    out++;
-    *out = ( (*in) >>  26  )   % (1U << 6 ) ;
-    out++;
-  }
-
-  return;
-}
 #endif
 
 
@@ -2111,7 +2235,49 @@ unpack_07 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
 
 
 
-#ifdef HAVE_SSE2
+#ifdef WORDS_BIGENDIAN
+static void
+unpack_08 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
+  UINT4 outer, inwordpointer;
+  unsigned int column;
+  const UINT4 *bitpack = in;
+
+  for (column = 0; column < 4; column++) {
+    in = &(bitpack[column]);
+
+    for (outer = 0; outer < 4; outer++) {
+      for (inwordpointer = 0; inwordpointer < 32; inwordpointer += 8) {
+	*(out++) = ( Bigendian_convert_uint(*in) >> inwordpointer )   % (1U << 8 ) ;
+      }
+      in += 4;
+    }
+  }
+
+  return;
+}
+
+#elif !defined(HAVE_SSE2)
+static void
+unpack_08 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
+  UINT4 outer, inwordpointer;
+  unsigned int column;
+  const UINT4 *bitpack = in;
+
+  for (column = 0; column < 4; column++) {
+    in = &(bitpack[column]);
+
+    for (outer = 0; outer < 4; outer++) {
+      for (inwordpointer = 0; inwordpointer < 32; inwordpointer += 8) {
+	*(out++) = ( (*in) >> inwordpointer )   % (1U << 8 ) ;
+      }
+      in += 4;
+    }
+  }
+
+  return;
+}
+
+#else
 static void
 unpack_08_fwd (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
     __m128i InReg = _mm_load_si128(in);
@@ -2599,26 +2765,6 @@ unpack_08_rev_8 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
 }
 
 
-#else
-static void
-unpack_08 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
-  UINT4 outer, inwordpointer;
-  unsigned int column;
-  const UINT4 *bitpack = in;
-
-  for (column = 0; column < 4; column++) {
-    in = &(bitpack[column]);
-
-    for (outer = 0; outer < 4; outer++) {
-      for (inwordpointer = 0; inwordpointer < 32; inwordpointer += 8) {
-	*(out++) = ( (*in) >> inwordpointer )   % (1U << 8 ) ;
-      }
-      in += 4;
-    }
-  }
-
-  return;
-}
 #endif
 
 
@@ -2727,53 +2873,161 @@ unpack_09 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
 
 
 
-#ifdef HAVE_SSE2
+#ifdef WORDS_BIGENDIAN
 static void
-unpack_10_fwd (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
-    __m128i InReg = _mm_load_si128(in);
-    __m128i OutReg;
-    const __m128i mask10 =  _mm_set1_epi32(1023U);
-
-    OutReg = _mm_and_si128( InReg , mask10);
-    _mm_store_si128(out++, OutReg);
-
-    OutReg = _mm_and_si128(  _mm_srli_epi32(InReg,10) , mask10);
-    /* total = _mm_add_epi32(total, OutReg); */
-    _mm_store_si128(out++, OutReg);
-
-    OutReg = _mm_and_si128(  _mm_srli_epi32(InReg,20) , mask10);
-    /* total = _mm_add_epi32(total, OutReg); */
-    _mm_store_si128(out++, OutReg);
+unpack_10 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
+  unsigned int column;
+  const UINT4 *bitpack = in;
 
-    OutReg =   _mm_srli_epi32(InReg,30) ;
-    InReg = _mm_load_si128(++in);
+  for (column = 0; column < 4; column++) {
+    in = &(bitpack[column]);
 
-#ifdef MULTIMASK
-    OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(_mm_and_si128(InReg, mask8), 10-8));
-#else
-    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 8), mask10));
-#endif
-    /* total = _mm_add_epi32(total, OutReg); */
-    _mm_store_si128(out++, OutReg);
+    *out = ( Bigendian_convert_uint(*in) >>  0  )   % (1U << 10 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  10  )   % (1U << 10 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  20  )   % (1U << 10 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  30  )   % (1U << 10 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 8 ))<<( 10 - 8 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  8  )   % (1U << 10 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  18  )   % (1U << 10 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  28  )   % (1U << 10 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 6 ))<<( 10 - 6 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  6  )   % (1U << 10 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  16  )   % (1U << 10 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  26  )   % (1U << 10 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 4 ))<<( 10 - 4 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  4  )   % (1U << 10 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  14  )   % (1U << 10 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  24  )   % (1U << 10 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 2 ))<<( 10 - 2 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  2  )   % (1U << 10 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  12  )   % (1U << 10 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  22  )   % (1U << 10 ) ;
+    out++;
 
-    OutReg = _mm_and_si128(  _mm_srli_epi32(InReg,8) , mask10);
-    /* total = _mm_add_epi32(total, OutReg); */
-    _mm_store_si128(out++, OutReg);
+  }
+  return;
+}
 
-    OutReg = _mm_and_si128(  _mm_srli_epi32(InReg,18) , mask10);
-    /* total = _mm_add_epi32(total, OutReg); */
-    _mm_store_si128(out++, OutReg);
+#elif !defined(HAVE_SSE2)
+static void
+unpack_10 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
+  unsigned int column;
+  const UINT4 *bitpack = in;
 
-    OutReg =   _mm_srli_epi32(InReg,28) ;
-    InReg = _mm_load_si128(++in);
+  for (column = 0; column < 4; column++) {
+    in = &(bitpack[column]);
 
-#ifdef MULTIMASK
-    OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(_mm_and_si128(InReg, mask6), 10-6));
-#else
-    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 6), mask10));
-#endif
-    /* total = _mm_add_epi32(total, OutReg); */
-    _mm_store_si128(out++, OutReg);
+    *out = ( (*in) >>  0  )   % (1U << 10 ) ;
+    out++;
+    *out = ( (*in) >>  10  )   % (1U << 10 ) ;
+    out++;
+    *out = ( (*in) >>  20  )   % (1U << 10 ) ;
+    out++;
+    *out = ( (*in) >>  30  )   % (1U << 10 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 8 ))<<( 10 - 8 );
+    out++;
+    *out = ( (*in) >>  8  )   % (1U << 10 ) ;
+    out++;
+    *out = ( (*in) >>  18  )   % (1U << 10 ) ;
+    out++;
+    *out = ( (*in) >>  28  )   % (1U << 10 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 6 ))<<( 10 - 6 );
+    out++;
+    *out = ( (*in) >>  6  )   % (1U << 10 ) ;
+    out++;
+    *out = ( (*in) >>  16  )   % (1U << 10 ) ;
+    out++;
+    *out = ( (*in) >>  26  )   % (1U << 10 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 4 ))<<( 10 - 4 );
+    out++;
+    *out = ( (*in) >>  4  )   % (1U << 10 ) ;
+    out++;
+    *out = ( (*in) >>  14  )   % (1U << 10 ) ;
+    out++;
+    *out = ( (*in) >>  24  )   % (1U << 10 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 2 ))<<( 10 - 2 );
+    out++;
+    *out = ( (*in) >>  2  )   % (1U << 10 ) ;
+    out++;
+    *out = ( (*in) >>  12  )   % (1U << 10 ) ;
+    out++;
+    *out = ( (*in) >>  22  )   % (1U << 10 ) ;
+    out++;
+
+  }
+  return;
+}
+
+#else
+static void
+unpack_10_fwd (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
+    __m128i InReg = _mm_load_si128(in);
+    __m128i OutReg;
+    const __m128i mask10 =  _mm_set1_epi32(1023U);
+
+    OutReg = _mm_and_si128( InReg , mask10);
+    _mm_store_si128(out++, OutReg);
+
+    OutReg = _mm_and_si128(  _mm_srli_epi32(InReg,10) , mask10);
+    /* total = _mm_add_epi32(total, OutReg); */
+    _mm_store_si128(out++, OutReg);
+
+    OutReg = _mm_and_si128(  _mm_srli_epi32(InReg,20) , mask10);
+    /* total = _mm_add_epi32(total, OutReg); */
+    _mm_store_si128(out++, OutReg);
+
+    OutReg =   _mm_srli_epi32(InReg,30) ;
+    InReg = _mm_load_si128(++in);
+
+#ifdef MULTIMASK
+    OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(_mm_and_si128(InReg, mask8), 10-8));
+#else
+    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 8), mask10));
+#endif
+    /* total = _mm_add_epi32(total, OutReg); */
+    _mm_store_si128(out++, OutReg);
+
+    OutReg = _mm_and_si128(  _mm_srli_epi32(InReg,8) , mask10);
+    /* total = _mm_add_epi32(total, OutReg); */
+    _mm_store_si128(out++, OutReg);
+
+    OutReg = _mm_and_si128(  _mm_srli_epi32(InReg,18) , mask10);
+    /* total = _mm_add_epi32(total, OutReg); */
+    _mm_store_si128(out++, OutReg);
+
+    OutReg =   _mm_srli_epi32(InReg,28) ;
+    InReg = _mm_load_si128(++in);
+
+#ifdef MULTIMASK
+    OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(_mm_and_si128(InReg, mask6), 10-6));
+#else
+    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 6), mask10));
+#endif
+    /* total = _mm_add_epi32(total, OutReg); */
+    _mm_store_si128(out++, OutReg);
 
     OutReg = _mm_and_si128(  _mm_srli_epi32(InReg,6) , mask10);
     /* total = _mm_add_epi32(total, OutReg); */
@@ -3274,59 +3528,6 @@ unpack_10_rev_8 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
     return;
 }
 
-#else
-static void
-unpack_10 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
-  unsigned int column;
-  const UINT4 *bitpack = in;
-
-  for (column = 0; column < 4; column++) {
-    in = &(bitpack[column]);
-
-    *out = ( (*in) >>  0  )   % (1U << 10 ) ;
-    out++;
-    *out = ( (*in) >>  10  )   % (1U << 10 ) ;
-    out++;
-    *out = ( (*in) >>  20  )   % (1U << 10 ) ;
-    out++;
-    *out = ( (*in) >>  30  )   % (1U << 10 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 8 ))<<( 10 - 8 );
-    out++;
-    *out = ( (*in) >>  8  )   % (1U << 10 ) ;
-    out++;
-    *out = ( (*in) >>  18  )   % (1U << 10 ) ;
-    out++;
-    *out = ( (*in) >>  28  )   % (1U << 10 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 6 ))<<( 10 - 6 );
-    out++;
-    *out = ( (*in) >>  6  )   % (1U << 10 ) ;
-    out++;
-    *out = ( (*in) >>  16  )   % (1U << 10 ) ;
-    out++;
-    *out = ( (*in) >>  26  )   % (1U << 10 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 4 ))<<( 10 - 4 );
-    out++;
-    *out = ( (*in) >>  4  )   % (1U << 10 ) ;
-    out++;
-    *out = ( (*in) >>  14  )   % (1U << 10 ) ;
-    out++;
-    *out = ( (*in) >>  24  )   % (1U << 10 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 2 ))<<( 10 - 2 );
-    out++;
-    *out = ( (*in) >>  2  )   % (1U << 10 ) ;
-    out++;
-    *out = ( (*in) >>  12  )   % (1U << 10 ) ;
-    out++;
-    *out = ( (*in) >>  22  )   % (1U << 10 ) ;
-    out++;
-
-  }
-  return;
-}
 #endif
 
 
@@ -3440,7 +3641,117 @@ unpack_11 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
 #endif
 
 
-#ifdef HAVE_SSE2
+#ifdef WORDS_BIGENDIAN
+static void
+unpack_12 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
+  unsigned int column;
+  const UINT4 *bitpack = in;
+
+  for (column = 0; column < 4; column++) {
+    in = &(bitpack[column]);
+
+    *out = ( Bigendian_convert_uint(*in) >>  0  )   % (1U << 12 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  12  )   % (1U << 12 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  24  )   % (1U << 12 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 4 ))<<( 12 - 4 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  4  )   % (1U << 12 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  16  )   % (1U << 12 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  28  )   % (1U << 12 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 8 ))<<( 12 - 8 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  8  )   % (1U << 12 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  20  )   % (1U << 12 ) ;
+    in += 4;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  0  )   % (1U << 12 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  12  )   % (1U << 12 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  24  )   % (1U << 12 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 4 ))<<( 12 - 4 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  4  )   % (1U << 12 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  16  )   % (1U << 12 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  28  )   % (1U << 12 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 8 ))<<( 12 - 8 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  8  )   % (1U << 12 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  20  )   % (1U << 12 ) ;
+    out++;
+  }
+
+  return;
+}
+
+#elif !defined(HAVE_SSE2)
+static void
+unpack_12 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
+  unsigned int column;
+  const UINT4 *bitpack = in;
+
+  for (column = 0; column < 4; column++) {
+    in = &(bitpack[column]);
+
+    *out = ( (*in) >>  0  )   % (1U << 12 ) ;
+    out++;
+    *out = ( (*in) >>  12  )   % (1U << 12 ) ;
+    out++;
+    *out = ( (*in) >>  24  )   % (1U << 12 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 4 ))<<( 12 - 4 );
+    out++;
+    *out = ( (*in) >>  4  )   % (1U << 12 ) ;
+    out++;
+    *out = ( (*in) >>  16  )   % (1U << 12 ) ;
+    out++;
+    *out = ( (*in) >>  28  )   % (1U << 12 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 8 ))<<( 12 - 8 );
+    out++;
+    *out = ( (*in) >>  8  )   % (1U << 12 ) ;
+    out++;
+    *out = ( (*in) >>  20  )   % (1U << 12 ) ;
+    in += 4;
+    out++;
+    *out = ( (*in) >>  0  )   % (1U << 12 ) ;
+    out++;
+    *out = ( (*in) >>  12  )   % (1U << 12 ) ;
+    out++;
+    *out = ( (*in) >>  24  )   % (1U << 12 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 4 ))<<( 12 - 4 );
+    out++;
+    *out = ( (*in) >>  4  )   % (1U << 12 ) ;
+    out++;
+    *out = ( (*in) >>  16  )   % (1U << 12 ) ;
+    out++;
+    *out = ( (*in) >>  28  )   % (1U << 12 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 8 ))<<( 12 - 8 );
+    out++;
+    *out = ( (*in) >>  8  )   % (1U << 12 ) ;
+    out++;
+    *out = ( (*in) >>  20  )   % (1U << 12 ) ;
+    out++;
+  }
+
+  return;
+}
+
+#else
 static void
 unpack_12_fwd (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
     __m128i InReg = _mm_load_si128(in);
@@ -3989,72 +4300,18 @@ unpack_12_rev_8 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
     return;
 }
 
-#else
+#endif
+
+
+#ifdef ALLOW_ODD_PACKSIZES
 static void
-unpack_12 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
-  unsigned int column;
-  const UINT4 *bitpack = in;
+unpack_13 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
+    __m128i InReg = _mm_load_si128(in);
+    __m128i OutReg;
+    const __m128i mask13 =  _mm_set1_epi32(8191U);
 
-  for (column = 0; column < 4; column++) {
-    in = &(bitpack[column]);
-
-    *out = ( (*in) >>  0  )   % (1U << 12 ) ;
-    out++;
-    *out = ( (*in) >>  12  )   % (1U << 12 ) ;
-    out++;
-    *out = ( (*in) >>  24  )   % (1U << 12 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 4 ))<<( 12 - 4 );
-    out++;
-    *out = ( (*in) >>  4  )   % (1U << 12 ) ;
-    out++;
-    *out = ( (*in) >>  16  )   % (1U << 12 ) ;
-    out++;
-    *out = ( (*in) >>  28  )   % (1U << 12 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 8 ))<<( 12 - 8 );
-    out++;
-    *out = ( (*in) >>  8  )   % (1U << 12 ) ;
-    out++;
-    *out = ( (*in) >>  20  )   % (1U << 12 ) ;
-    in += 4;
-    out++;
-    *out = ( (*in) >>  0  )   % (1U << 12 ) ;
-    out++;
-    *out = ( (*in) >>  12  )   % (1U << 12 ) ;
-    out++;
-    *out = ( (*in) >>  24  )   % (1U << 12 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 4 ))<<( 12 - 4 );
-    out++;
-    *out = ( (*in) >>  4  )   % (1U << 12 ) ;
-    out++;
-    *out = ( (*in) >>  16  )   % (1U << 12 ) ;
-    out++;
-    *out = ( (*in) >>  28  )   % (1U << 12 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 8 ))<<( 12 - 8 );
-    out++;
-    *out = ( (*in) >>  8  )   % (1U << 12 ) ;
-    out++;
-    *out = ( (*in) >>  20  )   % (1U << 12 ) ;
-    out++;
-  }
-
-  return;
-}
-#endif
-
-
-#ifdef ALLOW_ODD_PACKSIZES
-static void
-unpack_13 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
-    __m128i InReg = _mm_load_si128(in);
-    __m128i OutReg;
-    const __m128i mask13 =  _mm_set1_epi32(8191U);
-
-    OutReg = _mm_and_si128( InReg , mask13);
-    _mm_store_si128(out++, OutReg);
+    OutReg = _mm_and_si128( InReg , mask13);
+    _mm_store_si128(out++, OutReg);
 
     OutReg = _mm_and_si128(  _mm_srli_epi32(InReg,13) , mask13);
     /* total = _mm_add_epi32(total, OutReg); */
@@ -4163,7 +4420,123 @@ unpack_13 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
 #endif
 
 
-#ifdef HAVE_SSE2
+#ifdef WORDS_BIGENDIAN
+static void
+unpack_14 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
+  unsigned int column;
+  const UINT4 *bitpack = in;
+
+  for (column = 0; column < 4; column++) {
+    in = &(bitpack[column]);
+
+    *out = ( Bigendian_convert_uint(*in) >>  0  )   % (1U << 14 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  14  )   % (1U << 14 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  28  )   % (1U << 14 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 10 ))<<( 14 - 10 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  10  )   % (1U << 14 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  24  )   % (1U << 14 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 6 ))<<( 14 - 6 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  6  )   % (1U << 14 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  20  )   % (1U << 14 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 2 ))<<( 14 - 2 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  2  )   % (1U << 14 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  16  )   % (1U << 14 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  30  )   % (1U << 14 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 12 ))<<( 14 - 12 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  12  )   % (1U << 14 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  26  )   % (1U << 14 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 8 ))<<( 14 - 8 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  8  )   % (1U << 14 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  22  )   % (1U << 14 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 4 ))<<( 14 - 4 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  4  )   % (1U << 14 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  18  )   % (1U << 14 ) ;
+    out++;
+  }
+
+  return;
+}
+
+#elif !defined(HAVE_SSE2)
+static void
+unpack_14 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
+  unsigned int column;
+  const UINT4 *bitpack = in;
+
+  for (column = 0; column < 4; column++) {
+    in = &(bitpack[column]);
+
+    *out = ( (*in) >>  0  )   % (1U << 14 ) ;
+    out++;
+    *out = ( (*in) >>  14  )   % (1U << 14 ) ;
+    out++;
+    *out = ( (*in) >>  28  )   % (1U << 14 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 10 ))<<( 14 - 10 );
+    out++;
+    *out = ( (*in) >>  10  )   % (1U << 14 ) ;
+    out++;
+    *out = ( (*in) >>  24  )   % (1U << 14 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 6 ))<<( 14 - 6 );
+    out++;
+    *out = ( (*in) >>  6  )   % (1U << 14 ) ;
+    out++;
+    *out = ( (*in) >>  20  )   % (1U << 14 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 2 ))<<( 14 - 2 );
+    out++;
+    *out = ( (*in) >>  2  )   % (1U << 14 ) ;
+    out++;
+    *out = ( (*in) >>  16  )   % (1U << 14 ) ;
+    out++;
+    *out = ( (*in) >>  30  )   % (1U << 14 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 12 ))<<( 14 - 12 );
+    out++;
+    *out = ( (*in) >>  12  )   % (1U << 14 ) ;
+    out++;
+    *out = ( (*in) >>  26  )   % (1U << 14 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 8 ))<<( 14 - 8 );
+    out++;
+    *out = ( (*in) >>  8  )   % (1U << 14 ) ;
+    out++;
+    *out = ( (*in) >>  22  )   % (1U << 14 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 4 ))<<( 14 - 4 );
+    out++;
+    *out = ( (*in) >>  4  )   % (1U << 14 ) ;
+    out++;
+    *out = ( (*in) >>  18  )   % (1U << 14 ) ;
+    out++;
+  }
+
+  return;
+}
+
+#else
 static void
 unpack_14_fwd (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
     __m128i InReg = _mm_load_si128(in);
@@ -4747,63 +5120,6 @@ unpack_14_rev_8 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
     return;
 }
 
-#else
-static void
-unpack_14 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
-  unsigned int column;
-  const UINT4 *bitpack = in;
-
-  for (column = 0; column < 4; column++) {
-    in = &(bitpack[column]);
-
-    *out = ( (*in) >>  0  )   % (1U << 14 ) ;
-    out++;
-    *out = ( (*in) >>  14  )   % (1U << 14 ) ;
-    out++;
-    *out = ( (*in) >>  28  )   % (1U << 14 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 10 ))<<( 14 - 10 );
-    out++;
-    *out = ( (*in) >>  10  )   % (1U << 14 ) ;
-    out++;
-    *out = ( (*in) >>  24  )   % (1U << 14 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 6 ))<<( 14 - 6 );
-    out++;
-    *out = ( (*in) >>  6  )   % (1U << 14 ) ;
-    out++;
-    *out = ( (*in) >>  20  )   % (1U << 14 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 2 ))<<( 14 - 2 );
-    out++;
-    *out = ( (*in) >>  2  )   % (1U << 14 ) ;
-    out++;
-    *out = ( (*in) >>  16  )   % (1U << 14 ) ;
-    out++;
-    *out = ( (*in) >>  30  )   % (1U << 14 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 12 ))<<( 14 - 12 );
-    out++;
-    *out = ( (*in) >>  12  )   % (1U << 14 ) ;
-    out++;
-    *out = ( (*in) >>  26  )   % (1U << 14 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 8 ))<<( 14 - 8 );
-    out++;
-    *out = ( (*in) >>  8  )   % (1U << 14 ) ;
-    out++;
-    *out = ( (*in) >>  22  )   % (1U << 14 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 4 ))<<( 14 - 4 );
-    out++;
-    *out = ( (*in) >>  4  )   % (1U << 14 ) ;
-    out++;
-    *out = ( (*in) >>  18  )   % (1U << 14 ) ;
-    out++;
-  }
-
-  return;
-}
 #endif
 
 
@@ -4932,7 +5248,49 @@ unpack_15 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
 #endif
 
 
-#ifdef HAVE_SSE2
+#ifdef WORDS_BIGENDIAN
+static void
+unpack_16 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
+  UINT4 outer, inwordpointer;
+  unsigned int column;
+  const UINT4 *bitpack = in;
+
+  for (column = 0; column < 4; column++) {
+    in = &(bitpack[column]);
+
+    for (outer = 0; outer < 8; outer++) {
+      for(inwordpointer =  0; inwordpointer <32; inwordpointer += 16) {
+	*(out++) = ( Bigendian_convert_uint(*in) >> inwordpointer )   % (1U << 16 ) ;
+      }
+      in += 4;
+    }
+  }
+
+  return;
+}
+
+#elif !defined(HAVE_SSE2)
+static void
+unpack_16 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
+  UINT4 outer, inwordpointer;
+  unsigned int column;
+  const UINT4 *bitpack = in;
+
+  for (column = 0; column < 4; column++) {
+    in = &(bitpack[column]);
+
+    for (outer = 0; outer < 8; outer++) {
+      for(inwordpointer =  0; inwordpointer <32; inwordpointer += 16) {
+	*(out++) = ( (*in) >> inwordpointer )   % (1U << 16 ) ;
+      }
+      in += 4;
+    }
+  }
+
+  return;
+}
+
+#else
 static void
 unpack_16_fwd (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
     __m128i InReg = _mm_load_si128(in);
@@ -5437,27 +5795,7 @@ unpack_16_rev_8 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
     return;
 }
 
-#else
-static void
-unpack_16 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
-  UINT4 outer, inwordpointer;
-  unsigned int column;
-  const UINT4 *bitpack = in;
-
-  for (column = 0; column < 4; column++) {
-    in = &(bitpack[column]);
-
-    for (outer = 0; outer < 8; outer++) {
-      for(inwordpointer =  0; inwordpointer <32; inwordpointer += 16) {
-	*(out++) = ( (*in) >> inwordpointer )   % (1U << 16 ) ;
-      }
-      in += 4;
-    }
-  }
-
-  return;
-}
-#endif
+#endif
 
 
 #ifdef ALLOW_ODD_PACKSIZES
@@ -5591,7 +5929,131 @@ unpack_17 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
 #endif
 
 
-#ifdef HAVE_SSE2
+#ifdef WORDS_BIGENDIAN
+static void
+unpack_18 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
+  unsigned int column;
+  const UINT4 *bitpack = in;
+
+  for (column = 0; column < 4; column++) {
+    in = &(bitpack[column]);
+
+    *out = ( Bigendian_convert_uint(*in) >>  0  )   % (1U << 18 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  18  )   % (1U << 18 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 4 ))<<( 18 - 4 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  4  )   % (1U << 18 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  22  )   % (1U << 18 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 8 ))<<( 18 - 8 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  8  )   % (1U << 18 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  26  )   % (1U << 18 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 12 ))<<( 18 - 12 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  12  )   % (1U << 18 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  30  )   % (1U << 18 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 16 ))<<( 18 - 16 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  16  )   % (1U << 18 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 2 ))<<( 18 - 2 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  2  )   % (1U << 18 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  20  )   % (1U << 18 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 6 ))<<( 18 - 6 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  6  )   % (1U << 18 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  24  )   % (1U << 18 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 10 ))<<( 18 - 10 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  10  )   % (1U << 18 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  28  )   % (1U << 18 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 14 ))<<( 18 - 14 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  14  )   % (1U << 18 ) ;
+    out++;
+  }
+
+  return;
+}
+
+#elif !defined(HAVE_SSE2)
+static void
+unpack_18 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
+  unsigned int column;
+  const UINT4 *bitpack = in;
+
+  for (column = 0; column < 4; column++) {
+    in = &(bitpack[column]);
+
+    *out = ( (*in) >>  0  )   % (1U << 18 ) ;
+    out++;
+    *out = ( (*in) >>  18  )   % (1U << 18 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 4 ))<<( 18 - 4 );
+    out++;
+    *out = ( (*in) >>  4  )   % (1U << 18 ) ;
+    out++;
+    *out = ( (*in) >>  22  )   % (1U << 18 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 8 ))<<( 18 - 8 );
+    out++;
+    *out = ( (*in) >>  8  )   % (1U << 18 ) ;
+    out++;
+    *out = ( (*in) >>  26  )   % (1U << 18 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 12 ))<<( 18 - 12 );
+    out++;
+    *out = ( (*in) >>  12  )   % (1U << 18 ) ;
+    out++;
+    *out = ( (*in) >>  30  )   % (1U << 18 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 16 ))<<( 18 - 16 );
+    out++;
+    *out = ( (*in) >>  16  )   % (1U << 18 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 2 ))<<( 18 - 2 );
+    out++;
+    *out = ( (*in) >>  2  )   % (1U << 18 ) ;
+    out++;
+    *out = ( (*in) >>  20  )   % (1U << 18 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 6 ))<<( 18 - 6 );
+    out++;
+    *out = ( (*in) >>  6  )   % (1U << 18 ) ;
+    out++;
+    *out = ( (*in) >>  24  )   % (1U << 18 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 10 ))<<( 18 - 10 );
+    out++;
+    *out = ( (*in) >>  10  )   % (1U << 18 ) ;
+    out++;
+    *out = ( (*in) >>  28  )   % (1U << 18 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 14 ))<<( 18 - 14 );
+    out++;
+    *out = ( (*in) >>  14  )   % (1U << 18 ) ;
+    out++;
+  }
+
+  return;
+}
+
+#else
 static void
 unpack_18_fwd (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
     __m128i InReg = _mm_load_si128(in);
@@ -6212,67 +6674,6 @@ unpack_18_rev_8 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
     return;
 }
 
-#else
-static void
-unpack_18 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
-  unsigned int column;
-  const UINT4 *bitpack = in;
-
-  for (column = 0; column < 4; column++) {
-    in = &(bitpack[column]);
-
-    *out = ( (*in) >>  0  )   % (1U << 18 ) ;
-    out++;
-    *out = ( (*in) >>  18  )   % (1U << 18 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 4 ))<<( 18 - 4 );
-    out++;
-    *out = ( (*in) >>  4  )   % (1U << 18 ) ;
-    out++;
-    *out = ( (*in) >>  22  )   % (1U << 18 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 8 ))<<( 18 - 8 );
-    out++;
-    *out = ( (*in) >>  8  )   % (1U << 18 ) ;
-    out++;
-    *out = ( (*in) >>  26  )   % (1U << 18 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 12 ))<<( 18 - 12 );
-    out++;
-    *out = ( (*in) >>  12  )   % (1U << 18 ) ;
-    out++;
-    *out = ( (*in) >>  30  )   % (1U << 18 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 16 ))<<( 18 - 16 );
-    out++;
-    *out = ( (*in) >>  16  )   % (1U << 18 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 2 ))<<( 18 - 2 );
-    out++;
-    *out = ( (*in) >>  2  )   % (1U << 18 ) ;
-    out++;
-    *out = ( (*in) >>  20  )   % (1U << 18 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 6 ))<<( 18 - 6 );
-    out++;
-    *out = ( (*in) >>  6  )   % (1U << 18 ) ;
-    out++;
-    *out = ( (*in) >>  24  )   % (1U << 18 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 10 ))<<( 18 - 10 );
-    out++;
-    *out = ( (*in) >>  10  )   % (1U << 18 ) ;
-    out++;
-    *out = ( (*in) >>  28  )   % (1U << 18 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 14 ))<<( 18 - 14 );
-    out++;
-    *out = ( (*in) >>  14  )   % (1U << 18 ) ;
-    out++;
-  }
-
-  return;
-}
 #endif
 
 
@@ -6414,58 +6815,184 @@ unpack_19 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
 #endif
 
 
-#ifdef HAVE_SSE2
+#ifdef WORDS_BIGENDIAN
 static void
-unpack_20_fwd (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
-    __m128i InReg = _mm_load_si128(in);
-    __m128i OutReg;
-    const __m128i mask20 =  _mm_set1_epi32(1048575U);
-
-    OutReg = _mm_and_si128( InReg , mask20);
-    _mm_store_si128(out++, OutReg);
-
-    OutReg =   _mm_srli_epi32(InReg,20) ;
-    InReg = _mm_load_si128(++in);
-
-#ifdef MULTIMASK
-    OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(_mm_and_si128(InReg, mask8), 20-8));
-#else
-    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 8), mask20));
-#endif
-    /* total = _mm_add_epi32(total, OutReg); */
-    _mm_store_si128(out++, OutReg);
-
-    OutReg = _mm_and_si128(  _mm_srli_epi32(InReg,8) , mask20);
-    /* total = _mm_add_epi32(total, OutReg); */
-    _mm_store_si128(out++, OutReg);
+unpack_20 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
+  unsigned int column;
+  const UINT4 *bitpack = in;
 
-    OutReg =   _mm_srli_epi32(InReg,28) ;
-    InReg = _mm_load_si128(++in);
+  for (column = 0; column < 4; column++) {
+    in = &(bitpack[column]);
 
-#ifdef MULTIMASK
-    OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(_mm_and_si128(InReg, mask16), 20-16));
-#else
-    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 16), mask20));
-#endif
-    /* total = _mm_add_epi32(total, OutReg); */
-    _mm_store_si128(out++, OutReg);
+    *out = ( Bigendian_convert_uint(*in) >>  0  )   % (1U << 20 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  20  )   % (1U << 20 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 8 ))<<( 20 - 8 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  8  )   % (1U << 20 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  28  )   % (1U << 20 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 16 ))<<( 20 - 16 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  16  )   % (1U << 20 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 4 ))<<( 20 - 4 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  4  )   % (1U << 20 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  24  )   % (1U << 20 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 12 ))<<( 20 - 12 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  12  )   % (1U << 20 ) ;
+    in += 4;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  0  )   % (1U << 20 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  20  )   % (1U << 20 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 8 ))<<( 20 - 8 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  8  )   % (1U << 20 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  28  )   % (1U << 20 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 16 ))<<( 20 - 16 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  16  )   % (1U << 20 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 4 ))<<( 20 - 4 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  4  )   % (1U << 20 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  24  )   % (1U << 20 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 12 ))<<( 20 - 12 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  12  )   % (1U << 20 ) ;
+    out++;
+  }
 
-    OutReg =   _mm_srli_epi32(InReg,16) ;
-    InReg = _mm_load_si128(++in);
+  return;
+}
 
-#ifdef MULTIMASK
-    OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(_mm_and_si128(InReg, mask4), 20-4));
-#else
-    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 4), mask20));
-#endif
-    /* total = _mm_add_epi32(total, OutReg); */
-    _mm_store_si128(out++, OutReg);
+#elif !defined(HAVE_SSE2)
+static void
+unpack_20 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
+  unsigned int column;
+  const UINT4 *bitpack = in;
 
-    OutReg = _mm_and_si128(  _mm_srli_epi32(InReg,4) , mask20);
-    /* total = _mm_add_epi32(total, OutReg); */
-    _mm_store_si128(out++, OutReg);
+  for (column = 0; column < 4; column++) {
+    in = &(bitpack[column]);
 
-    OutReg =   _mm_srli_epi32(InReg,24) ;
+    *out = ( (*in) >>  0  )   % (1U << 20 ) ;
+    out++;
+    *out = ( (*in) >>  20  )   % (1U << 20 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 8 ))<<( 20 - 8 );
+    out++;
+    *out = ( (*in) >>  8  )   % (1U << 20 ) ;
+    out++;
+    *out = ( (*in) >>  28  )   % (1U << 20 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 16 ))<<( 20 - 16 );
+    out++;
+    *out = ( (*in) >>  16  )   % (1U << 20 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 4 ))<<( 20 - 4 );
+    out++;
+    *out = ( (*in) >>  4  )   % (1U << 20 ) ;
+    out++;
+    *out = ( (*in) >>  24  )   % (1U << 20 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 12 ))<<( 20 - 12 );
+    out++;
+    *out = ( (*in) >>  12  )   % (1U << 20 ) ;
+    in += 4;
+    out++;
+    *out = ( (*in) >>  0  )   % (1U << 20 ) ;
+    out++;
+    *out = ( (*in) >>  20  )   % (1U << 20 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 8 ))<<( 20 - 8 );
+    out++;
+    *out = ( (*in) >>  8  )   % (1U << 20 ) ;
+    out++;
+    *out = ( (*in) >>  28  )   % (1U << 20 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 16 ))<<( 20 - 16 );
+    out++;
+    *out = ( (*in) >>  16  )   % (1U << 20 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 4 ))<<( 20 - 4 );
+    out++;
+    *out = ( (*in) >>  4  )   % (1U << 20 ) ;
+    out++;
+    *out = ( (*in) >>  24  )   % (1U << 20 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 12 ))<<( 20 - 12 );
+    out++;
+    *out = ( (*in) >>  12  )   % (1U << 20 ) ;
+    out++;
+  }
+
+  return;
+}
+
+#else
+static void
+unpack_20_fwd (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
+    __m128i InReg = _mm_load_si128(in);
+    __m128i OutReg;
+    const __m128i mask20 =  _mm_set1_epi32(1048575U);
+
+    OutReg = _mm_and_si128( InReg , mask20);
+    _mm_store_si128(out++, OutReg);
+
+    OutReg =   _mm_srli_epi32(InReg,20) ;
+    InReg = _mm_load_si128(++in);
+
+#ifdef MULTIMASK
+    OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(_mm_and_si128(InReg, mask8), 20-8));
+#else
+    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 8), mask20));
+#endif
+    /* total = _mm_add_epi32(total, OutReg); */
+    _mm_store_si128(out++, OutReg);
+
+    OutReg = _mm_and_si128(  _mm_srli_epi32(InReg,8) , mask20);
+    /* total = _mm_add_epi32(total, OutReg); */
+    _mm_store_si128(out++, OutReg);
+
+    OutReg =   _mm_srli_epi32(InReg,28) ;
+    InReg = _mm_load_si128(++in);
+
+#ifdef MULTIMASK
+    OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(_mm_and_si128(InReg, mask16), 20-16));
+#else
+    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 16), mask20));
+#endif
+    /* total = _mm_add_epi32(total, OutReg); */
+    _mm_store_si128(out++, OutReg);
+
+    OutReg =   _mm_srli_epi32(InReg,16) ;
+    InReg = _mm_load_si128(++in);
+
+#ifdef MULTIMASK
+    OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(_mm_and_si128(InReg, mask4), 20-4));
+#else
+    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 4), mask20));
+#endif
+    /* total = _mm_add_epi32(total, OutReg); */
+    _mm_store_si128(out++, OutReg);
+
+    OutReg = _mm_and_si128(  _mm_srli_epi32(InReg,4) , mask20);
+    /* total = _mm_add_epi32(total, OutReg); */
+    _mm_store_si128(out++, OutReg);
+
+    OutReg =   _mm_srli_epi32(InReg,24) ;
     InReg = _mm_load_si128(++in);
 
 #ifdef MULTIMASK
@@ -7037,68 +7564,6 @@ unpack_20_rev_8 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
 }
 
 
-#else
-static void
-unpack_20 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
-  unsigned int column;
-  const UINT4 *bitpack = in;
-
-  for (column = 0; column < 4; column++) {
-    in = &(bitpack[column]);
-
-    *out = ( (*in) >>  0  )   % (1U << 20 ) ;
-    out++;
-    *out = ( (*in) >>  20  )   % (1U << 20 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 8 ))<<( 20 - 8 );
-    out++;
-    *out = ( (*in) >>  8  )   % (1U << 20 ) ;
-    out++;
-    *out = ( (*in) >>  28  )   % (1U << 20 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 16 ))<<( 20 - 16 );
-    out++;
-    *out = ( (*in) >>  16  )   % (1U << 20 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 4 ))<<( 20 - 4 );
-    out++;
-    *out = ( (*in) >>  4  )   % (1U << 20 ) ;
-    out++;
-    *out = ( (*in) >>  24  )   % (1U << 20 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 12 ))<<( 20 - 12 );
-    out++;
-    *out = ( (*in) >>  12  )   % (1U << 20 ) ;
-    in += 4;
-    out++;
-    *out = ( (*in) >>  0  )   % (1U << 20 ) ;
-    out++;
-    *out = ( (*in) >>  20  )   % (1U << 20 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 8 ))<<( 20 - 8 );
-    out++;
-    *out = ( (*in) >>  8  )   % (1U << 20 ) ;
-    out++;
-    *out = ( (*in) >>  28  )   % (1U << 20 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 16 ))<<( 20 - 16 );
-    out++;
-    *out = ( (*in) >>  16  )   % (1U << 20 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 4 ))<<( 20 - 4 );
-    out++;
-    *out = ( (*in) >>  4  )   % (1U << 20 ) ;
-    out++;
-    *out = ( (*in) >>  24  )   % (1U << 20 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 12 ))<<( 20 - 12 );
-    out++;
-    *out = ( (*in) >>  12  )   % (1U << 20 ) ;
-    out++;
-  }
-
-  return;
-}
 #endif
 
 
@@ -7247,61 +7712,193 @@ unpack_21 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
 #endif
 
 
-#ifdef HAVE_SSE2
+#ifdef WORDS_BIGENDIAN
 static void
-unpack_22_fwd (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
-    __m128i InReg = _mm_load_si128(in);
-    __m128i OutReg;
-    const __m128i mask22 =  _mm_set1_epi32(4194303U);
-
-    OutReg = _mm_and_si128( InReg , mask22);
-    _mm_store_si128(out++, OutReg);
-
-    OutReg =   _mm_srli_epi32(InReg,22) ;
-    InReg = _mm_load_si128(++in);
-
-#ifdef MULTIMASK
-    OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(_mm_and_si128(InReg, mask12), 22-12));
-#else
-    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 12), mask22));
-#endif
-    /* total = _mm_add_epi32(total, OutReg); */
-    _mm_store_si128(out++, OutReg);
-
-    OutReg =   _mm_srli_epi32(InReg,12) ;
-    InReg = _mm_load_si128(++in);
+unpack_22 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
+  unsigned int column;
+  const UINT4 *bitpack = in;
 
-#ifdef MULTIMASK
-    OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(_mm_and_si128(InReg, mask2), 22-2));
-#else
-    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 2), mask22));
-#endif
-    /* total = _mm_add_epi32(total, OutReg); */
-    _mm_store_si128(out++, OutReg);
+  for (column = 0; column < 4; column++) {
+    in = &(bitpack[column]);
 
-    OutReg = _mm_and_si128(  _mm_srli_epi32(InReg,2) , mask22);
-    /* total = _mm_add_epi32(total, OutReg); */
-    _mm_store_si128(out++, OutReg);
+    *out = ( Bigendian_convert_uint(*in) >>  0  )   % (1U << 22 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  22  )   % (1U << 22 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 12 ))<<( 22 - 12 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  12  )   % (1U << 22 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 2 ))<<( 22 - 2 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  2  )   % (1U << 22 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  24  )   % (1U << 22 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 14 ))<<( 22 - 14 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  14  )   % (1U << 22 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 4 ))<<( 22 - 4 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  4  )   % (1U << 22 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  26  )   % (1U << 22 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 16 ))<<( 22 - 16 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  16  )   % (1U << 22 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 6 ))<<( 22 - 6 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  6  )   % (1U << 22 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  28  )   % (1U << 22 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 18 ))<<( 22 - 18 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  18  )   % (1U << 22 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 8 ))<<( 22 - 8 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  8  )   % (1U << 22 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  30  )   % (1U << 22 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 20 ))<<( 22 - 20 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  20  )   % (1U << 22 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 10 ))<<( 22 - 10 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  10  )   % (1U << 22 ) ;
+    out++;
+  }
 
-    OutReg =   _mm_srli_epi32(InReg,24) ;
-    InReg = _mm_load_si128(++in);
+  return;
+}
 
-#ifdef MULTIMASK
-    OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(_mm_and_si128(InReg, mask14), 22-14));
-#else
-    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 14), mask22));
-#endif
-    /* total = _mm_add_epi32(total, OutReg); */
-    _mm_store_si128(out++, OutReg);
+#elif !defined(HAVE_SSE2)
+static void
+unpack_22 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
+  unsigned int column;
+  const UINT4 *bitpack = in;
 
-    OutReg =   _mm_srli_epi32(InReg,14) ;
-    InReg = _mm_load_si128(++in);
+  for (column = 0; column < 4; column++) {
+    in = &(bitpack[column]);
 
-#ifdef MULTIMASK
-    OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(_mm_and_si128(InReg, mask4), 22-4));
-#else
-    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 4), mask22));
-#endif
+    *out = ( (*in) >>  0  )   % (1U << 22 ) ;
+    out++;
+    *out = ( (*in) >>  22  )   % (1U << 22 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 12 ))<<( 22 - 12 );
+    out++;
+    *out = ( (*in) >>  12  )   % (1U << 22 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 2 ))<<( 22 - 2 );
+    out++;
+    *out = ( (*in) >>  2  )   % (1U << 22 ) ;
+    out++;
+    *out = ( (*in) >>  24  )   % (1U << 22 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 14 ))<<( 22 - 14 );
+    out++;
+    *out = ( (*in) >>  14  )   % (1U << 22 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 4 ))<<( 22 - 4 );
+    out++;
+    *out = ( (*in) >>  4  )   % (1U << 22 ) ;
+    out++;
+    *out = ( (*in) >>  26  )   % (1U << 22 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 16 ))<<( 22 - 16 );
+    out++;
+    *out = ( (*in) >>  16  )   % (1U << 22 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 6 ))<<( 22 - 6 );
+    out++;
+    *out = ( (*in) >>  6  )   % (1U << 22 ) ;
+    out++;
+    *out = ( (*in) >>  28  )   % (1U << 22 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 18 ))<<( 22 - 18 );
+    out++;
+    *out = ( (*in) >>  18  )   % (1U << 22 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 8 ))<<( 22 - 8 );
+    out++;
+    *out = ( (*in) >>  8  )   % (1U << 22 ) ;
+    out++;
+    *out = ( (*in) >>  30  )   % (1U << 22 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 20 ))<<( 22 - 20 );
+    out++;
+    *out = ( (*in) >>  20  )   % (1U << 22 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 10 ))<<( 22 - 10 );
+    out++;
+    *out = ( (*in) >>  10  )   % (1U << 22 ) ;
+    out++;
+  }
+
+  return;
+}
+
+#else
+static void
+unpack_22_fwd (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
+    __m128i InReg = _mm_load_si128(in);
+    __m128i OutReg;
+    const __m128i mask22 =  _mm_set1_epi32(4194303U);
+
+    OutReg = _mm_and_si128( InReg , mask22);
+    _mm_store_si128(out++, OutReg);
+
+    OutReg =   _mm_srli_epi32(InReg,22) ;
+    InReg = _mm_load_si128(++in);
+
+#ifdef MULTIMASK
+    OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(_mm_and_si128(InReg, mask12), 22-12));
+#else
+    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 12), mask22));
+#endif
+    /* total = _mm_add_epi32(total, OutReg); */
+    _mm_store_si128(out++, OutReg);
+
+    OutReg =   _mm_srli_epi32(InReg,12) ;
+    InReg = _mm_load_si128(++in);
+
+#ifdef MULTIMASK
+    OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(_mm_and_si128(InReg, mask2), 22-2));
+#else
+    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 2), mask22));
+#endif
+    /* total = _mm_add_epi32(total, OutReg); */
+    _mm_store_si128(out++, OutReg);
+
+    OutReg = _mm_and_si128(  _mm_srli_epi32(InReg,2) , mask22);
+    /* total = _mm_add_epi32(total, OutReg); */
+    _mm_store_si128(out++, OutReg);
+
+    OutReg =   _mm_srli_epi32(InReg,24) ;
+    InReg = _mm_load_si128(++in);
+
+#ifdef MULTIMASK
+    OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(_mm_and_si128(InReg, mask14), 22-14));
+#else
+    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 14), mask22));
+#endif
+    /* total = _mm_add_epi32(total, OutReg); */
+    _mm_store_si128(out++, OutReg);
+
+    OutReg =   _mm_srli_epi32(InReg,14) ;
+    InReg = _mm_load_si128(++in);
+
+#ifdef MULTIMASK
+    OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(_mm_and_si128(InReg, mask4), 22-4));
+#else
+    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 4), mask22));
+#endif
     /* total = _mm_add_epi32(total, OutReg); */
     _mm_store_si128(out++, OutReg);
 
@@ -7903,71 +8500,6 @@ unpack_22_rev_8 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
     return;
 }
 
-#else
-static void
-unpack_22 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
-  unsigned int column;
-  const UINT4 *bitpack = in;
-
-  for (column = 0; column < 4; column++) {
-    in = &(bitpack[column]);
-
-    *out = ( (*in) >>  0  )   % (1U << 22 ) ;
-    out++;
-    *out = ( (*in) >>  22  )   % (1U << 22 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 12 ))<<( 22 - 12 );
-    out++;
-    *out = ( (*in) >>  12  )   % (1U << 22 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 2 ))<<( 22 - 2 );
-    out++;
-    *out = ( (*in) >>  2  )   % (1U << 22 ) ;
-    out++;
-    *out = ( (*in) >>  24  )   % (1U << 22 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 14 ))<<( 22 - 14 );
-    out++;
-    *out = ( (*in) >>  14  )   % (1U << 22 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 4 ))<<( 22 - 4 );
-    out++;
-    *out = ( (*in) >>  4  )   % (1U << 22 ) ;
-    out++;
-    *out = ( (*in) >>  26  )   % (1U << 22 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 16 ))<<( 22 - 16 );
-    out++;
-    *out = ( (*in) >>  16  )   % (1U << 22 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 6 ))<<( 22 - 6 );
-    out++;
-    *out = ( (*in) >>  6  )   % (1U << 22 ) ;
-    out++;
-    *out = ( (*in) >>  28  )   % (1U << 22 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 18 ))<<( 22 - 18 );
-    out++;
-    *out = ( (*in) >>  18  )   % (1U << 22 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 8 ))<<( 22 - 8 );
-    out++;
-    *out = ( (*in) >>  8  )   % (1U << 22 ) ;
-    out++;
-    *out = ( (*in) >>  30  )   % (1U << 22 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 20 ))<<( 22 - 20 );
-    out++;
-    *out = ( (*in) >>  20  )   % (1U << 22 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 10 ))<<( 22 - 10 );
-    out++;
-    *out = ( (*in) >>  10  )   % (1U << 22 ) ;
-    out++;
-  }
-
-  return;
-}
 #endif
 
 
@@ -8123,50 +8655,180 @@ unpack_23 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
 #endif
 
 
-#ifdef HAVE_SSE2
+#ifdef WORDS_BIGENDIAN
 static void
-unpack_24_fwd (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
-    __m128i InReg = _mm_load_si128(in);
-    __m128i OutReg;
-    const __m128i mask24 =  _mm_set1_epi32(16777215U);
-
-    OutReg = _mm_and_si128( InReg , mask24);
-    _mm_store_si128(out++, OutReg);
-
-    OutReg =   _mm_srli_epi32(InReg,24) ;
-    InReg = _mm_load_si128(++in);
-
-#ifdef MULTIMASK
-    OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(_mm_and_si128(InReg, mask16), 24-16));
-#else
-    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask24));
-#endif
-    /* total = _mm_add_epi32(total, OutReg); */
-    _mm_store_si128(out++, OutReg);
-
-    OutReg =   _mm_srli_epi32(InReg,16) ;
-    InReg = _mm_load_si128(++in);
+unpack_24 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
+  unsigned int column;
+  const UINT4 *bitpack = in;
 
-#ifdef MULTIMASK
-    OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(_mm_and_si128(InReg, mask8), 24-8));
-#else
-    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask24));
-#endif
-    /* total = _mm_add_epi32(total, OutReg); */
-    _mm_store_si128(out++, OutReg);
+  for (column = 0; column < 4; column++) {
+    in = &(bitpack[column]);
 
-    OutReg =   _mm_srli_epi32(InReg,8) ;
-    InReg = _mm_load_si128(++in);
+    *out = ( Bigendian_convert_uint(*in) >>  0  )   % (1U << 24 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  24  )   % (1U << 24 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 16 ))<<( 24 - 16 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  16  )   % (1U << 24 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 8 ))<<( 24 - 8 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  8  )   % (1U << 24 ) ;
+    in += 4;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  0  )   % (1U << 24 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  24  )   % (1U << 24 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 16 ))<<( 24 - 16 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  16  )   % (1U << 24 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 8 ))<<( 24 - 8 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  8  )   % (1U << 24 ) ;
+    in += 4;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  0  )   % (1U << 24 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  24  )   % (1U << 24 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 16 ))<<( 24 - 16 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  16  )   % (1U << 24 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 8 ))<<( 24 - 8 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  8  )   % (1U << 24 ) ;
+    in += 4;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  0  )   % (1U << 24 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  24  )   % (1U << 24 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 16 ))<<( 24 - 16 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  16  )   % (1U << 24 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 8 ))<<( 24 - 8 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  8  )   % (1U << 24 ) ;
+    out++;
+  }
 
-    /* total = _mm_add_epi32(total, OutReg); */
-    _mm_store_si128(out++, OutReg);
+  return;
+}
 
-    OutReg = _mm_and_si128( InReg , mask24);
-    /* total = _mm_add_epi32(total, OutReg); */
-    _mm_store_si128(out++, OutReg);
+#elif !defined(HAVE_SSE2)
+static void
+unpack_24 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
+  unsigned int column;
+  const UINT4 *bitpack = in;
 
-    OutReg =   _mm_srli_epi32(InReg,24) ;
-    InReg = _mm_load_si128(++in);
+  for (column = 0; column < 4; column++) {
+    in = &(bitpack[column]);
+
+    *out = ( (*in) >>  0  )   % (1U << 24 ) ;
+    out++;
+    *out = ( (*in) >>  24  )   % (1U << 24 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 );
+    out++;
+    *out = ( (*in) >>  16  )   % (1U << 24 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 );
+    out++;
+    *out = ( (*in) >>  8  )   % (1U << 24 ) ;
+    in += 4;
+    out++;
+    *out = ( (*in) >>  0  )   % (1U << 24 ) ;
+    out++;
+    *out = ( (*in) >>  24  )   % (1U << 24 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 );
+    out++;
+    *out = ( (*in) >>  16  )   % (1U << 24 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 );
+    out++;
+    *out = ( (*in) >>  8  )   % (1U << 24 ) ;
+    in += 4;
+    out++;
+    *out = ( (*in) >>  0  )   % (1U << 24 ) ;
+    out++;
+    *out = ( (*in) >>  24  )   % (1U << 24 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 );
+    out++;
+    *out = ( (*in) >>  16  )   % (1U << 24 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 );
+    out++;
+    *out = ( (*in) >>  8  )   % (1U << 24 ) ;
+    in += 4;
+    out++;
+    *out = ( (*in) >>  0  )   % (1U << 24 ) ;
+    out++;
+    *out = ( (*in) >>  24  )   % (1U << 24 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 );
+    out++;
+    *out = ( (*in) >>  16  )   % (1U << 24 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 );
+    out++;
+    *out = ( (*in) >>  8  )   % (1U << 24 ) ;
+    out++;
+  }
+
+  return;
+}
+
+#else
+static void
+unpack_24_fwd (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
+    __m128i InReg = _mm_load_si128(in);
+    __m128i OutReg;
+    const __m128i mask24 =  _mm_set1_epi32(16777215U);
+
+    OutReg = _mm_and_si128( InReg , mask24);
+    _mm_store_si128(out++, OutReg);
+
+    OutReg =   _mm_srli_epi32(InReg,24) ;
+    InReg = _mm_load_si128(++in);
+
+#ifdef MULTIMASK
+    OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(_mm_and_si128(InReg, mask16), 24-16));
+#else
+    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask24));
+#endif
+    /* total = _mm_add_epi32(total, OutReg); */
+    _mm_store_si128(out++, OutReg);
+
+    OutReg =   _mm_srli_epi32(InReg,16) ;
+    InReg = _mm_load_si128(++in);
+
+#ifdef MULTIMASK
+    OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(_mm_and_si128(InReg, mask8), 24-8));
+#else
+    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask24));
+#endif
+    /* total = _mm_add_epi32(total, OutReg); */
+    _mm_store_si128(out++, OutReg);
+
+    OutReg =   _mm_srli_epi32(InReg,8) ;
+    InReg = _mm_load_si128(++in);
+
+    /* total = _mm_add_epi32(total, OutReg); */
+    _mm_store_si128(out++, OutReg);
+
+    OutReg = _mm_and_si128( InReg , mask24);
+    /* total = _mm_add_epi32(total, OutReg); */
+    _mm_store_si128(out++, OutReg);
+
+    OutReg =   _mm_srli_epi32(InReg,24) ;
+    InReg = _mm_load_si128(++in);
 
 #ifdef MULTIMASK
     OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(_mm_and_si128(InReg, mask16), 24-16));
@@ -8753,70 +9415,6 @@ unpack_24_rev_8 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
 }
 
 
-#else
-static void
-unpack_24 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
-  unsigned int column;
-  const UINT4 *bitpack = in;
-
-  for (column = 0; column < 4; column++) {
-    in = &(bitpack[column]);
-
-    *out = ( (*in) >>  0  )   % (1U << 24 ) ;
-    out++;
-    *out = ( (*in) >>  24  )   % (1U << 24 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 );
-    out++;
-    *out = ( (*in) >>  16  )   % (1U << 24 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 );
-    out++;
-    *out = ( (*in) >>  8  )   % (1U << 24 ) ;
-    in += 4;
-    out++;
-    *out = ( (*in) >>  0  )   % (1U << 24 ) ;
-    out++;
-    *out = ( (*in) >>  24  )   % (1U << 24 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 );
-    out++;
-    *out = ( (*in) >>  16  )   % (1U << 24 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 );
-    out++;
-    *out = ( (*in) >>  8  )   % (1U << 24 ) ;
-    in += 4;
-    out++;
-    *out = ( (*in) >>  0  )   % (1U << 24 ) ;
-    out++;
-    *out = ( (*in) >>  24  )   % (1U << 24 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 );
-    out++;
-    *out = ( (*in) >>  16  )   % (1U << 24 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 );
-    out++;
-    *out = ( (*in) >>  8  )   % (1U << 24 ) ;
-    in += 4;
-    out++;
-    *out = ( (*in) >>  0  )   % (1U << 24 ) ;
-    out++;
-    *out = ( (*in) >>  24  )   % (1U << 24 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 );
-    out++;
-    *out = ( (*in) >>  16  )   % (1U << 24 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 );
-    out++;
-    *out = ( (*in) >>  8  )   % (1U << 24 ) ;
-    out++;
-  }
-
-  return;
-}
 #endif
 
 
@@ -8979,59 +9577,199 @@ unpack_25 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
 #endif
 
 
-#ifdef HAVE_SSE2
+#ifdef WORDS_BIGENDIAN
 static void
-unpack_26_fwd (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
-    __m128i InReg = _mm_load_si128(in);
-    __m128i OutReg;
-    const __m128i mask26 =  _mm_set1_epi32(67108863U);
-
-    OutReg = _mm_and_si128( InReg , mask26);
-    _mm_store_si128(out++, OutReg);
-
-    OutReg =   _mm_srli_epi32(InReg,26) ;
-    InReg = _mm_load_si128(++in);
-
-#ifdef MULTIMASK
-    OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(_mm_and_si128(InReg, mask20), 26-20));
-#else
-    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 20), mask26));
-#endif
-    /* total = _mm_add_epi32(total, OutReg); */
-    _mm_store_si128(out++, OutReg);
-
-    OutReg =   _mm_srli_epi32(InReg,20) ;
-    InReg = _mm_load_si128(++in);
-
-#ifdef MULTIMASK
-    OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(_mm_and_si128(InReg, mask14), 26-14));
-#else
-    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 14), mask26));
-#endif
-    /* total = _mm_add_epi32(total, OutReg); */
-    _mm_store_si128(out++, OutReg);
+unpack_26 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
+  unsigned int column;
+  const UINT4 *bitpack = in;
 
-    OutReg =   _mm_srli_epi32(InReg,14) ;
-    InReg = _mm_load_si128(++in);
+  for (column = 0; column < 4; column++) {
+    in = &(bitpack[column]);
 
-#ifdef MULTIMASK
-    OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(_mm_and_si128(InReg, mask8), 26-8));
-#else
-    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 8), mask26));
-#endif
-    /* total = _mm_add_epi32(total, OutReg); */
-    _mm_store_si128(out++, OutReg);
+    *out = ( Bigendian_convert_uint(*in) >>  0  )   % (1U << 26 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  26  )   % (1U << 26 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 20 ))<<( 26 - 20 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  20  )   % (1U << 26 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 14 ))<<( 26 - 14 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  14  )   % (1U << 26 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 8 ))<<( 26 - 8 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  8  )   % (1U << 26 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 2 ))<<( 26 - 2 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  2  )   % (1U << 26 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  28  )   % (1U << 26 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 22 ))<<( 26 - 22 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  22  )   % (1U << 26 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 16 ))<<( 26 - 16 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  16  )   % (1U << 26 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 10 ))<<( 26 - 10 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  10  )   % (1U << 26 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 4 ))<<( 26 - 4 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  4  )   % (1U << 26 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  30  )   % (1U << 26 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 24 ))<<( 26 - 24 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  24  )   % (1U << 26 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 18 ))<<( 26 - 18 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  18  )   % (1U << 26 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 12 ))<<( 26 - 12 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  12  )   % (1U << 26 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 6 ))<<( 26 - 6 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  6  )   % (1U << 26 ) ;
+    out++;
+  }
+  
+  return;
+}
 
-    OutReg =   _mm_srli_epi32(InReg,8) ;
-    InReg = _mm_load_si128(++in);
+#elif !defined(HAVE_SSE2)
+static void
+unpack_26 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
+  unsigned int column;
+  const UINT4 *bitpack = in;
 
-#ifdef MULTIMASK
-    OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(_mm_and_si128(InReg, mask2), 26-2));
-#else
-    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 2), mask26));
-#endif
-    /* total = _mm_add_epi32(total, OutReg); */
-    _mm_store_si128(out++, OutReg);
+  for (column = 0; column < 4; column++) {
+    in = &(bitpack[column]);
+
+    *out = ( (*in) >>  0  )   % (1U << 26 ) ;
+    out++;
+    *out = ( (*in) >>  26  )   % (1U << 26 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 20 ))<<( 26 - 20 );
+    out++;
+    *out = ( (*in) >>  20  )   % (1U << 26 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 14 ))<<( 26 - 14 );
+    out++;
+    *out = ( (*in) >>  14  )   % (1U << 26 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 8 ))<<( 26 - 8 );
+    out++;
+    *out = ( (*in) >>  8  )   % (1U << 26 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 2 ))<<( 26 - 2 );
+    out++;
+    *out = ( (*in) >>  2  )   % (1U << 26 ) ;
+    out++;
+    *out = ( (*in) >>  28  )   % (1U << 26 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 22 ))<<( 26 - 22 );
+    out++;
+    *out = ( (*in) >>  22  )   % (1U << 26 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 16 ))<<( 26 - 16 );
+    out++;
+    *out = ( (*in) >>  16  )   % (1U << 26 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 10 ))<<( 26 - 10 );
+    out++;
+    *out = ( (*in) >>  10  )   % (1U << 26 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 4 ))<<( 26 - 4 );
+    out++;
+    *out = ( (*in) >>  4  )   % (1U << 26 ) ;
+    out++;
+    *out = ( (*in) >>  30  )   % (1U << 26 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 24 ))<<( 26 - 24 );
+    out++;
+    *out = ( (*in) >>  24  )   % (1U << 26 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 18 ))<<( 26 - 18 );
+    out++;
+    *out = ( (*in) >>  18  )   % (1U << 26 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 12 ))<<( 26 - 12 );
+    out++;
+    *out = ( (*in) >>  12  )   % (1U << 26 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 6 ))<<( 26 - 6 );
+    out++;
+    *out = ( (*in) >>  6  )   % (1U << 26 ) ;
+    out++;
+  }
+  
+  return;
+}
+
+#else
+static void
+unpack_26_fwd (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
+    __m128i InReg = _mm_load_si128(in);
+    __m128i OutReg;
+    const __m128i mask26 =  _mm_set1_epi32(67108863U);
+
+    OutReg = _mm_and_si128( InReg , mask26);
+    _mm_store_si128(out++, OutReg);
+
+    OutReg =   _mm_srli_epi32(InReg,26) ;
+    InReg = _mm_load_si128(++in);
+
+#ifdef MULTIMASK
+    OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(_mm_and_si128(InReg, mask20), 26-20));
+#else
+    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 20), mask26));
+#endif
+    /* total = _mm_add_epi32(total, OutReg); */
+    _mm_store_si128(out++, OutReg);
+
+    OutReg =   _mm_srli_epi32(InReg,20) ;
+    InReg = _mm_load_si128(++in);
+
+#ifdef MULTIMASK
+    OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(_mm_and_si128(InReg, mask14), 26-14));
+#else
+    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 14), mask26));
+#endif
+    /* total = _mm_add_epi32(total, OutReg); */
+    _mm_store_si128(out++, OutReg);
+
+    OutReg =   _mm_srli_epi32(InReg,14) ;
+    InReg = _mm_load_si128(++in);
+
+#ifdef MULTIMASK
+    OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(_mm_and_si128(InReg, mask8), 26-8));
+#else
+    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 8), mask26));
+#endif
+    /* total = _mm_add_epi32(total, OutReg); */
+    _mm_store_si128(out++, OutReg);
+
+    OutReg =   _mm_srli_epi32(InReg,8) ;
+    InReg = _mm_load_si128(++in);
+
+#ifdef MULTIMASK
+    OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(_mm_and_si128(InReg, mask2), 26-2));
+#else
+    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 2), mask26));
+#endif
+    /* total = _mm_add_epi32(total, OutReg); */
+    _mm_store_si128(out++, OutReg);
 
     OutReg = _mm_and_si128(  _mm_srli_epi32(InReg,2) , mask26);
     /* total = _mm_add_epi32(total, OutReg); */
@@ -9670,75 +10408,6 @@ unpack_26_rev_8 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
     return;
 }
 
-#else
-static void
-unpack_26 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
-  unsigned int column;
-  const UINT4 *bitpack = in;
-
-  for (column = 0; column < 4; column++) {
-    in = &(bitpack[column]);
-
-    *out = ( (*in) >>  0  )   % (1U << 26 ) ;
-    out++;
-    *out = ( (*in) >>  26  )   % (1U << 26 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 20 ))<<( 26 - 20 );
-    out++;
-    *out = ( (*in) >>  20  )   % (1U << 26 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 14 ))<<( 26 - 14 );
-    out++;
-    *out = ( (*in) >>  14  )   % (1U << 26 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 8 ))<<( 26 - 8 );
-    out++;
-    *out = ( (*in) >>  8  )   % (1U << 26 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 2 ))<<( 26 - 2 );
-    out++;
-    *out = ( (*in) >>  2  )   % (1U << 26 ) ;
-    out++;
-    *out = ( (*in) >>  28  )   % (1U << 26 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 22 ))<<( 26 - 22 );
-    out++;
-    *out = ( (*in) >>  22  )   % (1U << 26 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 16 ))<<( 26 - 16 );
-    out++;
-    *out = ( (*in) >>  16  )   % (1U << 26 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 10 ))<<( 26 - 10 );
-    out++;
-    *out = ( (*in) >>  10  )   % (1U << 26 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 4 ))<<( 26 - 4 );
-    out++;
-    *out = ( (*in) >>  4  )   % (1U << 26 ) ;
-    out++;
-    *out = ( (*in) >>  30  )   % (1U << 26 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 24 ))<<( 26 - 24 );
-    out++;
-    *out = ( (*in) >>  24  )   % (1U << 26 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 18 ))<<( 26 - 18 );
-    out++;
-    *out = ( (*in) >>  18  )   % (1U << 26 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 12 ))<<( 26 - 12 );
-    out++;
-    *out = ( (*in) >>  12  )   % (1U << 26 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 6 ))<<( 26 - 6 );
-    out++;
-    *out = ( (*in) >>  6  )   % (1U << 26 ) ;
-    out++;
-  }
-  
-  return;
-}
 #endif
 
 
@@ -9908,42 +10577,184 @@ unpack_27 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
 #endif
 
 
-#ifdef HAVE_SSE2
+#ifdef WORDS_BIGENDIAN
 static void
-unpack_28_fwd (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
-    __m128i InReg = _mm_load_si128(in);
-    __m128i OutReg;
-    const __m128i mask28 =  _mm_set1_epi32(268435455U);
-
-    OutReg = _mm_and_si128( InReg , mask28);
-    _mm_store_si128(out++, OutReg);
+unpack_28 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
+  unsigned int column;
+  const UINT4 *bitpack = in;
 
-    OutReg =   _mm_srli_epi32(InReg,28) ;
-    InReg = _mm_load_si128(++in);
+  for (column = 0; column < 4; column++) {
+    in = &(bitpack[column]);
 
-#ifdef MULTIMASK
-    OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(_mm_and_si128(InReg, mask24), 28-24));
-#else
-    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 24), mask28));
-#endif
-    /* total = _mm_add_epi32(total, OutReg); */
-    _mm_store_si128(out++, OutReg);
+    *out = ( Bigendian_convert_uint(*in) >>  0  )   % (1U << 28 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  28  )   % (1U << 28 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 24 ))<<( 28 - 24 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  24  )   % (1U << 28 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 20 ))<<( 28 - 20 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  20  )   % (1U << 28 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 16 ))<<( 28 - 16 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  16  )   % (1U << 28 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 12 ))<<( 28 - 12 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  12  )   % (1U << 28 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 8 ))<<( 28 - 8 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  8  )   % (1U << 28 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 4 ))<<( 28 - 4 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  4  )   % (1U << 28 ) ;
+    in += 4;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  0  )   % (1U << 28 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  28  )   % (1U << 28 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 24 ))<<( 28 - 24 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  24  )   % (1U << 28 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 20 ))<<( 28 - 20 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  20  )   % (1U << 28 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 16 ))<<( 28 - 16 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  16  )   % (1U << 28 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 12 ))<<( 28 - 12 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  12  )   % (1U << 28 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 8 ))<<( 28 - 8 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  8  )   % (1U << 28 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 4 ))<<( 28 - 4 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  4  )   % (1U << 28 ) ;
+    out++;
+  }
 
-    OutReg =   _mm_srli_epi32(InReg,24) ;
-    InReg = _mm_load_si128(++in);
+  return;
+}
 
-#ifdef MULTIMASK
-    OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(_mm_and_si128(InReg, mask20), 28-20));
-#else
-    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 20), mask28));
-#endif
-    /* total = _mm_add_epi32(total, OutReg); */
-    _mm_store_si128(out++, OutReg);
+#elif !defined(HAVE_SSE2)
+static void
+unpack_28 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
+  unsigned int column;
+  const UINT4 *bitpack = in;
 
-    OutReg =   _mm_srli_epi32(InReg,20) ;
-    InReg = _mm_load_si128(++in);
+  for (column = 0; column < 4; column++) {
+    in = &(bitpack[column]);
 
-#ifdef MULTIMASK
+    *out = ( (*in) >>  0  )   % (1U << 28 ) ;
+    out++;
+    *out = ( (*in) >>  28  )   % (1U << 28 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 24 ))<<( 28 - 24 );
+    out++;
+    *out = ( (*in) >>  24  )   % (1U << 28 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 20 ))<<( 28 - 20 );
+    out++;
+    *out = ( (*in) >>  20  )   % (1U << 28 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 16 ))<<( 28 - 16 );
+    out++;
+    *out = ( (*in) >>  16  )   % (1U << 28 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 12 ))<<( 28 - 12 );
+    out++;
+    *out = ( (*in) >>  12  )   % (1U << 28 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 8 ))<<( 28 - 8 );
+    out++;
+    *out = ( (*in) >>  8  )   % (1U << 28 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 4 ))<<( 28 - 4 );
+    out++;
+    *out = ( (*in) >>  4  )   % (1U << 28 ) ;
+    in += 4;
+    out++;
+    *out = ( (*in) >>  0  )   % (1U << 28 ) ;
+    out++;
+    *out = ( (*in) >>  28  )   % (1U << 28 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 24 ))<<( 28 - 24 );
+    out++;
+    *out = ( (*in) >>  24  )   % (1U << 28 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 20 ))<<( 28 - 20 );
+    out++;
+    *out = ( (*in) >>  20  )   % (1U << 28 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 16 ))<<( 28 - 16 );
+    out++;
+    *out = ( (*in) >>  16  )   % (1U << 28 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 12 ))<<( 28 - 12 );
+    out++;
+    *out = ( (*in) >>  12  )   % (1U << 28 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 8 ))<<( 28 - 8 );
+    out++;
+    *out = ( (*in) >>  8  )   % (1U << 28 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 4 ))<<( 28 - 4 );
+    out++;
+    *out = ( (*in) >>  4  )   % (1U << 28 ) ;
+    out++;
+  }
+
+  return;
+}
+
+#else
+static void
+unpack_28_fwd (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
+    __m128i InReg = _mm_load_si128(in);
+    __m128i OutReg;
+    const __m128i mask28 =  _mm_set1_epi32(268435455U);
+
+    OutReg = _mm_and_si128( InReg , mask28);
+    _mm_store_si128(out++, OutReg);
+
+    OutReg =   _mm_srli_epi32(InReg,28) ;
+    InReg = _mm_load_si128(++in);
+
+#ifdef MULTIMASK
+    OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(_mm_and_si128(InReg, mask24), 28-24));
+#else
+    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 24), mask28));
+#endif
+    /* total = _mm_add_epi32(total, OutReg); */
+    _mm_store_si128(out++, OutReg);
+
+    OutReg =   _mm_srli_epi32(InReg,24) ;
+    InReg = _mm_load_si128(++in);
+
+#ifdef MULTIMASK
+    OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(_mm_and_si128(InReg, mask20), 28-20));
+#else
+    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 20), mask28));
+#endif
+    /* total = _mm_add_epi32(total, OutReg); */
+    _mm_store_si128(out++, OutReg);
+
+    OutReg =   _mm_srli_epi32(InReg,20) ;
+    InReg = _mm_load_si128(++in);
+
+#ifdef MULTIMASK
     OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(_mm_and_si128(InReg, mask16), 28-16));
 #else
     OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 16), mask28));
@@ -10599,76 +11410,6 @@ unpack_28_rev_8 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
     return;
 }
 
-#else
-static void
-unpack_28 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
-  unsigned int column;
-  const UINT4 *bitpack = in;
-
-  for (column = 0; column < 4; column++) {
-    in = &(bitpack[column]);
-
-    *out = ( (*in) >>  0  )   % (1U << 28 ) ;
-    out++;
-    *out = ( (*in) >>  28  )   % (1U << 28 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 24 ))<<( 28 - 24 );
-    out++;
-    *out = ( (*in) >>  24  )   % (1U << 28 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 20 ))<<( 28 - 20 );
-    out++;
-    *out = ( (*in) >>  20  )   % (1U << 28 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 16 ))<<( 28 - 16 );
-    out++;
-    *out = ( (*in) >>  16  )   % (1U << 28 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 12 ))<<( 28 - 12 );
-    out++;
-    *out = ( (*in) >>  12  )   % (1U << 28 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 8 ))<<( 28 - 8 );
-    out++;
-    *out = ( (*in) >>  8  )   % (1U << 28 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 4 ))<<( 28 - 4 );
-    out++;
-    *out = ( (*in) >>  4  )   % (1U << 28 ) ;
-    in += 4;
-    out++;
-    *out = ( (*in) >>  0  )   % (1U << 28 ) ;
-    out++;
-    *out = ( (*in) >>  28  )   % (1U << 28 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 24 ))<<( 28 - 24 );
-    out++;
-    *out = ( (*in) >>  24  )   % (1U << 28 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 20 ))<<( 28 - 20 );
-    out++;
-    *out = ( (*in) >>  20  )   % (1U << 28 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 16 ))<<( 28 - 16 );
-    out++;
-    *out = ( (*in) >>  16  )   % (1U << 28 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 12 ))<<( 28 - 12 );
-    out++;
-    *out = ( (*in) >>  12  )   % (1U << 28 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 8 ))<<( 28 - 8 );
-    out++;
-    *out = ( (*in) >>  8  )   % (1U << 28 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 4 ))<<( 28 - 4 );
-    out++;
-    *out = ( (*in) >>  4  )   % (1U << 28 ) ;
-    out++;
-  }
-
-  return;
-}
 #endif
 
 
@@ -10845,57 +11586,205 @@ unpack_29 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
 #endif
 
 
-#ifdef HAVE_SSE2
+#ifdef WORDS_BIGENDIAN
 static void
-unpack_30_fwd (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
-    __m128i InReg = _mm_load_si128(in);
-    __m128i OutReg;
-    const __m128i mask30 =  _mm_set1_epi32(1073741823U);
-
-    OutReg = _mm_and_si128( InReg , mask30);
-    _mm_store_si128(out++, OutReg);
-
-    OutReg =   _mm_srli_epi32(InReg,30) ;
-    InReg = _mm_load_si128(++in);
-
-#ifdef MULTIMASK
-    OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(_mm_and_si128(InReg, mask28), 30-28));
-#else
-    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 28), mask30));
-#endif
-    /* total = _mm_add_epi32(total, OutReg); */
-    _mm_store_si128(out++, OutReg);
+unpack_30 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
+  unsigned int column;
+  const UINT4 *bitpack = in;
 
-    OutReg =   _mm_srli_epi32(InReg,28) ;
-    InReg = _mm_load_si128(++in);
+  for (column = 0; column < 4; column++) {
+    in = &(bitpack[column]);
 
-#ifdef MULTIMASK
-    OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(_mm_and_si128(InReg, mask26), 30-26));
-#else
-    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 26), mask30));
-#endif
-    /* total = _mm_add_epi32(total, OutReg); */
-    _mm_store_si128(out++, OutReg);
+    *out = ( Bigendian_convert_uint(*in) >>  0  )   % (1U << 30 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  30  )   % (1U << 30 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 28 ))<<( 30 - 28 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  28  )   % (1U << 30 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 26 ))<<( 30 - 26 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  26  )   % (1U << 30 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 24 ))<<( 30 - 24 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  24  )   % (1U << 30 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 22 ))<<( 30 - 22 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  22  )   % (1U << 30 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 20 ))<<( 30 - 20 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  20  )   % (1U << 30 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 18 ))<<( 30 - 18 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  18  )   % (1U << 30 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 16 ))<<( 30 - 16 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  16  )   % (1U << 30 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 14 ))<<( 30 - 14 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  14  )   % (1U << 30 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 12 ))<<( 30 - 12 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  12  )   % (1U << 30 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 10 ))<<( 30 - 10 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  10  )   % (1U << 30 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 8 ))<<( 30 - 8 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  8  )   % (1U << 30 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 6 ))<<( 30 - 6 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  6  )   % (1U << 30 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 4 ))<<( 30 - 4 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  4  )   % (1U << 30 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 2 ))<<( 30 - 2 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  2  )   % (1U << 30 ) ;
+    out++;
+  }
 
-    OutReg =   _mm_srli_epi32(InReg,26) ;
-    InReg = _mm_load_si128(++in);
+  return;
+}
 
-#ifdef MULTIMASK
-    OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(_mm_and_si128(InReg, mask24), 30-24));
-#else
-    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 24), mask30));
-#endif
-    /* total = _mm_add_epi32(total, OutReg); */
-    _mm_store_si128(out++, OutReg);
+#elif !defined(HAVE_SSE2)
+static void
+unpack_30 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
+  unsigned int column;
+  const UINT4 *bitpack = in;
 
-    OutReg =   _mm_srli_epi32(InReg,24) ;
-    InReg = _mm_load_si128(++in);
+  for (column = 0; column < 4; column++) {
+    in = &(bitpack[column]);
 
-#ifdef MULTIMASK
-    OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(_mm_and_si128(InReg, mask22), 30-22));
-#else
-    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 22), mask30));
-#endif
+    *out = ( (*in) >>  0  )   % (1U << 30 ) ;
+    out++;
+    *out = ( (*in) >>  30  )   % (1U << 30 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 28 ))<<( 30 - 28 );
+    out++;
+    *out = ( (*in) >>  28  )   % (1U << 30 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 26 ))<<( 30 - 26 );
+    out++;
+    *out = ( (*in) >>  26  )   % (1U << 30 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 24 ))<<( 30 - 24 );
+    out++;
+    *out = ( (*in) >>  24  )   % (1U << 30 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 22 ))<<( 30 - 22 );
+    out++;
+    *out = ( (*in) >>  22  )   % (1U << 30 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 20 ))<<( 30 - 20 );
+    out++;
+    *out = ( (*in) >>  20  )   % (1U << 30 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 18 ))<<( 30 - 18 );
+    out++;
+    *out = ( (*in) >>  18  )   % (1U << 30 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 16 ))<<( 30 - 16 );
+    out++;
+    *out = ( (*in) >>  16  )   % (1U << 30 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 14 ))<<( 30 - 14 );
+    out++;
+    *out = ( (*in) >>  14  )   % (1U << 30 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 12 ))<<( 30 - 12 );
+    out++;
+    *out = ( (*in) >>  12  )   % (1U << 30 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 10 ))<<( 30 - 10 );
+    out++;
+    *out = ( (*in) >>  10  )   % (1U << 30 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 8 ))<<( 30 - 8 );
+    out++;
+    *out = ( (*in) >>  8  )   % (1U << 30 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 6 ))<<( 30 - 6 );
+    out++;
+    *out = ( (*in) >>  6  )   % (1U << 30 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 4 ))<<( 30 - 4 );
+    out++;
+    *out = ( (*in) >>  4  )   % (1U << 30 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 2 ))<<( 30 - 2 );
+    out++;
+    *out = ( (*in) >>  2  )   % (1U << 30 ) ;
+    out++;
+  }
+
+  return;
+}
+
+#else
+static void
+unpack_30_fwd (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
+    __m128i InReg = _mm_load_si128(in);
+    __m128i OutReg;
+    const __m128i mask30 =  _mm_set1_epi32(1073741823U);
+
+    OutReg = _mm_and_si128( InReg , mask30);
+    _mm_store_si128(out++, OutReg);
+
+    OutReg =   _mm_srli_epi32(InReg,30) ;
+    InReg = _mm_load_si128(++in);
+
+#ifdef MULTIMASK
+    OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(_mm_and_si128(InReg, mask28), 30-28));
+#else
+    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 28), mask30));
+#endif
+    /* total = _mm_add_epi32(total, OutReg); */
+    _mm_store_si128(out++, OutReg);
+
+    OutReg =   _mm_srli_epi32(InReg,28) ;
+    InReg = _mm_load_si128(++in);
+
+#ifdef MULTIMASK
+    OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(_mm_and_si128(InReg, mask26), 30-26));
+#else
+    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 26), mask30));
+#endif
+    /* total = _mm_add_epi32(total, OutReg); */
+    _mm_store_si128(out++, OutReg);
+
+    OutReg =   _mm_srli_epi32(InReg,26) ;
+    InReg = _mm_load_si128(++in);
+
+#ifdef MULTIMASK
+    OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(_mm_and_si128(InReg, mask24), 30-24));
+#else
+    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 24), mask30));
+#endif
+    /* total = _mm_add_epi32(total, OutReg); */
+    _mm_store_si128(out++, OutReg);
+
+    OutReg =   _mm_srli_epi32(InReg,24) ;
+    InReg = _mm_load_si128(++in);
+
+#ifdef MULTIMASK
+    OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(_mm_and_si128(InReg, mask22), 30-22));
+#else
+    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 22), mask30));
+#endif
     /* total = _mm_add_epi32(total, OutReg); */
     _mm_store_si128(out++, OutReg);
 
@@ -11572,79 +12461,6 @@ unpack_30_rev_8 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
 }
 
 
-#else
-static void
-unpack_30 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
-  unsigned int column;
-  const UINT4 *bitpack = in;
-
-  for (column = 0; column < 4; column++) {
-    in = &(bitpack[column]);
-
-    *out = ( (*in) >>  0  )   % (1U << 30 ) ;
-    out++;
-    *out = ( (*in) >>  30  )   % (1U << 30 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 28 ))<<( 30 - 28 );
-    out++;
-    *out = ( (*in) >>  28  )   % (1U << 30 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 26 ))<<( 30 - 26 );
-    out++;
-    *out = ( (*in) >>  26  )   % (1U << 30 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 24 ))<<( 30 - 24 );
-    out++;
-    *out = ( (*in) >>  24  )   % (1U << 30 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 22 ))<<( 30 - 22 );
-    out++;
-    *out = ( (*in) >>  22  )   % (1U << 30 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 20 ))<<( 30 - 20 );
-    out++;
-    *out = ( (*in) >>  20  )   % (1U << 30 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 18 ))<<( 30 - 18 );
-    out++;
-    *out = ( (*in) >>  18  )   % (1U << 30 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 16 ))<<( 30 - 16 );
-    out++;
-    *out = ( (*in) >>  16  )   % (1U << 30 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 14 ))<<( 30 - 14 );
-    out++;
-    *out = ( (*in) >>  14  )   % (1U << 30 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 12 ))<<( 30 - 12 );
-    out++;
-    *out = ( (*in) >>  12  )   % (1U << 30 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 10 ))<<( 30 - 10 );
-    out++;
-    *out = ( (*in) >>  10  )   % (1U << 30 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 8 ))<<( 30 - 8 );
-    out++;
-    *out = ( (*in) >>  8  )   % (1U << 30 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 6 ))<<( 30 - 6 );
-    out++;
-    *out = ( (*in) >>  6  )   % (1U << 30 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 4 ))<<( 30 - 4 );
-    out++;
-    *out = ( (*in) >>  4  )   % (1U << 30 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 2 ))<<( 30 - 2 );
-    out++;
-    *out = ( (*in) >>  2  )   % (1U << 30 ) ;
-    out++;
-  }
-
-  return;
-}
 #endif
 
 
@@ -11829,35 +12645,157 @@ unpack_31 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
 
 
 
-#ifdef HAVE_SSE2
+#ifdef WORDS_BIGENDIAN
 static void
-unpack_32_fwd (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
-    __m128i OutReg;
-
-    OutReg = _mm_load_si128(in++);
-    _mm_store_si128(out++, OutReg);
+unpack_32 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
+  unsigned int column;
+  const UINT4 *bitpack = in;
 
-    OutReg = _mm_load_si128(in++);
-    /* total = _mm_add_epi32(total, _mm_load_si128(in++)); */
-    _mm_store_si128(out++, OutReg);
+  for (column = 0; column < 4; column++) {
+    in = &(bitpack[column]);
 
-    OutReg = _mm_load_si128(in++);
-    /* total = _mm_add_epi32(total, _mm_load_si128(in++)); */
-    _mm_store_si128(out++, OutReg);
+    *out = ( Bigendian_convert_uint(*in) >>  0  )   ;
+    in += 4;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  0  )   ;
+    in += 4;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  0  )   ;
+    in += 4;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  0  )   ;
+    in += 4;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  0  )   ;
+    in += 4;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  0  )   ;
+    in += 4;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  0  )   ;
+    in += 4;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  0  )   ;
+    in += 4;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  0  )   ;
+    in += 4;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  0  )   ;
+    in += 4;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  0  )   ;
+    in += 4;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  0  )   ;
+    in += 4;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  0  )   ;
+    in += 4;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  0  )   ;
+    in += 4;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  0  )   ;
+    in += 4;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  0  )   ;
+    out++;
+  }
 
-    OutReg = _mm_load_si128(in++);
-    /* total = _mm_add_epi32(total, _mm_load_si128(in++)); */
-    _mm_store_si128(out++, OutReg);
+  return;
+}
 
-    OutReg = _mm_load_si128(in++);
-    /* total = _mm_add_epi32(total, _mm_load_si128(in++)); */
-    _mm_store_si128(out++, OutReg);
+#elif !defined(HAVE_SSE2)
+static void
+unpack_32 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
+  unsigned int column;
+  const UINT4 *bitpack = in;
 
-    OutReg = _mm_load_si128(in++);
-    /* total = _mm_add_epi32(total, _mm_load_si128(in++)); */
-    _mm_store_si128(out++, OutReg);
+  for (column = 0; column < 4; column++) {
+    in = &(bitpack[column]);
 
-    OutReg = _mm_load_si128(in++);
+    *out = ( (*in) >>  0  )   ;
+    in += 4;
+    out++;
+    *out = ( (*in) >>  0  )   ;
+    in += 4;
+    out++;
+    *out = ( (*in) >>  0  )   ;
+    in += 4;
+    out++;
+    *out = ( (*in) >>  0  )   ;
+    in += 4;
+    out++;
+    *out = ( (*in) >>  0  )   ;
+    in += 4;
+    out++;
+    *out = ( (*in) >>  0  )   ;
+    in += 4;
+    out++;
+    *out = ( (*in) >>  0  )   ;
+    in += 4;
+    out++;
+    *out = ( (*in) >>  0  )   ;
+    in += 4;
+    out++;
+    *out = ( (*in) >>  0  )   ;
+    in += 4;
+    out++;
+    *out = ( (*in) >>  0  )   ;
+    in += 4;
+    out++;
+    *out = ( (*in) >>  0  )   ;
+    in += 4;
+    out++;
+    *out = ( (*in) >>  0  )   ;
+    in += 4;
+    out++;
+    *out = ( (*in) >>  0  )   ;
+    in += 4;
+    out++;
+    *out = ( (*in) >>  0  )   ;
+    in += 4;
+    out++;
+    *out = ( (*in) >>  0  )   ;
+    in += 4;
+    out++;
+    *out = ( (*in) >>  0  )   ;
+    out++;
+  }
+
+  return;
+}
+
+#else
+static void
+unpack_32_fwd (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
+    __m128i OutReg;
+
+    OutReg = _mm_load_si128(in++);
+    _mm_store_si128(out++, OutReg);
+
+    OutReg = _mm_load_si128(in++);
+    /* total = _mm_add_epi32(total, _mm_load_si128(in++)); */
+    _mm_store_si128(out++, OutReg);
+
+    OutReg = _mm_load_si128(in++);
+    /* total = _mm_add_epi32(total, _mm_load_si128(in++)); */
+    _mm_store_si128(out++, OutReg);
+
+    OutReg = _mm_load_si128(in++);
+    /* total = _mm_add_epi32(total, _mm_load_si128(in++)); */
+    _mm_store_si128(out++, OutReg);
+
+    OutReg = _mm_load_si128(in++);
+    /* total = _mm_add_epi32(total, _mm_load_si128(in++)); */
+    _mm_store_si128(out++, OutReg);
+
+    OutReg = _mm_load_si128(in++);
+    /* total = _mm_add_epi32(total, _mm_load_si128(in++)); */
+    _mm_store_si128(out++, OutReg);
+
+    OutReg = _mm_load_si128(in++);
     /* total = _mm_add_epi32(total, _mm_load_si128(in++)); */
     _mm_store_si128(out++, OutReg);
 
@@ -12234,66 +13172,6 @@ unpack_32_rev_8 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
 
 
 
-#else
-static void
-unpack_32 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
-  unsigned int column;
-  const UINT4 *bitpack = in;
-
-  for (column = 0; column < 4; column++) {
-    in = &(bitpack[column]);
-
-    *out = ( (*in) >>  0  )   ;
-    in += 4;
-    out++;
-    *out = ( (*in) >>  0  )   ;
-    in += 4;
-    out++;
-    *out = ( (*in) >>  0  )   ;
-    in += 4;
-    out++;
-    *out = ( (*in) >>  0  )   ;
-    in += 4;
-    out++;
-    *out = ( (*in) >>  0  )   ;
-    in += 4;
-    out++;
-    *out = ( (*in) >>  0  )   ;
-    in += 4;
-    out++;
-    *out = ( (*in) >>  0  )   ;
-    in += 4;
-    out++;
-    *out = ( (*in) >>  0  )   ;
-    in += 4;
-    out++;
-    *out = ( (*in) >>  0  )   ;
-    in += 4;
-    out++;
-    *out = ( (*in) >>  0  )   ;
-    in += 4;
-    out++;
-    *out = ( (*in) >>  0  )   ;
-    in += 4;
-    out++;
-    *out = ( (*in) >>  0  )   ;
-    in += 4;
-    out++;
-    *out = ( (*in) >>  0  )   ;
-    in += 4;
-    out++;
-    *out = ( (*in) >>  0  )   ;
-    in += 4;
-    out++;
-    *out = ( (*in) >>  0  )   ;
-    in += 4;
-    out++;
-    *out = ( (*in) >>  0  )   ;
-    out++;
-  }
-
-  return;
-}
 #endif
 
 
@@ -12392,9 +13270,10 @@ unpack_32 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
 #endif
 
 
-#ifdef HAVE_SSE2
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+
 static void
-vertical_order_fwd (UINT4 *vertical, UINT4 *columnar) {
+vertical_order (UINT4 *vertical, UINT4 *columnar) {
 
   vertical[0] = columnar[0];		/* remainder 1 */
   vertical[4] = columnar[1];		/* remainder 5 */
@@ -12432,54 +13311,47 @@ vertical_order_fwd (UINT4 *vertical, UINT4 *columnar) {
   vertical[27] = columnar[30];		/* remainder 28 */
   vertical[31] = columnar[31];		/* remainder 32 */
 
-  return;
-}
-
-static void
-vertical_order_rev (UINT4 *vertical, UINT4 *columnar) {
-
-  vertical[0] = columnar[0];		/* remainder 63 */
-  vertical[4] = columnar[1];		/* remainder 59 */
-  vertical[8] = columnar[2];		/* remainder 55 */
-  vertical[12] = columnar[3];		/* remainder 51 */
-  vertical[16] = columnar[4];		/* remainder 47 */
-  vertical[20] = columnar[5];		/* remainder 43 */
-  vertical[24] = columnar[6];		/* remainder 39 */
-  vertical[28] = columnar[7];		/* remainder 35 */
+  vertical[32] = columnar[32];		/* remainder 63 */
+  vertical[36] = columnar[33];		/* remainder 59 */
+  vertical[40] = columnar[34];		/* remainder 55 */
+  vertical[44] = columnar[35];		/* remainder 51 */
+  vertical[48] = columnar[36];		/* remainder 47 */
+  vertical[52] = columnar[37];		/* remainder 43 */
+  vertical[56] = columnar[38];		/* remainder 39 */
+  vertical[60] = columnar[39];		/* remainder 35 */
 
-  vertical[1] = columnar[8];		/* remainder 62 */
-  vertical[5] = columnar[9];		/* remainder 58 */
-  vertical[9] = columnar[10];		/* remainder 54 */
-  vertical[13] = columnar[11];		/* remainder 50 */
-  vertical[17] = columnar[12];		/* remainder 46 */
-  vertical[21] = columnar[13];		/* remainder 42 */
-  vertical[25] = columnar[14];		/* remainder 38 */
-  vertical[29] = columnar[15];		/* remainder 34 */
+  vertical[33] = columnar[40];		/* remainder 62 */
+  vertical[37] = columnar[41];		/* remainder 58 */
+  vertical[41] = columnar[42];		/* remainder 54 */
+  vertical[45] = columnar[43];		/* remainder 50 */
+  vertical[49] = columnar[44];		/* remainder 46 */
+  vertical[53] = columnar[45];		/* remainder 42 */
+  vertical[57] = columnar[46];		/* remainder 38 */
+  vertical[61] = columnar[47];		/* remainder 34 */
 
-  vertical[2] = columnar[16];		/* remainder 61 */
-  vertical[6] = columnar[17];		/* remainder 57 */
-  vertical[10] = columnar[18];		/* remainder 53 */
-  vertical[14] = columnar[19];		/* remainder 49 */
-  vertical[18] = columnar[20];		/* remainder 45 */
-  vertical[22] = columnar[21];		/* remainder 41 */
-  vertical[26] = columnar[22];		/* remainder 37 */
-  vertical[30] = columnar[23];		/* remainder 33 */
+  vertical[34] = columnar[48];		/* remainder 61 */
+  vertical[38] = columnar[49];		/* remainder 57 */
+  vertical[42] = columnar[50];		/* remainder 53 */
+  vertical[46] = columnar[51];		/* remainder 49 */
+  vertical[50] = columnar[52];		/* remainder 45 */
+  vertical[54] = columnar[53];		/* remainder 41 */
+  vertical[58] = columnar[54];		/* remainder 37 */
+  vertical[62] = columnar[55];		/* remainder 33 */
 
-  vertical[3] = columnar[24];		/* remainder 60 */
-  vertical[7] = columnar[25];		/* remainder 56 */
-  vertical[11] = columnar[26];		/* remainder 52 */
-  vertical[15] = columnar[27];		/* remainder 48 */
-  vertical[19] = columnar[28];		/* remainder 44 */
-  vertical[23] = columnar[29];		/* remainder 40 */
-  vertical[27] = columnar[30];		/* remainder 36 */
-  vertical[31] = columnar[31];		/* remainder 32 */
+  vertical[35] = columnar[56];		/* remainder 60 */
+  vertical[39] = columnar[57];		/* remainder 56 */
+  vertical[43] = columnar[58];		/* remainder 52 */
+  vertical[47] = columnar[59];		/* remainder 48 */
+  vertical[51] = columnar[60];		/* remainder 44 */
+  vertical[55] = columnar[61];		/* remainder 40 */
+  vertical[59] = columnar[62];		/* remainder 36 */
+  vertical[63] = columnar[63];		/* remainder 32 */
 
   return;
 }
 
-#if defined(HAVE_64_BIT) && (defined(UTILITYP) || defined(LARGE_GENOMES))
 static void
-vertical_order_huge_fwd (UINT8 *vertical, UINT4 *columnar) {
+vertical_order_huge (UINT8 *vertical, UINT4 *columnar) {
 
   vertical[0] = (UINT8) columnar[0];		/* remainder 1 */
   vertical[4] = (UINT8) columnar[1];		/* remainder 5 */
@@ -12517,58 +13389,49 @@ vertical_order_huge_fwd (UINT8 *vertical, UINT4 *columnar) {
   vertical[27] = (UINT8) columnar[30];		/* remainder 28 */
   vertical[31] = (UINT8) columnar[31];		/* remainder 32 */
 
-  return;
-}
-#endif
+  vertical[32] = (UINT8) columnar[32];		/* remainder 63 */
+  vertical[36] = (UINT8) columnar[33];		/* remainder 59 */
+  vertical[40] = (UINT8) columnar[34];		/* remainder 55 */
+  vertical[44] = (UINT8) columnar[35];		/* remainder 51 */
+  vertical[48] = (UINT8) columnar[36];		/* remainder 47 */
+  vertical[52] = (UINT8) columnar[37];		/* remainder 43 */
+  vertical[56] = (UINT8) columnar[38];		/* remainder 39 */
+  vertical[60] = (UINT8) columnar[39];		/* remainder 35 */
 
-#if defined(HAVE_64_BIT) && (defined(UTILITYP) || defined(LARGE_GENOMES))
-static void
-vertical_order_huge_rev (UINT8 *vertical, UINT4 *columnar) {
+  vertical[33] = (UINT8) columnar[40];		/* remainder 62 */
+  vertical[37] = (UINT8) columnar[41];		/* remainder 58 */
+  vertical[41] = (UINT8) columnar[42];		/* remainder 54 */
+  vertical[45] = (UINT8) columnar[43];		/* remainder 50 */
+  vertical[49] = (UINT8) columnar[44];		/* remainder 46 */
+  vertical[53] = (UINT8) columnar[45];		/* remainder 42 */
+  vertical[57] = (UINT8) columnar[46];		/* remainder 38 */
+  vertical[61] = (UINT8) columnar[47];		/* remainder 34 */
 
-  vertical[0] = (UINT8) columnar[0];		/* remainder 63 */
-  vertical[4] = (UINT8) columnar[1];		/* remainder 59 */
-  vertical[8] = (UINT8) columnar[2];		/* remainder 55 */
-  vertical[12] = (UINT8) columnar[3];		/* remainder 51 */
-  vertical[16] = (UINT8) columnar[4];		/* remainder 47 */
-  vertical[20] = (UINT8) columnar[5];		/* remainder 43 */
-  vertical[24] = (UINT8) columnar[6];		/* remainder 39 */
-  vertical[28] = (UINT8) columnar[7];		/* remainder 35 */
+  vertical[34] = (UINT8) columnar[48];		/* remainder 61 */
+  vertical[38] = (UINT8) columnar[49];		/* remainder 57 */
+  vertical[42] = (UINT8) columnar[50];		/* remainder 53 */
+  vertical[46] = (UINT8) columnar[51];		/* remainder 49 */
+  vertical[50] = (UINT8) columnar[52];		/* remainder 45 */
+  vertical[54] = (UINT8) columnar[53];		/* remainder 41 */
+  vertical[58] = (UINT8) columnar[54];		/* remainder 37 */
+  vertical[62] = (UINT8) columnar[55];		/* remainder 33 */
 
-  vertical[1] = (UINT8) columnar[8];		/* remainder 62 */
-  vertical[5] = (UINT8) columnar[9];		/* remainder 58 */
-  vertical[9] = (UINT8) columnar[10];		/* remainder 54 */
-  vertical[13] = (UINT8) columnar[11];		/* remainder 50 */
-  vertical[17] = (UINT8) columnar[12];		/* remainder 46 */
-  vertical[21] = (UINT8) columnar[13];		/* remainder 42 */
-  vertical[25] = (UINT8) columnar[14];		/* remainder 38 */
-  vertical[29] = (UINT8) columnar[15];		/* remainder 34 */
-
-  vertical[2] = (UINT8) columnar[16];		/* remainder 61 */
-  vertical[6] = (UINT8) columnar[17];		/* remainder 57 */
-  vertical[10] = (UINT8) columnar[18];		/* remainder 53 */
-  vertical[14] = (UINT8) columnar[19];		/* remainder 49 */
-  vertical[18] = (UINT8) columnar[20];		/* remainder 45 */
-  vertical[22] = (UINT8) columnar[21];		/* remainder 41 */
-  vertical[26] = (UINT8) columnar[22];		/* remainder 37 */
-  vertical[30] = (UINT8) columnar[23];		/* remainder 33 */
-
-  vertical[3] = (UINT8) columnar[24];		/* remainder 60 */
-  vertical[7] = (UINT8) columnar[25];		/* remainder 56 */
-  vertical[11] = (UINT8) columnar[26];		/* remainder 52 */
-  vertical[15] = (UINT8) columnar[27];		/* remainder 48 */
-  vertical[19] = (UINT8) columnar[28];		/* remainder 44 */
-  vertical[23] = (UINT8) columnar[29];		/* remainder 40 */
-  vertical[27] = (UINT8) columnar[30];		/* remainder 36 */
-  vertical[31] = (UINT8) columnar[31];		/* remainder 32 */
+  vertical[35] = (UINT8) columnar[56];		/* remainder 60 */
+  vertical[39] = (UINT8) columnar[57];		/* remainder 56 */
+  vertical[43] = (UINT8) columnar[58];		/* remainder 52 */
+  vertical[47] = (UINT8) columnar[59];		/* remainder 48 */
+  vertical[51] = (UINT8) columnar[60];		/* remainder 44 */
+  vertical[55] = (UINT8) columnar[61];		/* remainder 40 */
+  vertical[59] = (UINT8) columnar[62];		/* remainder 36 */
+  vertical[63] = (UINT8) columnar[63];		/* remainder 32 */
 
   return;
 }
-#endif
 
-#else
 
+#else
 static void
-vertical_order (UINT4 *vertical, UINT4 *columnar) {
+vertical_order_fwd (UINT4 *vertical, UINT4 *columnar) {
 
   vertical[0] = columnar[0];		/* remainder 1 */
   vertical[4] = columnar[1];		/* remainder 5 */
@@ -12606,47 +13469,54 @@ vertical_order (UINT4 *vertical, UINT4 *columnar) {
   vertical[27] = columnar[30];		/* remainder 28 */
   vertical[31] = columnar[31];		/* remainder 32 */
 
-  vertical[32] = columnar[32];		/* remainder 63 */
-  vertical[36] = columnar[33];		/* remainder 59 */
-  vertical[40] = columnar[34];		/* remainder 55 */
-  vertical[44] = columnar[35];		/* remainder 51 */
-  vertical[48] = columnar[36];		/* remainder 47 */
-  vertical[52] = columnar[37];		/* remainder 43 */
-  vertical[56] = columnar[38];		/* remainder 39 */
-  vertical[60] = columnar[39];		/* remainder 35 */
+  return;
+}
 
-  vertical[33] = columnar[40];		/* remainder 62 */
-  vertical[37] = columnar[41];		/* remainder 58 */
-  vertical[41] = columnar[42];		/* remainder 54 */
-  vertical[45] = columnar[43];		/* remainder 50 */
-  vertical[49] = columnar[44];		/* remainder 46 */
-  vertical[53] = columnar[45];		/* remainder 42 */
-  vertical[57] = columnar[46];		/* remainder 38 */
-  vertical[61] = columnar[47];		/* remainder 34 */
+static void
+vertical_order_rev (UINT4 *vertical, UINT4 *columnar) {
 
-  vertical[34] = columnar[48];		/* remainder 61 */
-  vertical[38] = columnar[49];		/* remainder 57 */
-  vertical[42] = columnar[50];		/* remainder 53 */
-  vertical[46] = columnar[51];		/* remainder 49 */
-  vertical[50] = columnar[52];		/* remainder 45 */
-  vertical[54] = columnar[53];		/* remainder 41 */
-  vertical[58] = columnar[54];		/* remainder 37 */
-  vertical[62] = columnar[55];		/* remainder 33 */
+  vertical[0] = columnar[0];		/* remainder 63 */
+  vertical[4] = columnar[1];		/* remainder 59 */
+  vertical[8] = columnar[2];		/* remainder 55 */
+  vertical[12] = columnar[3];		/* remainder 51 */
+  vertical[16] = columnar[4];		/* remainder 47 */
+  vertical[20] = columnar[5];		/* remainder 43 */
+  vertical[24] = columnar[6];		/* remainder 39 */
+  vertical[28] = columnar[7];		/* remainder 35 */
 
-  vertical[35] = columnar[56];		/* remainder 60 */
-  vertical[39] = columnar[57];		/* remainder 56 */
-  vertical[43] = columnar[58];		/* remainder 52 */
-  vertical[47] = columnar[59];		/* remainder 48 */
-  vertical[51] = columnar[60];		/* remainder 44 */
-  vertical[55] = columnar[61];		/* remainder 40 */
-  vertical[59] = columnar[62];		/* remainder 36 */
-  vertical[63] = columnar[63];		/* remainder 32 */
+  vertical[1] = columnar[8];		/* remainder 62 */
+  vertical[5] = columnar[9];		/* remainder 58 */
+  vertical[9] = columnar[10];		/* remainder 54 */
+  vertical[13] = columnar[11];		/* remainder 50 */
+  vertical[17] = columnar[12];		/* remainder 46 */
+  vertical[21] = columnar[13];		/* remainder 42 */
+  vertical[25] = columnar[14];		/* remainder 38 */
+  vertical[29] = columnar[15];		/* remainder 34 */
+
+  vertical[2] = columnar[16];		/* remainder 61 */
+  vertical[6] = columnar[17];		/* remainder 57 */
+  vertical[10] = columnar[18];		/* remainder 53 */
+  vertical[14] = columnar[19];		/* remainder 49 */
+  vertical[18] = columnar[20];		/* remainder 45 */
+  vertical[22] = columnar[21];		/* remainder 41 */
+  vertical[26] = columnar[22];		/* remainder 37 */
+  vertical[30] = columnar[23];		/* remainder 33 */
+
+  vertical[3] = columnar[24];		/* remainder 60 */
+  vertical[7] = columnar[25];		/* remainder 56 */
+  vertical[11] = columnar[26];		/* remainder 52 */
+  vertical[15] = columnar[27];		/* remainder 48 */
+  vertical[19] = columnar[28];		/* remainder 44 */
+  vertical[23] = columnar[29];		/* remainder 40 */
+  vertical[27] = columnar[30];		/* remainder 36 */
+  vertical[31] = columnar[31];		/* remainder 32 */
 
   return;
 }
 
+#if defined(HAVE_64_BIT) && (defined(UTILITYP) || defined(LARGE_GENOMES))
 static void
-vertical_order_huge (UINT8 *vertical, UINT4 *columnar) {
+vertical_order_huge_fwd (UINT8 *vertical, UINT4 *columnar) {
 
   vertical[0] = (UINT8) columnar[0];		/* remainder 1 */
   vertical[4] = (UINT8) columnar[1];		/* remainder 5 */
@@ -12684,53 +13554,61 @@ vertical_order_huge (UINT8 *vertical, UINT4 *columnar) {
   vertical[27] = (UINT8) columnar[30];		/* remainder 28 */
   vertical[31] = (UINT8) columnar[31];		/* remainder 32 */
 
-  vertical[32] = (UINT8) columnar[32];		/* remainder 63 */
-  vertical[36] = (UINT8) columnar[33];		/* remainder 59 */
-  vertical[40] = (UINT8) columnar[34];		/* remainder 55 */
-  vertical[44] = (UINT8) columnar[35];		/* remainder 51 */
-  vertical[48] = (UINT8) columnar[36];		/* remainder 47 */
-  vertical[52] = (UINT8) columnar[37];		/* remainder 43 */
-  vertical[56] = (UINT8) columnar[38];		/* remainder 39 */
-  vertical[60] = (UINT8) columnar[39];		/* remainder 35 */
+  return;
+}
+#endif
 
-  vertical[33] = (UINT8) columnar[40];		/* remainder 62 */
-  vertical[37] = (UINT8) columnar[41];		/* remainder 58 */
-  vertical[41] = (UINT8) columnar[42];		/* remainder 54 */
-  vertical[45] = (UINT8) columnar[43];		/* remainder 50 */
-  vertical[49] = (UINT8) columnar[44];		/* remainder 46 */
-  vertical[53] = (UINT8) columnar[45];		/* remainder 42 */
-  vertical[57] = (UINT8) columnar[46];		/* remainder 38 */
-  vertical[61] = (UINT8) columnar[47];		/* remainder 34 */
+#if defined(HAVE_64_BIT) && (defined(UTILITYP) || defined(LARGE_GENOMES))
+static void
+vertical_order_huge_rev (UINT8 *vertical, UINT4 *columnar) {
 
-  vertical[34] = (UINT8) columnar[48];		/* remainder 61 */
-  vertical[38] = (UINT8) columnar[49];		/* remainder 57 */
-  vertical[42] = (UINT8) columnar[50];		/* remainder 53 */
-  vertical[46] = (UINT8) columnar[51];		/* remainder 49 */
-  vertical[50] = (UINT8) columnar[52];		/* remainder 45 */
-  vertical[54] = (UINT8) columnar[53];		/* remainder 41 */
-  vertical[58] = (UINT8) columnar[54];		/* remainder 37 */
-  vertical[62] = (UINT8) columnar[55];		/* remainder 33 */
+  vertical[0] = (UINT8) columnar[0];		/* remainder 63 */
+  vertical[4] = (UINT8) columnar[1];		/* remainder 59 */
+  vertical[8] = (UINT8) columnar[2];		/* remainder 55 */
+  vertical[12] = (UINT8) columnar[3];		/* remainder 51 */
+  vertical[16] = (UINT8) columnar[4];		/* remainder 47 */
+  vertical[20] = (UINT8) columnar[5];		/* remainder 43 */
+  vertical[24] = (UINT8) columnar[6];		/* remainder 39 */
+  vertical[28] = (UINT8) columnar[7];		/* remainder 35 */
 
-  vertical[35] = (UINT8) columnar[56];		/* remainder 60 */
-  vertical[39] = (UINT8) columnar[57];		/* remainder 56 */
-  vertical[43] = (UINT8) columnar[58];		/* remainder 52 */
-  vertical[47] = (UINT8) columnar[59];		/* remainder 48 */
-  vertical[51] = (UINT8) columnar[60];		/* remainder 44 */
-  vertical[55] = (UINT8) columnar[61];		/* remainder 40 */
-  vertical[59] = (UINT8) columnar[62];		/* remainder 36 */
-  vertical[63] = (UINT8) columnar[63];		/* remainder 32 */
+  vertical[1] = (UINT8) columnar[8];		/* remainder 62 */
+  vertical[5] = (UINT8) columnar[9];		/* remainder 58 */
+  vertical[9] = (UINT8) columnar[10];		/* remainder 54 */
+  vertical[13] = (UINT8) columnar[11];		/* remainder 50 */
+  vertical[17] = (UINT8) columnar[12];		/* remainder 46 */
+  vertical[21] = (UINT8) columnar[13];		/* remainder 42 */
+  vertical[25] = (UINT8) columnar[14];		/* remainder 38 */
+  vertical[29] = (UINT8) columnar[15];		/* remainder 34 */
+
+  vertical[2] = (UINT8) columnar[16];		/* remainder 61 */
+  vertical[6] = (UINT8) columnar[17];		/* remainder 57 */
+  vertical[10] = (UINT8) columnar[18];		/* remainder 53 */
+  vertical[14] = (UINT8) columnar[19];		/* remainder 49 */
+  vertical[18] = (UINT8) columnar[20];		/* remainder 45 */
+  vertical[22] = (UINT8) columnar[21];		/* remainder 41 */
+  vertical[26] = (UINT8) columnar[22];		/* remainder 37 */
+  vertical[30] = (UINT8) columnar[23];		/* remainder 33 */
+
+  vertical[3] = (UINT8) columnar[24];		/* remainder 60 */
+  vertical[7] = (UINT8) columnar[25];		/* remainder 56 */
+  vertical[11] = (UINT8) columnar[26];		/* remainder 52 */
+  vertical[15] = (UINT8) columnar[27];		/* remainder 48 */
+  vertical[19] = (UINT8) columnar[28];		/* remainder 44 */
+  vertical[23] = (UINT8) columnar[29];		/* remainder 40 */
+  vertical[27] = (UINT8) columnar[30];		/* remainder 36 */
+  vertical[31] = (UINT8) columnar[31];		/* remainder 32 */
 
   return;
 }
-
 #endif
 
+#endif
 
 
-#ifdef HAVE_SSE2
-typedef void (*Unpacker_T) (__m128i* __restrict__, const __m128i* __restrict__);
-#else
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
 typedef void (*Unpacker_T) (UINT4* __restrict__, const UINT4* __restrict__);
+#else
+typedef void (*Unpacker_T) (__m128i* __restrict__, const __m128i* __restrict__);
 #endif
 
 
@@ -12745,8 +13623,20 @@ static Unpacker_T unpacker_table[33] =
    unpack_21, unpack_22, unpack_23, unpack_24,
    unpack_25, unpack_26, unpack_27, unpack_28,
    unpack_29, unpack_30, unpack_31, unpack_32};
+
+#elif defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+static Unpacker_T unpacker_all_table[33] =
+  {unpack_00,
+   unpack_00, unpack_02, unpack_00, unpack_04,
+   unpack_00, unpack_06, unpack_00, unpack_08,
+   unpack_00, unpack_10, unpack_00, unpack_12,
+   unpack_00, unpack_14, unpack_00, unpack_16,
+   unpack_00, unpack_18, unpack_00, unpack_20,
+   unpack_00, unpack_22, unpack_00, unpack_24,
+   unpack_00, unpack_26, unpack_00, unpack_28,
+   unpack_00, unpack_30, unpack_00, unpack_32};
+
 #else
-#ifdef HAVE_SSE2
 static Unpacker_T unpacker_all_table[34] =
   {unpack_00, unpack_00,
    unpack_02_fwd, unpack_02_rev, unpack_04_fwd, unpack_04_rev,
@@ -12864,18 +13754,6 @@ static Unpacker_T unpacker_table[17][17] =
 
 };
    
-#else
-static Unpacker_T unpacker_all_table[33] =
-  {unpack_00,
-   unpack_00, unpack_02, unpack_00, unpack_04,
-   unpack_00, unpack_06, unpack_00, unpack_08,
-   unpack_00, unpack_10, unpack_00, unpack_12,
-   unpack_00, unpack_14, unpack_00, unpack_16,
-   unpack_00, unpack_18, unpack_00, unpack_20,
-   unpack_00, unpack_22, unpack_00, unpack_24,
-   unpack_00, unpack_26, unpack_00, unpack_28,
-   unpack_00, unpack_30, unpack_00, unpack_32};
-#endif
 #endif
 
 
@@ -13786,7 +14664,11 @@ Bitpack64_read_one (Storedoligomer_T oligo, UINT4 *bitpackptrs, UINT4 *bitpackco
   Storedoligomer_T bmer;
   UINT4 *info, nwritten, packsize_div2;
   int delta, remainder, quarter_block, column, row;
-#ifdef HAVE_SSE2
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+  UINT4 ptr;
+  UINT4 diffs[BLOCKSIZE+1], *bitpack;
+  int k, i;
+#else
 #ifdef BRANCH_FREE_ROW_SUM
   __m128i diffs[3];
 #else
@@ -13797,10 +14679,6 @@ Bitpack64_read_one (Storedoligomer_T oligo, UINT4 *bitpackptrs, UINT4 *bitpackco
 #endif
   __m128i *bitpack;
   UINT4 *_diffs;
-#else
-  UINT4 ptr;
-  UINT4 diffs[BLOCKSIZE+1], *bitpack;
-  int k, i;
 #endif
 #ifdef DEBUG
   UINT4 offsets[BLOCKSIZE+1];
@@ -13812,72 +14690,37 @@ Bitpack64_read_one (Storedoligomer_T oligo, UINT4 *bitpackptrs, UINT4 *bitpackco
 
   debug(printf("Entered Bitpack64_read_one with oligo %u => bmer %u\n",oligo,bmer));
 
+#if defined(WORDS_BIGENDIAN)
+  nwritten = Bigendian_convert_uint(info[0]);		/* In 128-bit registers */
+  bitpack = (UINT4 *) &(bitpackcomp[nwritten*4]);
+  packsize_div2 = (Bigendian_convert_uint(info[DIFFERENTIAL_METAINFO_SIZE]) - nwritten);
+
+#elif !defined(HAVE_SSE2)
   nwritten = info[0];		/* In 128-bit registers */
-#ifdef HAVE_SSE2  
-  bitpack = (__m128i *) &(bitpackcomp[nwritten*4]);
-#else
   bitpack = (UINT4 *) &(bitpackcomp[nwritten*4]);
-#endif
+  packsize_div2 = (info[DIFFERENTIAL_METAINFO_SIZE] - nwritten);
 
+#else
+  nwritten = info[0];		/* In 128-bit registers */
+  bitpack = (__m128i *) &(bitpackcomp[nwritten*4]);
   /* packsize = (info[DIFFERENTIAL_METAINFO_SIZE] - nwritten)*2; */
   packsize_div2 = (info[DIFFERENTIAL_METAINFO_SIZE] - nwritten);
+#endif
 
   remainder = oligo % BLOCKSIZE;
   quarter_block = remainder / 16;
 
-#ifdef HAVE_SSE2
-  _diffs = (UINT4 *) diffs;	/* Assumes a dummy register in diffs[0] */
-
-#ifdef BRANCH_FREE_QTR_BLOCK
-  psums[0] = psums[1] = info[1];
-  psums[2] = psums[3] = info[DIFFERENTIAL_METAINFO_SIZE+1];
-
-  delta = 31 - abs(remainder - 32);
-  column = get_column(delta);
-  row = get_row(delta);
-  debug(printf("quarter-block %d, delta %d, column %d, row %d\n",quarter_block,delta,column,row));
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
 
-  (unpacker_table[packsize_div2][column*4 + quarter_block])(diffs,bitpack);
-  return psums[quarter_block] + _diffs[row+1] + _diffs[row+2] + _diffs[row+3] + _diffs[row+4];
+  /* Unpack all 64 diffs for non-SIMD */
+  (unpacker_all_table[packsize_div2*2])(&(diffs[1]),bitpack);
 
+  if (remainder <= 16) {
+#ifdef WORDS_BIGENDIAN
+    ptr = Bigendian_convert_uint(/*offset0*/info[1]);
 #else
-
-  if (quarter_block <= 1) {
-    delta = remainder - 1;
-    column = get_column(delta);
-    row = get_row(delta);
-    debug(printf("quarter-block %d, delta %d, column %d, row %d\n",quarter_block,delta,column,row));
-
-    (unpacker_table[packsize_div2][column*4 + quarter_block])(diffs,bitpack);
-#ifdef BRANCH_FREE_ROW_SUM
-    return info[1] + _diffs[row+1] + _diffs[row+2] + _diffs[row+3] + _diffs[row+4];
-#else
-    return_sum_fwd(info[1],_diffs,row);
-#endif
-
-  } else {
-    delta = 63 - remainder;
-    column = get_column(delta);
-    row = get_row(delta);
-    debug(printf("quarter-block %d, delta %d, column %d, row %d\n",quarter_block,delta,column,row));
-
-    (unpacker_table[packsize_div2][column*4 + quarter_block])(diffs,bitpack);
-#ifdef BRANCH_FREE_ROW_SUM
-    return info[DIFFERENTIAL_METAINFO_SIZE+1] - _diffs[row+1] - _diffs[row+2] - _diffs[row+3] - _diffs[row+4];
-#else
-    return_sum_rev(info[DIFFERENTIAL_METAINFO_SIZE+1],_diffs,row);
-#endif
-  }
-
-#endif
-
-#else
-
-  /* Unpack all 64 diffs for non-SIMD */
-  (unpacker_all_table[packsize_div2*2])(&(diffs[1]),bitpack);
-
-  if (remainder <= 16) {
-    ptr = /*offset0*/info[1];
+    ptr = /*offset0*/info[1];
+#endif
 
     delta = remainder - 1;
     column = get_column(delta);
@@ -13890,7 +14733,11 @@ Bitpack64_read_one (Storedoligomer_T oligo, UINT4 *bitpackptrs, UINT4 *bitpackco
     }
 
   } else if (remainder <= 32) {
+#ifdef WORDS_BIGENDIAN
+    ptr = Bigendian_convert_uint(/*offset0*/info[1]);
+#else
     ptr = /*offset0*/info[1];
+#endif
 
     delta = remainder - 1;
     column = get_column(delta);
@@ -13908,7 +14755,11 @@ Bitpack64_read_one (Storedoligomer_T oligo, UINT4 *bitpackptrs, UINT4 *bitpackco
     }
 
   } else if (remainder <= 48) {
+#ifdef WORDS_BIGENDIAN
+    ptr = Bigendian_convert_uint(/*offset1*/info[DIFFERENTIAL_METAINFO_SIZE+1]);
+#else
     ptr = /*offset1*/info[DIFFERENTIAL_METAINFO_SIZE+1];
+#endif
 
     delta = 63 - remainder;
     column = get_column(delta);
@@ -13926,7 +14777,11 @@ Bitpack64_read_one (Storedoligomer_T oligo, UINT4 *bitpackptrs, UINT4 *bitpackco
     }
 
   } else {
+#ifdef WORDS_BIGENDIAN
+    ptr = Bigendian_convert_uint(/*offset1*/info[DIFFERENTIAL_METAINFO_SIZE+1]);
+#else
     ptr = /*offset1*/info[DIFFERENTIAL_METAINFO_SIZE+1];
+#endif
 
     delta = 63 - remainder;
     column = get_column(delta);
@@ -13941,7 +14796,52 @@ Bitpack64_read_one (Storedoligomer_T oligo, UINT4 *bitpackptrs, UINT4 *bitpackco
 
   return ptr;
 
-#endif	/* HAVE_SSE2 */
+#else  /* littleendian and SSE2 */
+  _diffs = (UINT4 *) diffs;	/* Assumes a dummy register in diffs[0] */
+
+#ifdef BRANCH_FREE_QTR_BLOCK
+  psums[0] = psums[1] = info[1];
+  psums[2] = psums[3] = info[DIFFERENTIAL_METAINFO_SIZE+1];
+
+  delta = 31 - abs(remainder - 32);
+  column = get_column(delta);
+  row = get_row(delta);
+  debug(printf("quarter-block %d, delta %d, column %d, row %d\n",quarter_block,delta,column,row));
+
+  (unpacker_table[packsize_div2][column*4 + quarter_block])(diffs,bitpack);
+  return psums[quarter_block] + _diffs[row+1] + _diffs[row+2] + _diffs[row+3] + _diffs[row+4];
+
+#else
+
+  if (quarter_block <= 1) {
+    delta = remainder - 1;
+    column = get_column(delta);
+    row = get_row(delta);
+    debug(printf("quarter-block %d, delta %d, column %d, row %d\n",quarter_block,delta,column,row));
+
+    (unpacker_table[packsize_div2][column*4 + quarter_block])(diffs,bitpack);
+#ifdef BRANCH_FREE_ROW_SUM
+    return info[1] + _diffs[row+1] + _diffs[row+2] + _diffs[row+3] + _diffs[row+4];
+#else
+    return_sum_fwd(info[1],_diffs,row);
+#endif
+
+  } else {
+    delta = 63 - remainder;
+    column = get_column(delta);
+    row = get_row(delta);
+    debug(printf("quarter-block %d, delta %d, column %d, row %d\n",quarter_block,delta,column,row));
+
+    (unpacker_table[packsize_div2][column*4 + quarter_block])(diffs,bitpack);
+#ifdef BRANCH_FREE_ROW_SUM
+    return info[DIFFERENTIAL_METAINFO_SIZE+1] - _diffs[row+1] - _diffs[row+2] - _diffs[row+3] - _diffs[row+4];
+#else
+    return_sum_rev(info[DIFFERENTIAL_METAINFO_SIZE+1],_diffs,row);
+#endif
+  }
+
+#endif	/* BRANCH_FREE_QTR_BLOCK */
+#endif	/* littleendian and SSE2 */
 }
 
 
@@ -13955,7 +14855,11 @@ Bitpack64_read_one_huge (Storedoligomer_T oligo, UINT4 *bitpackpages,
   UINT4 *info, nwritten, packsize_div2;
   UINT8 offset0, offset1;
   int delta, remainder, quarter_block, column, row;
-#ifdef HAVE_SSE2
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+  UINT8 ptr;
+  UINT4 diffs[BLOCKSIZE+1], *bitpack;
+  int k;
+#else
 #ifdef BRANCH_FREE_ROW_SUM
   __m128i diffs[3];
 #else
@@ -13966,10 +14870,6 @@ Bitpack64_read_one_huge (Storedoligomer_T oligo, UINT4 *bitpackpages,
 #endif
   __m128i *bitpack;
   UINT4 *_diffs;
-#else
-  UINT8 ptr;
-  UINT4 diffs[BLOCKSIZE+1], *bitpack;
-  int k;
 #endif
   int i;
 
@@ -13979,15 +14879,23 @@ Bitpack64_read_one_huge (Storedoligomer_T oligo, UINT4 *bitpackpages,
 
   debug(printf("Entered Bitpack64_read_one_huge with oligo %u => bmer %u\n",oligo,bmer));
 
+#ifdef WORDS_BIGENDIAN
+  nwritten = Bigendian_convert_uint(info[0]);		/* In 128-bit registers */
+  bitpack = (UINT4 *) &(bitpackcomp[nwritten*4]);
+  packsize_div2 = (Bigendian_convert_uint(info[DIFFERENTIAL_METAINFO_SIZE]) - nwritten);
+
+#elif !defined(HAVE_SSE2)
   nwritten = info[0];		/* In 128-bit registers */
-#ifdef HAVE_SSE2  
-  bitpack = (__m128i *) &(bitpackcomp[nwritten*4]);
-#else
   bitpack = (UINT4 *) &(bitpackcomp[nwritten*4]);
-#endif
+  packsize_div2 = (info[DIFFERENTIAL_METAINFO_SIZE] - nwritten);
 
+#else
+  nwritten = info[0];		/* In 128-bit registers */
+  bitpack = (__m128i *) &(bitpackcomp[nwritten*4]);
   /* packsize = (info[DIFFERENTIAL_METAINFO_SIZE] - nwritten)*2; */
   packsize_div2 = (info[DIFFERENTIAL_METAINFO_SIZE] - nwritten);
+#endif
+
 
 #ifdef DEBUG
   printf("bitpack (for packsize %d):\n",packsize_div2*2);
@@ -14000,7 +14908,152 @@ Bitpack64_read_one_huge (Storedoligomer_T oligo, UINT4 *bitpackpages,
   remainder = oligo % BLOCKSIZE;
   quarter_block = remainder / 16;
 
-#ifdef HAVE_SSE2
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+
+  /* Unpack all 64 diffs for non-SIMD */
+  (unpacker_all_table[packsize_div2*2])(&(diffs[1]),bitpack);
+
+  if ((remainder = oligo % BLOCKSIZE) == 0) {
+#ifdef WORDS_BIGENDIAN
+    ptr = Bigendian_convert_uint(/*offset0*/info[1]);
+#else
+    ptr = /*offset0*/info[1];
+#endif
+
+  } else if (remainder <= 16) {
+#ifdef WORDS_BIGENDIAN
+    ptr = Bigendian_convert_uint(/*offset0*/info[1]);
+    if (bitpackpages != NULL) {
+      pageptr = bitpackpages;
+      while (bmer+1 >= Bigendian_convert_uint(*pageptr)) {
+	ptr += POSITIONS_PAGE;
+	pageptr++;
+      }
+    }
+#else
+    ptr = /*offset0*/info[1];
+    if (bitpackpages != NULL) {
+      pageptr = bitpackpages;
+      while (bmer+1 >= *pageptr) {
+	ptr += POSITIONS_PAGE;
+	pageptr++;
+      }
+    }
+#endif
+
+    column = (remainder - 1) % 4; /* Goes from 0 to 3 */
+    row = (remainder - 1) / 4;
+    debug(printf("column %d, row %d\n",column,row));
+    
+    for (k = column*2 + 1, i = 0; i <= row; k += BLOCKSIZE/4, i++) {
+      debug(printf("Adding diffs[%d] = %u\n",k,diffs[k]));
+      ptr += diffs[k];
+    }
+
+  } else if (remainder <= 32) {
+#ifdef WORDS_BIGENDIAN
+    ptr = Bigendian_convert_uint(/*offset0*/info[1]);
+    if (bitpackpages != NULL) {
+      pageptr = bitpackpages;
+      while (bmer+1 >= Bigendian_convert_uint(*pageptr)) {
+	ptr += POSITIONS_PAGE;
+	pageptr++;
+      }
+    }
+#else
+    ptr = /*offset0*/info[1];
+    if (bitpackpages != NULL) {
+      pageptr = bitpackpages;
+      while (bmer+1 >= *pageptr) {
+	ptr += POSITIONS_PAGE;
+	pageptr++;
+      }
+    }
+#endif
+
+    column = (remainder - 1) % 4; /* Goes from 0 to 3 */
+    row = (remainder - 1) / 4;
+    debug(printf("column %d, row %d\n",column,row));
+    
+    for (k = column*2 + 1, i = 0; i < 4; k += BLOCKSIZE/4, i++) {
+      debug(printf("Adding diffs[%d] = %u\n",k,diffs[k]));
+      ptr += diffs[k];
+    }
+
+    for (k = column*2 + 2; i <= row; k += BLOCKSIZE/4, i++) {
+      debug(printf("Adding diffs[%d] = %u\n",k,diffs[k]));
+      ptr += diffs[k];
+    }
+
+  } else if (remainder <= 48) {
+#ifdef WORDS_BIGENDIAN
+    ptr = Bigendian_convert_uint(/*offset1*/info[DIFFERENTIAL_METAINFO_SIZE+1]);
+    if (bitpackpages != NULL) {
+      pageptr = bitpackpages;
+      while (bmer+1 >= Bigendian_convert_uint(*pageptr)) {
+	ptr += POSITIONS_PAGE;
+	pageptr++;
+      }
+    }
+#else
+    ptr = /*offset1*/info[DIFFERENTIAL_METAINFO_SIZE+1];
+    if (bitpackpages != NULL) {
+      pageptr = bitpackpages;
+      while (bmer+1 >= *pageptr) {
+	ptr += POSITIONS_PAGE;
+	pageptr++;
+      }
+    }
+#endif
+
+    column = (63 - remainder) % 4; /* Goes from 0 to 3.  Assert remainder < 64 */
+    row = (63 - remainder) / 4;
+    debug(printf("column %d, row %d\n",column,row));
+
+    for (k = column*2 + 9, i = 0; i < 4; k += BLOCKSIZE/4, i++) {
+      debug(printf("Subtracting diffs[%d] = %u\n",k,diffs[k]));
+      ptr -= diffs[k];
+    }
+
+    for (k = column*2 + 10; i <= row; k += BLOCKSIZE/4, i++) {
+      debug(printf("Subtracting diffs[%d] = %u\n",k,diffs[k]));
+      ptr -= diffs[k];
+    }
+
+  } else {
+#ifdef WORDS_BIGENDIAN
+    ptr = Bigendian_convert_uint(/*offset1*/info[DIFFERENTIAL_METAINFO_SIZE+1]);
+    if (bitpackpages != NULL) {
+      pageptr = bitpackpages;
+      while (bmer+1 >= Bigendian_convert_uint(*pageptr)) {
+	ptr += POSITIONS_PAGE;
+	pageptr++;
+      }
+    }
+#else
+    ptr = /*offset1*/info[DIFFERENTIAL_METAINFO_SIZE+1];
+    if (bitpackpages != NULL) {
+      pageptr = bitpackpages;
+      while (bmer+1 >= *pageptr) {
+	ptr += POSITIONS_PAGE;
+	pageptr++;
+      }
+    }
+#endif
+
+    column = (63 - remainder) % 4; /* Goes from 0 to 3.  Assert remainder < 64 */
+    row = (63 - remainder) / 4;
+    debug(printf("column %d, row %d\n",column,row));
+
+    for (k = column*2 + 9, i = 0; i <= row; k += BLOCKSIZE/4, i++) {
+      debug(printf("Subtracting diffs[%d] = %u\n",k,diffs[k]));
+      ptr -= diffs[k];
+    }
+  }
+
+  return ptr;
+
+#else			    /* littleendian and SSE2 */
   _diffs = (UINT4 *) diffs;	/* Assumes a dummy register in diffs[0] */
 
 #ifdef BRANCH_FREE_QTR_BLOCK
@@ -14107,106 +15160,8 @@ Bitpack64_read_one_huge (Storedoligomer_T oligo, UINT4 *bitpackpages,
 #endif
   }
 
-#endif
-
-#else
-
-  /* Unpack all 64 diffs for non-SIMD */
-  (unpacker_all_table[packsize_div2*2])(&(diffs[1]),bitpack);
-
-  if ((remainder = oligo % BLOCKSIZE) == 0) {
-    ptr = /*offset0*/info[1];
-
-  } else if (remainder <= 16) {
-    ptr = /*offset0*/info[1];
-    if (bitpackpages != NULL) {
-      pageptr = bitpackpages;
-      while (bmer+1 >= *pageptr) {
-	ptr += POSITIONS_PAGE;
-	pageptr++;
-      }
-    }
-
-    column = (remainder - 1) % 4; /* Goes from 0 to 3 */
-    row = (remainder - 1) / 4;
-    debug(printf("column %d, row %d\n",column,row));
-    
-    for (k = column*2 + 1, i = 0; i <= row; k += BLOCKSIZE/4, i++) {
-      debug(printf("Adding diffs[%d] = %u\n",k,diffs[k]));
-      ptr += diffs[k];
-    }
-
-  } else if (remainder <= 32) {
-    ptr = /*offset0*/info[1];
-    if (bitpackpages != NULL) {
-      pageptr = bitpackpages;
-      while (bmer+1 >= *pageptr) {
-	ptr += POSITIONS_PAGE;
-	pageptr++;
-      }
-    }
-
-    column = (remainder - 1) % 4; /* Goes from 0 to 3 */
-    row = (remainder - 1) / 4;
-    debug(printf("column %d, row %d\n",column,row));
-    
-    for (k = column*2 + 1, i = 0; i < 4; k += BLOCKSIZE/4, i++) {
-      debug(printf("Adding diffs[%d] = %u\n",k,diffs[k]));
-      ptr += diffs[k];
-    }
-
-    for (k = column*2 + 2; i <= row; k += BLOCKSIZE/4, i++) {
-      debug(printf("Adding diffs[%d] = %u\n",k,diffs[k]));
-      ptr += diffs[k];
-    }
-
-  } else if (remainder <= 48) {
-    ptr = /*offset1*/info[DIFFERENTIAL_METAINFO_SIZE+1];
-    if (bitpackpages != NULL) {
-      pageptr = bitpackpages;
-      while (bmer+1 >= *pageptr) {
-	ptr += POSITIONS_PAGE;
-	pageptr++;
-      }
-    }
-
-    column = (63 - remainder) % 4; /* Goes from 0 to 3.  Assert remainder < 64 */
-    row = (63 - remainder) / 4;
-    debug(printf("column %d, row %d\n",column,row));
-
-    for (k = column*2 + 9, i = 0; i < 4; k += BLOCKSIZE/4, i++) {
-      debug(printf("Subtracting diffs[%d] = %u\n",k,diffs[k]));
-      ptr -= diffs[k];
-    }
-
-    for (k = column*2 + 10; i <= row; k += BLOCKSIZE/4, i++) {
-      debug(printf("Subtracting diffs[%d] = %u\n",k,diffs[k]));
-      ptr -= diffs[k];
-    }
-
-  } else {
-    ptr = /*offset1*/info[DIFFERENTIAL_METAINFO_SIZE+1];
-    if (bitpackpages != NULL) {
-      pageptr = bitpackpages;
-      while (bmer+1 >= *pageptr) {
-	ptr += POSITIONS_PAGE;
-	pageptr++;
-      }
-    }
-
-    column = (63 - remainder) % 4; /* Goes from 0 to 3.  Assert remainder < 64 */
-    row = (63 - remainder) / 4;
-    debug(printf("column %d, row %d\n",column,row));
-
-    for (k = column*2 + 9, i = 0; i <= row; k += BLOCKSIZE/4, i++) {
-      debug(printf("Subtracting diffs[%d] = %u\n",k,diffs[k]));
-      ptr -= diffs[k];
-    }
-  }
-
-  return ptr;
-
-#endif
+#endif	/* BRANCH_FREE_QTR_BLOCK */
+#endif	/* littleendian and SSE2 */
 }
 
 
@@ -14218,12 +15173,12 @@ Bitpack64_block_offsets (UINT4 *offsets, Storedoligomer_T oligo,
   UINT4 *info, nwritten;
   UINT4 offset0, offset1, temp;
   int packsize, k;
-#ifdef HAVE_SSE2
-  __m128i diffs[8], *bitpack;
-  UINT4 *_diffs;
-#else
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
   int column, row;
   UINT4 diffs[BLOCKSIZE], columnar[BLOCKSIZE], *bitpack, *vertical;
+#else
+  __m128i diffs[8], *bitpack;
+  UINT4 *_diffs;
 #endif
 #ifdef DEBUG
   int i;
@@ -14231,52 +15186,36 @@ Bitpack64_block_offsets (UINT4 *offsets, Storedoligomer_T oligo,
 
 
   info = &(bitpackptrs[oligo/BLOCKSIZE * DIFFERENTIAL_METAINFO_SIZE]);
+#ifdef WORDS_BIGENDIAN
+  nwritten = Bigendian_convert_uint(info[0]);
+  bitpack = (UINT4 *) &(bitpackcomp[nwritten*4]);
+  offset0 = Bigendian_convert_uint(info[1]);
+  offset1 = Bigendian_convert_uint(info[DIFFERENTIAL_METAINFO_SIZE+1]);
+  packsize = (Bigendian_convert_uint(info[DIFFERENTIAL_METAINFO_SIZE]) - nwritten)*2;
+
+#elif !defined(HAVE_SSE2)
   nwritten = info[0];		/* In 128-bit registers */
-#ifdef HAVE_SSE2
-  bitpack = (__m128i *) &(bitpackcomp[nwritten*4]);
-#else
   bitpack = (UINT4 *) &(bitpackcomp[nwritten*4]);
-#endif
   offset0 = info[1];
   offset1 = info[DIFFERENTIAL_METAINFO_SIZE+1];
-
   packsize = (info[DIFFERENTIAL_METAINFO_SIZE] - nwritten)*2;
 
-#ifdef DEBUG
-  printf("oligo: %08X, nwritten %u, offset0 %u, offset1 %u, packsize %d\n",
-	 oligo,nwritten,offset0,offset1,packsize);
+#else
+  nwritten = info[0];		/* In 128-bit registers */
+  bitpack = (__m128i *) &(bitpackcomp[nwritten*4]);
+  offset0 = info[1];
+  offset1 = info[DIFFERENTIAL_METAINFO_SIZE+1];
+  packsize = (info[DIFFERENTIAL_METAINFO_SIZE] - nwritten)*2;
 #endif
 
-#ifdef HAVE_SSE2
-#ifdef DEBUG
-  printf("bitpack:\n");
-  for (i = 0; i < packsize/2; i++) {
-    print_vector_hex(bitpack[i]);
-  }
-  printf("\n");
-#endif  
-
-  _diffs = (UINT4 *) &(diffs[0]);
-
-  /* Unpack fwd 32 cumulative sums under SIMD */
-  (unpacker_all_table[packsize])(&(diffs[0]),bitpack);
-  vertical_order_fwd(&(offsets[1]),_diffs);
-
-  /* Unpack rev 32 cumulative sums under SIMD */
-  (unpacker_all_table[packsize+1])(&(diffs[0]),bitpack);
-  vertical_order_rev(&(offsets[33]),_diffs);
 
 #ifdef DEBUG
-  printf("%u\n",offsets[i]);
-  for (i = 1; i <= BLOCKSIZE; i += 4) {
-    printf("%u %u %u %u\n",offsets[i],offsets[i+1],offsets[i+2],offsets[i+3]);
-  }
-  printf("end of diffs vertical\n");
+  printf("oligo: %08X, nwritten %u, offset0 %u, offset1 %u, packsize %d\n",
+	 oligo,nwritten,offset0,offset1,packsize);
 #endif
 
 
-#else
-
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
   /* Unpack all 64 diffs for non-SIMD */
   (unpacker_all_table[packsize])(&(diffs[0]),bitpack);
 
@@ -14311,7 +15250,35 @@ Bitpack64_block_offsets (UINT4 *offsets, Storedoligomer_T oligo,
   printf("end of diffs vertical\n");
 #endif
 
-#endif	/* HAVE_SSE2 */
+#else  /* littleendian and SSE2 */
+
+#ifdef DEBUG
+  printf("bitpack:\n");
+  for (i = 0; i < packsize/2; i++) {
+    print_vector_hex(bitpack[i]);
+  }
+  printf("\n");
+#endif  
+
+  _diffs = (UINT4 *) &(diffs[0]);
+
+  /* Unpack fwd 32 cumulative sums under SIMD */
+  (unpacker_all_table[packsize])(&(diffs[0]),bitpack);
+  vertical_order_fwd(&(offsets[1]),_diffs);
+
+  /* Unpack rev 32 cumulative sums under SIMD */
+  (unpacker_all_table[packsize+1])(&(diffs[0]),bitpack);
+  vertical_order_rev(&(offsets[33]),_diffs);
+
+#ifdef DEBUG
+  printf("%u\n",offsets[i]);
+  for (i = 1; i <= BLOCKSIZE; i += 4) {
+    printf("%u %u %u %u\n",offsets[i],offsets[i+1],offsets[i+2],offsets[i+3]);
+  }
+  printf("end of diffs vertical\n");
+#endif
+
+#endif	/* littleendian and SSE2 */
 
   /* Perform cumulative sum */
   offsets[0] = offset0;
@@ -14365,12 +15332,12 @@ Bitpack64_block_offsets_huge (UINT8 *offsets, Storedoligomer_T oligo,
   Storedoligomer_T bmer;
   UINT8 offset0, offset1, temp;
   int packsize, k;
-#ifdef HAVE_SSE2
-  __m128i diffs[8], *bitpack;
-  UINT4 *_diffs;
-#else
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
   int column, row;
   UINT4 diffs[BLOCKSIZE], columnar[BLOCKSIZE], *bitpack, *vertical;
+#else
+  __m128i diffs[8], *bitpack;
+  UINT4 *_diffs;
 #endif
 #ifdef DEBUG
   int i;
@@ -14379,13 +15346,33 @@ Bitpack64_block_offsets_huge (UINT8 *offsets, Storedoligomer_T oligo,
   bmer = oligo/BLOCKSIZE;
 
   info = &(bitpackptrs[bmer * DIFFERENTIAL_METAINFO_SIZE]);
+
+#ifdef WORDS_BIGENDIAN
+  nwritten = Bigendian_convert_uint(info[0]); /* In 128-bit registers */
+  bitpack = (UINT4 *) &(bitpackcomp[nwritten*4]);
+
+#elif !defined(HAVE_SSE2)
   nwritten = info[0];		/* In 128-bit registers */
-#ifdef HAVE_SSE2
-  bitpack = (__m128i *) &(bitpackcomp[nwritten*4]);
-#else
   bitpack = (UINT4 *) &(bitpackcomp[nwritten*4]);
+
+#else
+  nwritten = info[0];		/* In 128-bit registers */
+  bitpack = (__m128i *) &(bitpackcomp[nwritten*4]);
 #endif
 
+#ifdef WORDS_BIGENDIAN
+  offset0 = offset1 = 0UL;
+  pageptr = bitpackpages;
+  while (bmer >= Bigendian_convert_uint(*pageptr)) {
+    offset0 += POSITIONS_PAGE;
+    pageptr++;
+  }
+
+  offset1 = offset0;
+  if (bmer+1 >= Bigendian_convert_uint(*pageptr)) {
+    offset1 += POSITIONS_PAGE;
+  }
+#else
   offset0 = offset1 = 0UL;
   pageptr = bitpackpages;
   while (bmer >= *pageptr) {
@@ -14397,11 +15384,18 @@ Bitpack64_block_offsets_huge (UINT8 *offsets, Storedoligomer_T oligo,
   if (bmer+1 >= *pageptr) {
     offset1 += POSITIONS_PAGE;
   }
+#endif
 
+
+#ifdef WORDS_BIGENDIAN
+  offset0 += Bigendian_convert_uint(info[1]);
+  offset1 += Bigendian_convert_uint(info[DIFFERENTIAL_METAINFO_SIZE+1]);
+  packsize = (Bigendian_convert_uint(info[DIFFERENTIAL_METAINFO_SIZE]) - nwritten)*2;
+#else
   offset0 += info[1];
   offset1 += info[DIFFERENTIAL_METAINFO_SIZE+1];
-
   packsize = (info[DIFFERENTIAL_METAINFO_SIZE] - nwritten)*2;
+#endif
 
 
 #ifdef DEBUG
@@ -14409,36 +15403,8 @@ Bitpack64_block_offsets_huge (UINT8 *offsets, Storedoligomer_T oligo,
 	 oligo,nwritten,offset0,offset1,packsize);
 #endif
 
-#ifdef HAVE_SSE2
-#ifdef DEBUG
-  printf("bitpack:\n");
-  for (i = 0; i < packsize/2; i++) {
-    print_vector_hex(bitpack[i]);
-  }
-  printf("\n");
-#endif
-
-  _diffs = (UINT4 *) &(diffs[0]);
-
-  /* Unpack fwd 32 cumulative sums under SIMD */
-  (unpacker_all_table[packsize])(&(diffs[0]),bitpack);
-  vertical_order_huge_fwd(&(offsets[1]),_diffs);
-
-  /* Unpack rev 32 cumulative sums under SIMD */
-  (unpacker_all_table[packsize+1])(&(diffs[0]),bitpack);
-  vertical_order_huge_rev(&(offsets[33]),_diffs);
-
-#ifdef DEBUG
-  printf("%u\n",offsets[i]);
-  for (i = 1; i <= 64; i += 4) {
-    printf("%u %u %u %u\n",offsets[i],offsets[i+1],offsets[i+2],offsets[i+3]);
-  }
-  printf("end of diffs vertical\n");
-#endif
-
-
-#else
 
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
   /* Unpack all 64 diffs for non-SIMD */
   (unpacker_all_table[packsize])(&(diffs[0]),bitpack);
 
@@ -14473,6 +15439,34 @@ Bitpack64_block_offsets_huge (UINT8 *offsets, Storedoligomer_T oligo,
   printf("end of diffs vertical\n");
 #endif
 
+
+#else
+#ifdef DEBUG
+  printf("bitpack:\n");
+  for (i = 0; i < packsize/2; i++) {
+    print_vector_hex(bitpack[i]);
+  }
+  printf("\n");
+#endif
+
+  _diffs = (UINT4 *) &(diffs[0]);
+
+  /* Unpack fwd 32 cumulative sums under SIMD */
+  (unpacker_all_table[packsize])(&(diffs[0]),bitpack);
+  vertical_order_huge_fwd(&(offsets[1]),_diffs);
+
+  /* Unpack rev 32 cumulative sums under SIMD */
+  (unpacker_all_table[packsize+1])(&(diffs[0]),bitpack);
+  vertical_order_huge_rev(&(offsets[33]),_diffs);
+
+#ifdef DEBUG
+  printf("%u\n",offsets[i]);
+  for (i = 1; i <= 64; i += 4) {
+    printf("%u %u %u %u\n",offsets[i],offsets[i+1],offsets[i+2],offsets[i+3]);
+  }
+  printf("end of diffs vertical\n");
+#endif
+
 #endif	/* HAVE_SSE2 */
 
   /* Perform cumulative sum */
diff --git a/src/bitpack64-readtwo.c b/src/bitpack64-readtwo.c
index afc5e72..5e143e9 100644
--- a/src/bitpack64-readtwo.c
+++ b/src/bitpack64-readtwo.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: bitpack64-readtwo.c 153955 2014-11-24 17:54:45Z twu $";
+static char rcsid[] = "$Id: bitpack64-readtwo.c 168395 2015-06-26 17:13:13Z twu $";
 #ifdef HAVE_CONFIG_H
 #include <config.h>
 #endif
@@ -8,7 +8,9 @@ static char rcsid[] = "$Id: bitpack64-readtwo.c 153955 2014-11-24 17:54:45Z twu
 #include <stdio.h>
 #include <stdlib.h>
 
-#ifdef HAVE_SSE2
+#ifdef WORDS_BIGENDIAN
+#include "bigendian.h"
+#elif defined(HAVE_SSE2)
 #include <emmintrin.h>
 #endif
 
@@ -41,8 +43,9 @@ static char rcsid[] = "$Id: bitpack64-readtwo.c 153955 2014-11-24 17:54:45Z twu
 /* #define BRANCH_FREE_ROW_SUM 1 -- Not supported here */
 /* #define BRANCH_FREE_QTR_BLOCK 1 */
 
-#ifdef HAVE_SSE2
 #ifdef DEBUG
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+#else
 /* For debugging */
 static void
 print_vector_hex (__m128i x) {
@@ -63,21 +66,6 @@ print_vector (__m128i x) {
 #endif
 
 
-#if 0
-#ifdef HAVE_SSE2
-#ifdef ALLOW_ODD_PACKSIZES
-static __m128i mask1, mask2, mask3, mask4, mask5, mask6, mask7, mask8,
-  mask9, mask10, mask11, mask12, mask13, mask14, mask15, mask16,
-  mask17, mask18, mask19, mask20, mask21, mask22, mask23, mask24,
-  mask25, mask26, mask27, mask28, mask29, mask30, mask31;
-#else
-static __m128i mask2, mask4, mask6, mask8, mask10, mask12, mask14, mask16,
-  mask18, mask20, mask22, mask24, mask26, mask28, mask30;
-#endif
-#endif
-#endif
-
-
 #define BLOCKSIZE 64
 
 #if 0
@@ -125,7 +113,19 @@ Bitpack64_read_setup () {
 #endif
 
 
-#ifdef HAVE_SSE2
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+static void
+unpack_00 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
+  int i;
+
+  for (i = 0; i < BLOCKSIZE; i++) {
+    *out++ = 0;
+  }
+
+  return;
+}
+
+#else
 static void
 unpack_00 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
   __m128i total = _mm_set1_epi32(0U);
@@ -182,19 +182,6 @@ unpack_00_2_4 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
 
   return;
 }
-
-
-#else
-static void
-unpack_00 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
-  int i;
-
-  for (i = 0; i < BLOCKSIZE; i++) {
-    *out++ = 0;
-  }
-
-  return;
-}
 #endif
 
 
@@ -273,7 +260,99 @@ unpack_01 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
 #endif
 
 
-#ifdef HAVE_SSE2
+#ifdef WORDS_BIGENDIAN
+static void
+unpack_02 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
+  unsigned int column;
+  const UINT4 *bitpack = in;
+
+  for (column = 0; column < 4; column++) {
+    in = &(bitpack[column]);
+
+    *out = ( Bigendian_convert_uint(*in) >>  0  )   % (1U << 2 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  2  )   % (1U << 2 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  4  )   % (1U << 2 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  6  )   % (1U << 2 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  8  )   % (1U << 2 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  10  )   % (1U << 2 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  12  )   % (1U << 2 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  14  )   % (1U << 2 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  16  )   % (1U << 2 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  18  )   % (1U << 2 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  20  )   % (1U << 2 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  22  )   % (1U << 2 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  24  )   % (1U << 2 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  26  )   % (1U << 2 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  28  )   % (1U << 2 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  30  )   % (1U << 2 ) ;
+    out++;
+  }
+
+  return;
+}
+
+#elif !defined(HAVE_SSE2)
+static void
+unpack_02 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
+  unsigned int column;
+  const UINT4 *bitpack = in;
+
+  for (column = 0; column < 4; column++) {
+    in = &(bitpack[column]);
+
+    *out = ( (*in) >>  0  )   % (1U << 2 ) ;
+    out++;
+    *out = ( (*in) >>  2  )   % (1U << 2 ) ;
+    out++;
+    *out = ( (*in) >>  4  )   % (1U << 2 ) ;
+    out++;
+    *out = ( (*in) >>  6  )   % (1U << 2 ) ;
+    out++;
+    *out = ( (*in) >>  8  )   % (1U << 2 ) ;
+    out++;
+    *out = ( (*in) >>  10  )   % (1U << 2 ) ;
+    out++;
+    *out = ( (*in) >>  12  )   % (1U << 2 ) ;
+    out++;
+    *out = ( (*in) >>  14  )   % (1U << 2 ) ;
+    out++;
+    *out = ( (*in) >>  16  )   % (1U << 2 ) ;
+    out++;
+    *out = ( (*in) >>  18  )   % (1U << 2 ) ;
+    out++;
+    *out = ( (*in) >>  20  )   % (1U << 2 ) ;
+    out++;
+    *out = ( (*in) >>  22  )   % (1U << 2 ) ;
+    out++;
+    *out = ( (*in) >>  24  )   % (1U << 2 ) ;
+    out++;
+    *out = ( (*in) >>  26  )   % (1U << 2 ) ;
+    out++;
+    *out = ( (*in) >>  28  )   % (1U << 2 ) ;
+    out++;
+    *out = ( (*in) >>  30  )   % (1U << 2 ) ;
+    out++;
+  }
+
+  return;
+}
+
+#else
 static void
 unpack_02_fwd (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
     __m128i InReg = _mm_load_si128(in);
@@ -831,56 +910,11 @@ unpack_02_rev_8_2 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
     return;
 }
 
-#else
-
-static void
-unpack_02 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
-  unsigned int column;
-  const UINT4 *bitpack = in;
-
-  for (column = 0; column < 4; column++) {
-    in = &(bitpack[column]);
-
-    *out = ( (*in) >>  0  )   % (1U << 2 ) ;
-    out++;
-    *out = ( (*in) >>  2  )   % (1U << 2 ) ;
-    out++;
-    *out = ( (*in) >>  4  )   % (1U << 2 ) ;
-    out++;
-    *out = ( (*in) >>  6  )   % (1U << 2 ) ;
-    out++;
-    *out = ( (*in) >>  8  )   % (1U << 2 ) ;
-    out++;
-    *out = ( (*in) >>  10  )   % (1U << 2 ) ;
-    out++;
-    *out = ( (*in) >>  12  )   % (1U << 2 ) ;
-    out++;
-    *out = ( (*in) >>  14  )   % (1U << 2 ) ;
-    out++;
-    *out = ( (*in) >>  16  )   % (1U << 2 ) ;
-    out++;
-    *out = ( (*in) >>  18  )   % (1U << 2 ) ;
-    out++;
-    *out = ( (*in) >>  20  )   % (1U << 2 ) ;
-    out++;
-    *out = ( (*in) >>  22  )   % (1U << 2 ) ;
-    out++;
-    *out = ( (*in) >>  24  )   % (1U << 2 ) ;
-    out++;
-    *out = ( (*in) >>  26  )   % (1U << 2 ) ;
-    out++;
-    *out = ( (*in) >>  28  )   % (1U << 2 ) ;
-    out++;
-    *out = ( (*in) >>  30  )   % (1U << 2 ) ;
-    out++;
-  }
-
-  return;
-}
 #endif
 
 
 
+
 #ifdef ALLOW_ODD_PACKSIZES
 static void
 unpack_03 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
@@ -959,7 +993,50 @@ unpack_03 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
 #endif
 
 
-#ifdef HAVE_SSE2
+
+#ifdef WORDS_BIGENDIAN
+static void
+unpack_04 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
+  UINT4 outer, inwordpointer;
+  unsigned int column;
+  const UINT4 *bitpack = in;
+
+  for (column = 0; column < 4; column++) {
+    in = &(bitpack[column]);
+
+    for (outer = 0; outer < 2 ; outer++) {
+      for (inwordpointer = 0; inwordpointer < 32; inwordpointer +=  4) {
+	*(out++) = ( Bigendian_convert_uint(*in) >> inwordpointer )   % (1U << 4 ) ;
+      }
+      in += 4;
+    }
+  }
+
+  return;
+}
+
+#elif !defined(HAVE_SSE2)
+static void
+unpack_04 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
+  UINT4 outer, inwordpointer;
+  unsigned int column;
+  const UINT4 *bitpack = in;
+
+  for (column = 0; column < 4; column++) {
+    in = &(bitpack[column]);
+
+    for (outer = 0; outer < 2 ; outer++) {
+      for (inwordpointer = 0; inwordpointer < 32; inwordpointer +=  4) {
+	*(out++) = ( (*in) >> inwordpointer )   % (1U << 4 ) ;
+      }
+      in += 4;
+    }
+  }
+
+  return;
+}
+
+#else
 static void
 unpack_04_fwd (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
     __m128i InReg = _mm_load_si128(in);
@@ -1514,27 +1591,6 @@ unpack_04_rev_8_2 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
 
     return;
 }
-
-#else
-static void
-unpack_04 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
-  UINT4 outer, inwordpointer;
-  unsigned int column;
-  const UINT4 *bitpack = in;
-
-  for (column = 0; column < 4; column++) {
-    in = &(bitpack[column]);
-
-    for (outer = 0; outer < 2 ; outer++) {
-      for (inwordpointer = 0; inwordpointer < 32; inwordpointer +=  4) {
-	*(out++) = ( (*in) >> inwordpointer )   % (1U << 4 ) ;
-      }
-      in += 4;
-    }
-  }
-
-  return;
-}
 #endif
 
 
@@ -1619,34 +1675,135 @@ unpack_05 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
 #endif
 
 
-#ifdef HAVE_SSE2
-static void
-unpack_06_fwd (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
-    __m128i InReg = _mm_load_si128(in);
-    __m128i OutReg;
-    const __m128i mask6 =  _mm_set1_epi32(63U);
-
-    OutReg = _mm_and_si128( InReg , mask6);
-    _mm_store_si128(out++, OutReg);
-
-    OutReg = _mm_and_si128(  _mm_srli_epi32(InReg,6) , mask6);
-    /* total = _mm_add_epi32(total, OutReg); */
-    _mm_store_si128(out++, OutReg);
-
-    OutReg = _mm_and_si128(  _mm_srli_epi32(InReg,12) , mask6);
-    /* total = _mm_add_epi32(total, OutReg); */
-    _mm_store_si128(out++, OutReg);
-
-    OutReg = _mm_and_si128(  _mm_srli_epi32(InReg,18) , mask6);
-    /* total = _mm_add_epi32(total, OutReg); */
-    _mm_store_si128(out++, OutReg);
 
-    OutReg = _mm_and_si128(  _mm_srli_epi32(InReg,24) , mask6);
-    /* total = _mm_add_epi32(total, OutReg); */
-    _mm_store_si128(out++, OutReg);
+#ifdef WORDS_BIGENDIAN
+static void
+unpack_06 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
+  unsigned int column;
+  const UINT4 *bitpack = in;
 
-    OutReg =   _mm_srli_epi32(InReg,30) ;
-    InReg = _mm_load_si128(++in);
+  for (column = 0; column < 4; column++) {
+    in = &(bitpack[column]);
+
+    *out = ( Bigendian_convert_uint(*in) >>  0  )   % (1U << 6 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  6  )   % (1U << 6 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  12  )   % (1U << 6 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  18  )   % (1U << 6 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  24  )   % (1U << 6 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  30  )   % (1U << 6 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 4 ))<<( 6 - 4 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  4  )   % (1U << 6 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  10  )   % (1U << 6 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  16  )   % (1U << 6 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  22  )   % (1U << 6 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  28  )   % (1U << 6 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 2 ))<<( 6 - 2 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  2  )   % (1U << 6 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  8  )   % (1U << 6 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  14  )   % (1U << 6 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  20  )   % (1U << 6 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  26  )   % (1U << 6 ) ;
+    out++;
+  }
+
+  return;
+}
+
+#elif !defined(HAVE_SSE2)
+static void
+unpack_06 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
+  unsigned int column;
+  const UINT4 *bitpack = in;
+
+  for (column = 0; column < 4; column++) {
+    in = &(bitpack[column]);
+
+    *out = ( (*in) >>  0  )   % (1U << 6 ) ;
+    out++;
+    *out = ( (*in) >>  6  )   % (1U << 6 ) ;
+    out++;
+    *out = ( (*in) >>  12  )   % (1U << 6 ) ;
+    out++;
+    *out = ( (*in) >>  18  )   % (1U << 6 ) ;
+    out++;
+    *out = ( (*in) >>  24  )   % (1U << 6 ) ;
+    out++;
+    *out = ( (*in) >>  30  )   % (1U << 6 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 4 ))<<( 6 - 4 );
+    out++;
+    *out = ( (*in) >>  4  )   % (1U << 6 ) ;
+    out++;
+    *out = ( (*in) >>  10  )   % (1U << 6 ) ;
+    out++;
+    *out = ( (*in) >>  16  )   % (1U << 6 ) ;
+    out++;
+    *out = ( (*in) >>  22  )   % (1U << 6 ) ;
+    out++;
+    *out = ( (*in) >>  28  )   % (1U << 6 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 2 ))<<( 6 - 2 );
+    out++;
+    *out = ( (*in) >>  2  )   % (1U << 6 ) ;
+    out++;
+    *out = ( (*in) >>  8  )   % (1U << 6 ) ;
+    out++;
+    *out = ( (*in) >>  14  )   % (1U << 6 ) ;
+    out++;
+    *out = ( (*in) >>  20  )   % (1U << 6 ) ;
+    out++;
+    *out = ( (*in) >>  26  )   % (1U << 6 ) ;
+    out++;
+  }
+
+  return;
+}
+
+#else
+static void
+unpack_06_fwd (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
+    __m128i InReg = _mm_load_si128(in);
+    __m128i OutReg;
+    const __m128i mask6 =  _mm_set1_epi32(63U);
+
+    OutReg = _mm_and_si128( InReg , mask6);
+    _mm_store_si128(out++, OutReg);
+
+    OutReg = _mm_and_si128(  _mm_srli_epi32(InReg,6) , mask6);
+    /* total = _mm_add_epi32(total, OutReg); */
+    _mm_store_si128(out++, OutReg);
+
+    OutReg = _mm_and_si128(  _mm_srli_epi32(InReg,12) , mask6);
+    /* total = _mm_add_epi32(total, OutReg); */
+    _mm_store_si128(out++, OutReg);
+
+    OutReg = _mm_and_si128(  _mm_srli_epi32(InReg,18) , mask6);
+    /* total = _mm_add_epi32(total, OutReg); */
+    _mm_store_si128(out++, OutReg);
+
+    OutReg = _mm_and_si128(  _mm_srli_epi32(InReg,24) , mask6);
+    /* total = _mm_add_epi32(total, OutReg); */
+    _mm_store_si128(out++, OutReg);
+
+    OutReg =   _mm_srli_epi32(InReg,30) ;
+    InReg = _mm_load_si128(++in);
 
     OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6 - 4), mask6));
     /* total = _mm_add_epi32(total, OutReg); */
@@ -2204,56 +2361,6 @@ unpack_06_rev_8_2 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
 
     return;
 }
-
-#else
-static void
-unpack_06 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
-  unsigned int column;
-  const UINT4 *bitpack = in;
-
-  for (column = 0; column < 4; column++) {
-    in = &(bitpack[column]);
-
-    *out = ( (*in) >>  0  )   % (1U << 6 ) ;
-    out++;
-    *out = ( (*in) >>  6  )   % (1U << 6 ) ;
-    out++;
-    *out = ( (*in) >>  12  )   % (1U << 6 ) ;
-    out++;
-    *out = ( (*in) >>  18  )   % (1U << 6 ) ;
-    out++;
-    *out = ( (*in) >>  24  )   % (1U << 6 ) ;
-    out++;
-    *out = ( (*in) >>  30  )   % (1U << 6 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 4 ))<<( 6 - 4 );
-    out++;
-    *out = ( (*in) >>  4  )   % (1U << 6 ) ;
-    out++;
-    *out = ( (*in) >>  10  )   % (1U << 6 ) ;
-    out++;
-    *out = ( (*in) >>  16  )   % (1U << 6 ) ;
-    out++;
-    *out = ( (*in) >>  22  )   % (1U << 6 ) ;
-    out++;
-    *out = ( (*in) >>  28  )   % (1U << 6 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 2 ))<<( 6 - 2 );
-    out++;
-    *out = ( (*in) >>  2  )   % (1U << 6 ) ;
-    out++;
-    *out = ( (*in) >>  8  )   % (1U << 6 ) ;
-    out++;
-    *out = ( (*in) >>  14  )   % (1U << 6 ) ;
-    out++;
-    *out = ( (*in) >>  20  )   % (1U << 6 ) ;
-    out++;
-    *out = ( (*in) >>  26  )   % (1U << 6 ) ;
-    out++;
-  }
-
-  return;
-}
 #endif
 
 
@@ -2342,7 +2449,49 @@ unpack_07 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
 
 
 
-#ifdef HAVE_SSE2
+#ifdef WORDS_BIGENDIAN
+static void
+unpack_08 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
+  UINT4 outer, inwordpointer;
+  unsigned int column;
+  const UINT4 *bitpack = in;
+
+  for (column = 0; column < 4; column++) {
+    in = &(bitpack[column]);
+
+    for (outer = 0; outer < 4; outer++) {
+      for (inwordpointer = 0; inwordpointer < 32; inwordpointer += 8) {
+	*(out++) = ( Bigendian_convert_uint(*in) >> inwordpointer )   % (1U << 8 ) ;
+      }
+      in += 4;
+    }
+  }
+
+  return;
+}
+
+#elif !defined(HAVE_SSE2)
+static void
+unpack_08 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
+  UINT4 outer, inwordpointer;
+  unsigned int column;
+  const UINT4 *bitpack = in;
+
+  for (column = 0; column < 4; column++) {
+    in = &(bitpack[column]);
+
+    for (outer = 0; outer < 4; outer++) {
+      for (inwordpointer = 0; inwordpointer < 32; inwordpointer += 8) {
+	*(out++) = ( (*in) >> inwordpointer )   % (1U << 8 ) ;
+      }
+      in += 4;
+    }
+  }
+
+  return;
+}
+
+#else
 static void
 unpack_08_fwd (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
     __m128i InReg = _mm_load_si128(in);
@@ -2933,28 +3082,6 @@ unpack_08_rev_8_2 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
 
     return;
 }
-
-
-#else
-static void
-unpack_08 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
-  UINT4 outer, inwordpointer;
-  unsigned int column;
-  const UINT4 *bitpack = in;
-
-  for (column = 0; column < 4; column++) {
-    in = &(bitpack[column]);
-
-    for (outer = 0; outer < 4; outer++) {
-      for (inwordpointer = 0; inwordpointer < 32; inwordpointer += 8) {
-	*(out++) = ( (*in) >> inwordpointer )   % (1U << 8 ) ;
-      }
-      in += 4;
-    }
-  }
-
-  return;
-}
 #endif
 
 
@@ -3047,25 +3174,133 @@ unpack_09 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
 
 
 
-#ifdef HAVE_SSE2
+#ifdef WORDS_BIGENDIAN
 static void
-unpack_10_fwd (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
-    __m128i InReg = _mm_load_si128(in);
-    __m128i OutReg;
-    const __m128i mask10 =  _mm_set1_epi32(1023U);
+unpack_10 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
+  unsigned int column;
+  const UINT4 *bitpack = in;
 
-    OutReg = _mm_and_si128( InReg , mask10);
-    _mm_store_si128(out++, OutReg);
+  for (column = 0; column < 4; column++) {
+    in = &(bitpack[column]);
 
-    OutReg = _mm_and_si128(  _mm_srli_epi32(InReg,10) , mask10);
-    /* total = _mm_add_epi32(total, OutReg); */
-    _mm_store_si128(out++, OutReg);
+    *out = ( Bigendian_convert_uint(*in) >>  0  )   % (1U << 10 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  10  )   % (1U << 10 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  20  )   % (1U << 10 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  30  )   % (1U << 10 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 8 ))<<( 10 - 8 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  8  )   % (1U << 10 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  18  )   % (1U << 10 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  28  )   % (1U << 10 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 6 ))<<( 10 - 6 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  6  )   % (1U << 10 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  16  )   % (1U << 10 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  26  )   % (1U << 10 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 4 ))<<( 10 - 4 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  4  )   % (1U << 10 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  14  )   % (1U << 10 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  24  )   % (1U << 10 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 2 ))<<( 10 - 2 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  2  )   % (1U << 10 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  12  )   % (1U << 10 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  22  )   % (1U << 10 ) ;
+    out++;
 
-    OutReg = _mm_and_si128(  _mm_srli_epi32(InReg,20) , mask10);
-    /* total = _mm_add_epi32(total, OutReg); */
-    _mm_store_si128(out++, OutReg);
+  }
+  return;
+}
 
-    OutReg =   _mm_srli_epi32(InReg,30) ;
+#elif !defined(HAVE_SSE2)
+static void
+unpack_10 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
+  unsigned int column;
+  const UINT4 *bitpack = in;
+
+  for (column = 0; column < 4; column++) {
+    in = &(bitpack[column]);
+
+    *out = ( (*in) >>  0  )   % (1U << 10 ) ;
+    out++;
+    *out = ( (*in) >>  10  )   % (1U << 10 ) ;
+    out++;
+    *out = ( (*in) >>  20  )   % (1U << 10 ) ;
+    out++;
+    *out = ( (*in) >>  30  )   % (1U << 10 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 8 ))<<( 10 - 8 );
+    out++;
+    *out = ( (*in) >>  8  )   % (1U << 10 ) ;
+    out++;
+    *out = ( (*in) >>  18  )   % (1U << 10 ) ;
+    out++;
+    *out = ( (*in) >>  28  )   % (1U << 10 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 6 ))<<( 10 - 6 );
+    out++;
+    *out = ( (*in) >>  6  )   % (1U << 10 ) ;
+    out++;
+    *out = ( (*in) >>  16  )   % (1U << 10 ) ;
+    out++;
+    *out = ( (*in) >>  26  )   % (1U << 10 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 4 ))<<( 10 - 4 );
+    out++;
+    *out = ( (*in) >>  4  )   % (1U << 10 ) ;
+    out++;
+    *out = ( (*in) >>  14  )   % (1U << 10 ) ;
+    out++;
+    *out = ( (*in) >>  24  )   % (1U << 10 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 2 ))<<( 10 - 2 );
+    out++;
+    *out = ( (*in) >>  2  )   % (1U << 10 ) ;
+    out++;
+    *out = ( (*in) >>  12  )   % (1U << 10 ) ;
+    out++;
+    *out = ( (*in) >>  22  )   % (1U << 10 ) ;
+    out++;
+
+  }
+  return;
+}
+
+#else
+static void
+unpack_10_fwd (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
+    __m128i InReg = _mm_load_si128(in);
+    __m128i OutReg;
+    const __m128i mask10 =  _mm_set1_epi32(1023U);
+
+    OutReg = _mm_and_si128( InReg , mask10);
+    _mm_store_si128(out++, OutReg);
+
+    OutReg = _mm_and_si128(  _mm_srli_epi32(InReg,10) , mask10);
+    /* total = _mm_add_epi32(total, OutReg); */
+    _mm_store_si128(out++, OutReg);
+
+    OutReg = _mm_and_si128(  _mm_srli_epi32(InReg,20) , mask10);
+    /* total = _mm_add_epi32(total, OutReg); */
+    _mm_store_si128(out++, OutReg);
+
+    OutReg =   _mm_srli_epi32(InReg,30) ;
     InReg = _mm_load_si128(++in);
 
     OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 8), mask10));
@@ -3676,60 +3911,6 @@ unpack_10_rev_8_2 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
 
     return;
 }
-
-#else
-static void
-unpack_10 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
-  unsigned int column;
-  const UINT4 *bitpack = in;
-
-  for (column = 0; column < 4; column++) {
-    in = &(bitpack[column]);
-
-    *out = ( (*in) >>  0  )   % (1U << 10 ) ;
-    out++;
-    *out = ( (*in) >>  10  )   % (1U << 10 ) ;
-    out++;
-    *out = ( (*in) >>  20  )   % (1U << 10 ) ;
-    out++;
-    *out = ( (*in) >>  30  )   % (1U << 10 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 8 ))<<( 10 - 8 );
-    out++;
-    *out = ( (*in) >>  8  )   % (1U << 10 ) ;
-    out++;
-    *out = ( (*in) >>  18  )   % (1U << 10 ) ;
-    out++;
-    *out = ( (*in) >>  28  )   % (1U << 10 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 6 ))<<( 10 - 6 );
-    out++;
-    *out = ( (*in) >>  6  )   % (1U << 10 ) ;
-    out++;
-    *out = ( (*in) >>  16  )   % (1U << 10 ) ;
-    out++;
-    *out = ( (*in) >>  26  )   % (1U << 10 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 4 ))<<( 10 - 4 );
-    out++;
-    *out = ( (*in) >>  4  )   % (1U << 10 ) ;
-    out++;
-    *out = ( (*in) >>  14  )   % (1U << 10 ) ;
-    out++;
-    *out = ( (*in) >>  24  )   % (1U << 10 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 2 ))<<( 10 - 2 );
-    out++;
-    *out = ( (*in) >>  2  )   % (1U << 10 ) ;
-    out++;
-    *out = ( (*in) >>  12  )   % (1U << 10 ) ;
-    out++;
-    *out = ( (*in) >>  22  )   % (1U << 10 ) ;
-    out++;
-
-  }
-  return;
-}
 #endif
 
 
@@ -3823,7 +4004,117 @@ unpack_11 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
 #endif
 
 
-#ifdef HAVE_SSE2
+#ifdef WORDS_BIGENDIAN
+static void
+unpack_12 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
+  unsigned int column;
+  const UINT4 *bitpack = in;
+
+  for (column = 0; column < 4; column++) {
+    in = &(bitpack[column]);
+
+    *out = ( Bigendian_convert_uint(*in) >>  0  )   % (1U << 12 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  12  )   % (1U << 12 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  24  )   % (1U << 12 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 4 ))<<( 12 - 4 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  4  )   % (1U << 12 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  16  )   % (1U << 12 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  28  )   % (1U << 12 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 8 ))<<( 12 - 8 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  8  )   % (1U << 12 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  20  )   % (1U << 12 ) ;
+    in += 4;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  0  )   % (1U << 12 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  12  )   % (1U << 12 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  24  )   % (1U << 12 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 4 ))<<( 12 - 4 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  4  )   % (1U << 12 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  16  )   % (1U << 12 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  28  )   % (1U << 12 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 8 ))<<( 12 - 8 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  8  )   % (1U << 12 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  20  )   % (1U << 12 ) ;
+    out++;
+  }
+
+  return;
+}
+
+#elif !defined(HAVE_SSE2)
+static void
+unpack_12 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
+  unsigned int column;
+  const UINT4 *bitpack = in;
+
+  for (column = 0; column < 4; column++) {
+    in = &(bitpack[column]);
+
+    *out = ( (*in) >>  0  )   % (1U << 12 ) ;
+    out++;
+    *out = ( (*in) >>  12  )   % (1U << 12 ) ;
+    out++;
+    *out = ( (*in) >>  24  )   % (1U << 12 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 4 ))<<( 12 - 4 );
+    out++;
+    *out = ( (*in) >>  4  )   % (1U << 12 ) ;
+    out++;
+    *out = ( (*in) >>  16  )   % (1U << 12 ) ;
+    out++;
+    *out = ( (*in) >>  28  )   % (1U << 12 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 8 ))<<( 12 - 8 );
+    out++;
+    *out = ( (*in) >>  8  )   % (1U << 12 ) ;
+    out++;
+    *out = ( (*in) >>  20  )   % (1U << 12 ) ;
+    in += 4;
+    out++;
+    *out = ( (*in) >>  0  )   % (1U << 12 ) ;
+    out++;
+    *out = ( (*in) >>  12  )   % (1U << 12 ) ;
+    out++;
+    *out = ( (*in) >>  24  )   % (1U << 12 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 4 ))<<( 12 - 4 );
+    out++;
+    *out = ( (*in) >>  4  )   % (1U << 12 ) ;
+    out++;
+    *out = ( (*in) >>  16  )   % (1U << 12 ) ;
+    out++;
+    *out = ( (*in) >>  28  )   % (1U << 12 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 8 ))<<( 12 - 8 );
+    out++;
+    *out = ( (*in) >>  8  )   % (1U << 12 ) ;
+    out++;
+    *out = ( (*in) >>  20  )   % (1U << 12 ) ;
+    out++;
+  }
+
+  return;
+}
+
+#else
 static void
 unpack_12_fwd (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
     __m128i InReg = _mm_load_si128(in);
@@ -4454,73 +4745,18 @@ unpack_12_rev_8_2 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
 
     return;
 }
+#endif
 
-#else
+
+#ifdef ALLOW_ODD_PACKSIZES
 static void
-unpack_12 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
-  unsigned int column;
-  const UINT4 *bitpack = in;
+unpack_13 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
+    __m128i InReg = _mm_load_si128(in);
+    __m128i OutReg;
+    const __m128i mask13 =  _mm_set1_epi32(8191U);
 
-  for (column = 0; column < 4; column++) {
-    in = &(bitpack[column]);
-
-    *out = ( (*in) >>  0  )   % (1U << 12 ) ;
-    out++;
-    *out = ( (*in) >>  12  )   % (1U << 12 ) ;
-    out++;
-    *out = ( (*in) >>  24  )   % (1U << 12 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 4 ))<<( 12 - 4 );
-    out++;
-    *out = ( (*in) >>  4  )   % (1U << 12 ) ;
-    out++;
-    *out = ( (*in) >>  16  )   % (1U << 12 ) ;
-    out++;
-    *out = ( (*in) >>  28  )   % (1U << 12 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 8 ))<<( 12 - 8 );
-    out++;
-    *out = ( (*in) >>  8  )   % (1U << 12 ) ;
-    out++;
-    *out = ( (*in) >>  20  )   % (1U << 12 ) ;
-    in += 4;
-    out++;
-    *out = ( (*in) >>  0  )   % (1U << 12 ) ;
-    out++;
-    *out = ( (*in) >>  12  )   % (1U << 12 ) ;
-    out++;
-    *out = ( (*in) >>  24  )   % (1U << 12 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 4 ))<<( 12 - 4 );
-    out++;
-    *out = ( (*in) >>  4  )   % (1U << 12 ) ;
-    out++;
-    *out = ( (*in) >>  16  )   % (1U << 12 ) ;
-    out++;
-    *out = ( (*in) >>  28  )   % (1U << 12 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 8 ))<<( 12 - 8 );
-    out++;
-    *out = ( (*in) >>  8  )   % (1U << 12 ) ;
-    out++;
-    *out = ( (*in) >>  20  )   % (1U << 12 ) ;
-    out++;
-  }
-
-  return;
-}
-#endif
-
-
-#ifdef ALLOW_ODD_PACKSIZES
-static void
-unpack_13 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
-    __m128i InReg = _mm_load_si128(in);
-    __m128i OutReg;
-    const __m128i mask13 =  _mm_set1_epi32(8191U);
-
-    OutReg = _mm_and_si128( InReg , mask13);
-    _mm_store_si128(out++, OutReg);
+    OutReg = _mm_and_si128( InReg , mask13);
+    _mm_store_si128(out++, OutReg);
 
     OutReg = _mm_and_si128(  _mm_srli_epi32(InReg,13) , mask13);
     /* total = _mm_add_epi32(total, OutReg); */
@@ -4605,7 +4841,123 @@ unpack_13 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
 #endif
 
 
-#ifdef HAVE_SSE2
+#ifdef WORDS_BIGENDIAN
+static void
+unpack_14 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
+  unsigned int column;
+  const UINT4 *bitpack = in;
+
+  for (column = 0; column < 4; column++) {
+    in = &(bitpack[column]);
+
+    *out = ( Bigendian_convert_uint(*in) >>  0  )   % (1U << 14 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  14  )   % (1U << 14 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  28  )   % (1U << 14 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 10 ))<<( 14 - 10 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  10  )   % (1U << 14 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  24  )   % (1U << 14 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 6 ))<<( 14 - 6 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  6  )   % (1U << 14 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  20  )   % (1U << 14 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 2 ))<<( 14 - 2 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  2  )   % (1U << 14 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  16  )   % (1U << 14 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  30  )   % (1U << 14 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 12 ))<<( 14 - 12 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  12  )   % (1U << 14 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  26  )   % (1U << 14 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 8 ))<<( 14 - 8 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  8  )   % (1U << 14 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  22  )   % (1U << 14 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 4 ))<<( 14 - 4 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  4  )   % (1U << 14 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  18  )   % (1U << 14 ) ;
+    out++;
+  }
+
+  return;
+}
+
+#elif !defined(HAVE_SSE2)
+static void
+unpack_14 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
+  unsigned int column;
+  const UINT4 *bitpack = in;
+
+  for (column = 0; column < 4; column++) {
+    in = &(bitpack[column]);
+
+    *out = ( (*in) >>  0  )   % (1U << 14 ) ;
+    out++;
+    *out = ( (*in) >>  14  )   % (1U << 14 ) ;
+    out++;
+    *out = ( (*in) >>  28  )   % (1U << 14 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 10 ))<<( 14 - 10 );
+    out++;
+    *out = ( (*in) >>  10  )   % (1U << 14 ) ;
+    out++;
+    *out = ( (*in) >>  24  )   % (1U << 14 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 6 ))<<( 14 - 6 );
+    out++;
+    *out = ( (*in) >>  6  )   % (1U << 14 ) ;
+    out++;
+    *out = ( (*in) >>  20  )   % (1U << 14 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 2 ))<<( 14 - 2 );
+    out++;
+    *out = ( (*in) >>  2  )   % (1U << 14 ) ;
+    out++;
+    *out = ( (*in) >>  16  )   % (1U << 14 ) ;
+    out++;
+    *out = ( (*in) >>  30  )   % (1U << 14 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 12 ))<<( 14 - 12 );
+    out++;
+    *out = ( (*in) >>  12  )   % (1U << 14 ) ;
+    out++;
+    *out = ( (*in) >>  26  )   % (1U << 14 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 8 ))<<( 14 - 8 );
+    out++;
+    *out = ( (*in) >>  8  )   % (1U << 14 ) ;
+    out++;
+    *out = ( (*in) >>  22  )   % (1U << 14 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 4 ))<<( 14 - 4 );
+    out++;
+    *out = ( (*in) >>  4  )   % (1U << 14 ) ;
+    out++;
+    *out = ( (*in) >>  18  )   % (1U << 14 ) ;
+    out++;
+  }
+
+  return;
+}
+
+#else
 static void
 unpack_14_fwd (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
     __m128i InReg = _mm_load_si128(in);
@@ -5263,64 +5615,6 @@ unpack_14_rev_8_2 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
 
     return;
 }
-
-#else
-static void
-unpack_14 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
-  unsigned int column;
-  const UINT4 *bitpack = in;
-
-  for (column = 0; column < 4; column++) {
-    in = &(bitpack[column]);
-
-    *out = ( (*in) >>  0  )   % (1U << 14 ) ;
-    out++;
-    *out = ( (*in) >>  14  )   % (1U << 14 ) ;
-    out++;
-    *out = ( (*in) >>  28  )   % (1U << 14 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 10 ))<<( 14 - 10 );
-    out++;
-    *out = ( (*in) >>  10  )   % (1U << 14 ) ;
-    out++;
-    *out = ( (*in) >>  24  )   % (1U << 14 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 6 ))<<( 14 - 6 );
-    out++;
-    *out = ( (*in) >>  6  )   % (1U << 14 ) ;
-    out++;
-    *out = ( (*in) >>  20  )   % (1U << 14 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 2 ))<<( 14 - 2 );
-    out++;
-    *out = ( (*in) >>  2  )   % (1U << 14 ) ;
-    out++;
-    *out = ( (*in) >>  16  )   % (1U << 14 ) ;
-    out++;
-    *out = ( (*in) >>  30  )   % (1U << 14 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 12 ))<<( 14 - 12 );
-    out++;
-    *out = ( (*in) >>  12  )   % (1U << 14 ) ;
-    out++;
-    *out = ( (*in) >>  26  )   % (1U << 14 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 8 ))<<( 14 - 8 );
-    out++;
-    *out = ( (*in) >>  8  )   % (1U << 14 ) ;
-    out++;
-    *out = ( (*in) >>  22  )   % (1U << 14 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 4 ))<<( 14 - 4 );
-    out++;
-    *out = ( (*in) >>  4  )   % (1U << 14 ) ;
-    out++;
-    *out = ( (*in) >>  18  )   % (1U << 14 ) ;
-    out++;
-  }
-
-  return;
-}
 #endif
 
 
@@ -5421,7 +5715,50 @@ unpack_15 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
 #endif
 
 
-#ifdef HAVE_SSE2
+
+#ifdef WORDS_BIGENDIAN
+static void
+unpack_16 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
+  UINT4 outer, inwordpointer;
+  unsigned int column;
+  const UINT4 *bitpack = in;
+
+  for (column = 0; column < 4; column++) {
+    in = &(bitpack[column]);
+
+    for (outer = 0; outer < 8; outer++) {
+      for(inwordpointer =  0; inwordpointer <32; inwordpointer += 16) {
+	*(out++) = ( Bigendian_convert_uint(*in) >> inwordpointer )   % (1U << 16 ) ;
+      }
+      in += 4;
+    }
+  }
+
+  return;
+}
+
+#elif !defined(HAVE_SSE2)
+static void
+unpack_16 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
+  UINT4 outer, inwordpointer;
+  unsigned int column;
+  const UINT4 *bitpack = in;
+
+  for (column = 0; column < 4; column++) {
+    in = &(bitpack[column]);
+
+    for (outer = 0; outer < 8; outer++) {
+      for(inwordpointer =  0; inwordpointer <32; inwordpointer += 16) {
+	*(out++) = ( (*in) >> inwordpointer )   % (1U << 16 ) ;
+      }
+      in += 4;
+    }
+  }
+
+  return;
+}
+
+#else
 static void
 unpack_16_fwd (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
     __m128i InReg = _mm_load_si128(in);
@@ -6063,27 +6400,6 @@ unpack_16_rev_8_2 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
 
     return;
 }
-
-#else
-static void
-unpack_16 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
-  UINT4 outer, inwordpointer;
-  unsigned int column;
-  const UINT4 *bitpack = in;
-
-  for (column = 0; column < 4; column++) {
-    in = &(bitpack[column]);
-
-    for (outer = 0; outer < 8; outer++) {
-      for(inwordpointer =  0; inwordpointer <32; inwordpointer += 16) {
-	*(out++) = ( (*in) >> inwordpointer )   % (1U << 16 ) ;
-      }
-      in += 4;
-    }
-  }
-
-  return;
-}
 #endif
 
 
@@ -6186,15 +6502,140 @@ unpack_17 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
 #endif
 
 
-#ifdef HAVE_SSE2
-static void
-unpack_18_fwd (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
-    __m128i InReg = _mm_load_si128(in);
-    __m128i OutReg;
-    const __m128i mask18 =  _mm_set1_epi32(262143U);
 
-    OutReg = _mm_and_si128( InReg , mask18);
-    _mm_store_si128(out++, OutReg);
+#ifdef WORDS_BIGENDIAN
+static void
+unpack_18 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
+  unsigned int column;
+  const UINT4 *bitpack = in;
+
+  for (column = 0; column < 4; column++) {
+    in = &(bitpack[column]);
+
+    *out = ( Bigendian_convert_uint(*in) >>  0  )   % (1U << 18 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  18  )   % (1U << 18 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 4 ))<<( 18 - 4 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  4  )   % (1U << 18 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  22  )   % (1U << 18 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 8 ))<<( 18 - 8 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  8  )   % (1U << 18 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  26  )   % (1U << 18 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 12 ))<<( 18 - 12 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  12  )   % (1U << 18 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  30  )   % (1U << 18 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 16 ))<<( 18 - 16 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  16  )   % (1U << 18 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 2 ))<<( 18 - 2 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  2  )   % (1U << 18 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  20  )   % (1U << 18 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 6 ))<<( 18 - 6 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  6  )   % (1U << 18 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  24  )   % (1U << 18 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 10 ))<<( 18 - 10 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  10  )   % (1U << 18 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  28  )   % (1U << 18 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 14 ))<<( 18 - 14 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  14  )   % (1U << 18 ) ;
+    out++;
+  }
+
+  return;
+}
+
+#elif !defined(HAVE_SSE2)
+static void
+unpack_18 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
+  unsigned int column;
+  const UINT4 *bitpack = in;
+
+  for (column = 0; column < 4; column++) {
+    in = &(bitpack[column]);
+
+    *out = ( (*in) >>  0  )   % (1U << 18 ) ;
+    out++;
+    *out = ( (*in) >>  18  )   % (1U << 18 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 4 ))<<( 18 - 4 );
+    out++;
+    *out = ( (*in) >>  4  )   % (1U << 18 ) ;
+    out++;
+    *out = ( (*in) >>  22  )   % (1U << 18 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 8 ))<<( 18 - 8 );
+    out++;
+    *out = ( (*in) >>  8  )   % (1U << 18 ) ;
+    out++;
+    *out = ( (*in) >>  26  )   % (1U << 18 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 12 ))<<( 18 - 12 );
+    out++;
+    *out = ( (*in) >>  12  )   % (1U << 18 ) ;
+    out++;
+    *out = ( (*in) >>  30  )   % (1U << 18 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 16 ))<<( 18 - 16 );
+    out++;
+    *out = ( (*in) >>  16  )   % (1U << 18 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 2 ))<<( 18 - 2 );
+    out++;
+    *out = ( (*in) >>  2  )   % (1U << 18 ) ;
+    out++;
+    *out = ( (*in) >>  20  )   % (1U << 18 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 6 ))<<( 18 - 6 );
+    out++;
+    *out = ( (*in) >>  6  )   % (1U << 18 ) ;
+    out++;
+    *out = ( (*in) >>  24  )   % (1U << 18 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 10 ))<<( 18 - 10 );
+    out++;
+    *out = ( (*in) >>  10  )   % (1U << 18 ) ;
+    out++;
+    *out = ( (*in) >>  28  )   % (1U << 18 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 14 ))<<( 18 - 14 );
+    out++;
+    *out = ( (*in) >>  14  )   % (1U << 18 ) ;
+    out++;
+  }
+
+  return;
+}
+
+#else
+static void
+unpack_18_fwd (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
+    __m128i InReg = _mm_load_si128(in);
+    __m128i OutReg;
+    const __m128i mask18 =  _mm_set1_epi32(262143U);
+
+    OutReg = _mm_and_si128( InReg , mask18);
+    _mm_store_si128(out++, OutReg);
 
     OutReg =   _mm_srli_epi32(InReg,18) ;
     InReg = _mm_load_si128(++in);
@@ -6872,68 +7313,6 @@ unpack_18_rev_8_2 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
 
     return;
 }
-
-#else
-static void
-unpack_18 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
-  unsigned int column;
-  const UINT4 *bitpack = in;
-
-  for (column = 0; column < 4; column++) {
-    in = &(bitpack[column]);
-
-    *out = ( (*in) >>  0  )   % (1U << 18 ) ;
-    out++;
-    *out = ( (*in) >>  18  )   % (1U << 18 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 4 ))<<( 18 - 4 );
-    out++;
-    *out = ( (*in) >>  4  )   % (1U << 18 ) ;
-    out++;
-    *out = ( (*in) >>  22  )   % (1U << 18 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 8 ))<<( 18 - 8 );
-    out++;
-    *out = ( (*in) >>  8  )   % (1U << 18 ) ;
-    out++;
-    *out = ( (*in) >>  26  )   % (1U << 18 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 12 ))<<( 18 - 12 );
-    out++;
-    *out = ( (*in) >>  12  )   % (1U << 18 ) ;
-    out++;
-    *out = ( (*in) >>  30  )   % (1U << 18 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 16 ))<<( 18 - 16 );
-    out++;
-    *out = ( (*in) >>  16  )   % (1U << 18 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 2 ))<<( 18 - 2 );
-    out++;
-    *out = ( (*in) >>  2  )   % (1U << 18 ) ;
-    out++;
-    *out = ( (*in) >>  20  )   % (1U << 18 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 6 ))<<( 18 - 6 );
-    out++;
-    *out = ( (*in) >>  6  )   % (1U << 18 ) ;
-    out++;
-    *out = ( (*in) >>  24  )   % (1U << 18 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 10 ))<<( 18 - 10 );
-    out++;
-    *out = ( (*in) >>  10  )   % (1U << 18 ) ;
-    out++;
-    *out = ( (*in) >>  28  )   % (1U << 18 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 14 ))<<( 18 - 14 );
-    out++;
-    *out = ( (*in) >>  14  )   % (1U << 18 ) ;
-    out++;
-  }
-
-  return;
-}
 #endif
 
 
@@ -7039,64 +7418,191 @@ unpack_19 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
 #endif
 
 
-#ifdef HAVE_SSE2
-static void
-unpack_20_fwd (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
-    __m128i InReg = _mm_load_si128(in);
-    __m128i OutReg;
-    const __m128i mask20 =  _mm_set1_epi32(1048575U);
-
-    OutReg = _mm_and_si128( InReg , mask20);
-    _mm_store_si128(out++, OutReg);
-
-    OutReg =   _mm_srli_epi32(InReg,20) ;
-    InReg = _mm_load_si128(++in);
-
-    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 8), mask20));
-    /* total = _mm_add_epi32(total, OutReg); */
-    _mm_store_si128(out++, OutReg);
-
-    OutReg = _mm_and_si128(  _mm_srli_epi32(InReg,8) , mask20);
-    /* total = _mm_add_epi32(total, OutReg); */
-    _mm_store_si128(out++, OutReg);
-
-    OutReg =   _mm_srli_epi32(InReg,28) ;
-    InReg = _mm_load_si128(++in);
-
-    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 16), mask20));
-    /* total = _mm_add_epi32(total, OutReg); */
-    _mm_store_si128(out++, OutReg);
-
-    OutReg =   _mm_srli_epi32(InReg,16) ;
-    InReg = _mm_load_si128(++in);
-
-    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 4), mask20));
-    /* total = _mm_add_epi32(total, OutReg); */
-    _mm_store_si128(out++, OutReg);
-
-    OutReg = _mm_and_si128(  _mm_srli_epi32(InReg,4) , mask20);
-    /* total = _mm_add_epi32(total, OutReg); */
-    _mm_store_si128(out++, OutReg);
 
-    OutReg =   _mm_srli_epi32(InReg,24) ;
-    InReg = _mm_load_si128(++in);
+#ifdef WORDS_BIGENDIAN
+static void
+unpack_20 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
+  unsigned int column;
+  const UINT4 *bitpack = in;
 
-    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 12), mask20));
-    /* total = _mm_add_epi32(total, OutReg); */
-    _mm_store_si128(out++, OutReg);
+  for (column = 0; column < 4; column++) {
+    in = &(bitpack[column]);
 
-    OutReg =   _mm_srli_epi32(InReg,12) ;
-    /* total = _mm_add_epi32(total, OutReg); */
-    _mm_store_si128(out++, OutReg);
+    *out = ( Bigendian_convert_uint(*in) >>  0  )   % (1U << 20 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  20  )   % (1U << 20 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 8 ))<<( 20 - 8 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  8  )   % (1U << 20 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  28  )   % (1U << 20 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 16 ))<<( 20 - 16 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  16  )   % (1U << 20 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 4 ))<<( 20 - 4 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  4  )   % (1U << 20 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  24  )   % (1U << 20 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 12 ))<<( 20 - 12 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  12  )   % (1U << 20 ) ;
+    in += 4;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  0  )   % (1U << 20 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  20  )   % (1U << 20 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 8 ))<<( 20 - 8 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  8  )   % (1U << 20 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  28  )   % (1U << 20 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 16 ))<<( 20 - 16 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  16  )   % (1U << 20 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 4 ))<<( 20 - 4 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  4  )   % (1U << 20 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  24  )   % (1U << 20 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 12 ))<<( 20 - 12 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  12  )   % (1U << 20 ) ;
+    out++;
+  }
 
-    return;
+  return;
 }
 
+#elif !defined(HAVE_SSE2)
 static void
-unpack_20_fwd_1_3 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
-    __m128i InReg;
-    __m128i total;
-    const __m128i mask20 =  _mm_set1_epi32(1048575U);
+unpack_20 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
+  unsigned int column;
+  const UINT4 *bitpack = in;
+
+  for (column = 0; column < 4; column++) {
+    in = &(bitpack[column]);
+
+    *out = ( (*in) >>  0  )   % (1U << 20 ) ;
+    out++;
+    *out = ( (*in) >>  20  )   % (1U << 20 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 8 ))<<( 20 - 8 );
+    out++;
+    *out = ( (*in) >>  8  )   % (1U << 20 ) ;
+    out++;
+    *out = ( (*in) >>  28  )   % (1U << 20 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 16 ))<<( 20 - 16 );
+    out++;
+    *out = ( (*in) >>  16  )   % (1U << 20 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 4 ))<<( 20 - 4 );
+    out++;
+    *out = ( (*in) >>  4  )   % (1U << 20 ) ;
+    out++;
+    *out = ( (*in) >>  24  )   % (1U << 20 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 12 ))<<( 20 - 12 );
+    out++;
+    *out = ( (*in) >>  12  )   % (1U << 20 ) ;
+    in += 4;
+    out++;
+    *out = ( (*in) >>  0  )   % (1U << 20 ) ;
+    out++;
+    *out = ( (*in) >>  20  )   % (1U << 20 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 8 ))<<( 20 - 8 );
+    out++;
+    *out = ( (*in) >>  8  )   % (1U << 20 ) ;
+    out++;
+    *out = ( (*in) >>  28  )   % (1U << 20 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 16 ))<<( 20 - 16 );
+    out++;
+    *out = ( (*in) >>  16  )   % (1U << 20 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 4 ))<<( 20 - 4 );
+    out++;
+    *out = ( (*in) >>  4  )   % (1U << 20 ) ;
+    out++;
+    *out = ( (*in) >>  24  )   % (1U << 20 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 12 ))<<( 20 - 12 );
+    out++;
+    *out = ( (*in) >>  12  )   % (1U << 20 ) ;
+    out++;
+  }
+
+  return;
+}
+
+#else
+static void
+unpack_20_fwd (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
+    __m128i InReg = _mm_load_si128(in);
+    __m128i OutReg;
+    const __m128i mask20 =  _mm_set1_epi32(1048575U);
+
+    OutReg = _mm_and_si128( InReg , mask20);
+    _mm_store_si128(out++, OutReg);
+
+    OutReg =   _mm_srli_epi32(InReg,20) ;
+    InReg = _mm_load_si128(++in);
+
+    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 8), mask20));
+    /* total = _mm_add_epi32(total, OutReg); */
+    _mm_store_si128(out++, OutReg);
+
+    OutReg = _mm_and_si128(  _mm_srli_epi32(InReg,8) , mask20);
+    /* total = _mm_add_epi32(total, OutReg); */
+    _mm_store_si128(out++, OutReg);
+
+    OutReg =   _mm_srli_epi32(InReg,28) ;
+    InReg = _mm_load_si128(++in);
+
+    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 16), mask20));
+    /* total = _mm_add_epi32(total, OutReg); */
+    _mm_store_si128(out++, OutReg);
+
+    OutReg =   _mm_srli_epi32(InReg,16) ;
+    InReg = _mm_load_si128(++in);
+
+    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 4), mask20));
+    /* total = _mm_add_epi32(total, OutReg); */
+    _mm_store_si128(out++, OutReg);
+
+    OutReg = _mm_and_si128(  _mm_srli_epi32(InReg,4) , mask20);
+    /* total = _mm_add_epi32(total, OutReg); */
+    _mm_store_si128(out++, OutReg);
+
+    OutReg =   _mm_srli_epi32(InReg,24) ;
+    InReg = _mm_load_si128(++in);
+
+    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 12), mask20));
+    /* total = _mm_add_epi32(total, OutReg); */
+    _mm_store_si128(out++, OutReg);
+
+    OutReg =   _mm_srli_epi32(InReg,12) ;
+    /* total = _mm_add_epi32(total, OutReg); */
+    _mm_store_si128(out++, OutReg);
+
+    return;
+}
+
+static void
+unpack_20_fwd_1_3 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
+    __m128i InReg;
+    __m128i total;
+    const __m128i mask20 =  _mm_set1_epi32(1048575U);
 
     /* 1 */
     InReg = _mm_load_si128(in);
@@ -7727,70 +8233,6 @@ unpack_20_rev_8_2 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
 
     return;
 }
-
-
-#else
-static void
-unpack_20 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
-  unsigned int column;
-  const UINT4 *bitpack = in;
-
-  for (column = 0; column < 4; column++) {
-    in = &(bitpack[column]);
-
-    *out = ( (*in) >>  0  )   % (1U << 20 ) ;
-    out++;
-    *out = ( (*in) >>  20  )   % (1U << 20 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 8 ))<<( 20 - 8 );
-    out++;
-    *out = ( (*in) >>  8  )   % (1U << 20 ) ;
-    out++;
-    *out = ( (*in) >>  28  )   % (1U << 20 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 16 ))<<( 20 - 16 );
-    out++;
-    *out = ( (*in) >>  16  )   % (1U << 20 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 4 ))<<( 20 - 4 );
-    out++;
-    *out = ( (*in) >>  4  )   % (1U << 20 ) ;
-    out++;
-    *out = ( (*in) >>  24  )   % (1U << 20 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 12 ))<<( 20 - 12 );
-    out++;
-    *out = ( (*in) >>  12  )   % (1U << 20 ) ;
-    in += 4;
-    out++;
-    *out = ( (*in) >>  0  )   % (1U << 20 ) ;
-    out++;
-    *out = ( (*in) >>  20  )   % (1U << 20 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 8 ))<<( 20 - 8 );
-    out++;
-    *out = ( (*in) >>  8  )   % (1U << 20 ) ;
-    out++;
-    *out = ( (*in) >>  28  )   % (1U << 20 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 16 ))<<( 20 - 16 );
-    out++;
-    *out = ( (*in) >>  16  )   % (1U << 20 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 4 ))<<( 20 - 4 );
-    out++;
-    *out = ( (*in) >>  4  )   % (1U << 20 ) ;
-    out++;
-    *out = ( (*in) >>  24  )   % (1U << 20 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 12 ))<<( 20 - 12 );
-    out++;
-    *out = ( (*in) >>  12  )   % (1U << 20 ) ;
-    out++;
-  }
-
-  return;
-}
 #endif
 
 
@@ -7899,49 +8341,181 @@ unpack_21 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
 #endif
 
 
-#ifdef HAVE_SSE2
+#ifdef WORDS_BIGENDIAN
 static void
-unpack_22_fwd (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
-    __m128i InReg = _mm_load_si128(in);
-    __m128i OutReg;
-    const __m128i mask22 =  _mm_set1_epi32(4194303U);
-
-    OutReg = _mm_and_si128( InReg , mask22);
-    _mm_store_si128(out++, OutReg);
-
-    OutReg =   _mm_srli_epi32(InReg,22) ;
-    InReg = _mm_load_si128(++in);
-
-    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 12), mask22));
-    /* total = _mm_add_epi32(total, OutReg); */
-    _mm_store_si128(out++, OutReg);
-
-    OutReg =   _mm_srli_epi32(InReg,12) ;
-    InReg = _mm_load_si128(++in);
-
-    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 2), mask22));
-    /* total = _mm_add_epi32(total, OutReg); */
-    _mm_store_si128(out++, OutReg);
+unpack_22 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
+  unsigned int column;
+  const UINT4 *bitpack = in;
 
-    OutReg = _mm_and_si128(  _mm_srli_epi32(InReg,2) , mask22);
-    /* total = _mm_add_epi32(total, OutReg); */
-    _mm_store_si128(out++, OutReg);
+  for (column = 0; column < 4; column++) {
+    in = &(bitpack[column]);
 
-    OutReg =   _mm_srli_epi32(InReg,24) ;
-    InReg = _mm_load_si128(++in);
+    *out = ( Bigendian_convert_uint(*in) >>  0  )   % (1U << 22 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  22  )   % (1U << 22 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 12 ))<<( 22 - 12 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  12  )   % (1U << 22 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 2 ))<<( 22 - 2 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  2  )   % (1U << 22 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  24  )   % (1U << 22 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 14 ))<<( 22 - 14 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  14  )   % (1U << 22 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 4 ))<<( 22 - 4 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  4  )   % (1U << 22 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  26  )   % (1U << 22 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 16 ))<<( 22 - 16 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  16  )   % (1U << 22 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 6 ))<<( 22 - 6 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  6  )   % (1U << 22 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  28  )   % (1U << 22 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 18 ))<<( 22 - 18 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  18  )   % (1U << 22 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 8 ))<<( 22 - 8 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  8  )   % (1U << 22 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  30  )   % (1U << 22 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 20 ))<<( 22 - 20 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  20  )   % (1U << 22 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 10 ))<<( 22 - 10 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  10  )   % (1U << 22 ) ;
+    out++;
+  }
 
-    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 14), mask22));
-    /* total = _mm_add_epi32(total, OutReg); */
-    _mm_store_si128(out++, OutReg);
+  return;
+}
 
-    OutReg =   _mm_srli_epi32(InReg,14) ;
-    InReg = _mm_load_si128(++in);
+#elif !defined(HAVE_SSE2)
+static void
+unpack_22 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
+  unsigned int column;
+  const UINT4 *bitpack = in;
 
-    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 4), mask22));
-    /* total = _mm_add_epi32(total, OutReg); */
-    _mm_store_si128(out++, OutReg);
+  for (column = 0; column < 4; column++) {
+    in = &(bitpack[column]);
 
-    OutReg = _mm_and_si128(  _mm_srli_epi32(InReg,4) , mask22);
+    *out = ( (*in) >>  0  )   % (1U << 22 ) ;
+    out++;
+    *out = ( (*in) >>  22  )   % (1U << 22 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 12 ))<<( 22 - 12 );
+    out++;
+    *out = ( (*in) >>  12  )   % (1U << 22 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 2 ))<<( 22 - 2 );
+    out++;
+    *out = ( (*in) >>  2  )   % (1U << 22 ) ;
+    out++;
+    *out = ( (*in) >>  24  )   % (1U << 22 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 14 ))<<( 22 - 14 );
+    out++;
+    *out = ( (*in) >>  14  )   % (1U << 22 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 4 ))<<( 22 - 4 );
+    out++;
+    *out = ( (*in) >>  4  )   % (1U << 22 ) ;
+    out++;
+    *out = ( (*in) >>  26  )   % (1U << 22 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 16 ))<<( 22 - 16 );
+    out++;
+    *out = ( (*in) >>  16  )   % (1U << 22 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 6 ))<<( 22 - 6 );
+    out++;
+    *out = ( (*in) >>  6  )   % (1U << 22 ) ;
+    out++;
+    *out = ( (*in) >>  28  )   % (1U << 22 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 18 ))<<( 22 - 18 );
+    out++;
+    *out = ( (*in) >>  18  )   % (1U << 22 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 8 ))<<( 22 - 8 );
+    out++;
+    *out = ( (*in) >>  8  )   % (1U << 22 ) ;
+    out++;
+    *out = ( (*in) >>  30  )   % (1U << 22 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 20 ))<<( 22 - 20 );
+    out++;
+    *out = ( (*in) >>  20  )   % (1U << 22 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 10 ))<<( 22 - 10 );
+    out++;
+    *out = ( (*in) >>  10  )   % (1U << 22 ) ;
+    out++;
+  }
+
+  return;
+}
+
+#else
+static void
+unpack_22_fwd (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
+    __m128i InReg = _mm_load_si128(in);
+    __m128i OutReg;
+    const __m128i mask22 =  _mm_set1_epi32(4194303U);
+
+    OutReg = _mm_and_si128( InReg , mask22);
+    _mm_store_si128(out++, OutReg);
+
+    OutReg =   _mm_srli_epi32(InReg,22) ;
+    InReg = _mm_load_si128(++in);
+
+    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 12), mask22));
+    /* total = _mm_add_epi32(total, OutReg); */
+    _mm_store_si128(out++, OutReg);
+
+    OutReg =   _mm_srli_epi32(InReg,12) ;
+    InReg = _mm_load_si128(++in);
+
+    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 2), mask22));
+    /* total = _mm_add_epi32(total, OutReg); */
+    _mm_store_si128(out++, OutReg);
+
+    OutReg = _mm_and_si128(  _mm_srli_epi32(InReg,2) , mask22);
+    /* total = _mm_add_epi32(total, OutReg); */
+    _mm_store_si128(out++, OutReg);
+
+    OutReg =   _mm_srli_epi32(InReg,24) ;
+    InReg = _mm_load_si128(++in);
+
+    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 14), mask22));
+    /* total = _mm_add_epi32(total, OutReg); */
+    _mm_store_si128(out++, OutReg);
+
+    OutReg =   _mm_srli_epi32(InReg,14) ;
+    InReg = _mm_load_si128(++in);
+
+    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 4), mask22));
+    /* total = _mm_add_epi32(total, OutReg); */
+    _mm_store_si128(out++, OutReg);
+
+    OutReg = _mm_and_si128(  _mm_srli_epi32(InReg,4) , mask22);
     /* total = _mm_add_epi32(total, OutReg); */
     _mm_store_si128(out++, OutReg);
 
@@ -8610,72 +9184,6 @@ unpack_22_rev_8_2 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
 
     return;
 }
-
-#else
-static void
-unpack_22 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
-  unsigned int column;
-  const UINT4 *bitpack = in;
-
-  for (column = 0; column < 4; column++) {
-    in = &(bitpack[column]);
-
-    *out = ( (*in) >>  0  )   % (1U << 22 ) ;
-    out++;
-    *out = ( (*in) >>  22  )   % (1U << 22 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 12 ))<<( 22 - 12 );
-    out++;
-    *out = ( (*in) >>  12  )   % (1U << 22 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 2 ))<<( 22 - 2 );
-    out++;
-    *out = ( (*in) >>  2  )   % (1U << 22 ) ;
-    out++;
-    *out = ( (*in) >>  24  )   % (1U << 22 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 14 ))<<( 22 - 14 );
-    out++;
-    *out = ( (*in) >>  14  )   % (1U << 22 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 4 ))<<( 22 - 4 );
-    out++;
-    *out = ( (*in) >>  4  )   % (1U << 22 ) ;
-    out++;
-    *out = ( (*in) >>  26  )   % (1U << 22 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 16 ))<<( 22 - 16 );
-    out++;
-    *out = ( (*in) >>  16  )   % (1U << 22 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 6 ))<<( 22 - 6 );
-    out++;
-    *out = ( (*in) >>  6  )   % (1U << 22 ) ;
-    out++;
-    *out = ( (*in) >>  28  )   % (1U << 22 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 18 ))<<( 22 - 18 );
-    out++;
-    *out = ( (*in) >>  18  )   % (1U << 22 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 8 ))<<( 22 - 8 );
-    out++;
-    *out = ( (*in) >>  8  )   % (1U << 22 ) ;
-    out++;
-    *out = ( (*in) >>  30  )   % (1U << 22 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 20 ))<<( 22 - 20 );
-    out++;
-    *out = ( (*in) >>  20  )   % (1U << 22 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 10 ))<<( 22 - 10 );
-    out++;
-    *out = ( (*in) >>  10  )   % (1U << 22 ) ;
-    out++;
-  }
-
-  return;
-}
 #endif
 
 
@@ -8787,68 +9295,199 @@ unpack_23 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
 #endif
 
 
-#ifdef HAVE_SSE2
-static void
-unpack_24_fwd (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
-    __m128i InReg = _mm_load_si128(in);
-    __m128i OutReg;
-    const __m128i mask24 =  _mm_set1_epi32(16777215U);
-
-    OutReg = _mm_and_si128( InReg , mask24);
-    _mm_store_si128(out++, OutReg);
-
-    OutReg =   _mm_srli_epi32(InReg,24) ;
-    InReg = _mm_load_si128(++in);
-
-    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask24));
-    /* total = _mm_add_epi32(total, OutReg); */
-    _mm_store_si128(out++, OutReg);
-
-    OutReg =   _mm_srli_epi32(InReg,16) ;
-    InReg = _mm_load_si128(++in);
-
-    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask24));
-    /* total = _mm_add_epi32(total, OutReg); */
-    _mm_store_si128(out++, OutReg);
-
-    OutReg =   _mm_srli_epi32(InReg,8) ;
-    InReg = _mm_load_si128(++in);
-
-    /* total = _mm_add_epi32(total, OutReg); */
-    _mm_store_si128(out++, OutReg);
-
-    OutReg = _mm_and_si128( InReg , mask24);
-    /* total = _mm_add_epi32(total, OutReg); */
-    _mm_store_si128(out++, OutReg);
-
-    OutReg =   _mm_srli_epi32(InReg,24) ;
-    InReg = _mm_load_si128(++in);
-
-    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask24));
-    /* total = _mm_add_epi32(total, OutReg); */
-    _mm_store_si128(out++, OutReg);
 
-    OutReg =   _mm_srli_epi32(InReg,16) ;
-    InReg = _mm_load_si128(++in);
+#ifdef WORDS_BIGENDIAN
+static void
+unpack_24 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
+  unsigned int column;
+  const UINT4 *bitpack = in;
 
-    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask24));
-    /* total = _mm_add_epi32(total, OutReg); */
-    _mm_store_si128(out++, OutReg);
+  for (column = 0; column < 4; column++) {
+    in = &(bitpack[column]);
 
-    OutReg =   _mm_srli_epi32(InReg,8) ;
-    /* total = _mm_add_epi32(total, OutReg); */
-    _mm_store_si128(out++, OutReg);
+    *out = ( Bigendian_convert_uint(*in) >>  0  )   % (1U << 24 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  24  )   % (1U << 24 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 16 ))<<( 24 - 16 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  16  )   % (1U << 24 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 8 ))<<( 24 - 8 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  8  )   % (1U << 24 ) ;
+    in += 4;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  0  )   % (1U << 24 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  24  )   % (1U << 24 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 16 ))<<( 24 - 16 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  16  )   % (1U << 24 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 8 ))<<( 24 - 8 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  8  )   % (1U << 24 ) ;
+    in += 4;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  0  )   % (1U << 24 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  24  )   % (1U << 24 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 16 ))<<( 24 - 16 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  16  )   % (1U << 24 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 8 ))<<( 24 - 8 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  8  )   % (1U << 24 ) ;
+    in += 4;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  0  )   % (1U << 24 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  24  )   % (1U << 24 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 16 ))<<( 24 - 16 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  16  )   % (1U << 24 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 8 ))<<( 24 - 8 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  8  )   % (1U << 24 ) ;
+    out++;
+  }
 
-    return;
+  return;
 }
 
+#elif !defined(HAVE_SSE2)
 static void
-unpack_24_fwd_1_3 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
-    __m128i InReg;
-    __m128i OutReg, total;
-    const __m128i mask24 =  _mm_set1_epi32(16777215U);
-
-    /* 1 */
+unpack_24 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
+  unsigned int column;
+  const UINT4 *bitpack = in;
+
+  for (column = 0; column < 4; column++) {
+    in = &(bitpack[column]);
+
+    *out = ( (*in) >>  0  )   % (1U << 24 ) ;
+    out++;
+    *out = ( (*in) >>  24  )   % (1U << 24 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 );
+    out++;
+    *out = ( (*in) >>  16  )   % (1U << 24 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 );
+    out++;
+    *out = ( (*in) >>  8  )   % (1U << 24 ) ;
+    in += 4;
+    out++;
+    *out = ( (*in) >>  0  )   % (1U << 24 ) ;
+    out++;
+    *out = ( (*in) >>  24  )   % (1U << 24 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 );
+    out++;
+    *out = ( (*in) >>  16  )   % (1U << 24 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 );
+    out++;
+    *out = ( (*in) >>  8  )   % (1U << 24 ) ;
+    in += 4;
+    out++;
+    *out = ( (*in) >>  0  )   % (1U << 24 ) ;
+    out++;
+    *out = ( (*in) >>  24  )   % (1U << 24 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 );
+    out++;
+    *out = ( (*in) >>  16  )   % (1U << 24 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 );
+    out++;
+    *out = ( (*in) >>  8  )   % (1U << 24 ) ;
+    in += 4;
+    out++;
+    *out = ( (*in) >>  0  )   % (1U << 24 ) ;
+    out++;
+    *out = ( (*in) >>  24  )   % (1U << 24 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 );
+    out++;
+    *out = ( (*in) >>  16  )   % (1U << 24 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 );
+    out++;
+    *out = ( (*in) >>  8  )   % (1U << 24 ) ;
+    out++;
+  }
+
+  return;
+}
+
+#else
+static void
+unpack_24_fwd (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
+    __m128i InReg = _mm_load_si128(in);
+    __m128i OutReg;
+    const __m128i mask24 =  _mm_set1_epi32(16777215U);
+
+    OutReg = _mm_and_si128( InReg , mask24);
+    _mm_store_si128(out++, OutReg);
+
+    OutReg =   _mm_srli_epi32(InReg,24) ;
+    InReg = _mm_load_si128(++in);
+
+    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask24));
+    /* total = _mm_add_epi32(total, OutReg); */
+    _mm_store_si128(out++, OutReg);
+
+    OutReg =   _mm_srli_epi32(InReg,16) ;
+    InReg = _mm_load_si128(++in);
+
+    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask24));
+    /* total = _mm_add_epi32(total, OutReg); */
+    _mm_store_si128(out++, OutReg);
+
+    OutReg =   _mm_srli_epi32(InReg,8) ;
+    InReg = _mm_load_si128(++in);
+
+    /* total = _mm_add_epi32(total, OutReg); */
+    _mm_store_si128(out++, OutReg);
+
+    OutReg = _mm_and_si128( InReg , mask24);
+    /* total = _mm_add_epi32(total, OutReg); */
+    _mm_store_si128(out++, OutReg);
+
+    OutReg =   _mm_srli_epi32(InReg,24) ;
+    InReg = _mm_load_si128(++in);
+
+    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask24));
+    /* total = _mm_add_epi32(total, OutReg); */
+    _mm_store_si128(out++, OutReg);
+
+    OutReg =   _mm_srli_epi32(InReg,16) ;
+    InReg = _mm_load_si128(++in);
+
+    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask24));
+    /* total = _mm_add_epi32(total, OutReg); */
+    _mm_store_si128(out++, OutReg);
+
+    OutReg =   _mm_srli_epi32(InReg,8) ;
+    /* total = _mm_add_epi32(total, OutReg); */
+    _mm_store_si128(out++, OutReg);
+
+    return;
+}
+
+static void
+unpack_24_fwd_1_3 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
+    __m128i InReg;
+    __m128i OutReg, total;
+    const __m128i mask24 =  _mm_set1_epi32(16777215U);
+
+    /* 1 */
     InReg = _mm_load_si128(in);
 
     total = /* OutReg = */ _mm_and_si128( InReg , mask24);
@@ -9489,72 +10128,6 @@ unpack_24_rev_8_2 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
 
     return;
 }
-
-
-#else
-static void
-unpack_24 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
-  unsigned int column;
-  const UINT4 *bitpack = in;
-
-  for (column = 0; column < 4; column++) {
-    in = &(bitpack[column]);
-
-    *out = ( (*in) >>  0  )   % (1U << 24 ) ;
-    out++;
-    *out = ( (*in) >>  24  )   % (1U << 24 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 );
-    out++;
-    *out = ( (*in) >>  16  )   % (1U << 24 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 );
-    out++;
-    *out = ( (*in) >>  8  )   % (1U << 24 ) ;
-    in += 4;
-    out++;
-    *out = ( (*in) >>  0  )   % (1U << 24 ) ;
-    out++;
-    *out = ( (*in) >>  24  )   % (1U << 24 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 );
-    out++;
-    *out = ( (*in) >>  16  )   % (1U << 24 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 );
-    out++;
-    *out = ( (*in) >>  8  )   % (1U << 24 ) ;
-    in += 4;
-    out++;
-    *out = ( (*in) >>  0  )   % (1U << 24 ) ;
-    out++;
-    *out = ( (*in) >>  24  )   % (1U << 24 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 );
-    out++;
-    *out = ( (*in) >>  16  )   % (1U << 24 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 );
-    out++;
-    *out = ( (*in) >>  8  )   % (1U << 24 ) ;
-    in += 4;
-    out++;
-    *out = ( (*in) >>  0  )   % (1U << 24 ) ;
-    out++;
-    *out = ( (*in) >>  24  )   % (1U << 24 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 );
-    out++;
-    *out = ( (*in) >>  16  )   % (1U << 24 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 );
-    out++;
-    *out = ( (*in) >>  8  )   % (1U << 24 ) ;
-    out++;
-  }
-
-  return;
-}
 #endif
 
 
@@ -9669,32 +10242,173 @@ unpack_25 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
 #endif
 
 
-#ifdef HAVE_SSE2
-static void
-unpack_26_fwd (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
-    __m128i InReg = _mm_load_si128(in);
-    __m128i OutReg;
-    const __m128i mask26 =  _mm_set1_epi32(67108863U);
-
-    OutReg = _mm_and_si128( InReg , mask26);
-    _mm_store_si128(out++, OutReg);
 
-    OutReg =   _mm_srli_epi32(InReg,26) ;
-    InReg = _mm_load_si128(++in);
+#ifdef WORDS_BIGENDIAN
+static void
+unpack_26 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
+  unsigned int column;
+  const UINT4 *bitpack = in;
 
-    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 20), mask26));
-    /* total = _mm_add_epi32(total, OutReg); */
-    _mm_store_si128(out++, OutReg);
+  for (column = 0; column < 4; column++) {
+    in = &(bitpack[column]);
 
-    OutReg =   _mm_srli_epi32(InReg,20) ;
-    InReg = _mm_load_si128(++in);
+    *out = ( Bigendian_convert_uint(*in) >>  0  )   % (1U << 26 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  26  )   % (1U << 26 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 20 ))<<( 26 - 20 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  20  )   % (1U << 26 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 14 ))<<( 26 - 14 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  14  )   % (1U << 26 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 8 ))<<( 26 - 8 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  8  )   % (1U << 26 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 2 ))<<( 26 - 2 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  2  )   % (1U << 26 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  28  )   % (1U << 26 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 22 ))<<( 26 - 22 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  22  )   % (1U << 26 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 16 ))<<( 26 - 16 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  16  )   % (1U << 26 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 10 ))<<( 26 - 10 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  10  )   % (1U << 26 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 4 ))<<( 26 - 4 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  4  )   % (1U << 26 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  30  )   % (1U << 26 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 24 ))<<( 26 - 24 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  24  )   % (1U << 26 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 18 ))<<( 26 - 18 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  18  )   % (1U << 26 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 12 ))<<( 26 - 12 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  12  )   % (1U << 26 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 6 ))<<( 26 - 6 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  6  )   % (1U << 26 ) ;
+    out++;
+  }
+  
+  return;
+}
 
-    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 14), mask26));
-    /* total = _mm_add_epi32(total, OutReg); */
-    _mm_store_si128(out++, OutReg);
+#elif !defined(HAVE_SSE2)
+static void
+unpack_26 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
+  unsigned int column;
+  const UINT4 *bitpack = in;
 
-    OutReg =   _mm_srli_epi32(InReg,14) ;
-    InReg = _mm_load_si128(++in);
+  for (column = 0; column < 4; column++) {
+    in = &(bitpack[column]);
+
+    *out = ( (*in) >>  0  )   % (1U << 26 ) ;
+    out++;
+    *out = ( (*in) >>  26  )   % (1U << 26 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 20 ))<<( 26 - 20 );
+    out++;
+    *out = ( (*in) >>  20  )   % (1U << 26 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 14 ))<<( 26 - 14 );
+    out++;
+    *out = ( (*in) >>  14  )   % (1U << 26 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 8 ))<<( 26 - 8 );
+    out++;
+    *out = ( (*in) >>  8  )   % (1U << 26 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 2 ))<<( 26 - 2 );
+    out++;
+    *out = ( (*in) >>  2  )   % (1U << 26 ) ;
+    out++;
+    *out = ( (*in) >>  28  )   % (1U << 26 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 22 ))<<( 26 - 22 );
+    out++;
+    *out = ( (*in) >>  22  )   % (1U << 26 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 16 ))<<( 26 - 16 );
+    out++;
+    *out = ( (*in) >>  16  )   % (1U << 26 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 10 ))<<( 26 - 10 );
+    out++;
+    *out = ( (*in) >>  10  )   % (1U << 26 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 4 ))<<( 26 - 4 );
+    out++;
+    *out = ( (*in) >>  4  )   % (1U << 26 ) ;
+    out++;
+    *out = ( (*in) >>  30  )   % (1U << 26 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 24 ))<<( 26 - 24 );
+    out++;
+    *out = ( (*in) >>  24  )   % (1U << 26 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 18 ))<<( 26 - 18 );
+    out++;
+    *out = ( (*in) >>  18  )   % (1U << 26 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 12 ))<<( 26 - 12 );
+    out++;
+    *out = ( (*in) >>  12  )   % (1U << 26 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 6 ))<<( 26 - 6 );
+    out++;
+    *out = ( (*in) >>  6  )   % (1U << 26 ) ;
+    out++;
+  }
+  
+  return;
+}
+
+#else
+static void
+unpack_26_fwd (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
+    __m128i InReg = _mm_load_si128(in);
+    __m128i OutReg;
+    const __m128i mask26 =  _mm_set1_epi32(67108863U);
+
+    OutReg = _mm_and_si128( InReg , mask26);
+    _mm_store_si128(out++, OutReg);
+
+    OutReg =   _mm_srli_epi32(InReg,26) ;
+    InReg = _mm_load_si128(++in);
+
+    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 20), mask26));
+    /* total = _mm_add_epi32(total, OutReg); */
+    _mm_store_si128(out++, OutReg);
+
+    OutReg =   _mm_srli_epi32(InReg,20) ;
+    InReg = _mm_load_si128(++in);
+
+    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 14), mask26));
+    /* total = _mm_add_epi32(total, OutReg); */
+    _mm_store_si128(out++, OutReg);
+
+    OutReg =   _mm_srli_epi32(InReg,14) ;
+    InReg = _mm_load_si128(++in);
 
     OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 8), mask26));
     /* total = _mm_add_epi32(total, OutReg); */
@@ -10404,76 +11118,6 @@ unpack_26_rev_8_2 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
 
     return;
 }
-
-#else
-static void
-unpack_26 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
-  unsigned int column;
-  const UINT4 *bitpack = in;
-
-  for (column = 0; column < 4; column++) {
-    in = &(bitpack[column]);
-
-    *out = ( (*in) >>  0  )   % (1U << 26 ) ;
-    out++;
-    *out = ( (*in) >>  26  )   % (1U << 26 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 20 ))<<( 26 - 20 );
-    out++;
-    *out = ( (*in) >>  20  )   % (1U << 26 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 14 ))<<( 26 - 14 );
-    out++;
-    *out = ( (*in) >>  14  )   % (1U << 26 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 8 ))<<( 26 - 8 );
-    out++;
-    *out = ( (*in) >>  8  )   % (1U << 26 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 2 ))<<( 26 - 2 );
-    out++;
-    *out = ( (*in) >>  2  )   % (1U << 26 ) ;
-    out++;
-    *out = ( (*in) >>  28  )   % (1U << 26 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 22 ))<<( 26 - 22 );
-    out++;
-    *out = ( (*in) >>  22  )   % (1U << 26 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 16 ))<<( 26 - 16 );
-    out++;
-    *out = ( (*in) >>  16  )   % (1U << 26 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 10 ))<<( 26 - 10 );
-    out++;
-    *out = ( (*in) >>  10  )   % (1U << 26 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 4 ))<<( 26 - 4 );
-    out++;
-    *out = ( (*in) >>  4  )   % (1U << 26 ) ;
-    out++;
-    *out = ( (*in) >>  30  )   % (1U << 26 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 24 ))<<( 26 - 24 );
-    out++;
-    *out = ( (*in) >>  24  )   % (1U << 26 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 18 ))<<( 26 - 18 );
-    out++;
-    *out = ( (*in) >>  18  )   % (1U << 26 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 12 ))<<( 26 - 12 );
-    out++;
-    *out = ( (*in) >>  12  )   % (1U << 26 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 6 ))<<( 26 - 6 );
-    out++;
-    *out = ( (*in) >>  6  )   % (1U << 26 ) ;
-    out++;
-  }
-  
-  return;
-}
 #endif
 
 
@@ -10591,72 +11235,215 @@ unpack_27 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
 #endif
 
 
-#ifdef HAVE_SSE2
-static void
-unpack_28_fwd (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
-    __m128i InReg = _mm_load_si128(in);
-    __m128i OutReg;
-    const __m128i mask28 =  _mm_set1_epi32(268435455U);
-
-    OutReg = _mm_and_si128( InReg , mask28);
-    _mm_store_si128(out++, OutReg);
-
-    OutReg =   _mm_srli_epi32(InReg,28) ;
-    InReg = _mm_load_si128(++in);
-
-    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 24), mask28));
-    /* total = _mm_add_epi32(total, OutReg); */
-    _mm_store_si128(out++, OutReg);
-
-    OutReg =   _mm_srli_epi32(InReg,24) ;
-    InReg = _mm_load_si128(++in);
-
-    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 20), mask28));
-    /* total = _mm_add_epi32(total, OutReg); */
-    _mm_store_si128(out++, OutReg);
-
-    OutReg =   _mm_srli_epi32(InReg,20) ;
-    InReg = _mm_load_si128(++in);
-
-    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 16), mask28));
-    /* total = _mm_add_epi32(total, OutReg); */
-    _mm_store_si128(out++, OutReg);
-
-    OutReg =   _mm_srli_epi32(InReg,16) ;
-    InReg = _mm_load_si128(++in);
-
-    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 12), mask28));
-    /* total = _mm_add_epi32(total, OutReg); */
-    _mm_store_si128(out++, OutReg);
-
-    OutReg =   _mm_srli_epi32(InReg,12) ;
-    InReg = _mm_load_si128(++in);
-
-    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 8), mask28));
-    /* total = _mm_add_epi32(total, OutReg); */
-    _mm_store_si128(out++, OutReg);
 
-    OutReg =   _mm_srli_epi32(InReg,8) ;
-    InReg = _mm_load_si128(++in);
+#ifdef WORDS_BIGENDIAN
+static void
+unpack_28 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
+  unsigned int column;
+  const UINT4 *bitpack = in;
 
-    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 4), mask28));
-    /* total = _mm_add_epi32(total, OutReg); */
-    _mm_store_si128(out++, OutReg);
+  for (column = 0; column < 4; column++) {
+    in = &(bitpack[column]);
 
-    OutReg =   _mm_srli_epi32(InReg,4) ;
-    /* total = _mm_add_epi32(total, OutReg); */
-    _mm_store_si128(out++, OutReg);
+    *out = ( Bigendian_convert_uint(*in) >>  0  )   % (1U << 28 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  28  )   % (1U << 28 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 24 ))<<( 28 - 24 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  24  )   % (1U << 28 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 20 ))<<( 28 - 20 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  20  )   % (1U << 28 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 16 ))<<( 28 - 16 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  16  )   % (1U << 28 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 12 ))<<( 28 - 12 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  12  )   % (1U << 28 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 8 ))<<( 28 - 8 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  8  )   % (1U << 28 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 4 ))<<( 28 - 4 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  4  )   % (1U << 28 ) ;
+    in += 4;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  0  )   % (1U << 28 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  28  )   % (1U << 28 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 24 ))<<( 28 - 24 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  24  )   % (1U << 28 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 20 ))<<( 28 - 20 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  20  )   % (1U << 28 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 16 ))<<( 28 - 16 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  16  )   % (1U << 28 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 12 ))<<( 28 - 12 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  12  )   % (1U << 28 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 8 ))<<( 28 - 8 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  8  )   % (1U << 28 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 4 ))<<( 28 - 4 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  4  )   % (1U << 28 ) ;
+    out++;
+  }
 
-    return;
+  return;
 }
 
+#elif !defined(HAVE_SSE2)
 static void
-unpack_28_fwd_1_3 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
-    __m128i InReg;
-    __m128i OutReg, total;
-    const __m128i mask28 =  _mm_set1_epi32(268435455U);
+unpack_28 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
+  unsigned int column;
+  const UINT4 *bitpack = in;
 
-    /* 1 */
+  for (column = 0; column < 4; column++) {
+    in = &(bitpack[column]);
+
+    *out = ( (*in) >>  0  )   % (1U << 28 ) ;
+    out++;
+    *out = ( (*in) >>  28  )   % (1U << 28 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 24 ))<<( 28 - 24 );
+    out++;
+    *out = ( (*in) >>  24  )   % (1U << 28 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 20 ))<<( 28 - 20 );
+    out++;
+    *out = ( (*in) >>  20  )   % (1U << 28 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 16 ))<<( 28 - 16 );
+    out++;
+    *out = ( (*in) >>  16  )   % (1U << 28 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 12 ))<<( 28 - 12 );
+    out++;
+    *out = ( (*in) >>  12  )   % (1U << 28 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 8 ))<<( 28 - 8 );
+    out++;
+    *out = ( (*in) >>  8  )   % (1U << 28 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 4 ))<<( 28 - 4 );
+    out++;
+    *out = ( (*in) >>  4  )   % (1U << 28 ) ;
+    in += 4;
+    out++;
+    *out = ( (*in) >>  0  )   % (1U << 28 ) ;
+    out++;
+    *out = ( (*in) >>  28  )   % (1U << 28 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 24 ))<<( 28 - 24 );
+    out++;
+    *out = ( (*in) >>  24  )   % (1U << 28 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 20 ))<<( 28 - 20 );
+    out++;
+    *out = ( (*in) >>  20  )   % (1U << 28 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 16 ))<<( 28 - 16 );
+    out++;
+    *out = ( (*in) >>  16  )   % (1U << 28 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 12 ))<<( 28 - 12 );
+    out++;
+    *out = ( (*in) >>  12  )   % (1U << 28 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 8 ))<<( 28 - 8 );
+    out++;
+    *out = ( (*in) >>  8  )   % (1U << 28 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 4 ))<<( 28 - 4 );
+    out++;
+    *out = ( (*in) >>  4  )   % (1U << 28 ) ;
+    out++;
+  }
+
+  return;
+}
+
+#else
+static void
+unpack_28_fwd (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
+    __m128i InReg = _mm_load_si128(in);
+    __m128i OutReg;
+    const __m128i mask28 =  _mm_set1_epi32(268435455U);
+
+    OutReg = _mm_and_si128( InReg , mask28);
+    _mm_store_si128(out++, OutReg);
+
+    OutReg =   _mm_srli_epi32(InReg,28) ;
+    InReg = _mm_load_si128(++in);
+
+    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 24), mask28));
+    /* total = _mm_add_epi32(total, OutReg); */
+    _mm_store_si128(out++, OutReg);
+
+    OutReg =   _mm_srli_epi32(InReg,24) ;
+    InReg = _mm_load_si128(++in);
+
+    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 20), mask28));
+    /* total = _mm_add_epi32(total, OutReg); */
+    _mm_store_si128(out++, OutReg);
+
+    OutReg =   _mm_srli_epi32(InReg,20) ;
+    InReg = _mm_load_si128(++in);
+
+    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 16), mask28));
+    /* total = _mm_add_epi32(total, OutReg); */
+    _mm_store_si128(out++, OutReg);
+
+    OutReg =   _mm_srli_epi32(InReg,16) ;
+    InReg = _mm_load_si128(++in);
+
+    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 12), mask28));
+    /* total = _mm_add_epi32(total, OutReg); */
+    _mm_store_si128(out++, OutReg);
+
+    OutReg =   _mm_srli_epi32(InReg,12) ;
+    InReg = _mm_load_si128(++in);
+
+    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 8), mask28));
+    /* total = _mm_add_epi32(total, OutReg); */
+    _mm_store_si128(out++, OutReg);
+
+    OutReg =   _mm_srli_epi32(InReg,8) ;
+    InReg = _mm_load_si128(++in);
+
+    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 4), mask28));
+    /* total = _mm_add_epi32(total, OutReg); */
+    _mm_store_si128(out++, OutReg);
+
+    OutReg =   _mm_srli_epi32(InReg,4) ;
+    /* total = _mm_add_epi32(total, OutReg); */
+    _mm_store_si128(out++, OutReg);
+
+    return;
+}
+
+static void
+unpack_28_fwd_1_3 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
+    __m128i InReg;
+    __m128i OutReg, total;
+    const __m128i mask28 =  _mm_set1_epi32(268435455U);
+
+    /* 1 */
     InReg = _mm_load_si128(in);
 
     total = /* OutReg = */ _mm_and_si128( InReg , mask28);
@@ -11328,77 +12115,6 @@ unpack_28_rev_8_2 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
 
     return;
 }
-
-#else
-static void
-unpack_28 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
-  unsigned int column;
-  const UINT4 *bitpack = in;
-
-  for (column = 0; column < 4; column++) {
-    in = &(bitpack[column]);
-
-    *out = ( (*in) >>  0  )   % (1U << 28 ) ;
-    out++;
-    *out = ( (*in) >>  28  )   % (1U << 28 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 24 ))<<( 28 - 24 );
-    out++;
-    *out = ( (*in) >>  24  )   % (1U << 28 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 20 ))<<( 28 - 20 );
-    out++;
-    *out = ( (*in) >>  20  )   % (1U << 28 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 16 ))<<( 28 - 16 );
-    out++;
-    *out = ( (*in) >>  16  )   % (1U << 28 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 12 ))<<( 28 - 12 );
-    out++;
-    *out = ( (*in) >>  12  )   % (1U << 28 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 8 ))<<( 28 - 8 );
-    out++;
-    *out = ( (*in) >>  8  )   % (1U << 28 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 4 ))<<( 28 - 4 );
-    out++;
-    *out = ( (*in) >>  4  )   % (1U << 28 ) ;
-    in += 4;
-    out++;
-    *out = ( (*in) >>  0  )   % (1U << 28 ) ;
-    out++;
-    *out = ( (*in) >>  28  )   % (1U << 28 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 24 ))<<( 28 - 24 );
-    out++;
-    *out = ( (*in) >>  24  )   % (1U << 28 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 20 ))<<( 28 - 20 );
-    out++;
-    *out = ( (*in) >>  20  )   % (1U << 28 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 16 ))<<( 28 - 16 );
-    out++;
-    *out = ( (*in) >>  16  )   % (1U << 28 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 12 ))<<( 28 - 12 );
-    out++;
-    *out = ( (*in) >>  12  )   % (1U << 28 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 8 ))<<( 28 - 8 );
-    out++;
-    *out = ( (*in) >>  8  )   % (1U << 28 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 4 ))<<( 28 - 4 );
-    out++;
-    *out = ( (*in) >>  4  )   % (1U << 28 ) ;
-    out++;
-  }
-
-  return;
-}
 #endif
 
 
@@ -11519,25 +12235,174 @@ unpack_29 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
 #endif
 
 
-#ifdef HAVE_SSE2
-static void
-unpack_30_fwd (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
-    __m128i InReg = _mm_load_si128(in);
-    __m128i OutReg;
-    const __m128i mask30 =  _mm_set1_epi32(1073741823U);
 
-    OutReg = _mm_and_si128( InReg , mask30);
-    _mm_store_si128(out++, OutReg);
+#ifdef WORDS_BIGENDIAN
+static void
+unpack_30 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
+  unsigned int column;
+  const UINT4 *bitpack = in;
 
-    OutReg =   _mm_srli_epi32(InReg,30) ;
-    InReg = _mm_load_si128(++in);
+  for (column = 0; column < 4; column++) {
+    in = &(bitpack[column]);
 
-    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 28), mask30));
-    /* total = _mm_add_epi32(total, OutReg); */
-    _mm_store_si128(out++, OutReg);
+    *out = ( Bigendian_convert_uint(*in) >>  0  )   % (1U << 30 ) ;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  30  )   % (1U << 30 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 28 ))<<( 30 - 28 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  28  )   % (1U << 30 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 26 ))<<( 30 - 26 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  26  )   % (1U << 30 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 24 ))<<( 30 - 24 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  24  )   % (1U << 30 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 22 ))<<( 30 - 22 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  22  )   % (1U << 30 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 20 ))<<( 30 - 20 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  20  )   % (1U << 30 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 18 ))<<( 30 - 18 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  18  )   % (1U << 30 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 16 ))<<( 30 - 16 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  16  )   % (1U << 30 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 14 ))<<( 30 - 14 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  14  )   % (1U << 30 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 12 ))<<( 30 - 12 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  12  )   % (1U << 30 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 10 ))<<( 30 - 10 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  10  )   % (1U << 30 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 8 ))<<( 30 - 8 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  8  )   % (1U << 30 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 6 ))<<( 30 - 6 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  6  )   % (1U << 30 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 4 ))<<( 30 - 4 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  4  )   % (1U << 30 ) ;
+    in += 4;
+    *out |= (Bigendian_convert_uint(*in) % (1U<< 2 ))<<( 30 - 2 );
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  2  )   % (1U << 30 ) ;
+    out++;
+  }
 
-    OutReg =   _mm_srli_epi32(InReg,28) ;
-    InReg = _mm_load_si128(++in);
+  return;
+}
+
+#elif !defined(HAVE_SSE2)
+static void
+unpack_30 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
+  unsigned int column;
+  const UINT4 *bitpack = in;
+
+  for (column = 0; column < 4; column++) {
+    in = &(bitpack[column]);
+
+    *out = ( (*in) >>  0  )   % (1U << 30 ) ;
+    out++;
+    *out = ( (*in) >>  30  )   % (1U << 30 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 28 ))<<( 30 - 28 );
+    out++;
+    *out = ( (*in) >>  28  )   % (1U << 30 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 26 ))<<( 30 - 26 );
+    out++;
+    *out = ( (*in) >>  26  )   % (1U << 30 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 24 ))<<( 30 - 24 );
+    out++;
+    *out = ( (*in) >>  24  )   % (1U << 30 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 22 ))<<( 30 - 22 );
+    out++;
+    *out = ( (*in) >>  22  )   % (1U << 30 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 20 ))<<( 30 - 20 );
+    out++;
+    *out = ( (*in) >>  20  )   % (1U << 30 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 18 ))<<( 30 - 18 );
+    out++;
+    *out = ( (*in) >>  18  )   % (1U << 30 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 16 ))<<( 30 - 16 );
+    out++;
+    *out = ( (*in) >>  16  )   % (1U << 30 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 14 ))<<( 30 - 14 );
+    out++;
+    *out = ( (*in) >>  14  )   % (1U << 30 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 12 ))<<( 30 - 12 );
+    out++;
+    *out = ( (*in) >>  12  )   % (1U << 30 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 10 ))<<( 30 - 10 );
+    out++;
+    *out = ( (*in) >>  10  )   % (1U << 30 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 8 ))<<( 30 - 8 );
+    out++;
+    *out = ( (*in) >>  8  )   % (1U << 30 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 6 ))<<( 30 - 6 );
+    out++;
+    *out = ( (*in) >>  6  )   % (1U << 30 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 4 ))<<( 30 - 4 );
+    out++;
+    *out = ( (*in) >>  4  )   % (1U << 30 ) ;
+    in += 4;
+    *out |= ((*in) % (1U<< 2 ))<<( 30 - 2 );
+    out++;
+    *out = ( (*in) >>  2  )   % (1U << 30 ) ;
+    out++;
+  }
+
+  return;
+}
+
+#else
+static void
+unpack_30_fwd (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
+    __m128i InReg = _mm_load_si128(in);
+    __m128i OutReg;
+    const __m128i mask30 =  _mm_set1_epi32(1073741823U);
+
+    OutReg = _mm_and_si128( InReg , mask30);
+    _mm_store_si128(out++, OutReg);
+
+    OutReg =   _mm_srli_epi32(InReg,30) ;
+    InReg = _mm_load_si128(++in);
+
+    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 28), mask30));
+    /* total = _mm_add_epi32(total, OutReg); */
+    _mm_store_si128(out++, OutReg);
+
+    OutReg =   _mm_srli_epi32(InReg,28) ;
+    InReg = _mm_load_si128(++in);
 
     OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 26), mask30));
     /* total = _mm_add_epi32(total, OutReg); */
@@ -12280,81 +13145,6 @@ unpack_30_rev_8_2 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
 
     return;
 }
-
-
-#else
-static void
-unpack_30 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
-  unsigned int column;
-  const UINT4 *bitpack = in;
-
-  for (column = 0; column < 4; column++) {
-    in = &(bitpack[column]);
-
-    *out = ( (*in) >>  0  )   % (1U << 30 ) ;
-    out++;
-    *out = ( (*in) >>  30  )   % (1U << 30 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 28 ))<<( 30 - 28 );
-    out++;
-    *out = ( (*in) >>  28  )   % (1U << 30 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 26 ))<<( 30 - 26 );
-    out++;
-    *out = ( (*in) >>  26  )   % (1U << 30 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 24 ))<<( 30 - 24 );
-    out++;
-    *out = ( (*in) >>  24  )   % (1U << 30 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 22 ))<<( 30 - 22 );
-    out++;
-    *out = ( (*in) >>  22  )   % (1U << 30 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 20 ))<<( 30 - 20 );
-    out++;
-    *out = ( (*in) >>  20  )   % (1U << 30 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 18 ))<<( 30 - 18 );
-    out++;
-    *out = ( (*in) >>  18  )   % (1U << 30 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 16 ))<<( 30 - 16 );
-    out++;
-    *out = ( (*in) >>  16  )   % (1U << 30 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 14 ))<<( 30 - 14 );
-    out++;
-    *out = ( (*in) >>  14  )   % (1U << 30 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 12 ))<<( 30 - 12 );
-    out++;
-    *out = ( (*in) >>  12  )   % (1U << 30 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 10 ))<<( 30 - 10 );
-    out++;
-    *out = ( (*in) >>  10  )   % (1U << 30 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 8 ))<<( 30 - 8 );
-    out++;
-    *out = ( (*in) >>  8  )   % (1U << 30 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 6 ))<<( 30 - 6 );
-    out++;
-    *out = ( (*in) >>  6  )   % (1U << 30 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 4 ))<<( 30 - 4 );
-    out++;
-    *out = ( (*in) >>  4  )   % (1U << 30 ) ;
-    in += 4;
-    *out |= ((*in) % (1U<< 2 ))<<( 30 - 2 );
-    out++;
-    *out = ( (*in) >>  2  )   % (1U << 30 ) ;
-    out++;
-  }
-
-  return;
-}
 #endif
 
 
@@ -12479,59 +13269,181 @@ unpack_31 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
 
 
 
-#ifdef HAVE_SSE2
+#ifdef WORDS_BIGENDIAN
 static void
-unpack_32_fwd (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
-    __m128i OutReg;
-
-    OutReg = _mm_load_si128(in++);
-    _mm_store_si128(out++, OutReg);
-
-    OutReg = _mm_load_si128(in++);
-    /* total = _mm_add_epi32(total, _mm_load_si128(in++)); */
-    _mm_store_si128(out++, OutReg);
-
-    OutReg = _mm_load_si128(in++);
-    /* total = _mm_add_epi32(total, _mm_load_si128(in++)); */
-    _mm_store_si128(out++, OutReg);
-
-    OutReg = _mm_load_si128(in++);
-    /* total = _mm_add_epi32(total, _mm_load_si128(in++)); */
-    _mm_store_si128(out++, OutReg);
-
-    OutReg = _mm_load_si128(in++);
-    /* total = _mm_add_epi32(total, _mm_load_si128(in++)); */
-    _mm_store_si128(out++, OutReg);
-
-    OutReg = _mm_load_si128(in++);
-    /* total = _mm_add_epi32(total, _mm_load_si128(in++)); */
-    _mm_store_si128(out++, OutReg);
+unpack_32 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
+  unsigned int column;
+  const UINT4 *bitpack = in;
 
-    OutReg = _mm_load_si128(in++);
-    /* total = _mm_add_epi32(total, _mm_load_si128(in++)); */
-    _mm_store_si128(out++, OutReg);
+  for (column = 0; column < 4; column++) {
+    in = &(bitpack[column]);
 
-    OutReg = _mm_load_si128(in++);
-    /* total = _mm_add_epi32(total, _mm_load_si128(in++)); */
-    _mm_store_si128(out++, OutReg);
+    *out = ( Bigendian_convert_uint(*in) >>  0  )   ;
+    in += 4;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  0  )   ;
+    in += 4;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  0  )   ;
+    in += 4;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  0  )   ;
+    in += 4;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  0  )   ;
+    in += 4;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  0  )   ;
+    in += 4;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  0  )   ;
+    in += 4;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  0  )   ;
+    in += 4;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  0  )   ;
+    in += 4;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  0  )   ;
+    in += 4;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  0  )   ;
+    in += 4;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  0  )   ;
+    in += 4;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  0  )   ;
+    in += 4;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  0  )   ;
+    in += 4;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  0  )   ;
+    in += 4;
+    out++;
+    *out = ( Bigendian_convert_uint(*in) >>  0  )   ;
+    out++;
+  }
 
-    return;
+  return;
 }
 
+#elif !defined(HAVE_SSE2)
 static void
-unpack_32_fwd_1_3 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
-    __m128i total;
+unpack_32 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
+  unsigned int column;
+  const UINT4 *bitpack = in;
 
-    /* 1 */
-    total = _mm_load_si128(in);
-    _mm_store_si128(out++, total);
+  for (column = 0; column < 4; column++) {
+    in = &(bitpack[column]);
 
-    /* Skip row */
+    *out = ( (*in) >>  0  )   ;
+    in += 4;
     out++;
-
-    /* 3 */
-    in += 2;
-    total = _mm_load_si128(in);
+    *out = ( (*in) >>  0  )   ;
+    in += 4;
+    out++;
+    *out = ( (*in) >>  0  )   ;
+    in += 4;
+    out++;
+    *out = ( (*in) >>  0  )   ;
+    in += 4;
+    out++;
+    *out = ( (*in) >>  0  )   ;
+    in += 4;
+    out++;
+    *out = ( (*in) >>  0  )   ;
+    in += 4;
+    out++;
+    *out = ( (*in) >>  0  )   ;
+    in += 4;
+    out++;
+    *out = ( (*in) >>  0  )   ;
+    in += 4;
+    out++;
+    *out = ( (*in) >>  0  )   ;
+    in += 4;
+    out++;
+    *out = ( (*in) >>  0  )   ;
+    in += 4;
+    out++;
+    *out = ( (*in) >>  0  )   ;
+    in += 4;
+    out++;
+    *out = ( (*in) >>  0  )   ;
+    in += 4;
+    out++;
+    *out = ( (*in) >>  0  )   ;
+    in += 4;
+    out++;
+    *out = ( (*in) >>  0  )   ;
+    in += 4;
+    out++;
+    *out = ( (*in) >>  0  )   ;
+    in += 4;
+    out++;
+    *out = ( (*in) >>  0  )   ;
+    out++;
+  }
+
+  return;
+}
+
+#else
+static void
+unpack_32_fwd (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
+    __m128i OutReg;
+
+    OutReg = _mm_load_si128(in++);
+    _mm_store_si128(out++, OutReg);
+
+    OutReg = _mm_load_si128(in++);
+    /* total = _mm_add_epi32(total, _mm_load_si128(in++)); */
+    _mm_store_si128(out++, OutReg);
+
+    OutReg = _mm_load_si128(in++);
+    /* total = _mm_add_epi32(total, _mm_load_si128(in++)); */
+    _mm_store_si128(out++, OutReg);
+
+    OutReg = _mm_load_si128(in++);
+    /* total = _mm_add_epi32(total, _mm_load_si128(in++)); */
+    _mm_store_si128(out++, OutReg);
+
+    OutReg = _mm_load_si128(in++);
+    /* total = _mm_add_epi32(total, _mm_load_si128(in++)); */
+    _mm_store_si128(out++, OutReg);
+
+    OutReg = _mm_load_si128(in++);
+    /* total = _mm_add_epi32(total, _mm_load_si128(in++)); */
+    _mm_store_si128(out++, OutReg);
+
+    OutReg = _mm_load_si128(in++);
+    /* total = _mm_add_epi32(total, _mm_load_si128(in++)); */
+    _mm_store_si128(out++, OutReg);
+
+    OutReg = _mm_load_si128(in++);
+    /* total = _mm_add_epi32(total, _mm_load_si128(in++)); */
+    _mm_store_si128(out++, OutReg);
+
+    return;
+}
+
+static void
+unpack_32_fwd_1_3 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
+    __m128i total;
+
+    /* 1 */
+    total = _mm_load_si128(in);
+    _mm_store_si128(out++, total);
+
+    /* Skip row */
+    out++;
+
+    /* 3 */
+    in += 2;
+    total = _mm_load_si128(in);
     _mm_store_si128(out++, total);
 
     return;
@@ -12972,69 +13884,6 @@ unpack_32_rev_8_2 (__m128i* __restrict__ out, const __m128i* __restrict__ in) {
 
     return;
 }
-
-
-
-#else
-static void
-unpack_32 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
-  unsigned int column;
-  const UINT4 *bitpack = in;
-
-  for (column = 0; column < 4; column++) {
-    in = &(bitpack[column]);
-
-    *out = ( (*in) >>  0  )   ;
-    in += 4;
-    out++;
-    *out = ( (*in) >>  0  )   ;
-    in += 4;
-    out++;
-    *out = ( (*in) >>  0  )   ;
-    in += 4;
-    out++;
-    *out = ( (*in) >>  0  )   ;
-    in += 4;
-    out++;
-    *out = ( (*in) >>  0  )   ;
-    in += 4;
-    out++;
-    *out = ( (*in) >>  0  )   ;
-    in += 4;
-    out++;
-    *out = ( (*in) >>  0  )   ;
-    in += 4;
-    out++;
-    *out = ( (*in) >>  0  )   ;
-    in += 4;
-    out++;
-    *out = ( (*in) >>  0  )   ;
-    in += 4;
-    out++;
-    *out = ( (*in) >>  0  )   ;
-    in += 4;
-    out++;
-    *out = ( (*in) >>  0  )   ;
-    in += 4;
-    out++;
-    *out = ( (*in) >>  0  )   ;
-    in += 4;
-    out++;
-    *out = ( (*in) >>  0  )   ;
-    in += 4;
-    out++;
-    *out = ( (*in) >>  0  )   ;
-    in += 4;
-    out++;
-    *out = ( (*in) >>  0  )   ;
-    in += 4;
-    out++;
-    *out = ( (*in) >>  0  )   ;
-    out++;
-  }
-
-  return;
-}
 #endif
 
 
@@ -13129,345 +13978,10 @@ unpack_32 (UINT4* __restrict__ out, const UINT4* __restrict__ in) {
 #endif
 
 
-#ifdef HAVE_SSE2
-static void
-vertical_order_fwd (UINT4 *vertical, UINT4 *columnar) {
-
-  vertical[0] = columnar[0];		/* remainder 1 */
-  vertical[4] = columnar[1];		/* remainder 5 */
-  vertical[8] = columnar[2];		/* remainder 9 */
-  vertical[12] = columnar[3];		/* remainder 13 */
-  vertical[16] = columnar[4];		/* remainder 17 */
-  vertical[20] = columnar[5];		/* remainder 21 */
-  vertical[24] = columnar[6];		/* remainder 25 */
-  vertical[28] = columnar[7];		/* remainder 29 */
-
-  vertical[1] = columnar[8];		/* remainder 2 */
-  vertical[5] = columnar[9];		/* remainder 6 */
-  vertical[9] = columnar[10];		/* remainder 10 */
-  vertical[13] = columnar[11];		/* remainder 14 */
-  vertical[17] = columnar[12];		/* remainder 18 */
-  vertical[21] = columnar[13];		/* remainder 22 */
-  vertical[25] = columnar[14];		/* remainder 26 */
-  vertical[29] = columnar[15];		/* remainder 30 */
-
-  vertical[2] = columnar[16];		/* remainder 3 */
-  vertical[6] = columnar[17];		/* remainder 7 */
-  vertical[10] = columnar[18];		/* remainder 11 */
-  vertical[14] = columnar[19];		/* remainder 15 */
-  vertical[18] = columnar[20];		/* remainder 19 */
-  vertical[22] = columnar[21];		/* remainder 23 */
-  vertical[26] = columnar[22];		/* remainder 27 */
-  vertical[30] = columnar[23];		/* remainder 31 */
-
-  vertical[3] = columnar[24];		/* remainder 4 */
-  vertical[7] = columnar[25];		/* remainder 8 */
-  vertical[11] = columnar[26];		/* remainder 12 */
-  vertical[15] = columnar[27];		/* remainder 16 */
-  vertical[19] = columnar[28];		/* remainder 20 */
-  vertical[23] = columnar[29];		/* remainder 24 */
-  vertical[27] = columnar[30];		/* remainder 28 */
-  vertical[31] = columnar[31];		/* remainder 32 */
-
-  return;
-}
-
-static void
-vertical_order_rev (UINT4 *vertical, UINT4 *columnar) {
-
-  vertical[0] = columnar[0];		/* remainder 63 */
-  vertical[4] = columnar[1];		/* remainder 59 */
-  vertical[8] = columnar[2];		/* remainder 55 */
-  vertical[12] = columnar[3];		/* remainder 51 */
-  vertical[16] = columnar[4];		/* remainder 47 */
-  vertical[20] = columnar[5];		/* remainder 43 */
-  vertical[24] = columnar[6];		/* remainder 39 */
-  vertical[28] = columnar[7];		/* remainder 35 */
-
-  vertical[1] = columnar[8];		/* remainder 62 */
-  vertical[5] = columnar[9];		/* remainder 58 */
-  vertical[9] = columnar[10];		/* remainder 54 */
-  vertical[13] = columnar[11];		/* remainder 50 */
-  vertical[17] = columnar[12];		/* remainder 46 */
-  vertical[21] = columnar[13];		/* remainder 42 */
-  vertical[25] = columnar[14];		/* remainder 38 */
-  vertical[29] = columnar[15];		/* remainder 34 */
-
-  vertical[2] = columnar[16];		/* remainder 61 */
-  vertical[6] = columnar[17];		/* remainder 57 */
-  vertical[10] = columnar[18];		/* remainder 53 */
-  vertical[14] = columnar[19];		/* remainder 49 */
-  vertical[18] = columnar[20];		/* remainder 45 */
-  vertical[22] = columnar[21];		/* remainder 41 */
-  vertical[26] = columnar[22];		/* remainder 37 */
-  vertical[30] = columnar[23];		/* remainder 33 */
-
-  vertical[3] = columnar[24];		/* remainder 60 */
-  vertical[7] = columnar[25];		/* remainder 56 */
-  vertical[11] = columnar[26];		/* remainder 52 */
-  vertical[15] = columnar[27];		/* remainder 48 */
-  vertical[19] = columnar[28];		/* remainder 44 */
-  vertical[23] = columnar[29];		/* remainder 40 */
-  vertical[27] = columnar[30];		/* remainder 36 */
-  vertical[31] = columnar[31];		/* remainder 32 */
-
-  return;
-}
-
-static void
-vertical_order_huge_fwd (UINT8 *vertical, UINT4 *columnar) {
-
-  vertical[0] = (UINT8) columnar[0];		/* remainder 1 */
-  vertical[4] = (UINT8) columnar[1];		/* remainder 5 */
-  vertical[8] = (UINT8) columnar[2];		/* remainder 9 */
-  vertical[12] = (UINT8) columnar[3];		/* remainder 13 */
-  vertical[16] = (UINT8) columnar[4];		/* remainder 17 */
-  vertical[20] = (UINT8) columnar[5];		/* remainder 21 */
-  vertical[24] = (UINT8) columnar[6];		/* remainder 25 */
-  vertical[28] = (UINT8) columnar[7];		/* remainder 29 */
-
-  vertical[1] = (UINT8) columnar[8];		/* remainder 2 */
-  vertical[5] = (UINT8) columnar[9];		/* remainder 6 */
-  vertical[9] = (UINT8) columnar[10];		/* remainder 10 */
-  vertical[13] = (UINT8) columnar[11];		/* remainder 14 */
-  vertical[17] = (UINT8) columnar[12];		/* remainder 18 */
-  vertical[21] = (UINT8) columnar[13];		/* remainder 22 */
-  vertical[25] = (UINT8) columnar[14];		/* remainder 26 */
-  vertical[29] = (UINT8) columnar[15];		/* remainder 30 */
-
-  vertical[2] = (UINT8) columnar[16];		/* remainder 3 */
-  vertical[6] = (UINT8) columnar[17];		/* remainder 7 */
-  vertical[10] = (UINT8) columnar[18];		/* remainder 11 */
-  vertical[14] = (UINT8) columnar[19];		/* remainder 15 */
-  vertical[18] = (UINT8) columnar[20];		/* remainder 19 */
-  vertical[22] = (UINT8) columnar[21];		/* remainder 23 */
-  vertical[26] = (UINT8) columnar[22];		/* remainder 27 */
-  vertical[30] = (UINT8) columnar[23];		/* remainder 31 */
-
-  vertical[3] = (UINT8) columnar[24];		/* remainder 4 */
-  vertical[7] = (UINT8) columnar[25];		/* remainder 8 */
-  vertical[11] = (UINT8) columnar[26];		/* remainder 12 */
-  vertical[15] = (UINT8) columnar[27];		/* remainder 16 */
-  vertical[19] = (UINT8) columnar[28];		/* remainder 20 */
-  vertical[23] = (UINT8) columnar[29];		/* remainder 24 */
-  vertical[27] = (UINT8) columnar[30];		/* remainder 28 */
-  vertical[31] = (UINT8) columnar[31];		/* remainder 32 */
-
-  return;
-}
-
-static void
-vertical_order_huge_rev (UINT8 *vertical, UINT4 *columnar) {
-
-  vertical[0] = (UINT8) columnar[0];		/* remainder 63 */
-  vertical[4] = (UINT8) columnar[1];		/* remainder 59 */
-  vertical[8] = (UINT8) columnar[2];		/* remainder 55 */
-  vertical[12] = (UINT8) columnar[3];		/* remainder 51 */
-  vertical[16] = (UINT8) columnar[4];		/* remainder 47 */
-  vertical[20] = (UINT8) columnar[5];		/* remainder 43 */
-  vertical[24] = (UINT8) columnar[6];		/* remainder 39 */
-  vertical[28] = (UINT8) columnar[7];		/* remainder 35 */
-
-  vertical[1] = (UINT8) columnar[8];		/* remainder 62 */
-  vertical[5] = (UINT8) columnar[9];		/* remainder 58 */
-  vertical[9] = (UINT8) columnar[10];		/* remainder 54 */
-  vertical[13] = (UINT8) columnar[11];		/* remainder 50 */
-  vertical[17] = (UINT8) columnar[12];		/* remainder 46 */
-  vertical[21] = (UINT8) columnar[13];		/* remainder 42 */
-  vertical[25] = (UINT8) columnar[14];		/* remainder 38 */
-  vertical[29] = (UINT8) columnar[15];		/* remainder 34 */
-
-  vertical[2] = (UINT8) columnar[16];		/* remainder 61 */
-  vertical[6] = (UINT8) columnar[17];		/* remainder 57 */
-  vertical[10] = (UINT8) columnar[18];		/* remainder 53 */
-  vertical[14] = (UINT8) columnar[19];		/* remainder 49 */
-  vertical[18] = (UINT8) columnar[20];		/* remainder 45 */
-  vertical[22] = (UINT8) columnar[21];		/* remainder 41 */
-  vertical[26] = (UINT8) columnar[22];		/* remainder 37 */
-  vertical[30] = (UINT8) columnar[23];		/* remainder 33 */
-
-  vertical[3] = (UINT8) columnar[24];		/* remainder 60 */
-  vertical[7] = (UINT8) columnar[25];		/* remainder 56 */
-  vertical[11] = (UINT8) columnar[26];		/* remainder 52 */
-  vertical[15] = (UINT8) columnar[27];		/* remainder 48 */
-  vertical[19] = (UINT8) columnar[28];		/* remainder 44 */
-  vertical[23] = (UINT8) columnar[29];		/* remainder 40 */
-  vertical[27] = (UINT8) columnar[30];		/* remainder 36 */
-  vertical[31] = (UINT8) columnar[31];		/* remainder 32 */
-
-  return;
-}
-
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+typedef void (*Unpacker_T) (UINT4* __restrict__, const UINT4* __restrict__);
 #else
-
-#if 0
-static void
-vertical_order (UINT4 *vertical, UINT4 *columnar) {
-
-  vertical[0] = columnar[0];		/* remainder 1 */
-  vertical[4] = columnar[1];		/* remainder 5 */
-  vertical[8] = columnar[2];		/* remainder 9 */
-  vertical[12] = columnar[3];		/* remainder 13 */
-  vertical[16] = columnar[4];		/* remainder 17 */
-  vertical[20] = columnar[5];		/* remainder 21 */
-  vertical[24] = columnar[6];		/* remainder 25 */
-  vertical[28] = columnar[7];		/* remainder 29 */
-
-  vertical[1] = columnar[8];		/* remainder 2 */
-  vertical[5] = columnar[9];		/* remainder 6 */
-  vertical[9] = columnar[10];		/* remainder 10 */
-  vertical[13] = columnar[11];		/* remainder 14 */
-  vertical[17] = columnar[12];		/* remainder 18 */
-  vertical[21] = columnar[13];		/* remainder 22 */
-  vertical[25] = columnar[14];		/* remainder 26 */
-  vertical[29] = columnar[15];		/* remainder 30 */
-
-  vertical[2] = columnar[16];		/* remainder 3 */
-  vertical[6] = columnar[17];		/* remainder 7 */
-  vertical[10] = columnar[18];		/* remainder 11 */
-  vertical[14] = columnar[19];		/* remainder 15 */
-  vertical[18] = columnar[20];		/* remainder 19 */
-  vertical[22] = columnar[21];		/* remainder 23 */
-  vertical[26] = columnar[22];		/* remainder 27 */
-  vertical[30] = columnar[23];		/* remainder 31 */
-
-  vertical[3] = columnar[24];		/* remainder 4 */
-  vertical[7] = columnar[25];		/* remainder 8 */
-  vertical[11] = columnar[26];		/* remainder 12 */
-  vertical[15] = columnar[27];		/* remainder 16 */
-  vertical[19] = columnar[28];		/* remainder 20 */
-  vertical[23] = columnar[29];		/* remainder 24 */
-  vertical[27] = columnar[30];		/* remainder 28 */
-  vertical[31] = columnar[31];		/* remainder 32 */
-
-  vertical[32] = columnar[32];		/* remainder 63 */
-  vertical[36] = columnar[33];		/* remainder 59 */
-  vertical[40] = columnar[34];		/* remainder 55 */
-  vertical[44] = columnar[35];		/* remainder 51 */
-  vertical[48] = columnar[36];		/* remainder 47 */
-  vertical[52] = columnar[37];		/* remainder 43 */
-  vertical[56] = columnar[38];		/* remainder 39 */
-  vertical[60] = columnar[39];		/* remainder 35 */
-
-  vertical[33] = columnar[40];		/* remainder 62 */
-  vertical[37] = columnar[41];		/* remainder 58 */
-  vertical[41] = columnar[42];		/* remainder 54 */
-  vertical[45] = columnar[43];		/* remainder 50 */
-  vertical[49] = columnar[44];		/* remainder 46 */
-  vertical[53] = columnar[45];		/* remainder 42 */
-  vertical[57] = columnar[46];		/* remainder 38 */
-  vertical[61] = columnar[47];		/* remainder 34 */
-
-  vertical[34] = columnar[48];		/* remainder 61 */
-  vertical[38] = columnar[49];		/* remainder 57 */
-  vertical[42] = columnar[50];		/* remainder 53 */
-  vertical[46] = columnar[51];		/* remainder 49 */
-  vertical[50] = columnar[52];		/* remainder 45 */
-  vertical[54] = columnar[53];		/* remainder 41 */
-  vertical[58] = columnar[54];		/* remainder 37 */
-  vertical[62] = columnar[55];		/* remainder 33 */
-
-  vertical[35] = columnar[56];		/* remainder 60 */
-  vertical[39] = columnar[57];		/* remainder 56 */
-  vertical[43] = columnar[58];		/* remainder 52 */
-  vertical[47] = columnar[59];		/* remainder 48 */
-  vertical[51] = columnar[60];		/* remainder 44 */
-  vertical[55] = columnar[61];		/* remainder 40 */
-  vertical[59] = columnar[62];		/* remainder 36 */
-  vertical[63] = columnar[63];		/* remainder 32 */
-
-  return;
-}
-#endif
-
-#if 0
-static void
-vertical_order_huge (UINT8 *vertical, UINT4 *columnar) {
-
-  vertical[0] = (UINT8) columnar[0];		/* remainder 1 */
-  vertical[4] = (UINT8) columnar[1];		/* remainder 5 */
-  vertical[8] = (UINT8) columnar[2];		/* remainder 9 */
-  vertical[12] = (UINT8) columnar[3];		/* remainder 13 */
-  vertical[16] = (UINT8) columnar[4];		/* remainder 17 */
-  vertical[20] = (UINT8) columnar[5];		/* remainder 21 */
-  vertical[24] = (UINT8) columnar[6];		/* remainder 25 */
-  vertical[28] = (UINT8) columnar[7];		/* remainder 29 */
-
-  vertical[1] = (UINT8) columnar[8];		/* remainder 2 */
-  vertical[5] = (UINT8) columnar[9];		/* remainder 6 */
-  vertical[9] = (UINT8) columnar[10];		/* remainder 10 */
-  vertical[13] = (UINT8) columnar[11];		/* remainder 14 */
-  vertical[17] = (UINT8) columnar[12];		/* remainder 18 */
-  vertical[21] = (UINT8) columnar[13];		/* remainder 22 */
-  vertical[25] = (UINT8) columnar[14];		/* remainder 26 */
-  vertical[29] = (UINT8) columnar[15];		/* remainder 30 */
-
-  vertical[2] = (UINT8) columnar[16];		/* remainder 3 */
-  vertical[6] = (UINT8) columnar[17];		/* remainder 7 */
-  vertical[10] = (UINT8) columnar[18];		/* remainder 11 */
-  vertical[14] = (UINT8) columnar[19];		/* remainder 15 */
-  vertical[18] = (UINT8) columnar[20];		/* remainder 19 */
-  vertical[22] = (UINT8) columnar[21];		/* remainder 23 */
-  vertical[26] = (UINT8) columnar[22];		/* remainder 27 */
-  vertical[30] = (UINT8) columnar[23];		/* remainder 31 */
-
-  vertical[3] = (UINT8) columnar[24];		/* remainder 4 */
-  vertical[7] = (UINT8) columnar[25];		/* remainder 8 */
-  vertical[11] = (UINT8) columnar[26];		/* remainder 12 */
-  vertical[15] = (UINT8) columnar[27];		/* remainder 16 */
-  vertical[19] = (UINT8) columnar[28];		/* remainder 20 */
-  vertical[23] = (UINT8) columnar[29];		/* remainder 24 */
-  vertical[27] = (UINT8) columnar[30];		/* remainder 28 */
-  vertical[31] = (UINT8) columnar[31];		/* remainder 32 */
-
-  vertical[32] = (UINT8) columnar[32];		/* remainder 63 */
-  vertical[36] = (UINT8) columnar[33];		/* remainder 59 */
-  vertical[40] = (UINT8) columnar[34];		/* remainder 55 */
-  vertical[44] = (UINT8) columnar[35];		/* remainder 51 */
-  vertical[48] = (UINT8) columnar[36];		/* remainder 47 */
-  vertical[52] = (UINT8) columnar[37];		/* remainder 43 */
-  vertical[56] = (UINT8) columnar[38];		/* remainder 39 */
-  vertical[60] = (UINT8) columnar[39];		/* remainder 35 */
-
-  vertical[33] = (UINT8) columnar[40];		/* remainder 62 */
-  vertical[37] = (UINT8) columnar[41];		/* remainder 58 */
-  vertical[41] = (UINT8) columnar[42];		/* remainder 54 */
-  vertical[45] = (UINT8) columnar[43];		/* remainder 50 */
-  vertical[49] = (UINT8) columnar[44];		/* remainder 46 */
-  vertical[53] = (UINT8) columnar[45];		/* remainder 42 */
-  vertical[57] = (UINT8) columnar[46];		/* remainder 38 */
-  vertical[61] = (UINT8) columnar[47];		/* remainder 34 */
-
-  vertical[34] = (UINT8) columnar[48];		/* remainder 61 */
-  vertical[38] = (UINT8) columnar[49];		/* remainder 57 */
-  vertical[42] = (UINT8) columnar[50];		/* remainder 53 */
-  vertical[46] = (UINT8) columnar[51];		/* remainder 49 */
-  vertical[50] = (UINT8) columnar[52];		/* remainder 45 */
-  vertical[54] = (UINT8) columnar[53];		/* remainder 41 */
-  vertical[58] = (UINT8) columnar[54];		/* remainder 37 */
-  vertical[62] = (UINT8) columnar[55];		/* remainder 33 */
-
-  vertical[35] = (UINT8) columnar[56];		/* remainder 60 */
-  vertical[39] = (UINT8) columnar[57];		/* remainder 56 */
-  vertical[43] = (UINT8) columnar[58];		/* remainder 52 */
-  vertical[47] = (UINT8) columnar[59];		/* remainder 48 */
-  vertical[51] = (UINT8) columnar[60];		/* remainder 44 */
-  vertical[55] = (UINT8) columnar[61];		/* remainder 40 */
-  vertical[59] = (UINT8) columnar[62];		/* remainder 36 */
-  vertical[63] = (UINT8) columnar[63];		/* remainder 32 */
-
-  return;
-}
-#endif
-
-#endif
-
-
-
-#ifdef HAVE_SSE2
 typedef void (*Unpacker_T) (__m128i* __restrict__, const __m128i* __restrict__);
-#else
-typedef void (*Unpacker_T) (UINT4* __restrict__, const UINT4* __restrict__);
 #endif
 
 
@@ -13482,8 +13996,20 @@ static Unpacker_T unpacker_table[33] =
    unpack_21, unpack_22, unpack_23, unpack_24,
    unpack_25, unpack_26, unpack_27, unpack_28,
    unpack_29, unpack_30, unpack_31, unpack_32};
+
+#elif defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+static Unpacker_T unpacker_all_table[33] =
+  {unpack_00,
+   unpack_00, unpack_02, unpack_00, unpack_04,
+   unpack_00, unpack_06, unpack_00, unpack_08,
+   unpack_00, unpack_10, unpack_00, unpack_12,
+   unpack_00, unpack_14, unpack_00, unpack_16,
+   unpack_00, unpack_18, unpack_00, unpack_20,
+   unpack_00, unpack_22, unpack_00, unpack_24,
+   unpack_00, unpack_26, unpack_00, unpack_28,
+   unpack_00, unpack_30, unpack_00, unpack_32};
+
 #else
-#ifdef HAVE_SSE2
 static Unpacker_T unpacker_all_table[34] =
   {unpack_00, unpack_00,
    unpack_02_fwd, unpack_02_rev, unpack_04_fwd, unpack_04_rev,
@@ -13600,19 +14126,6 @@ static Unpacker_T unpacker_table[17][17] =
     unpack_00_0},
 
 };
-   
-#else
-static Unpacker_T unpacker_all_table[33] =
-  {unpack_00,
-   unpack_00, unpack_02, unpack_00, unpack_04,
-   unpack_00, unpack_06, unpack_00, unpack_08,
-   unpack_00, unpack_10, unpack_00, unpack_12,
-   unpack_00, unpack_14, unpack_00, unpack_16,
-   unpack_00, unpack_18, unpack_00, unpack_20,
-   unpack_00, unpack_22, unpack_00, unpack_24,
-   unpack_00, unpack_26, unpack_00, unpack_28,
-   unpack_00, unpack_30, unpack_00, unpack_32};
-#endif
 #endif
 
 
@@ -13629,7 +14142,12 @@ Bitpack64_read_two (UINT4 *end0, Storedoligomer_T oligo, UINT4 *bitpackptrs, UIN
   Storedoligomer_T bmer;
   UINT4 *info, nwritten, packsize_div2;
   int remainder0, remainder1, column;
-#ifdef HAVE_SSE2
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+  UINT4 offset0, offset1;
+  UINT4 ptr;
+  int remainder, row, k, i;
+  UINT4 diffs[BLOCKSIZE+1], *bitpack;
+#else
   __m128i diffs[4];  /* Need to provide space for 8 rows (or 2 128-bit registers) for ptr and for end0 */
   int delta, row0, row1;
 #ifdef BRANCH_FREE_QTR_BLOCK
@@ -13637,12 +14155,6 @@ Bitpack64_read_two (UINT4 *end0, Storedoligomer_T oligo, UINT4 *bitpackptrs, UIN
 #endif
   __m128i *bitpack;
   UINT4 *_diffs;
-
-#else
-  UINT4 offset0, offset1;
-  UINT4 ptr;
-  int remainder, row, k, i;
-  UINT4 diffs[BLOCKSIZE+1], *bitpack;
 #endif
 #ifdef DEBUG
   UINT4 offsets[BLOCKSIZE+1];
@@ -13654,119 +14166,52 @@ Bitpack64_read_two (UINT4 *end0, Storedoligomer_T oligo, UINT4 *bitpackptrs, UIN
 
   debug(printf("Entered Bitpack64_read_two with oligo %u => bmer %u\n",oligo,bmer));
 
+#ifdef WORDS_BIGENDIAN
+  nwritten = Bigendian_convert_uint(info[0]);		/* In 128-bit registers */
+  bitpack = (UINT4 *) &(bitpackcomp[nwritten*4]);
+  packsize_div2 = (Bigendian_convert_uint(info[METAINFO_SIZE]) - nwritten);
+
+#elif !defined(HAVE_SSE2)
   nwritten = info[0];		/* In 128-bit registers */
-#ifdef HAVE_SSE2  
-  bitpack = (__m128i *) &(bitpackcomp[nwritten*4]);
-#else
   bitpack = (UINT4 *) &(bitpackcomp[nwritten*4]);
-#endif
+  packsize_div2 = (info[METAINFO_SIZE] - nwritten);
 
+#else
+  nwritten = info[0];		/* In 128-bit registers */
+  bitpack = (__m128i *) &(bitpackcomp[nwritten*4]);
   /* packsize = (info[METAINFO_SIZE] - nwritten)*2; */
   packsize_div2 = (info[METAINFO_SIZE] - nwritten);
+#endif
 
   remainder0 = oligo % BLOCKSIZE;
   remainder1 = remainder0 + 1;
 
+  debug(printf("nwritten %u, packsize %d\n",nwritten,packsize_div2 * 2));
   debug(Bitpack64_block_offsets(offsets,oligo,bitpackptrs,bitpackcomp));
 
-#ifdef HAVE_SSE2
-  _diffs = (UINT4 *) diffs;	/* Assumes a dummy register in diffs[0] */
-
-#ifdef BRANCH_FREE_QTR_BLOCK
-  psums[0] = psums[1] = info[1];
-  psums[2] = psums[3] = psums[4] = info[METAINFO_SIZE+1];
-
-  delta = 31 - abs(remainder1 - 32);
-  column = get_column(delta);
-  row = get_row(delta);
-  debug(printf("quarter-block %d, delta %d, column %d, row %d\n",quarter_block_1,delta,column,row));
-  
-  (unpacker_table[packsize_div2][column*4 + quarter_block_1])(diffs,bitpack);
-  *end0 = psums[quarter_block_1] + _diffs[row+1] + _diffs[row+2] + _diffs[row+3] + _diffs[row+4];
-
-
-  delta = 31 - abs(remainder0 - 32);
-  column = get_column(delta);
-  row = get_row(delta);
-  debug(printf("quarter-block %d, delta %d, column %d, row %d\n",quarter_block_0,delta,column,row));
-
-  (unpacker_table[packsize_div2][column*4 + quarter_block_0])(diffs,bitpack);
-  return psums[quarter_block_0] + _diffs[row+1] + _diffs[row+2] + _diffs[row+3] + _diffs[row+4];
-
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+#ifdef WORDS_BIGENDIAN
+  offset0 = Bigendian_convert_uint(info[1]);
+  offset1 = Bigendian_convert_uint(info[METAINFO_SIZE+1]);
 #else
-
-  if (remainder0 < 16) {
-    /* Quarter-block 0 */
-    delta = remainder0 - 1;
-    column = get_column(delta);
-    row0 = get_row(delta);
-    row1 = get_row(delta + 1);
-    (unpacker_table[packsize_div2][column*4 + 0])(diffs,bitpack);
-
-    _diffs = (UINT4 *) &(diffs[2]);
-    assign_sum_fwd(*end0,info[1],_diffs,row1);
-
-    _diffs = (UINT4 *) &(diffs[0]);
-    return_sum_fwd(info[1],_diffs,row0);
-
-  } else if (remainder0 < 32) {
-    /* Quarter-block 1 */
-    delta = remainder0 - 1;
-    column = get_column(delta);
-    row0 = get_row(delta);
-    row1 = get_row(delta + 1);
-    (unpacker_table[packsize_div2][column*4 + 1])(diffs,bitpack);
-
-    _diffs = (UINT4 *) &(diffs[2]);
-    assign_sum_fwd(*end0,info[1],_diffs,row1);
-
-    _diffs = (UINT4 *) &(diffs[0]);
-    return_sum_fwd(info[1],_diffs,row0);
-
-  } else if (remainder0 < 48) {
-    /* Quarter-block 2 */
-    delta = 63 - remainder1;
-    column = get_column(delta);
-    row1 = get_row(delta);
-    row0 = get_row(delta + 1);
-    (unpacker_table[packsize_div2][column*4 + 2])(diffs,bitpack);
-
-    _diffs = (UINT4 *) &(diffs[0]);
-    assign_sum_rev(*end0,info[METAINFO_SIZE+1],_diffs,row1);
-
-    _diffs = (UINT4 *) &(diffs[2]);
-    return_sum_rev(info[METAINFO_SIZE+1],_diffs,row0);
-
-  } else {
-    /* Quarter-block 3 */
-    delta = 63 - remainder1;
-    column = get_column(delta);
-    row1 = get_row(delta);
-    row0 = get_row(delta + 1);
-    (unpacker_table[packsize_div2][column*4 + 3])(diffs,bitpack);
-
-    _diffs = (UINT4 *) &(diffs[0]);
-    assign_sum_rev(*end0,info[METAINFO_SIZE+1],_diffs,row1);
-
-    _diffs = (UINT4 *) &(diffs[2]);
-    return_sum_rev(info[METAINFO_SIZE+1],_diffs,row0);
-  }
-
-#endif
-
-#else  /* HAVE_SSE2 */
-
   offset0 = info[1];
   offset1 = info[METAINFO_SIZE+1];
+#endif
 
   /* Unpack all 64 diffs for non-SIMD */
   (unpacker_all_table[packsize_div2*2])(&(diffs[1]),bitpack);
 
 #ifdef DEBUG
+#ifdef WORDS_BIGENDIAN
+  printf("oligo: %08X, remainder %d, offset0 %u, offset1 %u\n",
+         oligo,oligo % BLOCKSIZE,Bigendian_convert_uint(info[1]),Bigendian_convert_uint(info[METAINFO_SIZE+1]));
+#else
   printf("oligo: %08X, remainder %d, offset0 %u, offset1 %u\n",
 	 oligo,oligo % BLOCKSIZE,info[1],info[METAINFO_SIZE+1]);
+#endif
   printf("bitpack:\n");
 
+
   for (i = 1; i <= BLOCKSIZE; i++) {
     printf("%d ",diffs[i]);
     if (i % (BLOCKSIZE/4) == 0) {
@@ -13912,93 +14357,12 @@ Bitpack64_read_two (UINT4 *end0, Storedoligomer_T oligo, UINT4 *bitpackptrs, UIN
 
   return ptr;
 
-#endif	/* HAVE_SSE2 */
-
-}
-#endif
-
-
-#ifdef LARGE_GENOMES
-/* bitpackpages: A list of b-mers (12-mers by default), ending with -1U */
-UINT8
-Bitpack64_read_two_huge (UINT8 *end0, Storedoligomer_T oligo,
-			 UINT4 *bitpackpages, UINT4 *bitpackptrs, UINT4 *bitpackcomp) {
-  Storedoligomer_T bmer;
-  UINT4 *info, nwritten;
-  UINT8 offset0, offset1;
-  UINT4 packsize_div2;
-  int remainder0, remainder1, column;
-#ifdef HAVE_SSE2
-  int delta, row0, row1;
-#ifdef BRANCH_FREE_ROW_SUM
-  __m128i diffs[3];
-#else
-  __m128i diffs[4];  /* Need to provide space for 8 rows (or 2 128-bit registers) for ptr and for end0 */
-#endif
-#ifdef BRANCH_FREE_QTR_BLOCK
-  UINT8 psums[5];		/* Need 5 to handle case where remainder == 64 */
-#endif
-  __m128i *bitpack;
-  UINT4 *_diffs;
-
-#else
-  UINT4 ptr;
-  int remainder, row, k, i;
-  UINT4 diffs[BLOCKSIZE+1], *bitpack;
-#endif
-  UINT4 *pageptr;
-#ifdef DEBUG
-  UINT4 offsets[BLOCKSIZE+1];
-#endif
-
-
-  bmer = oligo/BLOCKSIZE;
-  info = &(bitpackptrs[bmer * METAINFO_SIZE]);
-
-  debug(printf("Entered Bitpack64_read_two_huge with oligo %u => bmer %u\n",oligo,bmer));
-
-  nwritten = info[0];
-#ifdef HAVE_SSE2  
-  bitpack = (__m128i *) &(bitpackcomp[nwritten*4]);
-#else
-  bitpack = (UINT4 *) &(bitpackcomp[nwritten*4]);
-#endif
-
-  offset0 = (UINT8) info[1];
-  offset1 = (UINT8) info[METAINFO_SIZE+1];
-  debug(printf("offsets are %llu, %llu\n",offset0,offset1));
-
-  if (bitpackpages != NULL) {
-    pageptr = bitpackpages;
-    debug(printf("  compare bmer %u with pageptr %u\n",bmer,*pageptr));
-    while (bmer >= *pageptr) {
-      offset0 += POSITIONS_PAGE;
-      offset1 += POSITIONS_PAGE;
-      pageptr++;
-    }
-
-    if (bmer + 1 >= *pageptr) {
-      offset1 += POSITIONS_PAGE;
-      /* pageptr++; */
-    }
-  }
-  debug(printf("offsets are %llu, %llu\n",offset0,offset1));
-
-
-  /* packsize = (info[METAINFO_SIZE] - nwritten)*2; */
-  packsize_div2 = (info[METAINFO_SIZE] - nwritten);
-
-  remainder0 = oligo % BLOCKSIZE;
-  remainder1 = remainder0 + 1;
-
-  /* debug(Bitpack64_block_offsets_huge(offsets,oligo,bitpackpages,bitpackptrs,bitpackcomp)); */
-
-#ifdef HAVE_SSE2
+#else			    /* littleendian and SSE2 */
   _diffs = (UINT4 *) diffs;	/* Assumes a dummy register in diffs[0] */
 
 #ifdef BRANCH_FREE_QTR_BLOCK
-  psums[0] = psums[1] = offset0;
-  psums[2] = psums[3] = psums[4] = offset1;
+  psums[0] = psums[1] = info[1];
+  psums[2] = psums[3] = psums[4] = info[METAINFO_SIZE+1];
 
   delta = 31 - abs(remainder1 - 32);
   column = get_column(delta);
@@ -14006,7 +14370,7 @@ Bitpack64_read_two_huge (UINT8 *end0, Storedoligomer_T oligo,
   debug(printf("quarter-block %d, delta %d, column %d, row %d\n",quarter_block_1,delta,column,row));
   
   (unpacker_table[packsize_div2][column*4 + quarter_block_1])(diffs,bitpack);
-  *end0 = psums[quarter_block_1] + (INT4) (_diffs[row+1] + _diffs[row+2] + _diffs[row+3] + _diffs[row+4]);
+  *end0 = psums[quarter_block_1] + _diffs[row+1] + _diffs[row+2] + _diffs[row+3] + _diffs[row+4];
 
 
   delta = 31 - abs(remainder0 - 32);
@@ -14015,7 +14379,7 @@ Bitpack64_read_two_huge (UINT8 *end0, Storedoligomer_T oligo,
   debug(printf("quarter-block %d, delta %d, column %d, row %d\n",quarter_block_0,delta,column,row));
 
   (unpacker_table[packsize_div2][column*4 + quarter_block_0])(diffs,bitpack);
-  return psums[quarter_block_0] + (INT4) (_diffs[row+1] + _diffs[row+2] + _diffs[row+3] + _diffs[row+4]);
+  return psums[quarter_block_0] + _diffs[row+1] + _diffs[row+2] + _diffs[row+3] + _diffs[row+4];
 
 #else
 
@@ -14025,14 +14389,13 @@ Bitpack64_read_two_huge (UINT8 *end0, Storedoligomer_T oligo,
     column = get_column(delta);
     row0 = get_row(delta);
     row1 = get_row(delta + 1);
-    debug(printf("quarter_block 0, remainder %d, delta %d, column %d, row %d\n",remainder0,delta,column,row0));
     (unpacker_table[packsize_div2][column*4 + 0])(diffs,bitpack);
 
     _diffs = (UINT4 *) &(diffs[2]);
-    assign_sum_fwd(*end0,offset0,_diffs,row1);
+    assign_sum_fwd(*end0,info[1],_diffs,row1);
 
     _diffs = (UINT4 *) &(diffs[0]);
-    return_sum_fwd(offset0,_diffs,row0);
+    return_sum_fwd(info[1],_diffs,row0);
 
   } else if (remainder0 < 32) {
     /* Quarter-block 1 */
@@ -14040,14 +14403,13 @@ Bitpack64_read_two_huge (UINT8 *end0, Storedoligomer_T oligo,
     column = get_column(delta);
     row0 = get_row(delta);
     row1 = get_row(delta + 1);
-    debug(printf("quarter_block 1, remainder %d, delta %d, column %d, row %d\n",remainder0,delta,column,row0));
     (unpacker_table[packsize_div2][column*4 + 1])(diffs,bitpack);
 
     _diffs = (UINT4 *) &(diffs[2]);
-    assign_sum_fwd(*end0,offset0,_diffs,row1);
+    assign_sum_fwd(*end0,info[1],_diffs,row1);
 
     _diffs = (UINT4 *) &(diffs[0]);
-    return_sum_fwd(offset0,_diffs,row0);
+    return_sum_fwd(info[1],_diffs,row0);
 
   } else if (remainder0 < 48) {
     /* Quarter-block 2 */
@@ -14055,14 +14417,13 @@ Bitpack64_read_two_huge (UINT8 *end0, Storedoligomer_T oligo,
     column = get_column(delta);
     row1 = get_row(delta);
     row0 = get_row(delta + 1);
-    debug(printf("quarter_block 2, remainder %d, delta %d, column %d, row %d\n",remainder0,delta,column,row0));
     (unpacker_table[packsize_div2][column*4 + 2])(diffs,bitpack);
 
     _diffs = (UINT4 *) &(diffs[0]);
-    assign_sum_rev(*end0,offset1,_diffs,row1);
+    assign_sum_rev(*end0,info[METAINFO_SIZE+1],_diffs,row1);
 
     _diffs = (UINT4 *) &(diffs[2]);
-    return_sum_rev(offset1,_diffs,row0);
+    return_sum_rev(info[METAINFO_SIZE+1],_diffs,row0);
 
   } else {
     /* Quarter-block 3 */
@@ -14070,18 +14431,124 @@ Bitpack64_read_two_huge (UINT8 *end0, Storedoligomer_T oligo,
     column = get_column(delta);
     row1 = get_row(delta);
     row0 = get_row(delta + 1);
-    debug(printf("quarter_block 3, remainder %d, delta %d, column %d, row %d\n",remainder0,delta,column,row0));
     (unpacker_table[packsize_div2][column*4 + 3])(diffs,bitpack);
 
     _diffs = (UINT4 *) &(diffs[0]);
-    assign_sum_rev(*end0,offset1,_diffs,row1);
+    assign_sum_rev(*end0,info[METAINFO_SIZE+1],_diffs,row1);
 
     _diffs = (UINT4 *) &(diffs[2]);
-    return_sum_rev(offset1,_diffs,row0);
+    return_sum_rev(info[METAINFO_SIZE+1],_diffs,row0);
   }
+
+#endif	/* BRANCH_FREE_QTR_BLOCK */
+#endif	/* HAVE_SSE2 */
+
+}
+#endif
+
+
+#ifdef LARGE_GENOMES
+/* bitpackpages: A list of b-mers (12-mers by default), ending with -1U */
+UINT8
+Bitpack64_read_two_huge (UINT8 *end0, Storedoligomer_T oligo,
+			 UINT4 *bitpackpages, UINT4 *bitpackptrs, UINT4 *bitpackcomp) {
+  Storedoligomer_T bmer;
+  UINT4 *info, nwritten;
+  UINT8 offset0, offset1;
+  UINT4 packsize_div2;
+  int remainder0, remainder1, column;
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+  UINT4 ptr;
+  int remainder, row, k, i;
+  UINT4 diffs[BLOCKSIZE+1], *bitpack;
+#else
+  int delta, row0, row1;
+#ifdef BRANCH_FREE_ROW_SUM
+  __m128i diffs[3];
+#else
+  __m128i diffs[4];  /* Need to provide space for 8 rows (or 2 128-bit registers) for ptr and for end0 */
+#endif
+#ifdef BRANCH_FREE_QTR_BLOCK
+  UINT8 psums[5];		/* Need 5 to handle case where remainder == 64 */
+#endif
+  __m128i *bitpack;
+  UINT4 *_diffs;
+#endif
+  UINT4 *pageptr;
+#ifdef DEBUG
+  UINT4 offsets[BLOCKSIZE+1];
+#endif
+
+
+  bmer = oligo/BLOCKSIZE;
+  info = &(bitpackptrs[bmer * METAINFO_SIZE]);
+
+  debug(printf("Entered Bitpack64_read_two_huge with oligo %u => bmer %u\n",oligo,bmer));
+
+#ifdef WORDS_BIGENDIAN
+  nwritten = Bigendian_convert_uint(info[0]);
+  bitpack = (UINT4 *) &(bitpackcomp[nwritten*4]);
+  offset0 = (UINT8) Bigendian_convert_uint(info[1]);
+  offset1 = (UINT8) Bigendian_convert_uint(info[METAINFO_SIZE+1]);
+
+#elif !defined(HAVE_SSE2)
+  nwritten = info[0];
+  bitpack = (UINT4 *) &(bitpackcomp[nwritten*4]);
+  offset0 = (UINT8) info[1];
+  offset1 = (UINT8) info[METAINFO_SIZE+1];
+
+#else
+  nwritten = info[0];
+  bitpack = (__m128i *) &(bitpackcomp[nwritten*4]);
+  offset0 = (UINT8) info[1];
+  offset1 = (UINT8) info[METAINFO_SIZE+1];
 #endif
 
-#else  /* HAVE_SSE2 */
+  debug(printf("offsets are %llu, %llu\n",offset0,offset1));
+
+#ifdef WORDS_BIGENDIAN
+  if (bitpackpages != NULL) {
+    pageptr = bitpackpages;
+    debug(printf("  compare bmer %u with pageptr %u\n",bmer,*pageptr));
+    while (bmer >= Bigendian_convert_uint(*pageptr)) {
+      offset0 += POSITIONS_PAGE;
+      offset1 += POSITIONS_PAGE;
+      pageptr++;
+    }
+
+    if (bmer + 1 >= Bigendian_convert_uint(*pageptr)) {
+      offset1 += POSITIONS_PAGE;
+      /* pageptr++; */
+    }
+  }
+  debug(printf("offsets are %llu, %llu\n",offset0,offset1));
+  packsize_div2 = (Bigendian_convert_uint(info[METAINFO_SIZE]) - nwritten);
+
+#else
+  if (bitpackpages != NULL) {
+    pageptr = bitpackpages;
+    debug(printf("  compare bmer %u with pageptr %u\n",bmer,*pageptr));
+    while (bmer >= *pageptr) {
+      offset0 += POSITIONS_PAGE;
+      offset1 += POSITIONS_PAGE;
+      pageptr++;
+    }
+
+    if (bmer + 1 >= *pageptr) {
+      offset1 += POSITIONS_PAGE;
+      /* pageptr++; */
+    }
+  }
+  debug(printf("offsets are %llu, %llu\n",offset0,offset1));
+  packsize_div2 = (info[METAINFO_SIZE] - nwritten);
+#endif
+
+  remainder0 = oligo % BLOCKSIZE;
+  remainder1 = remainder0 + 1;
+
+  /* debug(Bitpack64_block_offsets_huge(offsets,oligo,bitpackpages,bitpackptrs,bitpackcomp)); */
+
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
 
   /* Unpack all 64 diffs for non-SIMD */
   (unpacker_all_table[packsize_div2*2])(&(diffs[1]),bitpack);
@@ -14237,6 +14704,95 @@ Bitpack64_read_two_huge (UINT8 *end0, Storedoligomer_T oligo,
 
   return ptr;
 
-#endif	/* HAVE_SSE2 */
+
+#else			    /* littleendian and SSE2 */
+  _diffs = (UINT4 *) diffs;	/* Assumes a dummy register in diffs[0] */
+
+#ifdef BRANCH_FREE_QTR_BLOCK
+  psums[0] = psums[1] = offset0;
+  psums[2] = psums[3] = psums[4] = offset1;
+
+  delta = 31 - abs(remainder1 - 32);
+  column = get_column(delta);
+  row = get_row(delta);
+  debug(printf("quarter-block %d, delta %d, column %d, row %d\n",quarter_block_1,delta,column,row));
+  
+  (unpacker_table[packsize_div2][column*4 + quarter_block_1])(diffs,bitpack);
+  *end0 = psums[quarter_block_1] + (INT4) (_diffs[row+1] + _diffs[row+2] + _diffs[row+3] + _diffs[row+4]);
+
+
+  delta = 31 - abs(remainder0 - 32);
+  column = get_column(delta);
+  row = get_row(delta);
+  debug(printf("quarter-block %d, delta %d, column %d, row %d\n",quarter_block_0,delta,column,row));
+
+  (unpacker_table[packsize_div2][column*4 + quarter_block_0])(diffs,bitpack);
+  return psums[quarter_block_0] + (INT4) (_diffs[row+1] + _diffs[row+2] + _diffs[row+3] + _diffs[row+4]);
+
+#else
+
+  if (remainder0 < 16) {
+    /* Quarter-block 0 */
+    delta = remainder0 - 1;
+    column = get_column(delta);
+    row0 = get_row(delta);
+    row1 = get_row(delta + 1);
+    debug(printf("quarter_block 0, remainder %d, delta %d, column %d, row %d\n",remainder0,delta,column,row0));
+    (unpacker_table[packsize_div2][column*4 + 0])(diffs,bitpack);
+
+    _diffs = (UINT4 *) &(diffs[2]);
+    assign_sum_fwd(*end0,offset0,_diffs,row1);
+
+    _diffs = (UINT4 *) &(diffs[0]);
+    return_sum_fwd(offset0,_diffs,row0);
+
+  } else if (remainder0 < 32) {
+    /* Quarter-block 1 */
+    delta = remainder0 - 1;
+    column = get_column(delta);
+    row0 = get_row(delta);
+    row1 = get_row(delta + 1);
+    debug(printf("quarter_block 1, remainder %d, delta %d, column %d, row %d\n",remainder0,delta,column,row0));
+    (unpacker_table[packsize_div2][column*4 + 1])(diffs,bitpack);
+
+    _diffs = (UINT4 *) &(diffs[2]);
+    assign_sum_fwd(*end0,offset0,_diffs,row1);
+
+    _diffs = (UINT4 *) &(diffs[0]);
+    return_sum_fwd(offset0,_diffs,row0);
+
+  } else if (remainder0 < 48) {
+    /* Quarter-block 2 */
+    delta = 63 - remainder1;
+    column = get_column(delta);
+    row1 = get_row(delta);
+    row0 = get_row(delta + 1);
+    debug(printf("quarter_block 2, remainder %d, delta %d, column %d, row %d\n",remainder0,delta,column,row0));
+    (unpacker_table[packsize_div2][column*4 + 2])(diffs,bitpack);
+
+    _diffs = (UINT4 *) &(diffs[0]);
+    assign_sum_rev(*end0,offset1,_diffs,row1);
+
+    _diffs = (UINT4 *) &(diffs[2]);
+    return_sum_rev(offset1,_diffs,row0);
+
+  } else {
+    /* Quarter-block 3 */
+    delta = 63 - remainder1;
+    column = get_column(delta);
+    row1 = get_row(delta);
+    row0 = get_row(delta + 1);
+    debug(printf("quarter_block 3, remainder %d, delta %d, column %d, row %d\n",remainder0,delta,column,row0));
+    (unpacker_table[packsize_div2][column*4 + 3])(diffs,bitpack);
+
+    _diffs = (UINT4 *) &(diffs[0]);
+    assign_sum_rev(*end0,offset1,_diffs,row1);
+
+    _diffs = (UINT4 *) &(diffs[2]);
+    return_sum_rev(offset1,_diffs,row0);
+  }
+
+#endif	/* BRANCH_FREE_QTR_BLOCK */
+#endif  /* HAVE_SSE2 */
 }
 #endif
diff --git a/src/bytecoding.c b/src/bytecoding.c
index 5c7bbcf..a311650 100644
--- a/src/bytecoding.c
+++ b/src/bytecoding.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: bytecoding.c 153444 2014-11-18 01:24:55Z twu $";
+static char rcsid[] = "$Id: bytecoding.c 170515 2015-07-23 23:03:24Z twu $";
 #ifdef HAVE_CONFIG_H
 #include <config.h>
 #endif
@@ -445,6 +445,19 @@ Bytecoding_read (UINT4 key, unsigned char *bytes, UINT4 *exceptions, int nexcept
     
     while (lowi < highi) {
       middlei = lowi + ((highi - lowi) / 2);
+#ifdef WORDS_BIGENDIAN
+      debug10(printf("  binary: %d:%u %d:%u %d:%u   vs. %u\n",
+		     lowi,exceptions[2*lowi],middlei,exceptions[2*middlei],
+		     highi,exceptions[2*highi],key));
+      if (key < Bigendian_convert_uint(exceptions[2*middlei])) {
+	highi = middlei;
+      } else if (key > Bigendian_convert_uint(exceptions[2*middlei])) {
+	lowi = middlei + 1;
+      } else {
+	debug10(printf("binary search returns %d => %u\n",middlei,exceptions[2*middlei+1]));
+	return Bigendian_convert_uint(exceptions[2*middlei+1]);
+      }
+#else
       debug10(printf("  binary: %d:%u %d:%u %d:%u   vs. %u\n",
 		     lowi,exceptions[2*lowi],middlei,exceptions[2*middlei],
 		     highi,exceptions[2*highi],key));
@@ -456,6 +469,7 @@ Bytecoding_read (UINT4 key, unsigned char *bytes, UINT4 *exceptions, int nexcept
 	debug10(printf("binary search returns %d => %u\n",middlei,exceptions[2*middlei+1]));
 	return exceptions[2*middlei+1];
       }
+#endif
     }
 
     /* debug10(printf("binary search returns %d => %u\n",highi,exceptions[highi+1])); */
@@ -478,13 +492,31 @@ Bytecoding_read_wguide (UINT4 key, unsigned char *bytes, UINT4 *guide, UINT4 *ex
 
   } else {
     guidei = key/guide_interval;
+#ifdef WORDS_BIGENDIAN
+    lowi = Bigendian_convert_uint(guide[guidei]);
+    highi = Bigendian_convert_uint(guide[guidei+1]);
+#else
     lowi = guide[guidei];
     highi = guide[guidei+1];
+#endif
 
     debug10(printf("entered binary search with lowi=%d, highi=%d, goal=%u\n",lowi,highi,key));
     
     while (lowi < highi) {
       middlei = lowi + ((highi - lowi) / 2);
+#ifdef WORDS_BIGENDIAN
+      debug10(printf("  binary: %d:%u %d:%u %d:%u   vs. %u\n",
+		     lowi,exceptions[2*lowi],middlei,exceptions[2*middlei],
+		     highi,exceptions[2*highi],key));
+      if (key < Bigendian_convert_uint(exceptions[2*middlei])) {
+	highi = middlei;
+      } else if (key > Bigendian_convert_uint(exceptions[2*middlei])) {
+	lowi = middlei + 1;
+      } else {
+	debug10(printf("binary search returns %d => %u\n",middlei,exceptions[2*middlei+1]));
+	return Bigendian_convert_uint(exceptions[2*middlei+1]);
+      }
+#else
       debug10(printf("  binary: %d:%u %d:%u %d:%u   vs. %u\n",
 		     lowi,exceptions[2*lowi],middlei,exceptions[2*middlei],
 		     highi,exceptions[2*highi],key));
@@ -496,6 +528,7 @@ Bytecoding_read_wguide (UINT4 key, unsigned char *bytes, UINT4 *guide, UINT4 *ex
 	debug10(printf("binary search returns %d => %u\n",middlei,exceptions[2*middlei+1]));
 	return exceptions[2*middlei+1];
       }
+#endif
     }
 
     /* debug10(printf("binary search returns %d => %u\n",highi,exceptions[highi+1])); */
@@ -525,6 +558,19 @@ Bytecoding_lcpchilddc_lcp (UINT4 key, unsigned char *bytes, UINT4 *exceptions, i
     
     while (lowi < highi) {
       middlei = lowi + ((highi - lowi) / 2);
+#ifdef WORDS_BIGENDIAN
+      debug10(printf("  binary: %d:%u %d:%u %d:%u   vs. %u\n",
+		     lowi,exceptions[2*lowi],middlei,exceptions[2*middlei],
+		     highi,exceptions[2*highi],key));
+      if (key < Bigendian_convert_uint(exceptions[2*middlei])) {
+	highi = middlei;
+      } else if (key > Bigendian_convert_uint(exceptions[2*middlei])) {
+	lowi = middlei + 1;
+      } else {
+	debug10(printf("binary search returns %d => %u\n",middlei,exceptions[2*middlei+1]));
+	return Bigendian_convert_uint(exceptions[2*middlei+1]);
+      }
+#else
       debug10(printf("  binary: %d:%u %d:%u %d:%u   vs. %u\n",
 		     lowi,exceptions[2*lowi],middlei,exceptions[2*middlei],
 		     highi,exceptions[2*highi],key));
@@ -536,6 +582,7 @@ Bytecoding_lcpchilddc_lcp (UINT4 key, unsigned char *bytes, UINT4 *exceptions, i
 	debug10(printf("binary search returns %d => %u\n",middlei,exceptions[2*middlei+1]));
 	return exceptions[2*middlei+1];
       }
+#endif
     }
 
     /* debug10(printf("binary search returns %d => %u\n",highi,exceptions[highi+1])); */
@@ -580,13 +627,31 @@ Bytecoding_lcpchilddc_child_up (UINT4 key, unsigned char *bytes, UINT4 *guide, U
 
   } else {
     guidei = key/guide_interval;
+#ifdef WORDS_BIGENDIAN
+    lowi = Bigendian_convert_uint(guide[guidei]);
+    highi = Bigendian_convert_uint(guide[guidei+1]);
+#else
     lowi = guide[guidei];
     highi = guide[guidei+1];
+#endif
 
     debug10(printf("entered binary search with lowi=%d, highi=%d, goal=%u\n",lowi,highi,key));
     
     while (lowi < highi) {
       middlei = lowi + ((highi - lowi) / 2);
+#ifdef WORDS_BIGENDIAN
+      debug10(printf("  binary: %d:%u %d:%u %d:%u   vs. %u\n",
+		     lowi,exceptions[2*lowi],middlei,exceptions[2*middlei],
+		     highi,exceptions[2*highi],key));
+      if (key < Bigendian_convert_uint(exceptions[2*middlei])) {
+	highi = middlei;
+      } else if (key > Bigendian_convert_uint(exceptions[2*middlei])) {
+	lowi = middlei + 1;
+      } else {
+	debug10(printf("binary search returns %d => %u\n",middlei,exceptions[2*middlei+1]));
+	return key - Bigendian_convert_uint(exceptions[2*middlei+1]);
+      }
+#else
       debug10(printf("  binary: %d:%u %d:%u %d:%u   vs. %u\n",
 		     lowi,exceptions[2*lowi],middlei,exceptions[2*middlei],
 		     highi,exceptions[2*highi],key));
@@ -598,6 +663,7 @@ Bytecoding_lcpchilddc_child_up (UINT4 key, unsigned char *bytes, UINT4 *guide, U
 	debug10(printf("binary search returns %d => %u\n",middlei,exceptions[2*middlei+1]));
 	return key - exceptions[2*middlei+1];
       }
+#endif
     }
 
     /* debug10(printf("binary search returns %d => %u\n",highi,exceptions[highi+1])); */
@@ -622,13 +688,31 @@ Bytecoding_lcpchilddc_child_next (UINT4 key, unsigned char *bytes, UINT4 *guide,
 
   } else {
     guidei = key/guide_interval;
+#ifdef WORDS_BIGENDIAN
+    lowi = Bigendian_convert_uint(guide[guidei]);
+    highi = Bigendian_convert_uint(guide[guidei+1]);
+#else
     lowi = guide[guidei];
     highi = guide[guidei+1];
+#endif
 
     debug10(printf("entered binary search with lowi=%d, highi=%d, goal=%u\n",lowi,highi,key));
     
     while (lowi < highi) {
       middlei = lowi + ((highi - lowi) / 2);
+#ifdef WORDS_BIGENDIAN
+      debug10(printf("  binary: %d:%u %d:%u %d:%u   vs. %u\n",
+		     lowi,exceptions[2*lowi],middlei,exceptions[2*middlei],
+		     highi,exceptions[2*highi],key));
+      if (key < Bigendian_convert_uint(exceptions[2*middlei])) {
+	highi = middlei;
+      } else if (key > Bigendian_convert_uint(exceptions[2*middlei])) {
+	lowi = middlei + 1;
+      } else {
+	debug10(printf("binary search returns %d => %u\n",middlei,exceptions[2*middlei+1]));
+	return Bigendian_convert_uint(exceptions[2*middlei+1]) + key + 1;
+      }
+#else
       debug10(printf("  binary: %d:%u %d:%u %d:%u   vs. %u\n",
 		     lowi,exceptions[2*lowi],middlei,exceptions[2*middlei],
 		     highi,exceptions[2*highi],key));
@@ -640,6 +724,7 @@ Bytecoding_lcpchilddc_child_next (UINT4 key, unsigned char *bytes, UINT4 *guide,
 	debug10(printf("binary search returns %d => %u\n",middlei,exceptions[2*middlei+1]));
 	return exceptions[2*middlei+1] + key + 1;
       }
+#endif
     }
 
     /* debug10(printf("binary search returns %d => %u\n",highi,exceptions[highi+1])); */
@@ -652,7 +737,7 @@ Bytecoding_lcpchilddc_child_next (UINT4 key, unsigned char *bytes, UINT4 *guide,
 
 
 UINT4
-Bytecoding_lcpchilddc_lcp_next (UINT4 key, unsigned char *bytes, UINT4 *child_guide,
+Bytecoding_lcpchilddc_lcp_next (UINT4 *child_next, UINT4 key, unsigned char *bytes, UINT4 *child_guide,
 				UINT4 *child_exceptions, int child_guide_interval,
 				UINT4 *lcp_exceptions, int n_lcp_exceptions) {
   UINT8 blocki = key/2;		/* Needs to be UINT8, because 5 * 2^32 will overflow UINT4 */
@@ -663,17 +748,37 @@ Bytecoding_lcpchilddc_lcp_next (UINT4 key, unsigned char *bytes, UINT4 *child_gu
 
   if ((byte = block[3 + (key % 2)]) < 255) {
     debug10(printf("value %d < 255\n",byte));
-    return Bytecoding_lcpchilddc_lcp((UINT4) byte + key + 1,bytes,lcp_exceptions,n_lcp_exceptions);
+    *child_next = (UINT4) byte + key + 1;
+    return Bytecoding_lcpchilddc_lcp(*child_next,bytes,lcp_exceptions,n_lcp_exceptions);
 
   } else {
     guidei = key/child_guide_interval;
+#ifdef WORDS_BIGENDIAN
+    lowi = Bigendian_convert_uint(child_guide[guidei]);
+    highi = Bigendian_convert_uint(child_guide[guidei+1]);
+#else
     lowi = child_guide[guidei];
     highi = child_guide[guidei+1];
+#endif
 
     debug10(printf("entered binary search with lowi=%d, highi=%d, goal=%u\n",lowi,highi,key));
     
     while (lowi < highi) {
       middlei = lowi + ((highi - lowi) / 2);
+#ifdef WORDS_BIGENDIAN
+      debug10(printf("  binary: %d:%u %d:%u %d:%u   vs. %u\n",
+		     lowi,child_exceptions[2*lowi],middlei,child_exceptions[2*middlei],
+		     highi,child_exceptions[2*highi],key));
+      if (key < Bigendian_convert_uint(child_exceptions[2*middlei])) {
+	highi = middlei;
+  } else if (key > Bigendian_convert_uint(child_exceptions[2*middlei])) {
+	lowi = middlei + 1;
+      } else {
+	debug10(printf("binary search returns %d => %u\n",middlei,child_exceptions[2*middlei+1]));
+	*child_next = Bigendian_convert_uint(child_exceptions[2*middlei+1]) + key + 1;
+	return Bytecoding_lcpchilddc_lcp(*child_next,bytes,lcp_exceptions,n_lcp_exceptions);
+      }
+#else
       debug10(printf("  binary: %d:%u %d:%u %d:%u   vs. %u\n",
 		     lowi,child_exceptions[2*lowi],middlei,child_exceptions[2*middlei],
 		     highi,child_exceptions[2*highi],key));
@@ -683,9 +788,10 @@ Bytecoding_lcpchilddc_lcp_next (UINT4 key, unsigned char *bytes, UINT4 *child_gu
 	lowi = middlei + 1;
       } else {
 	debug10(printf("binary search returns %d => %u\n",middlei,child_exceptions[2*middlei+1]));
-	return Bytecoding_lcpchilddc_lcp(child_exceptions[2*middlei+1] + key + 1,bytes,
-					 lcp_exceptions,n_lcp_exceptions);
+	*child_next = child_exceptions[2*middlei+1] + key + 1;
+	return Bytecoding_lcpchilddc_lcp(*child_next,bytes,lcp_exceptions,n_lcp_exceptions);
       }
+#endif
     }
 
     /* debug10(printf("binary search returns %d => %u\n",highi,exceptions[highi+1])); */
@@ -720,13 +826,31 @@ Bytecoding_lcpchilddcn_child_up (bool *nextp, UINT4 key, unsigned char *bytes, U
 
   } else {
     guidei = key/guide_interval;
+#ifdef WORDS_BIGENDIAN
+    lowi = Bigendian_convert_uint(guide[guidei]);
+    highi = Bigendian_convert_uint(guide[guidei+1]);
+#else
     lowi = guide[guidei];
     highi = guide[guidei+1];
+#endif
 
     debug10(printf("entered binary search with lowi=%d, highi=%d, goal=%u\n",lowi,highi,key));
     
     while (lowi < highi) {
       middlei = lowi + ((highi - lowi) / 2);
+#ifdef WORDS_BIGENDIAN
+      debug10(printf("  binary: %d:%u %d:%u %d:%u   vs. %u\n",
+		     lowi,exceptions[2*lowi],middlei,exceptions[2*middlei],
+		     highi,exceptions[2*highi],key));
+      if (key < Bigendian_convert_uint(exceptions[2*middlei])) {
+	highi = middlei;
+     } else if (key > Bigendian_convert_uint(exceptions[2*middlei])) {
+	lowi = middlei + 1;
+      } else {
+	debug10(printf("binary search returns %d => %u\n",middlei,exceptions[2*middlei+1]));
+	return key - Bigendian_convert_uint(exceptions[2*middlei+1]);
+      }
+#else
       debug10(printf("  binary: %d:%u %d:%u %d:%u   vs. %u\n",
 		     lowi,exceptions[2*lowi],middlei,exceptions[2*middlei],
 		     highi,exceptions[2*highi],key));
@@ -738,6 +862,7 @@ Bytecoding_lcpchilddcn_child_up (bool *nextp, UINT4 key, unsigned char *bytes, U
 	debug10(printf("binary search returns %d => %u\n",middlei,exceptions[2*middlei+1]));
 	return key - exceptions[2*middlei+1];
       }
+#endif
     }
 
     /* debug10(printf("binary search returns %d => %u\n",highi,exceptions[highi+1])); */
@@ -773,13 +898,31 @@ Bytecoding_lcpchilddcn_child_next (bool *nextp, UINT4 key, unsigned char *bytes,
 
   } else {
     guidei = key/guide_interval;
+#ifdef WORDS_BIGENDIAN
+    lowi = Bigendian_convert_uint(guide[guidei]);
+    highi = Bigendian_convert_uint(guide[guidei+1]);
+#else
     lowi = guide[guidei];
     highi = guide[guidei+1];
+#endif
 
     debug10(printf("entered binary search with lowi=%d, highi=%d, goal=%u\n",lowi,highi,key));
     
     while (lowi < highi) {
       middlei = lowi + ((highi - lowi) / 2);
+#ifdef WORDS_BIGENDIAN
+      debug10(printf("  binary: %d:%u %d:%u %d:%u   vs. %u\n",
+		     lowi,exceptions[2*lowi],middlei,exceptions[2*middlei],
+		     highi,exceptions[2*highi],key));
+      if (key < Bigendian_convert_uint(exceptions[2*middlei])) {
+	highi = middlei;
+      } else if (key > Bigendian_convert_uint(exceptions[2*middlei])) {
+	lowi = middlei + 1;
+      } else {
+	debug10(printf("binary search returns %d => %u\n",middlei,exceptions[2*middlei+1]));
+	return Bigendian_convert_uint(exceptions[2*middlei+1]) + key + 1;
+      }
+#else
       debug10(printf("  binary: %d:%u %d:%u %d:%u   vs. %u\n",
 		     lowi,exceptions[2*lowi],middlei,exceptions[2*middlei],
 		     highi,exceptions[2*highi],key));
@@ -791,6 +934,7 @@ Bytecoding_lcpchilddcn_child_next (bool *nextp, UINT4 key, unsigned char *bytes,
 	debug10(printf("binary search returns %d => %u\n",middlei,exceptions[2*middlei+1]));
 	return exceptions[2*middlei+1] + key + 1;
       }
+#endif
     }
 
     /* debug10(printf("binary search returns %d => %u\n",highi,exceptions[highi+1])); */
diff --git a/src/bytecoding.h b/src/bytecoding.h
index 089e93c..daf21ec 100644
--- a/src/bytecoding.h
+++ b/src/bytecoding.h
@@ -1,4 +1,4 @@
-/* $Id: bytecoding.h 157221 2015-01-22 18:38:57Z twu $ */
+/* $Id: bytecoding.h 170515 2015-07-23 23:03:24Z twu $ */
 #ifndef BYTECODING_INCLUDED
 #define BYTECODING_INCLUDED
 
@@ -37,7 +37,8 @@ Bytecoding_lcpchilddc_child_up (UINT4 key, unsigned char *bytes, UINT4 *guide, U
 extern UINT4
 Bytecoding_lcpchilddc_child_next (UINT4 key, unsigned char *bytes, UINT4 *guide, UINT4 *exceptions, int guide_interval);
 extern UINT4
-Bytecoding_lcpchilddc_lcp_next (UINT4 key, unsigned char *lcpchilddc, UINT4 *child_guide,
+Bytecoding_lcpchilddc_lcp_next (UINT4 *child_next, UINT4 key,
+				unsigned char *lcpchilddc, UINT4 *child_guide,
 				UINT4 *child_exceptions, int child_guide_interval,
 				UINT4 *lcp_exceptions, int n_lcp_exceptions);
 
diff --git a/src/compress.c b/src/compress.c
index e6e0b6e..8dc44ad 100644
--- a/src/compress.c
+++ b/src/compress.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: compress.c 157566 2015-01-28 00:02:04Z twu $";
+static char rcsid[] = "$Id: compress.c 168395 2015-06-26 17:13:13Z twu $";
 #ifdef HAVE_CONFIG_H
 #include <config.h>
 #endif
@@ -31,10 +31,14 @@ static char rcsid[] = "$Id: compress.c 157566 2015-01-28 00:02:04Z twu $";
 #include "mem.h"		/* For Compress_new */
 #include "assert.h"
 
-#ifdef HAVE_SSE2
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+/* Skip */
+#else
 #include <emmintrin.h>
 #endif
-#ifdef HAVE_SSSE3
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSSE3)
+/* Skip */
+#else
 #include <tmmintrin.h>
 #endif
 #ifdef HAVE_SSE4_1
@@ -78,10 +82,10 @@ static char rcsid[] = "$Id: compress.c 157566 2015-01-28 00:02:04Z twu $";
 #endif
 
 
-#ifdef HAVE_SSE2
-#define STEP_SIZE 128
-#else
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
 #define STEP_SIZE 32
+#else
+#define STEP_SIZE 128
 #endif
 
 
@@ -100,17 +104,17 @@ struct T {
 void
 Compress_free (T *old) {
   if (*old) {
-#ifdef HAVE_SSE2
-    _mm_free((*old)->shift_array[0]);
-#else
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
     FREE((*old)->shift_array[0]);
+#else
+    _mm_free((*old)->shift_array[0]);
 #endif
     FREE((*old)->shift_array);
 #if 0
-#ifdef HAVE_SSE2
-    _mm_free((*old)->blocks);
-#else
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
     FREE((*old)->blocks);
+#else
+    _mm_free((*old)->blocks);
 #endif
 #endif
     FREE(*old);
@@ -171,7 +175,26 @@ write_chars (Genomecomp_T high, Genomecomp_T low, Genomecomp_T flags) {
 }
 
 
-#ifdef HAVE_SSE2
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+void
+Compress_print_blocks (Genomecomp_T *blocks, int nshift, int pos5, int pos3) {
+  int ptr, endptr;
+
+  endptr = (nshift + pos3)/32U*3;   /* /STEP_SIZE*COMPRESS_BLOCKSIZE */
+  ptr = (nshift + pos5)/32U*3;
+
+  while (ptr <= endptr) {
+    printf("high: %08X  low: %08X  flags: %08X\t",
+	   blocks[ptr],blocks[ptr+1],blocks[ptr+2]);
+    write_chars(blocks[ptr],blocks[ptr+1],blocks[ptr+2]);
+    printf("\n");
+    ptr += COMPRESS_BLOCKSIZE;
+  }
+  printf("\n");
+  return;
+}
+
+#else
 void
 Compress_print_blocks (Genomecomp_T *blocks, int nshift, int pos5, int pos3) {
   int ptr, endptr;
@@ -279,25 +302,6 @@ Compress_print_one_block (Genomecomp_T *blocks) {
   return;
 }
 
-#else
-
-/* Not implemented */
-void
-Compress_print_blocks (Genomecomp_T *blocks, int nshift, int pos5, int pos3) {
-  int ptr = 0;
-  int nblocks = 0;
-
-  while (ptr < nblocks*COMPRESS_BLOCKSIZE) {
-    printf("high: %08X  low: %08X  flags: %08X\t",
-	   blocks[ptr],blocks[ptr+1],blocks[ptr+2]);
-    write_chars(blocks[ptr],blocks[ptr+1],blocks[ptr+2]);
-    printf("\n");
-    ptr += COMPRESS_BLOCKSIZE;
-  }
-  printf("\n");
-  return;
-}
-
 #endif
 
 
@@ -316,14 +320,14 @@ Compress_new_fwd (char *gbuffer, Chrpos_T length) {
   int c, i;
   int in_counter = 0;
 
-#ifdef HAVE_SSE2
-  new->nblocks = (length+127)/128U;
-  new->shift_array = (Genomecomp_T **) MALLOC(STEP_SIZE * sizeof(Genomecomp_T *));
-  new->shift_array[0] = (Genomecomp_T *) _mm_malloc(STEP_SIZE*(new->nblocks+1)*COMPRESS_BLOCKSIZE * sizeof(Genomecomp_T),16);
-#else
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
   new->nblocks = (length+31)/32U;
   new->shift_array = (Genomecomp_T **) MALLOC(STEP_SIZE * sizeof(Genomecomp_T *));
   new->shift_array[0] = (Genomecomp_T *) MALLOC(STEP_SIZE*(new->nblocks+1)*COMPRESS_BLOCKSIZE * sizeof(Genomecomp_T));
+#else
+  new->nblocks = (length+127)/128U;
+  new->shift_array = (Genomecomp_T **) MALLOC(STEP_SIZE * sizeof(Genomecomp_T *));
+  new->shift_array[0] = (Genomecomp_T *) _mm_malloc(STEP_SIZE*(new->nblocks+1)*COMPRESS_BLOCKSIZE * sizeof(Genomecomp_T),16);
 #endif
 #ifdef DEBUG14
   new->querylength = length;
@@ -342,7 +346,39 @@ Compress_new_fwd (char *gbuffer, Chrpos_T length) {
   position = 0U;
   while (position < length) {
 
-#ifdef HAVE_SSE2
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+    high = low = flags = 0U;
+    in_counter = 0;
+    while (position < length && in_counter < 32) {
+      c = gbuffer[position++];
+      high >>= 1;
+      low >>= 1;
+      flags >>= 1;
+
+      /* Assume that gbuffer is upper case */
+      switch /*(uppercaseCode[c])*/ (c) {
+      case 'A': /* high |= LEFT_CLEAR; */ /* low |= LEFT_CLEAR; */ /* flags |= LEFT_CLEAR; */ break;
+      case 'C': /* high |= LEFT_CLEAR; */    low |= LEFT_SET;      /* flags |= LEFT_CLEAR; */ break;
+      case 'G':    high |= LEFT_SET;      /* low |= LEFT_CLEAR; */ /* flags |= LEFT_CLEAR; */ break;
+      case 'T':    high |= LEFT_SET;         low |= LEFT_SET;      /* flags |= LEFT_CLEAR; */ break;
+      default:  /* high |= LEFT_CLEAR; */ /* low |= LEFT_CLEAR; */    flags |= LEFT_SET;
+      }
+      in_counter++;
+    }
+      
+    while (in_counter < 32) {
+      high >>= 1;
+      low >>= 1;
+      flags >>= 1;
+      in_counter++;
+    }
+
+    /* Use old storage method */
+    new->blocks[ptr] = high;
+    new->blocks[ptr+1] = low;
+    new->blocks[ptr+2] = flags;
+
+#else
     for (i = 0; i < 4; i++) {
       /* Word i */
       high = low = flags = 0U;
@@ -375,53 +411,21 @@ Compress_new_fwd (char *gbuffer, Chrpos_T length) {
       new->blocks[ptr + i + 4] = low;
       new->blocks[ptr + i + 8] = flags;
     }
-
-#else
-    high = low = flags = 0U;
-    in_counter = 0;
-    while (position < length && in_counter < 32) {
-      c = gbuffer[position++];
-      high >>= 1;
-      low >>= 1;
-      flags >>= 1;
-
-      /* Assume that gbuffer is upper case */
-      switch /*(uppercaseCode[c])*/ (c) {
-      case 'A': /* high |= LEFT_CLEAR; */ /* low |= LEFT_CLEAR; */ /* flags |= LEFT_CLEAR; */ break;
-      case 'C': /* high |= LEFT_CLEAR; */    low |= LEFT_SET;      /* flags |= LEFT_CLEAR; */ break;
-      case 'G':    high |= LEFT_SET;      /* low |= LEFT_CLEAR; */ /* flags |= LEFT_CLEAR; */ break;
-      case 'T':    high |= LEFT_SET;         low |= LEFT_SET;      /* flags |= LEFT_CLEAR; */ break;
-      default:  /* high |= LEFT_CLEAR; */ /* low |= LEFT_CLEAR; */    flags |= LEFT_SET;
-      }
-      in_counter++;
-    }
-      
-    while (in_counter < 32) {
-      high >>= 1;
-      low >>= 1;
-      flags >>= 1;
-      in_counter++;
-    }
-
-    /* Use old storage method */
-    new->blocks[ptr] = high;
-    new->blocks[ptr+1] = low;
-    new->blocks[ptr+2] = flags;
 #endif
 
     ptr += COMPRESS_BLOCKSIZE;
   }
 
-#ifdef HAVE_SSE2
-  /* Compress_shift will access these values */
-  new->blocks[ptr] = new->blocks[ptr+1] = new->blocks[ptr+2] = new->blocks[ptr+3] = 0U;
-  new->blocks[ptr+4] = new->blocks[ptr+5] = new->blocks[ptr+6] = new->blocks[ptr+7] = 0U;
-  new->blocks[ptr+8] = new->blocks[ptr+9] = new->blocks[ptr+10] = new->blocks[ptr+11] = 0U;
-#else
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
   /* Compress_shift will access these values */
   new->blocks[ptr] = 0U;
   new->blocks[ptr+1] = 0U;
   new->blocks[ptr+2] = 0U;
+#else
+  /* Compress_shift will access these values */
+  new->blocks[ptr] = new->blocks[ptr+1] = new->blocks[ptr+2] = new->blocks[ptr+3] = 0U;
+  new->blocks[ptr+4] = new->blocks[ptr+5] = new->blocks[ptr+6] = new->blocks[ptr+7] = 0U;
+  new->blocks[ptr+8] = new->blocks[ptr+9] = new->blocks[ptr+10] = new->blocks[ptr+11] = 0U;
 #endif
 
   debug0(printf("Compress_new_fwd\n"));
@@ -446,14 +450,14 @@ Compress_new_rev (char *gbuffer, Chrpos_T length) {
   int c, i;
   int in_counter = 0;
 
-#ifdef HAVE_SSE2
-  new->nblocks = (length+127)/128U;
-  new->shift_array = (Genomecomp_T **) MALLOC(STEP_SIZE * sizeof(Genomecomp_T *));
-  new->shift_array[0] = (Genomecomp_T *) _mm_malloc(STEP_SIZE*(new->nblocks+1)*COMPRESS_BLOCKSIZE * sizeof(Genomecomp_T),16);
-#else
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
   new->nblocks = (length+31)/32U;
   new->shift_array = (Genomecomp_T **) MALLOC(STEP_SIZE * sizeof(Genomecomp_T *));
   new->shift_array[0] = (Genomecomp_T *) MALLOC(STEP_SIZE*(new->nblocks+1)*COMPRESS_BLOCKSIZE * sizeof(Genomecomp_T));
+#else
+  new->nblocks = (length+127)/128U;
+  new->shift_array = (Genomecomp_T **) MALLOC(STEP_SIZE * sizeof(Genomecomp_T *));
+  new->shift_array[0] = (Genomecomp_T *) _mm_malloc(STEP_SIZE*(new->nblocks+1)*COMPRESS_BLOCKSIZE * sizeof(Genomecomp_T),16);
 #endif
 #ifdef DEBUG14
   new->querylength = length;
@@ -472,7 +476,38 @@ Compress_new_rev (char *gbuffer, Chrpos_T length) {
   position = length;
   while (position > 0) {
 
-#ifdef HAVE_SSE2
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+    high = low = flags = 0U;
+    in_counter = 0;
+    while (position > 0 && in_counter < 32) {
+      c = gbuffer[--position];
+      high >>= 1;
+      low >>= 1;
+      flags >>= 1;
+
+      /* Assume that gbuffer is upper case */
+      switch /*(uppercaseCode[c])*/ (c) {
+      case 'T': /* high |= LEFT_CLEAR; */ /* low |= LEFT_CLEAR; */ /* flags |= LEFT_CLEAR; */ break;
+      case 'G': /* high |= LEFT_CLEAR; */    low |= LEFT_SET;      /* flags |= LEFT_CLEAR; */ break;
+      case 'C':    high |= LEFT_SET;      /* low |= LEFT_CLEAR; */ /* flags |= LEFT_CLEAR; */ break;
+      case 'A':    high |= LEFT_SET;         low |= LEFT_SET;      /* flags |= LEFT_CLEAR; */ break;
+      default:  /* high |= LEFT_CLEAR; */ /* low |= LEFT_CLEAR; */    flags |= LEFT_SET;
+      }
+      in_counter++;
+    }
+
+    while (in_counter < 32) {
+      high >>= 1;
+      low >>= 1;
+      flags >>= 1;
+      in_counter++;
+    }
+
+    new->blocks[ptr] = high;
+    new->blocks[ptr+1] = low;
+    new->blocks[ptr+2] = flags;
+
+#else
     for (i = 0; i < 4; i++) {
       /* Word i */
       high = low = flags = 0U;
@@ -505,52 +540,21 @@ Compress_new_rev (char *gbuffer, Chrpos_T length) {
       new->blocks[ptr + i + 4] = low;
       new->blocks[ptr + i + 8] = flags;
     }
-
-#else
-    high = low = flags = 0U;
-    in_counter = 0;
-    while (position > 0 && in_counter < 32) {
-      c = gbuffer[--position];
-      high >>= 1;
-      low >>= 1;
-      flags >>= 1;
-
-      /* Assume that gbuffer is upper case */
-      switch /*(uppercaseCode[c])*/ (c) {
-      case 'T': /* high |= LEFT_CLEAR; */ /* low |= LEFT_CLEAR; */ /* flags |= LEFT_CLEAR; */ break;
-      case 'G': /* high |= LEFT_CLEAR; */    low |= LEFT_SET;      /* flags |= LEFT_CLEAR; */ break;
-      case 'C':    high |= LEFT_SET;      /* low |= LEFT_CLEAR; */ /* flags |= LEFT_CLEAR; */ break;
-      case 'A':    high |= LEFT_SET;         low |= LEFT_SET;      /* flags |= LEFT_CLEAR; */ break;
-      default:  /* high |= LEFT_CLEAR; */ /* low |= LEFT_CLEAR; */    flags |= LEFT_SET;
-      }
-      in_counter++;
-    }
-
-    while (in_counter < 32) {
-      high >>= 1;
-      low >>= 1;
-      flags >>= 1;
-      in_counter++;
-    }
-
-    new->blocks[ptr] = high;
-    new->blocks[ptr+1] = low;
-    new->blocks[ptr+2] = flags;
 #endif
     
     ptr += COMPRESS_BLOCKSIZE;
   }
 
-#ifdef HAVE_SSE2
-  /* Compress_shift will access these values */
-  new->blocks[ptr] = new->blocks[ptr+1] = new->blocks[ptr+2] = new->blocks[ptr+3] = 0U;
-  new->blocks[ptr+4] = new->blocks[ptr+5] = new->blocks[ptr+6] = new->blocks[ptr+7] = 0U;
-  new->blocks[ptr+8] = new->blocks[ptr+9] = new->blocks[ptr+10] = new->blocks[ptr+11] = 0U;
-#else
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
   /* Compress_shift will access these values */
   new->blocks[ptr] = 0U;
   new->blocks[ptr+1] = 0U;
   new->blocks[ptr+2] = 0U;
+#else
+  /* Compress_shift will access these values */
+  new->blocks[ptr] = new->blocks[ptr+1] = new->blocks[ptr+2] = new->blocks[ptr+3] = 0U;
+  new->blocks[ptr+4] = new->blocks[ptr+5] = new->blocks[ptr+6] = new->blocks[ptr+7] = 0U;
+  new->blocks[ptr+8] = new->blocks[ptr+9] = new->blocks[ptr+10] = new->blocks[ptr+11] = 0U;
 #endif
 
   debug0(printf("Compress_new_rev\n"));
@@ -1170,7 +1174,7 @@ shift_sse2 (T this, int nshift) {
 
 
 
-#ifndef HAVE_SSE2
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
 Genomecomp_T *
 Compress_shift (T this, int nshift) {
   Genomecomp_T *shifted;
@@ -1606,7 +1610,8 @@ Compress32_shift (T this, int nshift) {
   Genomecomp_T *shifted;
   int rightshift;
   int ptr;
-#ifdef HAVE_SSE2
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+#else
   __m128i out, current, next;
 #endif
 #ifdef DEBUG9
@@ -1642,7 +1647,7 @@ Compress32_shift (T this, int nshift) {
       shifted[1] = this->blocks[1] << nshift;
       shifted[0] = this->blocks[0] << nshift;
 
-#elif defined(HAVE_SSE2)
+#elif defined(HAVE_SSE2) && !defined(WORDS_BIGENDIAN)
       next = _mm_load_si128((__m128i *) &(this->blocks[ptr]));
       while (ptr > 0) {
 	current = next;
@@ -1710,7 +1715,25 @@ Compress_get_16mer_left (UINT4 *high, UINT4 *low, UINT4 *flags, T this, int pos3
   int columni, blocki;
   Genomecomp_T *ptr, curr_high, curr_low, curr_flags, prev_high, prev_low, prev_flags;
 
-#ifdef HAVE_SSE2
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+  /* query is stored as 3 x 32-bit words */
+  blocki = pos3/32U*3;
+
+  ptr = &(this->blocks[blocki]);
+  curr_high = ptr[0];
+  curr_low = ptr[1];
+  curr_flags = ptr[2];
+
+  if (blocki == 0) {
+    prev_high = prev_low = prev_flags = 0U;
+  } else {
+    ptr -= 3;
+    prev_high = ptr[0];
+    prev_low = ptr[1];
+    prev_flags = ptr[2];
+  }
+
+#else
   /* query is stored as 3 x 128-bit words */
   columni = (pos3 % 128) / 32;
   blocki = pos3/128U*12 + columni;
@@ -1733,23 +1756,6 @@ Compress_get_16mer_left (UINT4 *high, UINT4 *low, UINT4 *flags, T this, int pos3
     prev_low = ptr[4];
     prev_flags = ptr[8];
   }
-#else
-  /* query is stored as 3 x 32-bit words */
-  blocki = pos3/32U*3;
-
-  ptr = &(this->blocks[blocki]);
-  curr_high = ptr[0];
-  curr_low = ptr[1];
-  curr_flags = ptr[2];
-
-  if (blocki == 0) {
-    prev_high = prev_low = prev_flags = 0U;
-  } else {
-    ptr -= 3;
-    prev_high = ptr[0];
-    prev_low = ptr[1];
-    prev_flags = ptr[2];
-  }
 #endif
 
 
@@ -1784,7 +1790,21 @@ Compress_get_16mer_right (UINT4 *high, UINT4 *low, UINT4 *flags, T this, int pos
   int columni, blocki;
   Genomecomp_T *ptr, curr_high, curr_low, curr_flags, next_high, next_low, next_flags;
 
-#ifdef HAVE_SSE2
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+  /* query is stored as 3 x 32-bit words */
+  blocki = pos5/32U*3;
+
+  ptr = &(this->blocks[blocki]);
+  curr_high = ptr[0];
+  curr_low = ptr[1];
+  curr_flags = ptr[2];
+
+  ptr += 3;
+  next_high = ptr[0];
+  next_low = ptr[1];
+  next_flags = ptr[2];
+
+#else
   /* query is stored as 3 x 128-bit words */
   columni = (pos5 % 128) / 32;
   blocki = pos5/128U*12 + columni;
@@ -1805,19 +1825,6 @@ Compress_get_16mer_right (UINT4 *high, UINT4 *low, UINT4 *flags, T this, int pos
     next_low = ptr[4];
     next_flags = ptr[8];
   }
-#else
-  /* query is stored as 3 x 32-bit words */
-  blocki = pos5/32U*3;
-
-  ptr = &(this->blocks[blocki]);
-  curr_high = ptr[0];
-  curr_low = ptr[1];
-  curr_flags = ptr[2];
-
-  ptr += 3;
-  next_high = ptr[0];
-  next_low = ptr[1];
-  next_flags = ptr[2];
 #endif
 
   debug2(printf("high:  %08X %08X\n",curr_high,next_high));
diff --git a/src/compress.h b/src/compress.h
index 41be273..8eb3483 100644
--- a/src/compress.h
+++ b/src/compress.h
@@ -1,4 +1,4 @@
-/* $Id: compress.h 157225 2015-01-22 18:47:23Z twu $ */
+/* $Id: compress.h 168395 2015-06-26 17:13:13Z twu $ */
 #ifndef COMPRESS_INCLUDED
 #define COMPRESS_INCLUDED
 #ifdef HAVE_CONFIG_H
@@ -19,10 +19,10 @@
    SIMD in Compress_shift, so COMPRESS_BLOCKSIZE can be 3.  */
 
 
-#ifdef HAVE_SSE2
-#define COMPRESS_BLOCKSIZE 12	/* 12 unsigned ints per block */
-#else
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
 #define COMPRESS_BLOCKSIZE 3	/* 3 unsigned ints per block */
+#else
+#define COMPRESS_BLOCKSIZE 12	/* 12 unsigned ints per block */
 #endif
 
 
diff --git a/src/config.h.in b/src/config.h.in
index ac9a8c4..07cbd67 100644
--- a/src/config.h.in
+++ b/src/config.h.in
@@ -181,6 +181,9 @@
 /* Define to 1 if you have the `shmget' function. */
 #undef HAVE_SHMGET
 
+/* Define to 1 if SHM_NORESERVE available for shmget. */
+#undef HAVE_SHM_NORESERVE
+
 /* Define to 1 if you have the `sigaction' function. */
 #undef HAVE_SIGACTION
 
diff --git a/src/dynprog_genome.c b/src/dynprog_genome.c
index 3958098..8d2c9dc 100644
--- a/src/dynprog_genome.c
+++ b/src/dynprog_genome.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: dynprog_genome.c 145990 2014-08-25 21:47:32Z twu $";
+static char rcsid[] = "$Id: dynprog_genome.c 170390 2015-07-23 01:29:31Z twu $";
 #ifdef HAVE_CONFIG_H
 #include <config.h>
 #endif
@@ -879,7 +879,7 @@ bridge_intron_gap_8_ud (int *finalscore, int *bestrL, int *bestrR, int *bestcL,
 
     if (watsonp == true) {
       if (cdna_direction > 0) {
-	for (cL = 0; cL < glengthL; cL++) {
+	for (cL = 0; cL < glengthL - 1; cL++) {
 	  splicesitepos = chroffset + leftoffset + cL;
 	  if (left_known[cL]) {
 	    left_probabilities[cL] = 1.0;
@@ -888,7 +888,7 @@ bridge_intron_gap_8_ud (int *finalscore, int *bestrL, int *bestrR, int *bestcL,
 	  }
 	}
 
-	for (cR = 0; cR < glengthR; cR++) {
+	for (cR = 0; cR < glengthR - 1; cR++) {
 	  splicesitepos = chroffset + rightoffset - cR + 1;
 	  if (right_known[cR]) {
 	    right_probabilities[cR] = 1.0;
@@ -898,7 +898,7 @@ bridge_intron_gap_8_ud (int *finalscore, int *bestrL, int *bestrR, int *bestcL,
 	}
 
       } else {
-	for (cL = 0; cL < glengthL; cL++) {
+	for (cL = 0; cL < glengthL - 1; cL++) {
 	  splicesitepos = chroffset + leftoffset + cL;
 	  if (left_known[cL]) {
 	    left_probabilities[cL] = 1.0;
@@ -907,7 +907,7 @@ bridge_intron_gap_8_ud (int *finalscore, int *bestrL, int *bestrR, int *bestcL,
 	  }
 	}
 
-	for (cR = 0; cR < glengthR; cR++) {
+	for (cR = 0; cR < glengthR - 1; cR++) {
 	  splicesitepos = chroffset + rightoffset - cR + 1;
 	  if (right_known[cR]) {
 	    right_probabilities[cR] = 1.0;
@@ -919,7 +919,7 @@ bridge_intron_gap_8_ud (int *finalscore, int *bestrL, int *bestrR, int *bestcL,
 
     } else {
       if (cdna_direction > 0) {
-	for (cL = 0; cL < glengthL; cL++) {
+	for (cL = 0; cL < glengthL - 1; cL++) {
 	  splicesitepos = chrhigh - leftoffset - cL + 1;
 	  if (left_known[cL]) {
 	    left_probabilities[cL] = 1.0;
@@ -928,7 +928,7 @@ bridge_intron_gap_8_ud (int *finalscore, int *bestrL, int *bestrR, int *bestcL,
 	  }
 	}
 
-	for (cR = 0; cR < glengthR; cR++) {
+	for (cR = 0; cR < glengthR - 1; cR++) {
 	  splicesitepos = chrhigh - rightoffset + cR;
 	  if (right_known[cR]) {
 	    right_probabilities[cR] = 1.0;
@@ -938,7 +938,7 @@ bridge_intron_gap_8_ud (int *finalscore, int *bestrL, int *bestrR, int *bestcL,
 	}
 
       } else {
-	for (cL = 0; cL < glengthL; cL++) {
+	for (cL = 0; cL < glengthL - 1; cL++) {
 	  splicesitepos = chrhigh - leftoffset - cL + 1;
 	  if (left_known[cL]) {
 	    left_probabilities[cL] = 1.0;
@@ -947,7 +947,7 @@ bridge_intron_gap_8_ud (int *finalscore, int *bestrL, int *bestrR, int *bestcL,
 	  }
 	}
 
-	for (cR = 0; cR < glengthR; cR++) {
+	for (cR = 0; cR < glengthR - 1; cR++) {
 	  splicesitepos = chrhigh - rightoffset + cR;
 	  if (right_known[cR]) {
 	    right_probabilities[cR] = 1.0;
@@ -1826,7 +1826,7 @@ bridge_intron_gap_16_ud (int *finalscore, int *bestrL, int *bestrR, int *bestcL,
 
     if (watsonp == true) {
       if (cdna_direction > 0) {
-	for (cL = 0; cL < glengthL; cL++) {
+	for (cL = 0; cL < glengthL - 1; cL++) {
 	  splicesitepos = chroffset + leftoffset + cL;
 	  if (left_known[cL]) {
 	    left_probabilities[cL] = 1.0;
@@ -1835,7 +1835,7 @@ bridge_intron_gap_16_ud (int *finalscore, int *bestrL, int *bestrR, int *bestcL,
 	  }
 	}
 
-	for (cR = 0; cR < glengthR; cR++) {
+	for (cR = 0; cR < glengthR - 1; cR++) {
 	  splicesitepos = chroffset + rightoffset - cR + 1;
 	  if (right_known[cR]) {
 	    right_probabilities[cR] = 1.0;
@@ -1845,7 +1845,7 @@ bridge_intron_gap_16_ud (int *finalscore, int *bestrL, int *bestrR, int *bestcL,
 	}
 
       } else {
-	for (cL = 0; cL < glengthL; cL++) {
+	for (cL = 0; cL < glengthL - 1; cL++) {
 	  splicesitepos = chroffset + leftoffset + cL;
 	  if (left_known[cL]) {
 	    left_probabilities[cL] = 1.0;
@@ -1854,7 +1854,7 @@ bridge_intron_gap_16_ud (int *finalscore, int *bestrL, int *bestrR, int *bestcL,
 	  }
 	}
 
-	for (cR = 0; cR < glengthR; cR++) {
+	for (cR = 0; cR < glengthR - 1; cR++) {
 	  splicesitepos = chroffset + rightoffset - cR + 1;
 	  if (right_known[cR]) {
 	    right_probabilities[cR] = 1.0;
@@ -1866,7 +1866,7 @@ bridge_intron_gap_16_ud (int *finalscore, int *bestrL, int *bestrR, int *bestcL,
 
     } else {
       if (cdna_direction > 0) {
-	for (cL = 0; cL < glengthL; cL++) {
+	for (cL = 0; cL < glengthL - 1; cL++) {
 	  splicesitepos = chrhigh - leftoffset - cL + 1;
 	  if (left_known[cL]) {
 	    left_probabilities[cL] = 1.0;
@@ -1875,7 +1875,7 @@ bridge_intron_gap_16_ud (int *finalscore, int *bestrL, int *bestrR, int *bestcL,
 	  }
 	}
 
-	for (cR = 0; cR < glengthR; cR++) {
+	for (cR = 0; cR < glengthR - 1; cR++) {
 	  splicesitepos = chrhigh - rightoffset + cR;
 	  if (right_known[cR]) {
 	    right_probabilities[cR] = 1.0;
@@ -1885,7 +1885,7 @@ bridge_intron_gap_16_ud (int *finalscore, int *bestrL, int *bestrR, int *bestcL,
 	}
 
       } else {
-	for (cL = 0; cL < glengthL; cL++) {
+	for (cL = 0; cL < glengthL - 1; cL++) {
 	  splicesitepos = chrhigh - leftoffset - cL + 1;
 	  if (left_known[cL]) {
 	    left_probabilities[cL] = 1.0;
@@ -1894,7 +1894,7 @@ bridge_intron_gap_16_ud (int *finalscore, int *bestrL, int *bestrR, int *bestcL,
 	  }
 	}
 
-	for (cR = 0; cR < glengthR; cR++) {
+	for (cR = 0; cR < glengthR - 1; cR++) {
 	  splicesitepos = chrhigh - rightoffset + cR;
 	  if (right_known[cR]) {
 	    right_probabilities[cR] = 1.0;
@@ -2625,7 +2625,7 @@ bridge_intron_gap (int *finalscore, int *bestrL, int *bestrR, int *bestcL, int *
 
     if (watsonp == true) {
       if (cdna_direction > 0) {
-	for (cL = 0; cL < glengthL; cL++) {
+	for (cL = 0; cL < glengthL - 1; cL++) {
 	  splicesitepos = chroffset + leftoffset + cL;
 	  if (left_known[cL]) {
 	    left_probabilities[cL] = 1.0;
@@ -2634,7 +2634,7 @@ bridge_intron_gap (int *finalscore, int *bestrL, int *bestrR, int *bestcL, int *
 	  }
 	}
 
-	for (cR = 0; cR < glengthR; cR++) {
+	for (cR = 0; cR < glengthR - 1; cR++) {
 	  splicesitepos = chroffset + rightoffset - cR + 1;
 	  if (right_known[cR]) {
 	    right_probabilities[cR] = 1.0;
@@ -2644,7 +2644,7 @@ bridge_intron_gap (int *finalscore, int *bestrL, int *bestrR, int *bestcL, int *
 	}
 
       } else {
-	for (cL = 0; cL < glengthL; cL++) {
+	for (cL = 0; cL < glengthL - 1; cL++) {
 	  splicesitepos = chroffset + leftoffset + cL;
 	  if (left_known[cL]) {
 	    left_probabilities[cL] = 1.0;
@@ -2653,7 +2653,7 @@ bridge_intron_gap (int *finalscore, int *bestrL, int *bestrR, int *bestcL, int *
 	  }
 	}
 
-	for (cR = 0; cR < glengthR; cR++) {
+	for (cR = 0; cR < glengthR - 1; cR++) {
 	  splicesitepos = chroffset + rightoffset - cR + 1;
 	  if (right_known[cR]) {
 	    right_probabilities[cR] = 1.0;
@@ -2665,7 +2665,7 @@ bridge_intron_gap (int *finalscore, int *bestrL, int *bestrR, int *bestcL, int *
 
     } else {
       if (cdna_direction > 0) {
-	for (cL = 0; cL < glengthL; cL++) {
+	for (cL = 0; cL < glengthL - 1; cL++) {
 	  splicesitepos = chrhigh - leftoffset - cL + 1;
 	  if (left_known[cL]) {
 	    left_probabilities[cL] = 1.0;
@@ -2674,7 +2674,7 @@ bridge_intron_gap (int *finalscore, int *bestrL, int *bestrR, int *bestcL, int *
 	  }
 	}
 
-	for (cR = 0; cR < glengthR; cR++) {
+	for (cR = 0; cR < glengthR - 1; cR++) {
 	  splicesitepos = chrhigh - rightoffset + cR;
 	  if (right_known[cR]) {
 	    right_probabilities[cR] = 1.0;
@@ -2684,7 +2684,7 @@ bridge_intron_gap (int *finalscore, int *bestrL, int *bestrR, int *bestcL, int *
 	}
 
       } else {
-	for (cL = 0; cL < glengthL; cL++) {
+	for (cL = 0; cL < glengthL - 1; cL++) {
 	  splicesitepos = chrhigh - leftoffset - cL + 1;
 	  if (left_known[cL]) {
 	    left_probabilities[cL] = 1.0;
@@ -2693,7 +2693,7 @@ bridge_intron_gap (int *finalscore, int *bestrL, int *bestrR, int *bestcL, int *
 	  }
 	}
 
-	for (cR = 0; cR < glengthR; cR++) {
+	for (cR = 0; cR < glengthR - 1; cR++) {
 	  splicesitepos = chrhigh - rightoffset + cR;
 	  if (right_known[cR]) {
 	    right_probabilities[cR] = 1.0;
diff --git a/src/genome-write.c b/src/genome-write.c
index 6fae146..7cadc41 100644
--- a/src/genome-write.c
+++ b/src/genome-write.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: genome-write.c 153955 2014-11-24 17:54:45Z twu $";
+static char rcsid[] = "$Id: genome-write.c 168395 2015-06-26 17:13:13Z twu $";
 #ifdef HAVE_CONFIG_H
 #include <config.h>
 #endif
@@ -28,7 +28,7 @@ static char rcsid[] = "$Id: genome-write.c 153955 2014-11-24 17:54:45Z twu $";
 #include "compress-write.h"
 #include "iit-write.h"
 #include "complement.h"
-#include "genome.h"		/* For Genome_uncompress_mmap */
+#include "genome.h"		/* For Genome_uncompress_memory */
 
 #define CONTROLM 13		/* From PC */
 
@@ -606,7 +606,7 @@ fill_circular_chromosomes (UINT4 *genomecomp, Univ_IIT_T chromosome_iit, int cir
 
       segment = (char *) CALLOC(seglength+1U,sizeof(char));
       /* Add 1U because procedures below are expecting exclusive coordinates */
-      Genome_uncompress_mmap(segment,genomecomp,orig_startpos,orig_endpos+1U);
+      Genome_uncompress_memory(segment,genomecomp,orig_startpos,orig_endpos+1U); /* not Genome_uncompress_mmap, which does bigendian conversion */
       Compress_update_memory(/*nbadchars*/0,genomecomp,segment,alias_startpos,alias_endpos+1U);
       FREE(segment);
     }
diff --git a/src/genome.c b/src/genome.c
index fec13f2..ffdd265 100644
--- a/src/genome.c
+++ b/src/genome.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: genome.c 161940 2015-03-25 20:36:59Z twu $";
+static char rcsid[] = "$Id: genome.c 168395 2015-06-26 17:13:13Z twu $";
 #ifdef HAVE_CONFIG_H
 #include <config.h>
 #endif
@@ -9148,6 +9148,125 @@ Genome_uncompress_mmap (char *gbuffer1, Genomecomp_T *blocks, Univcoord_T startp
 }
 
 
+/* Same as Genome_uncompress_mmap, except does not perform bigendian conversion */
+void
+Genome_uncompress_memory (char *gbuffer1, Genomecomp_T *blocks, Univcoord_T startpos, 
+			  Univcoord_T endpos) {
+  /* Chrpos_T length = endpos - startpos; */
+  Univcoord_T startblock, endblock, ptr;
+  Genomecomp_T high, low, flags;
+  char Buffer[32];
+  int startdiscard, enddiscard;
+  Univcoord_T k = 0, i;
+
+  /* sequence = (char *) CALLOC(length+1,sizeof(char)); */
+
+  ptr = startblock = startpos/32U*3;
+  endblock = endpos/32U*3;
+  startdiscard = startpos % 32;
+  enddiscard = endpos % 32;
+  
+  if (endblock == startblock) {
+    /* Special case */
+#if 0
+    high = Bigendian_convert_uint(blocks[ptr]);
+    low = Bigendian_convert_uint(blocks[ptr+1]);
+    flags = Bigendian_convert_uint(blocks[ptr+2]);
+#else
+    high = blocks[ptr]; low = blocks[ptr+1]; flags = blocks[ptr+2];
+#endif
+
+    memcpy(Buffer,nucleotides[low & 0x0000FFFF],8);
+    memcpy(&(Buffer[8]),nucleotides[low >> 16],8);
+    memcpy(&(Buffer[16]),nucleotides[high & 0x0000FFFF],8);
+    memcpy(&(Buffer[24]),nucleotides[high >> 16],8);
+    if (flags) {
+      for (i = 0; i < 32; i++) {
+	if (flags & 1U) {
+	  Buffer[i] = 'N';
+	}
+	flags >>= 1;
+      }
+    }
+    memcpy(gbuffer1,&(Buffer[startdiscard]),(enddiscard - startdiscard));
+
+  } else {
+#if 0
+    high = Bigendian_convert_uint(blocks[ptr]);
+    low = Bigendian_convert_uint(blocks[ptr+1]);
+    flags = Bigendian_convert_uint(blocks[ptr+2]);
+#else
+    high = blocks[ptr]; low = blocks[ptr+1]; flags = blocks[ptr+2];
+#endif
+
+    memcpy(Buffer,nucleotides[low & 0x0000FFFF],8);
+    memcpy(&(Buffer[8]),nucleotides[low >> 16],8);
+    memcpy(&(Buffer[16]),nucleotides[high & 0x0000FFFF],8);
+    memcpy(&(Buffer[24]),nucleotides[high >> 16],8);
+    if (flags) {
+      for (i = 0; i < 32; i++) {
+	if (flags & 1U) {
+	  Buffer[i] = 'N';
+	}
+	flags >>= 1;
+      }
+    }
+    memcpy(gbuffer1,&(Buffer[startdiscard]),k = 32 - startdiscard);
+    ptr += 3;
+      
+    while (ptr < endblock) {
+#if 0
+      high = Bigendian_convert_uint(blocks[ptr]);
+      low = Bigendian_convert_uint(blocks[ptr+1]);
+      flags = Bigendian_convert_uint(blocks[ptr+2]);
+#else
+      high = blocks[ptr]; low = blocks[ptr+1]; flags = blocks[ptr+2];
+#endif
+
+      memcpy(&(gbuffer1[k]),nucleotides[low & 0x0000FFFF],8); k += 8;
+      memcpy(&(gbuffer1[k]),nucleotides[low >> 16],8); k += 8;
+      memcpy(&(gbuffer1[k]),nucleotides[high & 0x0000FFFF],8); k += 8;
+      memcpy(&(gbuffer1[k]),nucleotides[high >> 16],8); k += 8;
+      if (flags) {
+	for (i = k - 32; i < k; i++) {
+	  if (flags & 1U) {
+	    gbuffer1[i] = 'N';
+	  }
+	  flags >>= 1;
+	}
+      }
+      ptr += 3;
+    }
+
+    if (enddiscard > 0) {
+#if 0
+      high = Bigendian_convert_uint(blocks[ptr]);
+      low = Bigendian_convert_uint(blocks[ptr+1]);
+      flags = Bigendian_convert_uint(blocks[ptr+2]);
+#else
+      high = blocks[ptr]; low = blocks[ptr+1]; flags = blocks[ptr+2];
+#endif
+
+      memcpy(Buffer,nucleotides[low & 0x0000FFFF],8);
+      memcpy(&(Buffer[8]),nucleotides[low >> 16],8);
+      memcpy(&(Buffer[16]),nucleotides[high & 0x0000FFFF],8);
+      memcpy(&(Buffer[24]),nucleotides[high >> 16],8);
+      if (flags) {
+	for (i = 0; i < 32; i++) {
+	  if (flags & 1U) {
+	    Buffer[i] = 'N';
+	  }
+	  flags >>= 1;
+	}
+      }
+      memcpy(&(gbuffer1[k]),Buffer,enddiscard);
+    }
+  }
+
+  return;
+}
+
+
 
 /* Correct procedure should look at alt high/low and normal flags, and substitute N based on normal flags */
 /* May not handle wildcard positions correctly.  A wildcard occurs if ref == alt && ref_flag == 0 && alt_flag == 1 */
@@ -10405,6 +10524,9 @@ Genome_setup (T genome_in, T genomealt_in, Mode_T mode_in, int circular_typeint_
   } else if (mode == ATOI_STRANDED || mode == ATOI_NONSTRANDED) {
     fwd_conversion = "GCGT";
     rev_conversion = "ACGC";
+  } else if (mode == TTOC_STRANDED || mode == TTOC_NONSTRANDED) {
+    fwd_conversion = "ACGC";
+    rev_conversion = "GCGT";
   }
   circular_typeint = circular_typeint_in;
   return;
diff --git a/src/genome.h b/src/genome.h
index 41c7430..ec39b70 100644
--- a/src/genome.h
+++ b/src/genome.h
@@ -1,4 +1,4 @@
-/* $Id: genome.h 161940 2015-03-25 20:36:59Z twu $ */
+/* $Id: genome.h 168395 2015-06-26 17:13:13Z twu $ */
 #ifndef GENOME_INCLUDED
 #define GENOME_INCLUDED
 
@@ -39,6 +39,9 @@ Genome_user_setup (Genomecomp_T *genome_blocks_in);
 extern void
 Genome_uncompress_mmap (char *gbuffer1, Genomecomp_T *blocks, Univcoord_T startpos, 
 			Univcoord_T endpos);
+extern void
+Genome_uncompress_memory (char *gbuffer1, Genomecomp_T *blocks, Univcoord_T startpos, 
+			  Univcoord_T endpos);
 extern bool
 Genome_fill_buffer (Chrnum_T *chrnum, int *nunknowns, T this, Univcoord_T left, Chrpos_T length, char *gbuffer1,
 		    Univ_IIT_T chromosome_iit);
diff --git a/src/genome128_hr.c b/src/genome128_hr.c
index f79c657..825dd39 100644
--- a/src/genome128_hr.c
+++ b/src/genome128_hr.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: genome128_hr.c 166739 2015-06-02 01:23:18Z twu $";
+static char rcsid[] = "$Id: genome128_hr.c 168395 2015-06-26 17:13:13Z twu $";
 #ifdef HAVE_CONFIG_H
 #include <config.h>
 #endif
@@ -24,7 +24,15 @@ static char rcsid[] = "$Id: genome128_hr.c 166739 2015-06-02 01:23:18Z twu $";
 #include "compress.h"
 #include "popcount.h"
 
-#ifdef HAVE_SSE2
+#ifdef WORDS_BIGENDIAN
+#include "bigendian.h"
+#else
+#include "littleendian.h"
+#endif
+
+#ifdef WORDS_BIGENDIAN
+/* Do not use SIMD */
+#elif defined(HAVE_SSE2)
 #include <emmintrin.h>
 #endif
 #ifdef HAVE_SSE4_1
@@ -40,13 +48,6 @@ static char rcsid[] = "$Id: genome128_hr.c 166739 2015-06-02 01:23:18Z twu $";
 #include <immintrin.h>
 #endif
 
-#ifdef WORDS_BIGENDIAN
-#include "bigendian.h"
-#else
-#include "littleendian.h"
-#endif
-
-
 #ifdef DEBUG
 #define debug(x) x
 #else
@@ -16603,18 +16604,18 @@ Genome_print_blocks (Genomecomp_T *blocks, Univcoord_T startpos, Univcoord_T end
 
   /* sequence = (char *) CALLOC(length+1,sizeof(char)); */
 
-#ifdef HAVE_SSE2
-  startcolumni = (startpos % 128) / 32;
-  endcolumni = (endpos % 128) / 32;
-
-  startblocki = startpos/128U*12;
-  endblocki = endpos/128U*12;
-#else
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
   startcolumni = (startpos % 128) / 32;
   startblocki = startpos/128U*12 + startcolumni;
 
   endcolumni = (endpos % 128) / 32;
   endblocki = endpos/128U*12 + endcolumni;
+#else
+  startcolumni = (startpos % 128) / 32;
+  endcolumni = (endpos % 128) / 32;
+
+  startblocki = startpos/128U*12;
+  endblocki = endpos/128U*12;
 #endif
 
   startdiscard32 = startpos % 32;
@@ -16623,7 +16624,24 @@ Genome_print_blocks (Genomecomp_T *blocks, Univcoord_T startpos, Univcoord_T end
 
   ptr = &(blocks[startblocki]);
   while (ptr <= &(blocks[endblocki])) {
-#ifdef HAVE_SSE2
+#if defined(WORDS_BIGENDIAN)
+    high = Bigendian_convert_uint(ptr[0]);
+    low = Bigendian_convert_uint(ptr[4]);
+    flags = Bigendian_convert_uint(ptr[8]);
+    printf("high: %08X  low: %08X  flags: %08X\t",high,low,flags);
+    write_chars(high,low,flags);
+    printf("\n");
+
+    ptr += 1; if (++startcolumni == 4) {ptr += 8; startcolumni = 0;}
+#elif !defined(HAVE_SSE2)
+    high = ptr[0]; low = ptr[4]; flags = ptr[8];
+    printf("high: %08X  low: %08X  flags: %08X\t",high,low,flags);
+    write_chars(high,low,flags);
+    printf("\n");
+
+    ptr += 1; if (++startcolumni == 4) {ptr += 8; startcolumni = 0;}
+
+#else
     if (startcolumni == 0) {
       /*      high: 9F61B62A  low: 6D68A157  flags: 00000000 */
       printf("                                              \t");
@@ -16747,13 +16765,6 @@ Genome_print_blocks (Genomecomp_T *blocks, Univcoord_T startpos, Univcoord_T end
 
     printf("\n");
     ptr += 12;
-#else
-    high = ptr[0]; low = ptr[4]; flags = ptr[8];
-    printf("high: %08X  low: %08X  flags: %08X\t",high,low,flags);
-    write_chars(high,low,flags);
-    printf("\n");
-
-    ptr += 1; if (++startcolumni == 4) {ptr += 8; startcolumni = 0;}
 #endif
   }
 
@@ -16773,15 +16784,15 @@ Genome_print_blocks_snp (Genomecomp_T *blocks, Genomecomp_T *snp_blocks, Univcoo
 
   /* sequence = (char *) CALLOC(length+1,sizeof(char)); */
 
-#ifdef HAVE_SSE2
-  startblocki = startpos/128U*12;
-  endblocki = endpos/128U*12;
-#else
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
   startcolumni = (startpos % 128) / 32;
   startblocki = startpos/128U*12 + startcolumni;
 
   endcolumni = (endpos % 128) / 32;
   endblocki = endpos/128U*12 + endcolumni;
+#else
+  startblocki = startpos/128U*12;
+  endblocki = endpos/128U*12;
 #endif
 
   startdiscard32 = startpos % 32;
@@ -16790,7 +16801,11 @@ Genome_print_blocks_snp (Genomecomp_T *blocks, Genomecomp_T *snp_blocks, Univcoo
   ref_ptr = &(blocks[startblocki]);
   snp_ptr = &(snp_blocks[startblocki]);
   while (ref_ptr <= &(blocks[endblocki])) {
-#ifdef HAVE_SSE2
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+    high = ref_ptr[0]; low = ref_ptr[4]; flags = ref_ptr[8]; snpmask = snp_ptr[8];
+    printf("high: %08X  low: %08X  flags: %08X  snpmask: %08X\n",high,low,flags,snpmask);
+    ref_ptr += 1; snp_ptr += 1; if (++startcolumni == 4) {ref_ptr += 8; snp_ptr += 8; startcolumni = 0;}
+#else
     high = ref_ptr[0]; low = ref_ptr[4]; flags = ref_ptr[8]; snpmask = snp_ptr[8];
     printf("high: %08X  low: %08X  flags: %08X  snpmask: %08X\n",high,low,flags,snpmask);
 
@@ -16804,11 +16819,6 @@ Genome_print_blocks_snp (Genomecomp_T *blocks, Genomecomp_T *snp_blocks, Univcoo
     printf("high: %08X  low: %08X  flags: %08X  snpmask: %08X\n",high,low,flags,snpmask);
 
     ref_ptr += 12; snp_ptr += 12;
-
-#else
-    high = ref_ptr[0]; low = ref_ptr[4]; flags = ref_ptr[8]; snpmask = snp_ptr[8];
-    printf("high: %08X  low: %08X  flags: %08X  snpmask: %08X\n",high,low,flags,snpmask);
-    ref_ptr += 1; snp_ptr += 1; if (++startcolumni == 4) {ref_ptr += 8; snp_ptr += 8; startcolumni = 0;}
 #endif
   }
 
@@ -16829,12 +16839,12 @@ static Genomecomp_T *snp_blocks;
 static bool query_unk_mismatch_p = false;
 static bool genome_unk_mismatch_p = true;
 
-#ifdef HAVE_SSE2
-typedef __m128i Genomediff_T;
-#define STEP_SIZE 128
-#else
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
 typedef UINT4 Genomediff_T;
 #define STEP_SIZE 32
+#else
+typedef __m128i Genomediff_T;
+#define STEP_SIZE 128
 #endif
 
 
@@ -16844,34 +16854,35 @@ block_diff_standard_32 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
   UINT4 diff;
 
   debug(printf("Comparing high: query %08X with genome %08X ",query_shifted[0],ref_ptr[0]));
-#ifdef HAVE_SSE2
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+#else
   debug(printf("Comparing low: query %08X with genome %08X ",query_shifted[4],ref_ptr[4]));
 #endif
 
-#ifdef HAVE_SSE2
-  diff = (query_shifted[0] ^ ref_ptr[0]) | (query_shifted[4] ^ ref_ptr[4]);
-#elif defined(WORDS_BIGENDIAN)
+#ifdef WORDS_BIGENDIAN
   diff = (query_shifted[0] ^ Bigendian_convert_uint(ref_ptr[0])) | (query_shifted[1] ^ Bigendian_convert_uint(ref_ptr[4]));
-#else
+#elif !defined(HAVE_SSE2)
   diff = (query_shifted[0] ^ ref_ptr[0]) | (query_shifted[1] ^ ref_ptr[4]);
+#else
+  diff = (query_shifted[0] ^ ref_ptr[0]) | (query_shifted[4] ^ ref_ptr[4]);
 #endif
 
   /* Query Ns */
-#ifdef HAVE_SSE2
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
   if (query_unk_mismatch_local_p) {
     /* Query: Considering N as a mismatch */
-    diff |= query_shifted[8];
+    diff |= query_shifted[2];
   } else {
     /* Query: Considering N as a wildcard */
-    diff &= ~(query_shifted[8]);
+    diff &= ~(query_shifted[2]);
   }
 #else
   if (query_unk_mismatch_local_p) {
     /* Query: Considering N as a mismatch */
-    diff |= query_shifted[2];
+    diff |= query_shifted[8];
   } else {
     /* Query: Considering N as a wildcard */
-    diff &= ~(query_shifted[2]);
+    diff &= ~(query_shifted[8]);
   }
 #endif
 
@@ -16901,33 +16912,7 @@ block_diff_standard_32 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
 static Genomediff_T
 block_diff_standard (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
 		     bool plusp, int genestrand, bool query_unk_mismatch_local_p) {
-#ifdef HAVE_SSE2
-  __m128i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
-
-  _query_high = _mm_load_si128((__m128i *) query_shifted);
-  _ref_high = _mm_load_si128((__m128i *) ref_ptr);
-  _query_low = _mm_load_si128((__m128i *) &(query_shifted[4]));
-  _ref_low = _mm_load_si128((__m128i *) &(ref_ptr[4]));
-
-  _diff = _mm_or_si128(_mm_xor_si128(_query_high, _ref_high), _mm_xor_si128(_query_low, _ref_low));
-
-  _query_flags = _mm_load_si128((__m128i *) &(query_shifted[8]));
-  if (query_unk_mismatch_local_p) {
-    _diff = _mm_or_si128(_query_flags, _diff);
-  } else {
-    _diff = _mm_andnot_si128(_query_flags, _diff);
-  }
-
-  _ref_flags = _mm_load_si128((__m128i *) &(ref_ptr[8]));
-  if (genome_unk_mismatch_p) {
-    _diff = _mm_or_si128(_ref_flags, _diff);
-  } else {
-    _diff = _mm_andnot_si128(_ref_flags, _diff);
-  }
-
-  return _diff;
-
-#else
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
   UINT4 diff;
 
   debug(printf("Comparing high: query %08X with genome %08X ",query_shifted[0],ref_ptr[0]));
@@ -16968,8 +16953,33 @@ block_diff_standard (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
   debug(printf(" => diff %08X\n",diff));
 
   return diff;
-#endif
 
+#else
+  __m128i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
+
+  _query_high = _mm_load_si128((__m128i *) query_shifted);
+  _ref_high = _mm_load_si128((__m128i *) ref_ptr);
+  _query_low = _mm_load_si128((__m128i *) &(query_shifted[4]));
+  _ref_low = _mm_load_si128((__m128i *) &(ref_ptr[4]));
+
+  _diff = _mm_or_si128(_mm_xor_si128(_query_high, _ref_high), _mm_xor_si128(_query_low, _ref_low));
+
+  _query_flags = _mm_load_si128((__m128i *) &(query_shifted[8]));
+  if (query_unk_mismatch_local_p) {
+    _diff = _mm_or_si128(_query_flags, _diff);
+  } else {
+    _diff = _mm_andnot_si128(_query_flags, _diff);
+  }
+
+  _ref_flags = _mm_load_si128((__m128i *) &(ref_ptr[8]));
+  if (genome_unk_mismatch_p) {
+    _diff = _mm_or_si128(_ref_flags, _diff);
+  } else {
+    _diff = _mm_andnot_si128(_ref_flags, _diff);
+  }
+
+  return _diff;
+#endif
 }
 
 
@@ -16980,28 +16990,30 @@ block_diff_standard_wildcard_32 (Genomecomp_T *query_shifted, Genomecomp_T *snp_
   UINT4 diff, non_wildcard;
 
   /* Taken from block_diff_standard */
-#ifdef HAVE_SSE2
-  diff = (query_shifted[0] ^ ref_ptr[0]) | (query_shifted[4] ^ ref_ptr[4]);
-#else
+#ifdef WORDS_BIGENDIAN
+  diff = (query_shifted[0] ^ Bigendian_convert_uint(ref_ptr[0])) | (query_shifted[1] ^ Bigendian_convert_uint(ref_ptr[4]));
+#elif !defined(HAVE_SSE2)
   diff = (query_shifted[0] ^ ref_ptr[0]) | (query_shifted[1] ^ ref_ptr[4]);
+#else
+  diff = (query_shifted[0] ^ ref_ptr[0]) | (query_shifted[4] ^ ref_ptr[4]);
 #endif
 
   /* Query Ns */
-#ifdef HAVE_SSE2
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
   if (query_unk_mismatch_local_p) {
     /* Query: Considering N as a mismatch */
-    diff |= query_shifted[8];
+    diff |= query_shifted[2];
   } else {
     /* Query: Considering N as a wildcard */
-    diff &= ~(query_shifted[8]);
+    diff &= ~(query_shifted[2]);
   }
 #else
   if (query_unk_mismatch_local_p) {
     /* Query: Considering N as a mismatch */
-    diff |= query_shifted[2];
+    diff |= query_shifted[8];
   } else {
     /* Query: Considering N as a wildcard */
-    diff &= ~(query_shifted[2]);
+    diff &= ~(query_shifted[8]);
   }
 #endif
 
@@ -17023,10 +17035,12 @@ block_diff_standard_wildcard_32 (Genomecomp_T *query_shifted, Genomecomp_T *snp_
   }
 
   /* Add difference relative to SNP */
-#ifdef HAVE_SSE2
-  diff &= (query_shifted[0] ^ snp_ptr[0]) | (query_shifted[4] ^ snp_ptr[4]);
-#else
+#ifdef WORDS_BIGENDIAN
+  diff &= (query_shifted[0] ^ Bigendian_convert_uint(snp_ptr[0])) | (query_shifted[1] ^ Bigendian_convert_uint(snp_ptr[4]));
+#elif !defined(HAVE_SSE2)
   diff &= (query_shifted[0] ^ snp_ptr[0]) | (query_shifted[1] ^ snp_ptr[4]);
+#else
+  diff &= (query_shifted[0] ^ snp_ptr[0]) | (query_shifted[4] ^ snp_ptr[4]);
 #endif
 
   /* Test for equality of ref and alt */
@@ -17067,54 +17081,16 @@ block_diff_standard_wildcard_32 (Genomecomp_T *query_shifted, Genomecomp_T *snp_
 static Genomediff_T
 block_diff_standard_wildcard (Genomecomp_T *query_shifted, Genomecomp_T *snp_ptr, Genomecomp_T *ref_ptr,
 			      bool plusp, int genestrand, bool query_unk_mismatch_local_p) {
-#ifdef HAVE_SSE2
-  __m128i _diff, _wildcard, _query_high, _query_low, _query_flags,
-    _ref_high, _ref_low, _ref_flags, _snp_high, _snp_low, _snp_flags;
-
-  _query_high = _mm_load_si128((__m128i *) query_shifted);
-  _query_low = _mm_load_si128((__m128i *) &(query_shifted[4]));
-  _ref_high = _mm_load_si128((__m128i *) ref_ptr);
-  _ref_low = _mm_load_si128((__m128i *) &(ref_ptr[4]));
-
-  _diff = _mm_or_si128(_mm_xor_si128(_query_high, _ref_high), _mm_xor_si128(_query_low, _ref_low));
-
-  _query_flags = _mm_load_si128((__m128i *) &(query_shifted[8]));
-  if (query_unk_mismatch_local_p) {
-    _diff = _mm_or_si128(_query_flags, _diff);
-  } else {
-    _diff = _mm_andnot_si128(_query_flags, _diff);
-  }
-
-  _ref_flags = _mm_load_si128((__m128i *) &(ref_ptr[8]));
-  if (genome_unk_mismatch_p) {
-    _diff = _mm_or_si128(_ref_flags, _diff);
-  } else {
-    _diff = _mm_andnot_si128(_ref_flags, _diff);
-  }
-  /* End of (query ^ ref) */
-
-
-  /* Add (query ^ snp).  Don't need to recompute query flags or use SNP flags. */
-  _snp_high = _mm_load_si128((__m128i *) snp_ptr);
-  _snp_low = _mm_load_si128((__m128i *) &(snp_ptr[4]));
-
-  _diff = _mm_and_si128(_diff, _mm_or_si128(_mm_xor_si128(_query_high, _snp_high), _mm_xor_si128(_query_low, _snp_low)));
-
-
-  /* Test for equality of ref and alt */
-  _snp_flags = _mm_load_si128((__m128i *) &(snp_ptr[8]));
-  _wildcard = _mm_andnot_si128(_ref_flags, _snp_flags);
-  _wildcard = _mm_andnot_si128(_mm_or_si128(_mm_xor_si128(_ref_high, _snp_high), _mm_xor_si128(_ref_low, _snp_low)), _wildcard);
-
-  _diff = _mm_andnot_si128(_wildcard, _diff);
 
-  return _diff;
-
-#else
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
   UINT4 diff, non_wildcard;
 
   /* Taken from block_diff_standard */
+#ifdef WORDS_BIGENDIAN
+  diff = (query_shifted[0] ^ Bigendian_convert_uint(ref_ptr[0])) | (query_shifted[1] ^ Bigendian_convert_uint(ref_ptr[4]));
+#else
   diff = (query_shifted[0] ^ ref_ptr[0]) | (query_shifted[1] ^ ref_ptr[4]);
+#endif
 
   /* Query Ns */
   if (query_unk_mismatch_local_p) {
@@ -17143,7 +17119,11 @@ block_diff_standard_wildcard (Genomecomp_T *query_shifted, Genomecomp_T *snp_ptr
   }
 
   /* Add difference relative to SNP */
+#ifdef WORDS_BIGENDIAN
+  diff &= (query_shifted[0] ^ Bigendian_convert_uint(snp_ptr[0])) | (query_shifted[1] ^ Bigendian_convert_uint(snp_ptr[4]));
+#else
   diff &= (query_shifted[0] ^ snp_ptr[0]) | (query_shifted[1] ^ snp_ptr[4]);
+#endif
 
   /* Test for equality of ref and alt */
   debug(printf("Equality high: ref genome %08X with alt genome %08X ",ref_ptr[0],snp_ptr[0]));
@@ -17173,6 +17153,49 @@ block_diff_standard_wildcard (Genomecomp_T *query_shifted, Genomecomp_T *snp_ptr
   debug(printf(" => non_wildcard %08X\n",non_wildcard));
 
   return diff & non_wildcard;
+
+#else
+  __m128i _diff, _wildcard, _query_high, _query_low, _query_flags,
+    _ref_high, _ref_low, _ref_flags, _snp_high, _snp_low, _snp_flags;
+
+  _query_high = _mm_load_si128((__m128i *) query_shifted);
+  _query_low = _mm_load_si128((__m128i *) &(query_shifted[4]));
+  _ref_high = _mm_load_si128((__m128i *) ref_ptr);
+  _ref_low = _mm_load_si128((__m128i *) &(ref_ptr[4]));
+
+  _diff = _mm_or_si128(_mm_xor_si128(_query_high, _ref_high), _mm_xor_si128(_query_low, _ref_low));
+
+  _query_flags = _mm_load_si128((__m128i *) &(query_shifted[8]));
+  if (query_unk_mismatch_local_p) {
+    _diff = _mm_or_si128(_query_flags, _diff);
+  } else {
+    _diff = _mm_andnot_si128(_query_flags, _diff);
+  }
+
+  _ref_flags = _mm_load_si128((__m128i *) &(ref_ptr[8]));
+  if (genome_unk_mismatch_p) {
+    _diff = _mm_or_si128(_ref_flags, _diff);
+  } else {
+    _diff = _mm_andnot_si128(_ref_flags, _diff);
+  }
+  /* End of (query ^ ref) */
+
+
+  /* Add (query ^ snp).  Don't need to recompute query flags or use SNP flags. */
+  _snp_high = _mm_load_si128((__m128i *) snp_ptr);
+  _snp_low = _mm_load_si128((__m128i *) &(snp_ptr[4]));
+
+  _diff = _mm_and_si128(_diff, _mm_or_si128(_mm_xor_si128(_query_high, _snp_high), _mm_xor_si128(_query_low, _snp_low)));
+
+
+  /* Test for equality of ref and alt */
+  _snp_flags = _mm_load_si128((__m128i *) &(snp_ptr[8]));
+  _wildcard = _mm_andnot_si128(_ref_flags, _snp_flags);
+  _wildcard = _mm_andnot_si128(_mm_or_si128(_mm_xor_si128(_ref_high, _snp_high), _mm_xor_si128(_ref_low, _snp_low)), _wildcard);
+
+  _diff = _mm_andnot_si128(_wildcard, _diff);
+
+  return _diff;
 #endif
 }
 
@@ -17192,31 +17215,31 @@ block_diff_metct_32 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
     diff = 0U;
   } else {
     /* Mark genome-T to query-C mismatches */
-#ifdef HAVE_SSE2
-    diff = (~(query_shifted[0]) & query_shifted[4]) & (ref_ptr[0] & ref_ptr[4]);
-#elif defined(WORDS_BIGENDIAN)
+#ifdef WORDS_BIGENDIAN
     diff = (~(query_shifted[0]) & query_shifted[1]) &
       (Bigendian_convert_uint(ref_ptr[0]) & Bigendian_convert_uint(ref_ptr[4]));
-#else
+#elif !defined(HAVE_SSE2)
     diff = (~(query_shifted[0]) & query_shifted[1]) & (ref_ptr[0] & ref_ptr[4]);
+#else
+    diff = (~(query_shifted[0]) & query_shifted[4]) & (ref_ptr[0] & ref_ptr[4]);
 #endif
     debug(printf(" => diff %08X\n",diff));
   }
 
   /* Compare reduced C->T nts  */
-#ifdef HAVE_SSE2
-  diff |= ((query_shifted[0] | query_shifted[4]) ^ (ref_ptr[0] | ref_ptr[4])) | (query_shifted[4] ^ ref_ptr[4]);
-#elif defined(WORDS_BIGENDIAN)
+#ifdef WORDS_BIGENDIAN
   diff |= ((query_shifted[0] | query_shifted[1]) ^ (Bigendian_convert_uint(ref_ptr[0]) | Bigendian_convert_uint(ref_ptr[4]))) |
     (query_shifted[1] ^ Bigendian_convert_uint(ref_ptr[4]));
-#else
+#elif !defined(HAVE_SSE2)
   diff |= ((query_shifted[0] | query_shifted[1]) ^ (ref_ptr[0] | ref_ptr[4])) | (query_shifted[1] ^ ref_ptr[4]);
+#else
+  diff |= ((query_shifted[0] | query_shifted[4]) ^ (ref_ptr[0] | ref_ptr[4])) | (query_shifted[4] ^ ref_ptr[4]);
 #endif
   debug(printf(" => diff %08X\n",diff));
 
 
   /* Flags: Considering N as a mismatch */
-#ifdef HAVE_SSE2
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
   if (query_unk_mismatch_local_p) {
     debug(printf("Marking query flags: query %08X ",query_shifted[8]));
     diff |= query_shifted[8];
@@ -17260,43 +17283,8 @@ block_diff_metct_32 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
 static Genomediff_T
 block_diff_metct (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
 		  bool query_unk_mismatch_local_p, bool sarrayp) {
-#ifdef HAVE_SSE2
-  __m128i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
 
-  _query_high = _mm_load_si128((__m128i *) query_shifted);
-  _ref_high = _mm_load_si128((__m128i *) ref_ptr);
-  _query_low = _mm_load_si128((__m128i *) &(query_shifted[4]));
-  _ref_low = _mm_load_si128((__m128i *) &(ref_ptr[4]));
-
-  if (sarrayp == true) {
-    /* Ignore genome-T to query-C mismatches.  Convert everything to 3-nucleotide space */
-    _diff = _mm_setzero_si128();
-  } else {
-    /* Mark genome-T to query-C mismatches */
-    _diff = _mm_and_si128(_mm_andnot_si128(_query_high, _query_low), _mm_and_si128(_ref_high, _ref_low));
-  }
-
-  /* Compare reduced C->T nts  */
-  _diff = _mm_or_si128(_diff, _mm_xor_si128(_mm_or_si128(_query_high, _query_low), _mm_or_si128(_ref_high, _ref_low)));
-  _diff = _mm_or_si128(_diff, _mm_xor_si128(_query_low, _ref_low));
-
-  _query_flags = _mm_load_si128((__m128i *) &(query_shifted[8]));
-  if (query_unk_mismatch_local_p) {
-    _diff = _mm_or_si128(_query_flags, _diff);
-  } else {
-    _diff = _mm_andnot_si128(_query_flags, _diff);
-  }
-
-  _ref_flags = _mm_load_si128((__m128i *) &(ref_ptr[8]));
-  if (genome_unk_mismatch_p) {
-    _diff = _mm_or_si128(_ref_flags, _diff);
-  } else {
-    _diff = _mm_andnot_si128(_ref_flags, _diff);
-  }
-
-  return _diff;
-
-#else
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
   UINT4 diff;
 
   if (sarrayp == true) {
@@ -17350,6 +17338,42 @@ block_diff_metct (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
   debug(printf(" => diff %08X\n",diff));
 
   return diff;
+
+#else
+  __m128i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
+
+  _query_high = _mm_load_si128((__m128i *) query_shifted);
+  _ref_high = _mm_load_si128((__m128i *) ref_ptr);
+  _query_low = _mm_load_si128((__m128i *) &(query_shifted[4]));
+  _ref_low = _mm_load_si128((__m128i *) &(ref_ptr[4]));
+
+  if (sarrayp == true) {
+    /* Ignore genome-T to query-C mismatches.  Convert everything to 3-nucleotide space */
+    _diff = _mm_setzero_si128();
+  } else {
+    /* Mark genome-T to query-C mismatches */
+    _diff = _mm_and_si128(_mm_andnot_si128(_query_high, _query_low), _mm_and_si128(_ref_high, _ref_low));
+  }
+
+  /* Compare reduced C->T nts  */
+  _diff = _mm_or_si128(_diff, _mm_xor_si128(_mm_or_si128(_query_high, _query_low), _mm_or_si128(_ref_high, _ref_low)));
+  _diff = _mm_or_si128(_diff, _mm_xor_si128(_query_low, _ref_low));
+
+  _query_flags = _mm_load_si128((__m128i *) &(query_shifted[8]));
+  if (query_unk_mismatch_local_p) {
+    _diff = _mm_or_si128(_query_flags, _diff);
+  } else {
+    _diff = _mm_andnot_si128(_query_flags, _diff);
+  }
+
+  _ref_flags = _mm_load_si128((__m128i *) &(ref_ptr[8]));
+  if (genome_unk_mismatch_p) {
+    _diff = _mm_or_si128(_ref_flags, _diff);
+  } else {
+    _diff = _mm_andnot_si128(_ref_flags, _diff);
+  }
+
+  return _diff;
 #endif
 }
 
@@ -17364,39 +17388,31 @@ block_diff_metga_32 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
     diff = 0U;
   } else {
     /* Mark genome-A to query-G mismatches */
-#ifdef HAVE_SSE2
-    diff = (query_shifted[0] & ~(query_shifted[4])) & ~(ref_ptr[0] | ref_ptr[4]);
-#elif defined(WORDS_BIGENDIAN)
+#ifdef WORDS_BIGENDIAN
     diff = (query_shifted[0] & ~(query_shifted[1])) &
       ~(Bigendian_convert_uint(ref_ptr[0]) | Bigendian_convert_uint(ref_ptr[4]));
-#else
+#elif !defined(HAVE_SSE2)
     diff = (query_shifted[0] & ~(query_shifted[1])) & ~(ref_ptr[0] | ref_ptr[4]);
+#else
+    diff = (query_shifted[0] & ~(query_shifted[4])) & ~(ref_ptr[0] | ref_ptr[4]);
 #endif
     debug(printf(" => diff %08X\n",diff));
   }
 
   /* Compare reduced G->A nts  */
-#ifdef HAVE_SSE2
-  diff |= ((query_shifted[0] & query_shifted[4]) ^ (ref_ptr[0] & ref_ptr[4])) | (query_shifted[4] ^ ref_ptr[4]);
-#elif defined(WORDS_BIGENDIAN)
+#ifdef WORDS_BIGENDIAN
   diff |= ((query_shifted[0] & query_shifted[1]) ^ (Bigendian_convert_uint(ref_ptr[0]) & Bigendian_convert_uint(ref_ptr[4]))) |
     (query_shifted[1] ^ Bigendian_convert_uint(ref_ptr[4]));
-#else
+#elif !defined(HAVE_SSE2)
   diff |= ((query_shifted[0] & query_shifted[1]) ^ (ref_ptr[0] & ref_ptr[4])) | (query_shifted[1] ^ ref_ptr[4]);
+#else
+  diff |= ((query_shifted[0] & query_shifted[4]) ^ (ref_ptr[0] & ref_ptr[4])) | (query_shifted[4] ^ ref_ptr[4]);
 #endif
   debug(printf(" => diff %08X\n",diff));
 
 
   /* Flags: Considering N as a mismatch */
-#ifdef HAVE_SSE2
-  if (query_unk_mismatch_local_p) {
-    debug(printf("Marking query flags: query %08X ",query_shifted[8]));
-    diff |= query_shifted[8];
-  } else {
-    debug(printf("Clearing query flags: query %08X ",query_shifted[8]));
-    diff &= ~(query_shifted[8]);
-  }
-#else
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
   if (query_unk_mismatch_local_p) {
     debug(printf("Marking query flags: query %08X ",query_shifted[2]));
     diff |= query_shifted[2];
@@ -17404,6 +17420,14 @@ block_diff_metga_32 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
     debug(printf("Clearing query flags: query %08X ",query_shifted[2]));
     diff &= ~(query_shifted[2]);
   }
+#else
+  if (query_unk_mismatch_local_p) {
+    debug(printf("Marking query flags: query %08X ",query_shifted[8]));
+    diff |= query_shifted[8];
+  } else {
+    debug(printf("Clearing query flags: query %08X ",query_shifted[8]));
+    diff &= ~(query_shifted[8]);
+  }
 #endif
 
   if (genome_unk_mismatch_p) {
@@ -17432,45 +17456,8 @@ block_diff_metga_32 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
 static Genomediff_T
 block_diff_metga (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
 		  bool query_unk_mismatch_local_p, bool sarrayp) {
-#ifdef HAVE_SSE2
-  __m128i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
-
-  _query_high = _mm_load_si128((__m128i *) query_shifted);
-  _ref_high = _mm_load_si128((__m128i *) ref_ptr);
-  _query_low = _mm_load_si128((__m128i *) &(query_shifted[4]));
-  _ref_low = _mm_load_si128((__m128i *) &(ref_ptr[4]));
-
-  if (sarrayp == true) {
-    /* Ignore genome-A to query-G mismatches.  Convert everything to 3-nucleotide space. */
-    _diff = _mm_setzero_si128();
-  } else {
-    /* Mark genome-A to query-G mismatches */
-    _diff = _mm_andnot_si128(_query_low, _query_high);
-    _diff = _mm_andnot_si128(_ref_high, _diff);
-    _diff = _mm_andnot_si128(_ref_low, _diff);
-  }
-
-  /* Compare reduced G->A nts  */
-  _diff = _mm_or_si128(_diff, _mm_xor_si128(_mm_and_si128(_query_high, _query_low), _mm_and_si128(_ref_high, _ref_low)));
-  _diff = _mm_or_si128(_diff, _mm_xor_si128(_query_low, _ref_low));
 
-  _query_flags = _mm_load_si128((__m128i *) &(query_shifted[8]));
-  if (query_unk_mismatch_local_p) {
-    _diff = _mm_or_si128(_query_flags, _diff);
-  } else {
-    _diff = _mm_andnot_si128(_query_flags, _diff);
-  }
-
-  _ref_flags = _mm_load_si128((__m128i *) &(ref_ptr[8]));
-  if (genome_unk_mismatch_p) {
-    _diff = _mm_or_si128(_ref_flags, _diff);
-  } else {
-    _diff = _mm_andnot_si128(_ref_flags, _diff);
-  }
-
-  return _diff;
-
-#else
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
   UINT4 diff;
 
   if (sarrayp == true) {
@@ -17524,6 +17511,44 @@ block_diff_metga (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
   debug(printf(" => diff %08X\n",diff));
 
   return diff;
+
+#else
+  __m128i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
+
+  _query_high = _mm_load_si128((__m128i *) query_shifted);
+  _ref_high = _mm_load_si128((__m128i *) ref_ptr);
+  _query_low = _mm_load_si128((__m128i *) &(query_shifted[4]));
+  _ref_low = _mm_load_si128((__m128i *) &(ref_ptr[4]));
+
+  if (sarrayp == true) {
+    /* Ignore genome-A to query-G mismatches.  Convert everything to 3-nucleotide space. */
+    _diff = _mm_setzero_si128();
+  } else {
+    /* Mark genome-A to query-G mismatches */
+    _diff = _mm_andnot_si128(_query_low, _query_high);
+    _diff = _mm_andnot_si128(_ref_high, _diff);
+    _diff = _mm_andnot_si128(_ref_low, _diff);
+  }
+
+  /* Compare reduced G->A nts  */
+  _diff = _mm_or_si128(_diff, _mm_xor_si128(_mm_and_si128(_query_high, _query_low), _mm_and_si128(_ref_high, _ref_low)));
+  _diff = _mm_or_si128(_diff, _mm_xor_si128(_query_low, _ref_low));
+
+  _query_flags = _mm_load_si128((__m128i *) &(query_shifted[8]));
+  if (query_unk_mismatch_local_p) {
+    _diff = _mm_or_si128(_query_flags, _diff);
+  } else {
+    _diff = _mm_andnot_si128(_query_flags, _diff);
+  }
+
+  _ref_flags = _mm_load_si128((__m128i *) &(ref_ptr[8]));
+  if (genome_unk_mismatch_p) {
+    _diff = _mm_or_si128(_ref_flags, _diff);
+  } else {
+    _diff = _mm_andnot_si128(_ref_flags, _diff);
+  }
+
+  return _diff;
 #endif
 }
 
@@ -17654,33 +17679,41 @@ block_diff_a2iag_32 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
     diff = 0U;
   } else {
     /* Mark genome-G to query-A mismatches */
-#ifdef HAVE_SSE2
-    diff = ~(query_shifted[0] | query_shifted[4]) & (ref_ptr[0] & ~(ref_ptr[4]));
-#elif defined(WORDS_BIGENDIAN)
+#ifdef WORDS_BIGENDIAN
     diff = ~(query_shifted[0] | query_shifted[1]) &
       (Bigendian_convert_uint(ref_ptr[0]) & ~Bigendian_convert_uint(ref_ptr[4]));
-#else
+#elif !defined(HAVE_SSE2)
     diff = ~(query_shifted[0] | query_shifted[1]) & (ref_ptr[0] & ~(ref_ptr[4]));
+#else
+    diff = ~(query_shifted[0] | query_shifted[4]) & (ref_ptr[0] & ~(ref_ptr[4]));
 #endif
     debug(printf(" => diff %08X\n",diff));
   }
 
   /* Compare reduced A->G nts  */
-#ifdef HAVE_SSE2
-  diff |= ((query_shifted[0] | ~(query_shifted[4])) ^ (ref_ptr[0] | ~(ref_ptr[4]))) | (query_shifted[4] ^ ref_ptr[4]);
-#elif defined(WORDS_BIGENDIAN)
+#ifdef WORDS_BIGENDIAN
   diff |= ((query_shifted[0] | ~(query_shifted[1])) ^ (Bigendian_convert_uint(ref_ptr[0]) | ~(Bigendian_convert_uint(ref_ptr[4])))) |
     (query_shifted[1] ^ Bigendian_convert_uint(ref_ptr[4]));
-#else
+#elif !defined(HAVE_SSE2)
   diff |= ((query_shifted[0] | ~(query_shifted[1])) ^ (ref_ptr[0] | ~(ref_ptr[4]))) | (query_shifted[1] ^ ref_ptr[4]);
   /* Because (a ^ b) = (~a ^ ~b), this is equivalent to 
   diff |= ((~query_shifted[0] & query_shifted[1]) ^ (~ref_ptr[0] & ref_ptr[4])) | (query_shifted[1] ^ ref_ptr[4]);
   */
+#else
+  diff |= ((query_shifted[0] | ~(query_shifted[4])) ^ (ref_ptr[0] | ~(ref_ptr[4]))) | (query_shifted[4] ^ ref_ptr[4]);
 #endif
   debug(printf(" => diff %08X\n",diff));
 
   /* Flags: Considering N as a mismatch */
-#ifdef HAVE_SSE2
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+  if (query_unk_mismatch_local_p) {
+    debug(printf("Marking query flags: query %08X ",query_shifted[2]));
+    diff |= query_shifted[2];
+  } else {
+    debug(printf("Clearing query flags: query %08X ",query_shifted[2]));
+    diff &= ~(query_shifted[2]);
+  }
+#else
   if (query_unk_mismatch_local_p) {
     debug(printf("Marking query flags: query %08X ",query_shifted[8]));
     diff |= query_shifted[8];
@@ -17688,7 +17721,64 @@ block_diff_a2iag_32 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
     debug(printf("Clearing query flags: query %08X ",query_shifted[8]));
     diff &= ~(query_shifted[8]);
   }
+#endif
+
+  if (genome_unk_mismatch_p) {
+    debug(printf("Marking genome flags: genome %08X ",ref_ptr[8]));
+#ifdef WORDS_BIGENDIAN
+    diff |= Bigendian_convert_uint(ref_ptr[8]);
+#else
+    diff |= (ref_ptr[8]);
+#endif
+  } else {
+    debug(printf("Clearing genome flags: genome %08X ",ref_ptr[8]));
+#ifdef WORDS_BIGENDIAN
+    diff &= ~(Bigendian_convert_uint(ref_ptr[8]));
+#else
+    diff &= ~(ref_ptr[8]);
+#endif
+  }
+  debug(printf(" => diff %08X\n",diff));
+
+  return diff;
+}
+
+
+/* Convert A->G: new high = high | ~low */
+static Genomediff_T
+block_diff_a2iag (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
+		  bool query_unk_mismatch_local_p, bool sarrayp) {
+
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+  UINT4 diff;
+
+  if (sarrayp == true) {
+    /* Ignore genome-G to query-A mismatches.  Convert everything to 3-nucleotide space. */
+    diff = 0U;
+  } else {
+    /* Mark genome-G to query-A mismatches */
+#ifdef WORDS_BIGENDIAN
+    diff = ~(query_shifted[0] | query_shifted[1]) &
+      (Bigendian_convert_uint(ref_ptr[0]) & ~Bigendian_convert_uint(ref_ptr[4]));
+#else
+    diff = ~(query_shifted[0] | query_shifted[1]) & (ref_ptr[0] & ~(ref_ptr[4]));
+#endif
+    debug(printf(" => diff %08X\n",diff));
+  }
+
+  /* Compare reduced A->G nts  */
+#ifdef WORDS_BIGENDIAN
+  diff |= ((query_shifted[0] | ~(query_shifted[1])) ^ (Bigendian_convert_uint(ref_ptr[0]) | ~(Bigendian_convert_uint(ref_ptr[4])))) |
+    (query_shifted[1] ^ Bigendian_convert_uint(ref_ptr[4]));
 #else
+  diff |= ((query_shifted[0] | ~(query_shifted[1])) ^ (ref_ptr[0] | ~(ref_ptr[4]))) | (query_shifted[1] ^ ref_ptr[4]);
+  /* Because (a ^ b) = (~a ^ ~b), this is equivalent to 
+  diff |= ((~query_shifted[0] & query_shifted[1]) ^ (~ref_ptr[0] & ref_ptr[4])) | (query_shifted[1] ^ ref_ptr[4]);
+  */
+#endif
+  debug(printf(" => diff %08X\n",diff));
+
+  /* Flags: Considering N as a mismatch */
   if (query_unk_mismatch_local_p) {
     debug(printf("Marking query flags: query %08X ",query_shifted[2]));
     diff |= query_shifted[2];
@@ -17696,7 +17786,6 @@ block_diff_a2iag_32 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
     debug(printf("Clearing query flags: query %08X ",query_shifted[2]));
     diff &= ~(query_shifted[2]);
   }
-#endif
 
   if (genome_unk_mismatch_p) {
     debug(printf("Marking genome flags: genome %08X ",ref_ptr[8]));
@@ -17716,14 +17805,8 @@ block_diff_a2iag_32 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
   debug(printf(" => diff %08X\n",diff));
 
   return diff;
-}
 
-
-/* Convert A->G: new high = high | ~low */
-static Genomediff_T
-block_diff_a2iag (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
-		  bool query_unk_mismatch_local_p, bool sarrayp) {
-#ifdef HAVE_SSE2
+#else
   __m128i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
 
   _query_high = _mm_load_si128((__m128i *) query_shifted);
@@ -17758,37 +17841,52 @@ block_diff_a2iag (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
   }
 
   return _diff;
+#endif
+}
 
-#else
+
+static UINT4
+block_diff_a2itc_32 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
+		     bool query_unk_mismatch_local_p, bool sarrayp) {
   UINT4 diff;
 
   if (sarrayp == true) {
-    /* Ignore genome-G to query-A mismatches.  Convert everything to 3-nucleotide space. */
+    /* Ignore genome-C to query-T mismatches */
     diff = 0U;
   } else {
-    /* Mark genome-G to query-A mismatches */
+    /* Mark genome-C to query-T mismatches */
 #ifdef WORDS_BIGENDIAN
-    diff = ~(query_shifted[0] | query_shifted[1]) &
-      (Bigendian_convert_uint(ref_ptr[0]) & ~Bigendian_convert_uint(ref_ptr[4]));
+    diff = (query_shifted[0] & query_shifted[1]) &
+      (~(Bigendian_convert_uint(ref_ptr[0])) & Bigendian_convert_uint(ref_ptr[4]));
+#elif !defined(HAVE_SSE2)
+    diff = (query_shifted[0] & query_shifted[1]) & (~(ref_ptr[0]) & ref_ptr[4]);
 #else
-    diff = ~(query_shifted[0] | query_shifted[1]) & (ref_ptr[0] & ~(ref_ptr[4]));
+    diff = (query_shifted[0] & query_shifted[4]) & (~(ref_ptr[0]) & ref_ptr[4]);
 #endif
     debug(printf(" => diff %08X\n",diff));
   }
 
-  /* Compare reduced A->G nts  */
+  /* Compare reduced T->C nts  */
 #ifdef WORDS_BIGENDIAN
-  diff |= ((query_shifted[0] | ~(query_shifted[1])) ^ (Bigendian_convert_uint(ref_ptr[0]) | ~(Bigendian_convert_uint(ref_ptr[4])))) |
+  diff |= ((query_shifted[0] & ~(query_shifted[1])) ^ (Bigendian_convert_uint(ref_ptr[0]) & ~(Bigendian_convert_uint(ref_ptr[4])))) |
     (query_shifted[1] ^ Bigendian_convert_uint(ref_ptr[4]));
+#elif !defined(HAVE_SSE2)
+  diff |= ((query_shifted[0] & ~(query_shifted[1])) ^ (ref_ptr[0] & ~(ref_ptr[4]))) | (query_shifted[1] ^ ref_ptr[4]);
 #else
-  diff |= ((query_shifted[0] | ~(query_shifted[1])) ^ (ref_ptr[0] | ~(ref_ptr[4]))) | (query_shifted[1] ^ ref_ptr[4]);
-  /* Because (a ^ b) = (~a ^ ~b), this is equivalent to 
-  diff |= ((~query_shifted[0] & query_shifted[1]) ^ (~ref_ptr[0] & ref_ptr[4])) | (query_shifted[1] ^ ref_ptr[4]);
-  */
+  diff |= ((query_shifted[0] & ~(query_shifted[4])) ^ (ref_ptr[0] & ~(ref_ptr[4]))) | (query_shifted[4] ^ ref_ptr[4]);
 #endif
   debug(printf(" => diff %08X\n",diff));
 
   /* Flags: Considering N as a mismatch */
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+  if (query_unk_mismatch_local_p) {
+    debug(printf("Marking query flags: query %08X ",query_shifted[8]));
+    diff |= query_shifted[8];
+  } else {
+    debug(printf("Clearing query flags: query %08X ",query_shifted[8]));
+    diff &= ~(query_shifted[8]);
+  }
+#else
   if (query_unk_mismatch_local_p) {
     debug(printf("Marking query flags: query %08X ",query_shifted[2]));
     diff |= query_shifted[2];
@@ -17796,6 +17894,7 @@ block_diff_a2iag (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
     debug(printf("Clearing query flags: query %08X ",query_shifted[2]));
     diff &= ~(query_shifted[2]);
   }
+#endif
 
   if (genome_unk_mismatch_p) {
     debug(printf("Marking genome flags: genome %08X ",ref_ptr[8]));
@@ -17815,13 +17914,15 @@ block_diff_a2iag (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
   debug(printf(" => diff %08X\n",diff));
 
   return diff;
-#endif
 }
 
 
-static UINT4
-block_diff_a2itc_32 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
-		     bool query_unk_mismatch_local_p, bool sarrayp) {
+/* Convert T->C: new high = high & ~low */
+static Genomediff_T
+block_diff_a2itc (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
+                  bool query_unk_mismatch_local_p, bool sarrayp) {
+
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
   UINT4 diff;
 
   if (sarrayp == true) {
@@ -17829,9 +17930,7 @@ block_diff_a2itc_32 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
     diff = 0U;
   } else {
     /* Mark genome-C to query-T mismatches */
-#ifdef HAVE_SSE2
-    diff = (query_shifted[0] & query_shifted[4]) & (~(ref_ptr[0]) & ref_ptr[4]);
-#elif defined(WORDS_BIGENDIAN)
+#ifdef WORDS_BIGENDIAN
     diff = (query_shifted[0] & query_shifted[1]) &
       (~(Bigendian_convert_uint(ref_ptr[0])) & Bigendian_convert_uint(ref_ptr[4]));
 #else
@@ -17841,9 +17940,7 @@ block_diff_a2itc_32 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
   }
 
   /* Compare reduced T->C nts  */
-#ifdef HAVE_SSE2
-  diff |= ((query_shifted[0] & ~(query_shifted[4])) ^ (ref_ptr[0] & ~(ref_ptr[4]))) | (query_shifted[4] ^ ref_ptr[4]);
-#elif defined(WORDS_BIGENDIAN)
+#ifdef WORDS_BIGENDIAN
   diff |= ((query_shifted[0] & ~(query_shifted[1])) ^ (Bigendian_convert_uint(ref_ptr[0]) & ~(Bigendian_convert_uint(ref_ptr[4])))) |
     (query_shifted[1] ^ Bigendian_convert_uint(ref_ptr[4]));
 #else
@@ -17852,7 +17949,6 @@ block_diff_a2itc_32 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
   debug(printf(" => diff %08X\n",diff));
 
   /* Flags: Considering N as a mismatch */
-#ifdef HAVE_SSE2
   if (query_unk_mismatch_local_p) {
     debug(printf("Marking query flags: query %08X ",query_shifted[2]));
     diff |= query_shifted[2];
@@ -17860,15 +17956,6 @@ block_diff_a2itc_32 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
     debug(printf("Clearing query flags: query %08X ",query_shifted[2]));
     diff &= ~(query_shifted[2]);
   }
-#else
-  if (query_unk_mismatch_local_p) {
-    debug(printf("Marking query flags: query %08X ",query_shifted[8]));
-    diff |= query_shifted[8];
-  } else {
-    debug(printf("Clearing query flags: query %08X ",query_shifted[8]));
-    diff &= ~(query_shifted[8]);
-  }
-#endif
 
   if (genome_unk_mismatch_p) {
     debug(printf("Marking genome flags: genome %08X ",ref_ptr[8]));
@@ -17888,14 +17975,8 @@ block_diff_a2itc_32 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
   debug(printf(" => diff %08X\n",diff));
 
   return diff;
-}
-
 
-/* Convert T->C: new high = high & ~low */
-static Genomediff_T
-block_diff_a2itc (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
-                  bool query_unk_mismatch_local_p, bool sarrayp) {
-#ifdef HAVE_SSE2
+#else
   __m128i _diff, _query_high, _query_low, _query_flags, _ref_high, _ref_low, _ref_flags;
 
   _query_high = _mm_load_si128((__m128i *) query_shifted);
@@ -17930,171 +18011,233 @@ block_diff_a2itc (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
   }
 
   return _diff;
+#endif
+}
 
-#else
-  UINT4 diff;
 
-  if (sarrayp == true) {
-    /* Ignore genome-C to query-T mismatches */
-    diff = 0U;
+static UINT4
+block_diff_atoi_32 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
+		    bool plusp, int genestrand, bool query_unk_mismatch_local_p) {
+  if (genestrand == +2) {
+    if (plusp) {
+      return block_diff_a2itc_32(query_shifted,ref_ptr,query_unk_mismatch_local_p,/*sarrayp*/false);
+    } else {
+      return block_diff_a2iag_32(query_shifted,ref_ptr,query_unk_mismatch_local_p,/*sarrayp*/false);
+    }
   } else {
-    /* Mark genome-C to query-T mismatches */
-#ifdef WORDS_BIGENDIAN
-    diff = (query_shifted[0] & query_shifted[1]) &
-      (~(Bigendian_convert_uint(ref_ptr[0])) & Bigendian_convert_uint(ref_ptr[4]));
-#else
-    diff = (query_shifted[0] & query_shifted[1]) & (~(ref_ptr[0]) & ref_ptr[4]);
-#endif
-    debug(printf(" => diff %08X\n",diff));
+    if (plusp) {
+      return block_diff_a2iag_32(query_shifted,ref_ptr,query_unk_mismatch_local_p,/*sarrayp*/false);
+    } else {
+      return block_diff_a2itc_32(query_shifted,ref_ptr,query_unk_mismatch_local_p,/*sarrayp*/false);
+    }
   }
+}
 
-  /* Compare reduced T->C nts  */
-#ifdef WORDS_BIGENDIAN
-  diff |= ((query_shifted[0] & ~(query_shifted[1])) ^ (Bigendian_convert_uint(ref_ptr[0]) & ~(Bigendian_convert_uint(ref_ptr[4])))) |
-    (query_shifted[1] ^ Bigendian_convert_uint(ref_ptr[4]));
-#else
-  diff |= ((query_shifted[0] & ~(query_shifted[1])) ^ (ref_ptr[0] & ~(ref_ptr[4]))) | (query_shifted[1] ^ ref_ptr[4]);
-#endif
-  debug(printf(" => diff %08X\n",diff));
 
-  /* Flags: Considering N as a mismatch */
-  if (query_unk_mismatch_local_p) {
-    debug(printf("Marking query flags: query %08X ",query_shifted[2]));
-    diff |= query_shifted[2];
+static Genomediff_T
+block_diff_atoi (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
+		 bool plusp, int genestrand, bool query_unk_mismatch_local_p) {
+  if (genestrand == +2) {
+    if (plusp) {
+      return block_diff_a2itc(query_shifted,ref_ptr,query_unk_mismatch_local_p,/*sarrayp*/false);
+    } else {
+      return block_diff_a2iag(query_shifted,ref_ptr,query_unk_mismatch_local_p,/*sarrayp*/false);
+    }
   } else {
-    debug(printf("Clearing query flags: query %08X ",query_shifted[2]));
-    diff &= ~(query_shifted[2]);
+    if (plusp) {
+      return block_diff_a2iag(query_shifted,ref_ptr,query_unk_mismatch_local_p,/*sarrayp*/false);
+    } else {
+      return block_diff_a2itc(query_shifted,ref_ptr,query_unk_mismatch_local_p,/*sarrayp*/false);
+    }
   }
+}
 
-  if (genome_unk_mismatch_p) {
-    debug(printf("Marking genome flags: genome %08X ",ref_ptr[8]));
-#ifdef WORDS_BIGENDIAN
-    diff |= Bigendian_convert_uint(ref_ptr[8]);
-#else
-    diff |= (ref_ptr[8]);
-#endif
+static UINT4
+block_diff_atoi_sarray_32 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
+			   bool plusp, int genestrand, bool query_unk_mismatch_local_p) {
+  if (genestrand == +2) {
+    if (plusp) {
+      return block_diff_a2itc_32(query_shifted,ref_ptr,query_unk_mismatch_local_p,/*sarrayp*/true);
+    } else {
+      return block_diff_a2iag_32(query_shifted,ref_ptr,query_unk_mismatch_local_p,/*sarrayp*/true);
+    }
   } else {
-    debug(printf("Clearing genome flags: genome %08X ",ref_ptr[8]));
-#ifdef WORDS_BIGENDIAN
-    diff &= ~(Bigendian_convert_uint(ref_ptr[8]));
-#else
-    diff &= ~(ref_ptr[8]);
-#endif
+    if (plusp) {
+      return block_diff_a2iag_32(query_shifted,ref_ptr,query_unk_mismatch_local_p,/*sarrayp*/true);
+    } else {
+      return block_diff_a2itc_32(query_shifted,ref_ptr,query_unk_mismatch_local_p,/*sarrayp*/true);
+    }
   }
-  debug(printf(" => diff %08X\n",diff));
+}
 
-  return diff;
-#endif
+static Genomediff_T
+block_diff_atoi_sarray (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
+			bool plusp, int genestrand, bool query_unk_mismatch_local_p) {
+  if (genestrand == +2) {
+    if (plusp) {
+      return block_diff_a2itc(query_shifted,ref_ptr,query_unk_mismatch_local_p,/*sarrayp*/true);
+    } else {
+      return block_diff_a2iag(query_shifted,ref_ptr,query_unk_mismatch_local_p,/*sarrayp*/true);
+    }
+  } else {
+    if (plusp) {
+      return block_diff_a2iag(query_shifted,ref_ptr,query_unk_mismatch_local_p,/*sarrayp*/true);
+    } else {
+      return block_diff_a2itc(query_shifted,ref_ptr,query_unk_mismatch_local_p,/*sarrayp*/true);
+    }
+  }
+}
+
+/* Ignores snp_ptr */
+static UINT4
+block_diff_atoi_snp_32 (Genomecomp_T *query_shifted, Genomecomp_T *snp_ptr, Genomecomp_T *ref_ptr,
+			bool plusp, int genestrand, bool query_unk_mismatch_local_p) {
+  if (genestrand == +2) {
+    if (plusp) {
+      return block_diff_a2itc_32(query_shifted,ref_ptr,query_unk_mismatch_local_p,/*sarrayp*/false);
+    } else {
+      return block_diff_a2iag_32(query_shifted,ref_ptr,query_unk_mismatch_local_p,/*sarrayp*/false);
+    }
+  } else {
+    if (plusp) {
+      return block_diff_a2iag_32(query_shifted,ref_ptr,query_unk_mismatch_local_p,/*sarrayp*/false);
+    } else {
+      return block_diff_a2itc_32(query_shifted,ref_ptr,query_unk_mismatch_local_p,/*sarrayp*/false);
+    }
+  }
 }
 
+/* Ignores snp_ptr */
+static Genomediff_T
+block_diff_atoi_snp (Genomecomp_T *query_shifted, Genomecomp_T *snp_ptr, Genomecomp_T *ref_ptr,
+		     bool plusp, int genestrand, bool query_unk_mismatch_local_p) {
+  if (genestrand == +2) {
+    if (plusp) {
+      return block_diff_a2itc(query_shifted,ref_ptr,query_unk_mismatch_local_p,/*sarrayp*/false);
+    } else {
+      return block_diff_a2iag(query_shifted,ref_ptr,query_unk_mismatch_local_p,/*sarrayp*/false);
+    }
+  } else {
+    if (plusp) {
+      return block_diff_a2iag(query_shifted,ref_ptr,query_unk_mismatch_local_p,/*sarrayp*/false);
+    } else {
+      return block_diff_a2itc(query_shifted,ref_ptr,query_unk_mismatch_local_p,/*sarrayp*/false);
+    }
+  }
+}
+
+
+/************************************************************************
+ *  TTOC
+ ************************************************************************/
 
 static UINT4
-block_diff_atoi_32 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
+block_diff_ttoc_32 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
 		    bool plusp, int genestrand, bool query_unk_mismatch_local_p) {
   if (genestrand == +2) {
     if (plusp) {
-      return block_diff_a2itc_32(query_shifted,ref_ptr,query_unk_mismatch_local_p,/*sarrayp*/false);
-    } else {
       return block_diff_a2iag_32(query_shifted,ref_ptr,query_unk_mismatch_local_p,/*sarrayp*/false);
+    } else {
+      return block_diff_a2itc_32(query_shifted,ref_ptr,query_unk_mismatch_local_p,/*sarrayp*/false);
     }
   } else {
     if (plusp) {
-      return block_diff_a2iag_32(query_shifted,ref_ptr,query_unk_mismatch_local_p,/*sarrayp*/false);
-    } else {
       return block_diff_a2itc_32(query_shifted,ref_ptr,query_unk_mismatch_local_p,/*sarrayp*/false);
+    } else {
+      return block_diff_a2iag_32(query_shifted,ref_ptr,query_unk_mismatch_local_p,/*sarrayp*/false);
     }
   }
 }
 
 
 static Genomediff_T
-block_diff_atoi (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
+block_diff_ttoc (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
 		 bool plusp, int genestrand, bool query_unk_mismatch_local_p) {
   if (genestrand == +2) {
     if (plusp) {
-      return block_diff_a2itc(query_shifted,ref_ptr,query_unk_mismatch_local_p,/*sarrayp*/false);
-    } else {
       return block_diff_a2iag(query_shifted,ref_ptr,query_unk_mismatch_local_p,/*sarrayp*/false);
+    } else {
+      return block_diff_a2itc(query_shifted,ref_ptr,query_unk_mismatch_local_p,/*sarrayp*/false);
     }
   } else {
     if (plusp) {
-      return block_diff_a2iag(query_shifted,ref_ptr,query_unk_mismatch_local_p,/*sarrayp*/false);
-    } else {
       return block_diff_a2itc(query_shifted,ref_ptr,query_unk_mismatch_local_p,/*sarrayp*/false);
+    } else {
+      return block_diff_a2iag(query_shifted,ref_ptr,query_unk_mismatch_local_p,/*sarrayp*/false);
     }
   }
 }
 
 static UINT4
-block_diff_atoi_sarray_32 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
+block_diff_ttoc_sarray_32 (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
 			   bool plusp, int genestrand, bool query_unk_mismatch_local_p) {
   if (genestrand == +2) {
     if (plusp) {
-      return block_diff_a2itc_32(query_shifted,ref_ptr,query_unk_mismatch_local_p,/*sarrayp*/true);
-    } else {
       return block_diff_a2iag_32(query_shifted,ref_ptr,query_unk_mismatch_local_p,/*sarrayp*/true);
+    } else {
+      return block_diff_a2itc_32(query_shifted,ref_ptr,query_unk_mismatch_local_p,/*sarrayp*/true);
     }
   } else {
     if (plusp) {
-      return block_diff_a2iag_32(query_shifted,ref_ptr,query_unk_mismatch_local_p,/*sarrayp*/true);
-    } else {
       return block_diff_a2itc_32(query_shifted,ref_ptr,query_unk_mismatch_local_p,/*sarrayp*/true);
+    } else {
+      return block_diff_a2iag_32(query_shifted,ref_ptr,query_unk_mismatch_local_p,/*sarrayp*/true);
     }
   }
 }
 
 static Genomediff_T
-block_diff_atoi_sarray (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
+block_diff_ttoc_sarray (Genomecomp_T *query_shifted, Genomecomp_T *ref_ptr,
 			bool plusp, int genestrand, bool query_unk_mismatch_local_p) {
   if (genestrand == +2) {
     if (plusp) {
-      return block_diff_a2itc(query_shifted,ref_ptr,query_unk_mismatch_local_p,/*sarrayp*/true);
-    } else {
       return block_diff_a2iag(query_shifted,ref_ptr,query_unk_mismatch_local_p,/*sarrayp*/true);
+    } else {
+      return block_diff_a2itc(query_shifted,ref_ptr,query_unk_mismatch_local_p,/*sarrayp*/true);
     }
   } else {
     if (plusp) {
-      return block_diff_a2iag(query_shifted,ref_ptr,query_unk_mismatch_local_p,/*sarrayp*/true);
-    } else {
       return block_diff_a2itc(query_shifted,ref_ptr,query_unk_mismatch_local_p,/*sarrayp*/true);
+    } else {
+      return block_diff_a2iag(query_shifted,ref_ptr,query_unk_mismatch_local_p,/*sarrayp*/true);
     }
   }
 }
 
 /* Ignores snp_ptr */
 static UINT4
-block_diff_atoi_snp_32 (Genomecomp_T *query_shifted, Genomecomp_T *snp_ptr, Genomecomp_T *ref_ptr,
+block_diff_ttoc_snp_32 (Genomecomp_T *query_shifted, Genomecomp_T *snp_ptr, Genomecomp_T *ref_ptr,
 			bool plusp, int genestrand, bool query_unk_mismatch_local_p) {
   if (genestrand == +2) {
     if (plusp) {
-      return block_diff_a2itc_32(query_shifted,ref_ptr,query_unk_mismatch_local_p,/*sarrayp*/false);
-    } else {
       return block_diff_a2iag_32(query_shifted,ref_ptr,query_unk_mismatch_local_p,/*sarrayp*/false);
+    } else {
+      return block_diff_a2itc_32(query_shifted,ref_ptr,query_unk_mismatch_local_p,/*sarrayp*/false);
     }
   } else {
     if (plusp) {
-      return block_diff_a2iag_32(query_shifted,ref_ptr,query_unk_mismatch_local_p,/*sarrayp*/false);
-    } else {
       return block_diff_a2itc_32(query_shifted,ref_ptr,query_unk_mismatch_local_p,/*sarrayp*/false);
+    } else {
+      return block_diff_a2iag_32(query_shifted,ref_ptr,query_unk_mismatch_local_p,/*sarrayp*/false);
     }
   }
 }
 
 /* Ignores snp_ptr */
 static Genomediff_T
-block_diff_atoi_snp (Genomecomp_T *query_shifted, Genomecomp_T *snp_ptr, Genomecomp_T *ref_ptr,
+block_diff_ttoc_snp (Genomecomp_T *query_shifted, Genomecomp_T *snp_ptr, Genomecomp_T *ref_ptr,
 		     bool plusp, int genestrand, bool query_unk_mismatch_local_p) {
   if (genestrand == +2) {
     if (plusp) {
-      return block_diff_a2itc(query_shifted,ref_ptr,query_unk_mismatch_local_p,/*sarrayp*/false);
-    } else {
       return block_diff_a2iag(query_shifted,ref_ptr,query_unk_mismatch_local_p,/*sarrayp*/false);
+    } else {
+      return block_diff_a2itc(query_shifted,ref_ptr,query_unk_mismatch_local_p,/*sarrayp*/false);
     }
   } else {
     if (plusp) {
-      return block_diff_a2iag(query_shifted,ref_ptr,query_unk_mismatch_local_p,/*sarrayp*/false);
-    } else {
       return block_diff_a2itc(query_shifted,ref_ptr,query_unk_mismatch_local_p,/*sarrayp*/false);
+    } else {
+      return block_diff_a2iag(query_shifted,ref_ptr,query_unk_mismatch_local_p,/*sarrayp*/false);
     }
   }
 }
@@ -18117,7 +18260,9 @@ static Diffproc_snp_32_T block_diff_snp_32;
 static Diffproc_T block_diff_sarray; 
 static Diffproc_32_T block_diff_sarray_32; 
 
-#ifdef HAVE_SSE2
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+/* Skip */
+#else
 static __m128i _BOUND_HIGH;
 static __m128i _BOUND_LOW;
 #endif
@@ -18126,7 +18271,9 @@ void
 Genome_hr_setup (Genomecomp_T *ref_blocks_in, Genomecomp_T *snp_blocks_in,
 		 bool query_unk_mismatch_p_in, bool genome_unk_mismatch_p_in,
 		 Mode_T mode) {
-#ifdef HAVE_SSE2
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+/* Skip */
+#else
   _BOUND_HIGH = _mm_set_epi32(128,96,64,32);
   _BOUND_LOW = _mm_set_epi32(96,64,32,0);
 #endif
@@ -18155,6 +18302,12 @@ Genome_hr_setup (Genomecomp_T *ref_blocks_in, Genomecomp_T *snp_blocks_in,
     block_diff_32 = block_diff_atoi_32;
     block_diff_sarray_32 = block_diff_atoi_sarray_32;
     break;
+  case TTOC_STRANDED: case TTOC_NONSTRANDED:
+    block_diff = block_diff_ttoc;
+    block_diff_sarray = block_diff_ttoc_sarray;
+    block_diff_32 = block_diff_ttoc_32;
+    block_diff_sarray_32 = block_diff_ttoc_sarray_32;
+    break;
   default: fprintf(stderr,"Mode %d not recognized\n",mode); abort();
   }
 
@@ -18175,6 +18328,10 @@ Genome_hr_setup (Genomecomp_T *ref_blocks_in, Genomecomp_T *snp_blocks_in,
     block_diff_snp = block_diff_atoi_snp;
     block_diff_snp_32 = block_diff_atoi_snp_32;
     break;
+  case TTOC_STRANDED: case TTOC_NONSTRANDED:
+    block_diff_snp = block_diff_ttoc_snp;
+    block_diff_snp_32 = block_diff_ttoc_snp_32;
+    break;
   default: fprintf(stderr,"Mode %d not recognized\n",mode); abort();
   }
 #endif
@@ -18205,6 +18362,10 @@ Genome_hr_user_setup (UINT4 *ref_blocks_in,
     block_diff = block_diff_atoi;
     block_diff_32 = block_diff_atoi_32;
     break;
+  case TTOC_STRANDED: case TTOC_NONSTRANDED:
+    block_diff = block_diff_ttoc;
+    block_diff_32 = block_diff_ttoc_32;
+    break;
   default: fprintf(stderr,"Mode %d not recognized\n",mode); abort();
   }
 
@@ -18225,6 +18386,10 @@ Genome_hr_user_setup (UINT4 *ref_blocks_in,
     block_diff_snp = block_diff_atoi_snp;
     block_diff_snp_32 = block_diff_atoi_snp_32;
     break;
+  case TTOC_STRANDED: case TTOC_NONSTRANDED:
+    block_diff_snp = block_diff_ttoc_snp;
+    block_diff_snp_32 = block_diff_ttoc_snp_32;
+    break;
   default: fprintf(stderr,"Mode %d not recognized\n",mode); abort();
   }
 #endif
@@ -18251,7 +18416,72 @@ Genome_hr_user_setup (UINT4 *ref_blocks_in,
 #define set_end_mask(enddiscard) (~0U << enddiscard)
 
 
-#ifdef HAVE_SSE2
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+
+#define nonzero_p(diff) diff
+
+#define clear_start(diff,startdiscard) (diff & (~0U << (startdiscard)))
+#define clear_end(diff,enddiscard) (diff & ~(~0U << (enddiscard)))
+
+/* Same speed: clear_highbit(diff,relpos) (diff - (HIGH_BIT >> relpos)) */
+/* Note: xor assumes that bit at relpos was on */
+#define clear_highbit(diff,relpos) (diff ^ (HIGH_BIT >> relpos))
+
+/* Slower: clear_lowbit(diff,relpos) diff -= (1 << relpos) */
+#define clear_lowbit(diff,relpos) (diff & (diff - 1));
+
+
+#ifdef HAVE_POPCNT
+#define popcount_ones(diff) (_popcnt32(diff))
+#elif defined(HAVE_MM_POPCNT)
+#define popcount_ones(diff) (_mm_popcnt_u32(diff))
+#elif defined(HAVE_BUILTIN_POPCOUNT)
+#define popcount_ones(diff) (__builtin_popcount(diff))
+#else
+#define popcount_ones(diff) (count_bits[diff & 0x0000FFFF] + count_bits[diff >> 16])
+#endif
+
+
+#ifdef HAVE_LZCNT
+#define count_leading_zeroes(diff) _lzcnt_u32(diff)
+#elif defined(HAVE_BUILTIN_CLZ)
+#define count_leading_zeroes(diff) __builtin_clz(diff)
+#else
+#define count_leading_zeroes(diff) ((diff >> 16) ? clz_table[diff >> 16] : 16 + clz_table[diff])
+#endif
+
+#ifdef HAVE_TZCNT
+#define count_trailing_zeroes(diff) _tzcnt_u32(diff)
+#elif defined(HAVE_BUILTIN_CTZ)
+#define count_trailing_zeroes(diff) __builtin_ctz(diff)
+#else
+/* lowbit = -diff & diff */
+#define count_trailing_zeroes(diff) mod_37_bit_position[(-diff & diff) % 37]
+#endif
+
+/* For trimming */
+#define set_start(diff,startdiscard) (diff | ~(~0U << startdiscard))
+#define set_end(diff,enddiscard) (diff | (~0U << enddiscard))
+
+static void
+print_diff_popcount (UINT4 diff) {
+  printf("diff: %08X => nmismatches %d\n",diff,popcount_ones(diff));
+  return;
+}
+
+static void
+print_diff_trailing_zeroes (UINT4 diff, int offset) {
+  printf("diff: %08X => offset %d + trailing zeroes %d\n",diff,offset,count_trailing_zeroes(diff));
+  return;
+}
+
+static void
+print_diff_leading_zeroes (UINT4 diff, int offset) {
+  printf("diff: %08X => offset %d - leading zeroes %d\n",diff,offset,count_leading_zeroes(diff));
+  return;
+}
+
+#else  /* littleendian and SSE2 */
 
 #ifdef HAVE_SSE4_1
 #define nonzero_p(diff) !_mm_testz_si128(diff,diff)
@@ -18625,72 +18855,7 @@ print_diff_leading_zeroes (__m128i _diff, int offset) {
 }
 #endif
 
-#else
-
-#define nonzero_p(diff) diff
-
-#define clear_start(diff,startdiscard) (diff & (~0U << (startdiscard)))
-#define clear_end(diff,enddiscard) (diff & ~(~0U << (enddiscard)))
-
-/* Same speed: clear_highbit(diff,relpos) (diff - (HIGH_BIT >> relpos)) */
-/* Note: xor assumes that bit at relpos was on */
-#define clear_highbit(diff,relpos) (diff ^ (HIGH_BIT >> relpos))
-
-/* Slower: clear_lowbit(diff,relpos) diff -= (1 << relpos) */
-#define clear_lowbit(diff,relpos) (diff & (diff - 1));
-
-
-#ifdef HAVE_POPCNT
-#define popcount_ones(diff) (_popcnt32(diff))
-#elif defined(HAVE_MM_POPCNT)
-#define popcount_ones(diff) (_mm_popcnt_u32(diff))
-#elif defined(HAVE_BUILTIN_POPCOUNT)
-#define popcount_ones(diff) (__builtin_popcount(diff))
-#else
-#define popcount_ones(diff) (count_bits[diff & 0x0000FFFF] + count_bits[diff >> 16])
-#endif
-
-
-#ifdef HAVE_LZCNT
-#define count_leading_zeroes(diff) _lzcnt_u32(diff)
-#elif defined(HAVE_BUILTIN_CLZ)
-#define count_leading_zeroes(diff) __builtin_clz(diff)
-#else
-#define count_leading_zeroes(diff) ((diff >> 16) ? clz_table[diff >> 16] : 16 + clz_table[diff])
-#endif
-
-#ifdef HAVE_TZCNT
-#define count_trailing_zeroes(diff) _tzcnt_u32(diff)
-#elif defined(HAVE_BUILTIN_CTZ)
-#define count_trailing_zeroes(diff) __builtin_ctz(diff)
-#else
-/* lowbit = -diff & diff */
-#define count_trailing_zeroes(diff) mod_37_bit_position[(-diff & diff) % 37]
-#endif
-
-/* For trimming */
-#define set_start(diff,startdiscard) (diff | ~(~0U << startdiscard))
-#define set_end(diff,enddiscard) (diff | (~0U << enddiscard))
-
-static void
-print_diff_popcount (UINT4 diff) {
-  printf("diff: %08X => nmismatches %d\n",diff,popcount_ones(diff));
-  return;
-}
-
-static void
-print_diff_trailing_zeroes (UINT4 diff, int offset) {
-  printf("diff: %08X => offset %d + trailing zeroes %d\n",diff,offset,count_trailing_zeroes(diff));
-  return;
-}
-
-static void
-print_diff_leading_zeroes (UINT4 diff, int offset) {
-  printf("diff: %08X => offset %d - leading zeroes %d\n",diff,offset,count_leading_zeroes(diff));
-  return;
-}
-
-#endif
+#endif	/* littleendian and SSE2 */
 
 
 #define nonzero_p_32(diff) diff
@@ -18787,12 +18952,13 @@ Genome_consecutive_matches_rightward (Compress_T query_compress, Univcoord_T lef
     debug(printf("nshift = %d, startdiscard = %u, enddiscard = %u\n",nshift,startdiscard,enddiscard));
 
 
-    diff_32 = (block_diff_sarray_32)(query_shifted
-#ifdef HAVE_SSE2
-				     + startcolumni
-#endif
-				     ,&(ref_blocks[startblocki_32]),
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+    diff_32 = (block_diff_sarray_32)(query_shifted,&(ref_blocks[startblocki_32]),
+                                     plusp,genestrand,/*query_unk_mismatch_local_p*/true);
+#else
+    diff_32 = (block_diff_sarray_32)(query_shifted + startcolumni,&(ref_blocks[startblocki_32]),
                                      plusp,genestrand,/*query_unk_mismatch_local_p*/true);
+#endif
     diff_32 = clear_start_32(diff_32,startdiscard);
     diff_32 = clear_end_32(diff_32,enddiscard);
 
@@ -18818,8 +18984,7 @@ Genome_consecutive_matches_rightward (Compress_T query_compress, Univcoord_T lef
   else {
 #endif
 
-#ifdef HAVE_SSE2
-#else
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
     startblocki = startblocki_32;
     endblocki = endblocki_32;
 #endif
@@ -18835,7 +19000,8 @@ Genome_consecutive_matches_rightward (Compress_T query_compress, Univcoord_T lef
 #endif
 
 
-#ifdef HAVE_SSE2
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+#else
   if (endblocki == startblocki) {
     diff = (block_diff_sarray)(query_shifted,&(ref_blocks[startblocki]),
 			       plusp,genestrand,/*query_unk_mismatch_local_p*/true);
@@ -18869,11 +19035,11 @@ Genome_consecutive_matches_rightward (Compress_T query_compress, Univcoord_T lef
     }
 
     query_shifted += COMPRESS_BLOCKSIZE;
-#ifdef HAVE_SSE2
-    ptr = &(ref_blocks[startblocki+12]);
-#else
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
     ptr = &(ref_blocks[startblocki]);
     ptr += 1; if (++startcolumni == 4) {ptr += 8; startcolumni = 0;}
+#else
+    ptr = &(ref_blocks[startblocki+12]);
 #endif
     end = &(ref_blocks[endblocki]);
     offset += STEP_SIZE; /* 128 or 32 */
@@ -18888,10 +19054,10 @@ Genome_consecutive_matches_rightward (Compress_T query_compress, Univcoord_T lef
       }
 
       query_shifted += COMPRESS_BLOCKSIZE;
-#ifdef HAVE_SSE2
-      ptr += 12;
-#else
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
       ptr += 1; if (++startcolumni == 4) {ptr += 8; startcolumni = 0;}
+#else
+      ptr += 12;
 #endif
       offset += STEP_SIZE; /* 128 or 32 */
     }
@@ -18911,7 +19077,8 @@ Genome_consecutive_matches_rightward (Compress_T query_compress, Univcoord_T lef
       return (pos3 - pos5);
     }
 
-#ifdef HAVE_SSE2
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+#else
   }
 #endif
 }
@@ -18966,13 +19133,13 @@ Genome_consecutive_matches_leftward (Compress_T query_compress, Univcoord_T left
     offset = (pos3 - 1) - enddiscard + 32;
     debug(printf("nshift = %d, startdiscard = %u, enddiscard = %u\n",nshift,startdiscard,enddiscard));
 
-
-    diff_32 = (block_diff_sarray_32)(query_shifted
-#ifdef HAVE_SSE2
-				     + endcolumni
-#endif
-				     ,&(ref_blocks[endblocki_32]),
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+    diff_32 = (block_diff_sarray_32)(query_shifted,&(ref_blocks[endblocki_32]),
+				     plusp,genestrand,/*query_unk_mismatch_local_p*/true);
+#else
+    diff_32 = (block_diff_sarray_32)(query_shifted + endcolumni,&(ref_blocks[endblocki_32]),
 				     plusp,genestrand,/*query_unk_mismatch_local_p*/true);
+#endif
     diff_32 = clear_start_32(diff_32,startdiscard);
     diff_32 = clear_end_32(diff_32,enddiscard);
 
@@ -18998,8 +19165,7 @@ Genome_consecutive_matches_leftward (Compress_T query_compress, Univcoord_T left
   else {
 #endif
 
-#ifdef HAVE_SSE2
-#else
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
     startblocki = startblocki_32;
     endblocki = endblocki_32;
 #endif
@@ -19015,7 +19181,8 @@ Genome_consecutive_matches_leftward (Compress_T query_compress, Univcoord_T left
 #endif
 
 
-#ifdef HAVE_SSE2
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+#else
   if (startblocki == endblocki) {
     diff = (block_diff_sarray)(query_shifted,&(ref_blocks[endblocki]),
 			       plusp,genestrand,/*query_unk_mismatch_local_p*/true);
@@ -19049,11 +19216,11 @@ Genome_consecutive_matches_leftward (Compress_T query_compress, Univcoord_T left
     }
 
     query_shifted -= COMPRESS_BLOCKSIZE;
-#ifdef HAVE_SSE2
-    ptr = &(ref_blocks[endblocki-12]);
-#else
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
     ptr = &(ref_blocks[endblocki]);
     ptr -= 1; if (endcolumni-- == 0) {ptr -= 8; endcolumni = 3;}
+#else
+    ptr = &(ref_blocks[endblocki-12]);
 #endif
     start = &(ref_blocks[startblocki]);
     offset -= STEP_SIZE; /* 128 or 32 */
@@ -19068,10 +19235,10 @@ Genome_consecutive_matches_leftward (Compress_T query_compress, Univcoord_T left
       }
 
       query_shifted -= COMPRESS_BLOCKSIZE;
-#ifdef HAVE_SSE2
-      ptr -= 12;
-#else
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
       ptr -= 1; if (endcolumni-- == 0) {ptr -= 8; endcolumni = 3;}
+#else
+      ptr -= 12;
 #endif
       offset -= STEP_SIZE; /* 128 or 32 */
     }
@@ -19091,7 +19258,8 @@ Genome_consecutive_matches_leftward (Compress_T query_compress, Univcoord_T left
       return (pos3 - pos5);
     }
 
-#ifdef HAVE_SSE2
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+#else
   }
 #endif
 }
@@ -19162,14 +19330,24 @@ Genome_consecutive_matches_pair (UINT4 lefta, UINT4 leftb, UINT4 genomelength) {
 
       ptr1 = &(ref_blocks[startblocki_1]);
       ptr2 = &(ref_blocks[startblocki_2]);
+#ifdef WORDS_BIGENDIAN
+      shifted1[0] = Bigendian_convert_uint(ptr1[0]) << nshift;
+      shifted1[1] = Bigendian_convert_uint(ptr1[4]) << nshift;
+      shifted1[2] = Bigendian_convert_uint(ptr1[8]) << nshift;
+#else
       shifted1[0] = ptr1[0] << nshift;
       shifted1[1] = ptr1[4] << nshift;
       shifted1[2] = ptr1[8] << nshift;
+#endif
       debug2(Compress_print_one_block(ptr1));
       debug2(Compress_print_one_block(ptr2));
       debug2(Compress_print_one_block(shifted1));
 
+#ifdef WORDS_BIGENDIAN
+      diff = (shifted1[0] ^ Bigendian_convert_uint(ptr2[0])) | (shifted1[1] ^ Bigendian_convert_uint(ptr2[4])) | (shifted1[2] ^ Bigendian_convert_uint(ptr2[8]));
+#else
       diff = (shifted1[0] ^ ptr2[0]) | (shifted1[1] ^ ptr2[4]) | (shifted1[2] ^ ptr2[8]);
+#endif
       diff = clear_start_32(diff,startdiscard);
       diff = clear_end_32(diff,enddiscard);
 
@@ -19195,14 +19373,24 @@ Genome_consecutive_matches_pair (UINT4 lefta, UINT4 leftb, UINT4 genomelength) {
       /* Block 1 */
       ptr1 = &(ref_blocks[startblocki_1]);
       ptr2 = &(ref_blocks[startblocki_2]);
+#ifdef WORDS_BIGENDIAN
+      shifted1[0] = Bigendian_convert_uint(ptr1[0]) << nshift;
+      shifted1[1] = Bigendian_convert_uint(ptr1[4]) << nshift;
+      shifted1[2] = Bigendian_convert_uint(ptr1[8]) << nshift;
+#else
       shifted1[0] = ptr1[0] << nshift;
       shifted1[1] = ptr1[4] << nshift;
       shifted1[2] = ptr1[8] << nshift;
+#endif
       debug2(Compress_print_one_block(ptr1));
       debug2(Compress_print_one_block(ptr2));
       debug2(Compress_print_one_block(shifted1));
 
+#ifdef WORDS_BIGENDIAN
+      diff = (shifted1[0] ^ Bigendian_convert_uint(ptr2[0])) | (shifted1[1] ^ Bigendian_convert_uint(ptr2[4])) | (shifted1[2] ^ Bigendian_convert_uint(ptr2[8]));
+#else
       diff = (shifted1[0] ^ ptr2[0]) | (shifted1[1] ^ ptr2[4]) | (shifted1[2] ^ ptr2[8]);
+#endif
       diff = clear_start_32(diff,startdiscard);
 
       if (diff /* != 0U */) {
@@ -19223,17 +19411,31 @@ Genome_consecutive_matches_pair (UINT4 lefta, UINT4 leftb, UINT4 genomelength) {
       /* Block 2 */
       if (nshift == 0) {
 	/* rightshift of 32 is a no-op */
+#ifdef WORDS_BIGENDIAN
+	shifted1[0] = Bigendian_convert_uint(ptr1[0]); shifted1[1] = Bigendian_convert_uint(ptr1[4]); shifted1[2] = Bigendian_convert_uint(ptr1[8]);
+#else
 	shifted1[0] = ptr1[0]; shifted1[1] = ptr1[4]; shifted1[2] = ptr1[8];
+#endif
       } else {
+#ifdef WORDS_BIGENDIAN
+	shifted1[0] = (Bigendian_convert_uint(ptr1[0]) << nshift) | (Bigendian_convert_uint(ptr1_prev[0]) >> rightshift);
+	shifted1[1] = (Bigendian_convert_uint(ptr1[4]) << nshift) | (Bigendian_convert_uint(ptr1_prev[4]) >> rightshift);
+	shifted1[2] = (Bigendian_convert_uint(ptr1[8]) << nshift) | (Bigendian_convert_uint(ptr1_prev[8]) >> rightshift);
+#else
 	shifted1[0] = (ptr1[0] << nshift) | (ptr1_prev[0] >> rightshift);
 	shifted1[1] = (ptr1[4] << nshift) | (ptr1_prev[4] >> rightshift);
 	shifted1[2] = (ptr1[8] << nshift) | (ptr1_prev[8] >> rightshift);
+#endif
       }
       debug2(Compress_print_one_block(ptr1));
       debug2(Compress_print_one_block(ptr2));
       debug2(Compress_print_one_block(shifted1));
 
+#ifdef WORDS_BIGENDIAN
+      diff = (shifted1[0] ^ Bigendian_convert_uint(ptr2[0])) | (shifted1[1] ^ Bigendian_convert_uint(ptr2[4])) | (shifted1[2] ^ Bigendian_convert_uint(ptr2[8]));
+#else
       diff = (shifted1[0] ^ ptr2[0]) | (shifted1[1] ^ ptr2[4]) | (shifted1[2] ^ ptr2[8]);
+#endif
       diff = clear_end_32(diff,enddiscard);
 
       if (diff /* != 0U */) {
@@ -19256,14 +19458,24 @@ Genome_consecutive_matches_pair (UINT4 lefta, UINT4 leftb, UINT4 genomelength) {
 
     ptr1 = &(ref_blocks[startblocki_1]);
     ptr2 = &(ref_blocks[startblocki_2]);
+#ifdef WORDS_BIGENDIAN
+    shifted1[0] = Bigendian_convert_uint(ptr1[0]) << nshift;
+    shifted1[1] = Bigendian_convert_uint(ptr1[4]) << nshift;
+    shifted1[2] = Bigendian_convert_uint(ptr1[8]) << nshift;
+#else
     shifted1[0] = ptr1[0] << nshift;
     shifted1[1] = ptr1[4] << nshift;
     shifted1[2] = ptr1[8] << nshift;
+#endif
     debug2(Compress_print_one_block(ptr1));
     debug2(Compress_print_one_block(ptr2));
     debug2(Compress_print_one_block(shifted1));
 
+#ifdef WORDS_BIGENDIAN
+    diff = (shifted1[0] ^ Bigendian_convert_uint(ptr2[0])) | (shifted1[1] ^ Bigendian_convert_uint(ptr2[4])) | (shifted1[2] ^ Bigendian_convert_uint(ptr2[8]));
+#else
     diff = (shifted1[0] ^ ptr2[0]) | (shifted1[1] ^ ptr2[4]) | (shifted1[2] ^ ptr2[8]);
+#endif
     diff = clear_start_32(diff,startdiscard);
     diff = clear_end_32(diff,enddiscard);
 
@@ -19285,14 +19497,24 @@ Genome_consecutive_matches_pair (UINT4 lefta, UINT4 leftb, UINT4 genomelength) {
     /* Startblock */
     ptr1 = &(ref_blocks[startblocki_1]);
     ptr2 = &(ref_blocks[startblocki_2]);
+#ifdef WORDS_BIGENDIAN
+    shifted1[0] = (Bigendian_convert_uint(ptr1[0]) << nshift);
+    shifted1[1] = (Bigendian_convert_uint(ptr1[4]) << nshift);
+    shifted1[2] = (Bigendian_convert_uint(ptr1[8]) << nshift);
+#else
     shifted1[0] = (ptr1[0] << nshift);
     shifted1[1] = (ptr1[4] << nshift);
     shifted1[2] = (ptr1[8] << nshift);
+#endif
     debug2(Compress_print_one_block(ptr1));
     debug2(Compress_print_one_block(ptr2));
     debug2(Compress_print_one_block(shifted1));
 
+#ifdef WORDS_BIGENDIAN
+    diff = (shifted1[0] ^ Bigendian_convert_uint(ptr2[0])) | (shifted1[1] ^ Bigendian_convert_uint(ptr2[4])) | (shifted1[2] ^ Bigendian_convert_uint(ptr2[8]));
+#else
     diff = (shifted1[0] ^ ptr2[0]) | (shifted1[1] ^ ptr2[4]) | (shifted1[2] ^ ptr2[8]);
+#endif
     diff = clear_start_32(diff,startdiscard);
 
     if (diff /* != 0U */) {
@@ -19313,17 +19535,31 @@ Genome_consecutive_matches_pair (UINT4 lefta, UINT4 leftb, UINT4 genomelength) {
     while (ptr1 < end && ptr2 < end) {
       if (nshift == 0) {
 	/* rightshift of 32 is a no-op */
+#ifdef WORDS_BIGENDIAN
+	shifted1[0] = Bigendian_convert_uint(ptr1[0]); shifted1[1] = Bigendian_convert_uint(ptr1[4]); shifted1[2] = Bigendian_convert_uint(ptr1[8]);
+#else
 	shifted1[0] = ptr1[0]; shifted1[1] = ptr1[4]; shifted1[2] = ptr1[8];
+#endif
       } else {
+#ifdef WORDS_BIGENDIAN
+	shifted1[0] = (Bigendian_convert_uint(ptr1[0]) << nshift) | (Bigendian_convert_uint(ptr1_prev[0]) >> rightshift);
+	shifted1[1] = (Bigendian_convert_uint(ptr1[4]) << nshift) | (Bigendian_convert_uint(ptr1_prev[4]) >> rightshift);
+	shifted1[2] = (Bigendian_convert_uint(ptr1[8]) << nshift) | (Bigendian_convert_uint(ptr1_prev[8]) >> rightshift);
+#else
 	shifted1[0] = (ptr1[0] << nshift) | (ptr1_prev[0] >> rightshift);
 	shifted1[1] = (ptr1[4] << nshift) | (ptr1_prev[4] >> rightshift);
 	shifted1[2] = (ptr1[8] << nshift) | (ptr1_prev[8] >> rightshift);
+#endif
       }
       debug2(Compress_print_one_block(ptr1));
       debug2(Compress_print_one_block(ptr2));
       debug2(Compress_print_one_block(shifted1));
 
+#ifdef WORDS_BIGENDIAN
+      diff = (shifted1[0] ^ Bigendian_convert_uint(ptr2[0])) | (shifted1[1] ^ Bigendian_convert_uint(ptr2[4])) | (shifted1[2] ^ Bigendian_convert_uint(ptr2[8]));
+#else
       diff = (shifted1[0] ^ ptr2[0]) | (shifted1[1] ^ ptr2[4]) | (shifted1[2] ^ ptr2[8]);
+#endif
       if (diff /* != 0U */) {
 #ifdef HAVE_BUILTIN_CTZ
 	mismatch_position = offset + (relpos = __builtin_ctz(diff));
@@ -19363,17 +19599,31 @@ Genome_consecutive_matches_pair (UINT4 lefta, UINT4 leftb, UINT4 genomelength) {
     /* Block 1 */
     if (nshift == 0) {
       /* rightshift of 32 is a no-op */
+#ifdef WORDS_BIGENDIAN
+      shifted1[0] = Bigendian_convert_uint(ptr1[0]); shifted1[1] = Bigendian_convert_uint(ptr1[4]); shifted1[2] = Bigendian_convert_uint(ptr1[8]);
+#else
       shifted1[0] = ptr1[0]; shifted1[1] = ptr1[4]; shifted1[2] = ptr1[8];
+#endif
     } else {
+#ifdef WORDS_BIGENDIAN
+      shifted1[0] = (Bigendian_convert_uint(ptr1[0]) << nshift) | (Bigendian_convert_uint(ptr1_prev[0]) >> rightshift);
+      shifted1[1] = (Bigendian_convert_uint(ptr1[4]) << nshift) | (Bigendian_convert_uint(ptr1_prev[4]) >> rightshift);
+      shifted1[2] = (Bigendian_convert_uint(ptr1[8]) << nshift) | (Bigendian_convert_uint(ptr1_prev[8]) >> rightshift);
+#else
       shifted1[0] = (ptr1[0] << nshift) | (ptr1_prev[0] >> rightshift);
       shifted1[1] = (ptr1[4] << nshift) | (ptr1_prev[4] >> rightshift);
       shifted1[2] = (ptr1[8] << nshift) | (ptr1_prev[8] >> rightshift);
+#endif
     }
     debug2(Compress_print_one_block(ptr1));
     debug2(Compress_print_one_block(ptr2));
     debug2(Compress_print_one_block(shifted1));
 
+#ifdef WORDS_BIGENDIAN
+    diff = (shifted1[0] ^ Bigendian_convert_uint(ptr2[0])) | (shifted1[1] ^ Bigendian_convert_uint(ptr2[4])) | (shifted1[2] ^ Bigendian_convert_uint(ptr2[8]));
+#else
     diff = (shifted1[0] ^ ptr2[0]) | (shifted1[1] ^ ptr2[4]) | (shifted1[2] ^ ptr2[8]);
+#endif
     if (nblocks == 1) {
       diff = clear_end_32(diff,enddiscard);
     }
@@ -19397,14 +19647,24 @@ Genome_consecutive_matches_pair (UINT4 lefta, UINT4 leftb, UINT4 genomelength) {
     }
 
     /* Block 2 */
+#ifdef WORDS_BIGENDIAN
+    shifted1[0] = (Bigendian_convert_uint(ptr1_prev[0]) >> rightshift);
+    shifted1[1] = (Bigendian_convert_uint(ptr1_prev[4]) >> rightshift);
+    shifted1[2] = (Bigendian_convert_uint(ptr1_prev[8]) >> rightshift);
+#else
     shifted1[0] = (ptr1_prev[0] >> rightshift);
     shifted1[1] = (ptr1_prev[4] >> rightshift);
     shifted1[2] = (ptr1_prev[8] >> rightshift);
+#endif
     debug2(Compress_print_one_block(ptr1));
     debug2(Compress_print_one_block(ptr2));
     debug2(Compress_print_one_block(shifted1));
 
+#ifdef WORDS_BIGENDIAN
+    diff = (shifted1[0] ^ Bigendian_convert_uint(ptr2[0])) | (shifted1[1] ^ Bigendian_convert_uint(ptr2[4])) | (shifted1[2] ^ Bigendian_convert_uint(ptr2[8]));
+#else
     diff = (shifted1[0] ^ ptr2[0]) | (shifted1[1] ^ ptr2[4]) | (shifted1[2] ^ ptr2[8]);
+#endif
     diff = clear_end_32(diff,enddiscard);
 
     if (diff /* != 0U */) {
@@ -19474,12 +19734,13 @@ count_mismatches_limit (Compress_T query_compress, Univcoord_T left,
     debug(printf("nshift = %d, startdiscard = %u, enddiscard = %u\n",nshift,startdiscard,enddiscard));
 
 
-    diff_32 = (block_diff_32)(query_shifted
-#ifdef HAVE_SSE2
-			      + startcolumni
-#endif
-			      ,&(ref_blocks[startblocki_32]),
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+    diff_32 = (block_diff_32)(query_shifted,&(ref_blocks[startblocki_32]),
+			      plusp,genestrand,query_unk_mismatch_p);
+#else
+    diff_32 = (block_diff_32)(query_shifted + startcolumni,&(ref_blocks[startblocki_32]),
 			      plusp,genestrand,query_unk_mismatch_p);
+#endif
     diff_32 = clear_start_32(diff_32,startdiscard);
     diff_32 = clear_end_32(diff_32,enddiscard);
 
@@ -19494,8 +19755,7 @@ count_mismatches_limit (Compress_T query_compress, Univcoord_T left,
   else {
 #endif
 
-#ifdef HAVE_SSE2
-#else
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
     startblocki = startblocki_32;
     endblocki = endblocki_32;
 #endif
@@ -19509,7 +19769,8 @@ count_mismatches_limit (Compress_T query_compress, Univcoord_T left,
   }
 #endif
 
-#ifdef HAVE_SSE2
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+#else
   if (endblocki == startblocki) {
     debug(printf("** Single block **\n"));
     diff = (block_diff)(query_shifted,&(ref_blocks[startblocki]),
@@ -19585,11 +19846,11 @@ count_mismatches_limit (Compress_T query_compress, Univcoord_T left,
 
     /* 2..(n-1) / n: Check all middle blocks first */
     query_shifted += COMPRESS_BLOCKSIZE;
-#ifdef HAVE_SSE2
-    ptr = &(ref_blocks[startblocki+12]);
-#else
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
     ptr = &(ref_blocks[startblocki]);
     ptr += 1; if (++startcolumni == 4) {ptr += 8; startcolumni = 0;}
+#else
+    ptr = &(ref_blocks[startblocki+12]);
 #endif
     endblock = &(ref_blocks[endblocki]);
     nmismatches = 0;
@@ -19604,10 +19865,10 @@ count_mismatches_limit (Compress_T query_compress, Univcoord_T left,
       }
 
       query_shifted += COMPRESS_BLOCKSIZE;
-#ifdef HAVE_SSE2
-      ptr += 12;
-#else
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
       ptr += 1; if (++startcolumni == 4) {ptr += 8; startcolumni = 0;}
+#else
+      ptr += 12;
 #endif
     }
 
@@ -19658,7 +19919,8 @@ count_mismatches_limit (Compress_T query_compress, Univcoord_T left,
       return nmismatches + popcount_ones(diff);
     }
 
-#ifdef HAVE_SSE2
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+#else
   }
 #endif
 }
@@ -19714,12 +19976,13 @@ count_mismatches_limit_snps (Compress_T query_compress, Univcoord_T left, int po
     debug(printf("nshift = %d, startdiscard = %u, enddiscard = %u\n",nshift,startdiscard,enddiscard));
 
 
-    diff_32 = (block_diff_snp_32)(query_shifted
-#ifdef HAVE_SSE2
-				  + startcolumni
-#endif
-				  ,&(snp_blocks[startblocki_32]),&(ref_blocks[startblocki_32]),
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+    diff_32 = (block_diff_snp_32)(query_shifted,&(snp_blocks[startblocki_32]),&(ref_blocks[startblocki_32]),
 				  plusp,genestrand,query_unk_mismatch_p);
+#else
+    diff_32 = (block_diff_snp_32)(query_shifted + startcolumni,&(snp_blocks[startblocki_32]),&(ref_blocks[startblocki_32]),
+				  plusp,genestrand,query_unk_mismatch_p);
+#endif
     diff_32 = clear_start_32(diff_32,startdiscard);
     diff_32 = clear_end_32(diff_32,enddiscard);
 
@@ -19734,8 +19997,7 @@ count_mismatches_limit_snps (Compress_T query_compress, Univcoord_T left, int po
   else {
 #endif
 
-#ifdef HAVE_SSE2
-#else
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
     startblocki = startblocki_32;
     endblocki = endblocki_32;
 #endif
@@ -19749,7 +20011,8 @@ count_mismatches_limit_snps (Compress_T query_compress, Univcoord_T left, int po
   }
 #endif
 
-#ifdef HAVE_SSE2
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+#else
   if (endblocki == startblocki) {
     debug(printf("** Single block **\n"));
     diff = (block_diff_snp)(query_shifted,&(snp_blocks[startblocki]),&(ref_blocks[startblocki]),
@@ -19827,13 +20090,13 @@ count_mismatches_limit_snps (Compress_T query_compress, Univcoord_T left, int po
 
     /* 2..(n-1) / n: Check all middle blocks first */
     query_shifted += COMPRESS_BLOCKSIZE;
-#ifdef HAVE_SSE2
-    ref_ptr = &(ref_blocks[startblocki+12]);
-    alt_ptr = &(snp_blocks[startblocki+12]);
-#else
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
     ref_ptr = &(ref_blocks[startblocki]);
     alt_ptr = &(snp_blocks[startblocki]);
     ref_ptr += 1; alt_ptr += 1; if (++startcolumni == 4) {ref_ptr += 8; alt_ptr += 8; startcolumni = 0;}
+#else
+    ref_ptr = &(ref_blocks[startblocki+12]);
+    alt_ptr = &(snp_blocks[startblocki+12]);
 #endif
     endblock = &(ref_blocks[endblocki]);
     nmismatches = 0;
@@ -19848,10 +20111,10 @@ count_mismatches_limit_snps (Compress_T query_compress, Univcoord_T left, int po
       }
 
       query_shifted += COMPRESS_BLOCKSIZE;
-#ifdef HAVE_SSE2
-      ref_ptr += 12; alt_ptr += 12;
-#else
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
       ref_ptr += 1; alt_ptr += 1; if (++startcolumni == 4) {ref_ptr += 8; alt_ptr += 8; startcolumni = 0;}
+#else
+      ref_ptr += 12; alt_ptr += 12;
 #endif
     }
 
@@ -19902,7 +20165,8 @@ count_mismatches_limit_snps (Compress_T query_compress, Univcoord_T left, int po
       return nmismatches + popcount_ones(diff);
     }
 
-#ifdef HAVE_SSE2
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+#else
   }
 #endif
 }
@@ -19978,13 +20242,13 @@ Genome_count_mismatches_substring_ref (Compress_T query_compress, Univcoord_T le
     enddiscard = (left+pos3) % 32;
     debug(printf("nshift = %d, startdiscard = %u, enddiscard = %u\n",nshift,startdiscard,enddiscard));
 
-
-    diff_32 = (block_diff_32)(query_shifted
-#ifdef HAVE_SSE2
-			      + startcolumni
-#endif
-			      ,&(ref_blocks[startblocki_32]),
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+    diff_32 = (block_diff_32)(query_shifted,&(ref_blocks[startblocki_32]),
+			      plusp,genestrand,query_unk_mismatch_p);
+#else
+    diff_32 = (block_diff_32)(query_shifted + startcolumni,&(ref_blocks[startblocki_32]),
 			      plusp,genestrand,query_unk_mismatch_p);
+#endif
     diff_32 = clear_start_32(diff_32,startdiscard);
     diff_32 = clear_end_32(diff_32,enddiscard);
 
@@ -19999,8 +20263,7 @@ Genome_count_mismatches_substring_ref (Compress_T query_compress, Univcoord_T le
   else {
 #endif
 
-#ifdef HAVE_SSE2
-#else
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
     startblocki = startblocki_32;
     endblocki = endblocki_32;
 #endif
@@ -20015,7 +20278,8 @@ Genome_count_mismatches_substring_ref (Compress_T query_compress, Univcoord_T le
 #endif
 
 
-#ifdef HAVE_SSE2
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+#else
   if (endblocki == startblocki) {
     diff = (block_diff)(query_shifted,&(ref_blocks[startblocki]),
 			plusp,genestrand,query_unk_mismatch_p);
@@ -20038,11 +20302,11 @@ Genome_count_mismatches_substring_ref (Compress_T query_compress, Univcoord_T le
     nmismatches = popcount_ones(diff);
 
     query_shifted += COMPRESS_BLOCKSIZE;
-#ifdef HAVE_SSE2
-    ptr = &(ref_blocks[startblocki+12]);
-#else
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
     ptr = &(ref_blocks[startblocki]);
     ptr += 1; if (++startcolumni == 4) {ptr += 8; startcolumni = 0;}
+#else
+    ptr = &(ref_blocks[startblocki+12]);
 #endif
     end = &(ref_blocks[endblocki]);
     while (ptr < end) {
@@ -20052,10 +20316,10 @@ Genome_count_mismatches_substring_ref (Compress_T query_compress, Univcoord_T le
       nmismatches += popcount_ones(diff);
 
       query_shifted += COMPRESS_BLOCKSIZE;
-#ifdef HAVE_SSE2
-      ptr += 12;
-#else
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
       ptr += 1; if (++startcolumni == 4) {ptr += 8; startcolumni = 0;}
+#else
+      ptr += 12;
 #endif
     }
 
@@ -20067,7 +20331,8 @@ Genome_count_mismatches_substring_ref (Compress_T query_compress, Univcoord_T le
     debug14(if (endblocki_32 == startblocki_32) assert(answer == nmismatches + popcount_ones(diff)));
     return nmismatches + popcount_ones(diff);
 
-#ifdef HAVE_SSE2
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+#else
   }
 #endif
 }
@@ -20119,13 +20384,13 @@ count_mismatches_substring_snps (Compress_T query_compress, Univcoord_T left, in
     enddiscard = (left+pos3) % 32;
     debug(printf("nshift = %d, startdiscard = %u, enddiscard = %u\n",nshift,startdiscard,enddiscard));
 
-
-    diff_32 = (block_diff_snp_32)(query_shifted
-#ifdef HAVE_SSE2
-				  + startcolumni
-#endif
-				  ,&(snp_blocks[startblocki_32]),&(ref_blocks[startblocki_32]),
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+    diff_32 = (block_diff_snp_32)(query_shifted,&(snp_blocks[startblocki_32]),&(ref_blocks[startblocki_32]),
+				  plusp,genestrand,query_unk_mismatch_p);
+#else
+    diff_32 = (block_diff_snp_32)(query_shifted + startcolumni,&(snp_blocks[startblocki_32]),&(ref_blocks[startblocki_32]),
 				  plusp,genestrand,query_unk_mismatch_p);
+#endif
     diff_32 = clear_start_32(diff_32,startdiscard);
     diff_32 = clear_end_32(diff_32,enddiscard);
 
@@ -20140,8 +20405,7 @@ count_mismatches_substring_snps (Compress_T query_compress, Univcoord_T left, in
   else {
 #endif
 
-#ifdef HAVE_SSE2
-#else
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
     startblocki = startblocki_32;
     endblocki = endblocki_32;
 #endif
@@ -20156,7 +20420,8 @@ count_mismatches_substring_snps (Compress_T query_compress, Univcoord_T left, in
 #endif
 
 
-#ifdef HAVE_SSE2
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+#else
   if (endblocki == startblocki) {
     diff = (block_diff_snp)(query_shifted,&(snp_blocks[startblocki]),&(ref_blocks[startblocki]),
 			    plusp,genestrand,query_unk_mismatch_p);
@@ -20179,13 +20444,13 @@ count_mismatches_substring_snps (Compress_T query_compress, Univcoord_T left, in
     nmismatches = popcount_ones(diff);
 
     query_shifted += COMPRESS_BLOCKSIZE;
-#ifdef HAVE_SSE2
-    ref_ptr = &(ref_blocks[startblocki+12]);
-    alt_ptr = &(snp_blocks[startblocki+12]);
-#else
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
     ref_ptr = &(ref_blocks[startblocki]);
     alt_ptr = &(snp_blocks[startblocki]);
     ref_ptr += 1; alt_ptr += 1; if (++startcolumni == 4) {ref_ptr += 8; alt_ptr += 8; startcolumni = 0;}
+#else
+    ref_ptr = &(ref_blocks[startblocki+12]);
+    alt_ptr = &(snp_blocks[startblocki+12]);
 #endif
     end = &(ref_blocks[endblocki]);
     while (ref_ptr < end) {
@@ -20195,10 +20460,10 @@ count_mismatches_substring_snps (Compress_T query_compress, Univcoord_T left, in
       nmismatches += popcount_ones(diff);
 
       query_shifted += COMPRESS_BLOCKSIZE;
-#ifdef HAVE_SSE2
-      ref_ptr += 12; alt_ptr += 12;
-#else
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
       ref_ptr += 1; alt_ptr += 1; if (++startcolumni == 4) {ref_ptr += 8; alt_ptr += 8; startcolumni = 0;}
+#else
+      ref_ptr += 12; alt_ptr += 12;
 #endif
     }
 
@@ -20210,7 +20475,8 @@ count_mismatches_substring_snps (Compress_T query_compress, Univcoord_T left, in
     debug14(if (endblocki_32 == startblocki_32) assert(answer == nmismatches + popcount_ones(diff)));
     return nmismatches + popcount_ones(diff);
 
-#ifdef HAVE_SSE2
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+#else
   }
 #endif
 }
@@ -20409,13 +20675,13 @@ mismatches_left (int *mismatch_positions, int max_mismatches, Compress_T query_c
     offset = -startdiscard + pos5;
     debug(printf("nshift = %d, startdiscard = %u, enddiscard = %u\n",nshift,startdiscard,enddiscard));
 
-
-    diff_32 = (block_diff_32)(query_shifted
-#ifdef HAVE_SSE2
-			      + startcolumni
-#endif
-			      ,&(ref_blocks[startblocki_32]),
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+    diff_32 = (block_diff_32)(query_shifted,&(ref_blocks[startblocki_32]),
 			      plusp,genestrand,query_unk_mismatch_local_p);
+#else
+    diff_32 = (block_diff_32)(query_shifted + startcolumni,&(ref_blocks[startblocki_32]),
+			      plusp,genestrand,query_unk_mismatch_local_p);
+#endif
     diff_32 = clear_start_32(diff_32,startdiscard);
     diff_32 = clear_end_32(diff_32,enddiscard);
 
@@ -20436,8 +20702,7 @@ mismatches_left (int *mismatch_positions, int max_mismatches, Compress_T query_c
   else {
 #endif
 
-#ifdef HAVE_SSE2
-#else
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
     startblocki = startblocki_32;
     endblocki = endblocki_32;
 #endif
@@ -20453,7 +20718,8 @@ mismatches_left (int *mismatch_positions, int max_mismatches, Compress_T query_c
 #endif
 
 
-#ifdef HAVE_SSE2
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+#else
   if (endblocki == startblocki) {
     diff = (block_diff)(query_shifted,&(ref_blocks[startblocki]),
 			plusp,genestrand,query_unk_mismatch_local_p);
@@ -20487,11 +20753,11 @@ mismatches_left (int *mismatch_positions, int max_mismatches, Compress_T query_c
     }
 
     query_shifted += COMPRESS_BLOCKSIZE;
-#ifdef HAVE_SSE2
-    ptr = &(ref_blocks[startblocki+12]);
-#else
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
     ptr = &(ref_blocks[startblocki]);
     ptr += 1; if (++startcolumni == 4) {ptr += 8; startcolumni = 0;}
+#else
+    ptr = &(ref_blocks[startblocki+12]);
 #endif
     end = &(ref_blocks[endblocki]);
     offset += STEP_SIZE; /* 128 or 32 */
@@ -20509,10 +20775,10 @@ mismatches_left (int *mismatch_positions, int max_mismatches, Compress_T query_c
       }
 
       query_shifted += COMPRESS_BLOCKSIZE;
-#ifdef HAVE_SSE2
-      ptr += 12;
-#else
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
       ptr += 1; if (++startcolumni == 4) {ptr += 8; startcolumni = 0;}
+#else
+      ptr += 12;
 #endif
       offset += STEP_SIZE; /* 128 or 32 */
     }
@@ -20529,7 +20795,8 @@ mismatches_left (int *mismatch_positions, int max_mismatches, Compress_T query_c
     debug14(if (endblocki_32 == startblocki_32) assert(answer == nmismatches));
     return nmismatches;
 
-#ifdef HAVE_SSE2
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+#else
   }
 #endif
 
@@ -20584,13 +20851,13 @@ mismatches_left_snps (int *mismatch_positions, int max_mismatches, Compress_T qu
     offset = -startdiscard + pos5;
     debug(printf("nshift = %d, startdiscard = %u, enddiscard = %u\n",nshift,startdiscard,enddiscard));
 
-
-    diff_32 = (block_diff_snp_32)(query_shifted
-#ifdef HAVE_SSE2
-				  + startcolumni
-#endif
-				  ,&(snp_blocks[startblocki_32]),&(ref_blocks[startblocki_32]),
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+    diff_32 = (block_diff_snp_32)(query_shifted,&(snp_blocks[startblocki_32]),&(ref_blocks[startblocki_32]),
 				  plusp,genestrand,query_unk_mismatch_local_p);
+#else
+    diff_32 = (block_diff_snp_32)(query_shifted + startcolumni,&(snp_blocks[startblocki_32]),&(ref_blocks[startblocki_32]),
+				  plusp,genestrand,query_unk_mismatch_local_p);
+#endif
     diff_32 = clear_start_32(diff_32,startdiscard);
     diff_32 = clear_end_32(diff_32,enddiscard);
 
@@ -20610,8 +20877,7 @@ mismatches_left_snps (int *mismatch_positions, int max_mismatches, Compress_T qu
   else {
 #endif
 
-#ifdef HAVE_SSE2
-#else
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
     startblocki = startblocki_32;
     endblocki = endblocki_32;
 #endif
@@ -20627,7 +20893,8 @@ mismatches_left_snps (int *mismatch_positions, int max_mismatches, Compress_T qu
 #endif
 
 
-#ifdef HAVE_SSE2
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+#else
   if (endblocki == startblocki) {
     diff = (block_diff_snp)(query_shifted,&(snp_blocks[startblocki]),&(ref_blocks[startblocki]),
 			    plusp,genestrand,query_unk_mismatch_local_p);
@@ -20661,13 +20928,13 @@ mismatches_left_snps (int *mismatch_positions, int max_mismatches, Compress_T qu
     }
 
     query_shifted += COMPRESS_BLOCKSIZE;
-#ifdef HAVE_SSE2
-    ref_ptr = &(ref_blocks[startblocki+12]);
-    alt_ptr = &(snp_blocks[startblocki+12]);
-#else
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
     ref_ptr = &(ref_blocks[startblocki]);
     alt_ptr = &(snp_blocks[startblocki]);
     ref_ptr += 1; alt_ptr += 1; if (++startcolumni == 4) {ref_ptr += 8; alt_ptr += 8; startcolumni = 0;}
+#else
+    ref_ptr = &(ref_blocks[startblocki+12]);
+    alt_ptr = &(snp_blocks[startblocki+12]);
 #endif
     end = &(ref_blocks[endblocki]);
     offset += STEP_SIZE; /* 128 or 32 */
@@ -20685,10 +20952,10 @@ mismatches_left_snps (int *mismatch_positions, int max_mismatches, Compress_T qu
       }
 
       query_shifted += COMPRESS_BLOCKSIZE;
-#ifdef HAVE_SSE2
-      ref_ptr += 12; alt_ptr += 12;
-#else
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
       ref_ptr += 1; alt_ptr += 1; if (++startcolumni == 4) {ref_ptr += 8; alt_ptr += 8; startcolumni = 0;}
+#else
+      ref_ptr += 12; alt_ptr += 12;
 #endif
       offset += STEP_SIZE; /* 128 or 32 */
     }
@@ -20705,7 +20972,8 @@ mismatches_left_snps (int *mismatch_positions, int max_mismatches, Compress_T qu
     debug14(if (endblocki_32 == startblocki_32) assert(answer == nmismatches_both));
     return nmismatches_both;
 
-#ifdef HAVE_SSE2
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+#else
   }
 #endif
 }
@@ -20847,13 +21115,13 @@ mismatches_right (int *mismatch_positions, int max_mismatches, Compress_T query_
     offset = (pos3 - 1) - enddiscard + 32;
     debug(printf("nshift = %d, startdiscard = %u, enddiscard = %u\n",nshift,startdiscard,enddiscard));
 
-
-    diff_32 = (block_diff_32)(query_shifted
-#ifdef HAVE_SSE2
-			      + endcolumni
-#endif
-			      ,&(ref_blocks[endblocki_32]),
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+    diff_32 = (block_diff_32)(query_shifted,&(ref_blocks[endblocki_32]),
+			      plusp,genestrand,query_unk_mismatch_local_p);
+#else
+    diff_32 = (block_diff_32)(query_shifted + endcolumni,&(ref_blocks[endblocki_32]),
 			      plusp,genestrand,query_unk_mismatch_local_p);
+#endif
     diff_32 = clear_start_32(diff_32,startdiscard);
     diff_32 = clear_end_32(diff_32,enddiscard);
 
@@ -20873,8 +21141,7 @@ mismatches_right (int *mismatch_positions, int max_mismatches, Compress_T query_
   else {
 #endif
 
-#ifdef HAVE_SSE2
-#else
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
     startblocki = startblocki_32;
     endblocki = endblocki_32;
 #endif
@@ -20891,7 +21158,8 @@ mismatches_right (int *mismatch_positions, int max_mismatches, Compress_T query_
 #endif
 
 
-#ifdef HAVE_SSE2
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+#else
   if (startblocki == endblocki) {
     diff = (block_diff)(query_shifted,&(ref_blocks[endblocki]),
 			plusp,genestrand,query_unk_mismatch_local_p);
@@ -20925,11 +21193,11 @@ mismatches_right (int *mismatch_positions, int max_mismatches, Compress_T query_
     }
 
     query_shifted -= COMPRESS_BLOCKSIZE;
-#ifdef HAVE_SSE2
-    ptr = &(ref_blocks[endblocki-12]);
-#else
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
     ptr = &(ref_blocks[endblocki]);
     ptr -= 1; if (endcolumni-- == 0) {ptr -= 8; endcolumni = 3;}
+#else
+    ptr = &(ref_blocks[endblocki-12]);
 #endif
     start = &(ref_blocks[startblocki]);
     offset -= STEP_SIZE; /* 128 or 32 */
@@ -20947,10 +21215,10 @@ mismatches_right (int *mismatch_positions, int max_mismatches, Compress_T query_
       }
 
       query_shifted -= COMPRESS_BLOCKSIZE;
-#ifdef HAVE_SSE2
-      ptr -= 12;
-#else
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
       ptr -= 1; if (endcolumni-- == 0) {ptr -= 8; endcolumni = 3;}
+#else
+      ptr -= 12;
 #endif
       offset -= STEP_SIZE; /* 128 or 32 */
     }
@@ -20968,7 +21236,8 @@ mismatches_right (int *mismatch_positions, int max_mismatches, Compress_T query_
     debug14(if (startblocki_32 == endblocki_32) assert(answer == nmismatches));
     return nmismatches;
 
-#ifdef HAVE_SSE2
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+#else
   }
 #endif
 }
@@ -21024,13 +21293,13 @@ mismatches_right_snps (int *mismatch_positions, int max_mismatches, Compress_T q
     offset = (pos3 - 1) - enddiscard + 32;
     debug(printf("nshift = %d, startdiscard = %u, enddiscard = %u\n",nshift,startdiscard,enddiscard));
 
-
-    diff_32 = (block_diff_snp_32)(query_shifted
-#ifdef HAVE_SSE2
-				  + endcolumni
-#endif
-				  ,&(snp_blocks[endblocki_32]),&(ref_blocks[endblocki_32]),
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+    diff_32 = (block_diff_snp_32)(query_shifted,&(snp_blocks[endblocki_32]),&(ref_blocks[endblocki_32]),
+				  plusp,genestrand,query_unk_mismatch_local_p);
+#else
+    diff_32 = (block_diff_snp_32)(query_shifted + endcolumni,&(snp_blocks[endblocki_32]),&(ref_blocks[endblocki_32]),
 				  plusp,genestrand,query_unk_mismatch_local_p);
+#endif
     diff_32 = clear_start_32(diff_32,startdiscard);
     diff_32 = clear_end_32(diff_32,enddiscard);
 
@@ -21050,8 +21319,7 @@ mismatches_right_snps (int *mismatch_positions, int max_mismatches, Compress_T q
   else {
 #endif
 
-#ifdef HAVE_SSE2
-#else
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
     startblocki = startblocki_32;
     endblocki = endblocki_32;
 #endif
@@ -21067,7 +21335,8 @@ mismatches_right_snps (int *mismatch_positions, int max_mismatches, Compress_T q
 #endif
 
 
-#ifdef HAVE_SSE2
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+#else
   if (startblocki == endblocki) {
     diff = (block_diff_snp)(query_shifted,&(snp_blocks[endblocki]),&(ref_blocks[endblocki]),
 			    plusp,genestrand,query_unk_mismatch_local_p);
@@ -21101,13 +21370,13 @@ mismatches_right_snps (int *mismatch_positions, int max_mismatches, Compress_T q
     }
 
     query_shifted -= COMPRESS_BLOCKSIZE;
-#ifdef HAVE_SSE2
-    ref_ptr = &(ref_blocks[endblocki-12]);
-    alt_ptr = &(snp_blocks[endblocki-12]);
-#else
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
     ref_ptr = &(ref_blocks[endblocki]);
     alt_ptr = &(snp_blocks[endblocki]);
     ref_ptr -= 1; alt_ptr -= 1; if (endcolumni-- == 0) {ref_ptr -= 8; alt_ptr -= 8; endcolumni = 3;}
+#else
+    ref_ptr = &(ref_blocks[endblocki-12]);
+    alt_ptr = &(snp_blocks[endblocki-12]);
 #endif
     start = &(ref_blocks[startblocki]);
     offset -= STEP_SIZE; /* 128 or 32 */
@@ -21125,10 +21394,10 @@ mismatches_right_snps (int *mismatch_positions, int max_mismatches, Compress_T q
       }
 
       query_shifted -= COMPRESS_BLOCKSIZE;
-#ifdef HAVE_SSE2
-      ref_ptr -= 12; alt_ptr -= 12;
-#else
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
       ref_ptr -= 1; alt_ptr -= 1; if (endcolumni-- == 0) {ref_ptr -= 8; alt_ptr -= 8; endcolumni = 3;}
+#else
+      ref_ptr -= 12; alt_ptr -= 12;
 #endif
       offset -= STEP_SIZE; /* 128 or 32 */
     }
@@ -21146,7 +21415,8 @@ mismatches_right_snps (int *mismatch_positions, int max_mismatches, Compress_T q
     debug14(if (startblocki_32 == endblocki_32) assert(answer == nmismatches_both));
     return nmismatches_both;
 
-#ifdef HAVE_SSE2
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+#else
   }
 #endif
 }
@@ -21267,8 +21537,8 @@ Genome_mark_mismatches_ref (char *genomic, int querylength, Compress_T query_com
   endcolumni = ((left+pos3) % 128) / 32;
   endblocki_32 = endblocki + endcolumni;
 
-  debug5(printf("left = %u, pos5 = %d, pos3 = %d, startblocki = %u, endblocki = %u, plusp %d\n",
-		left,pos5,pos3,startblocki,endblocki,plusp));
+  debug5(printf("left = %u, pos5 = %d, pos3 = %d, startblocki = %u, endblocki = %u, plusp %d, step_size %d\n",
+		left,pos5,pos3,startblocki,endblocki,plusp,STEP_SIZE));
 
   nshift = left % STEP_SIZE;
   query_shifted = Compress_shift(query_compress,nshift);
@@ -21291,12 +21561,13 @@ Genome_mark_mismatches_ref (char *genomic, int querylength, Compress_T query_com
     debug5(printf("nshift = %d, startdiscard = %u, enddiscard = %u\n",nshift,startdiscard,enddiscard));
 
 
-    diff_32 = (block_diff_32)(query_shifted
-#ifdef HAVE_SSE2
-			      + startcolumni
-#endif
-			      ,&(ref_blocks[startblocki_32]),
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+    diff_32 = (block_diff_32)(query_shifted,&(ref_blocks[startblocki_32]),
+			      plusp,genestrand,query_unk_mismatch_p);
+#else
+    diff_32 = (block_diff_32)(query_shifted + startcolumni,&(ref_blocks[startblocki_32]),
 			      plusp,genestrand,query_unk_mismatch_p);
+#endif
     diff_32 = clear_start_32(diff_32,startdiscard);
     diff_32 = clear_end_32(diff_32,enddiscard);
 
@@ -21322,8 +21593,7 @@ Genome_mark_mismatches_ref (char *genomic, int querylength, Compress_T query_com
   else {
 #endif
 
-#ifdef HAVE_SSE2
-#else
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
     startblocki = startblocki_32;
     endblocki = endblocki_32;
 #endif
@@ -21347,7 +21617,8 @@ Genome_mark_mismatches_ref (char *genomic, int querylength, Compress_T query_com
 #endif
 
 
-#ifdef HAVE_SSE2
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+#else
   if (endblocki == startblocki) {
     diff = (block_diff)(query_shifted,&(ref_blocks[startblocki]),
 			plusp,genestrand,query_unk_mismatch_p);
@@ -21388,11 +21659,11 @@ Genome_mark_mismatches_ref (char *genomic, int querylength, Compress_T query_com
     }
 
     query_shifted += COMPRESS_BLOCKSIZE;
-#ifdef HAVE_SSE2
-    ptr = &(ref_blocks[startblocki+12]);
-#else
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
     ptr = &(ref_blocks[startblocki]);
     ptr += 1; if (++startcolumni == 4) {ptr += 8; startcolumni = 0;}
+#else
+    ptr = &(ref_blocks[startblocki+12]);
 #endif
     end = &(ref_blocks[endblocki]);
     offset += STEP_SIZE; /* 128 or 32 */
@@ -21411,10 +21682,10 @@ Genome_mark_mismatches_ref (char *genomic, int querylength, Compress_T query_com
       }
 
       query_shifted += COMPRESS_BLOCKSIZE;
-#ifdef HAVE_SSE2
-      ptr += 12;
-#else
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
       ptr += 1; if (++startcolumni == 4) {ptr += 8; startcolumni = 0;}
+#else
+      ptr += 12;
 #endif
       offset += STEP_SIZE; /* 128 or 32 */
     }
@@ -21437,7 +21708,8 @@ Genome_mark_mismatches_ref (char *genomic, int querylength, Compress_T query_com
     debug14(if (endblocki_32 == startblocki) assert(answer == nmismatches));
     return nmismatches;
 
-#ifdef HAVE_SSE2
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+#else
   }
 #endif
 }
@@ -21502,12 +21774,13 @@ mark_mismatches_snps (char *genomic, int querylength, Compress_T query_compress,
     debug5(printf("nshift = %d, startdiscard = %u, enddiscard = %u\n",nshift,startdiscard,enddiscard));
 
 
-    diff_32 = (block_diff_snp_32)(query_shifted
-#ifdef HAVE_SSE2
-				  + startcolumni
-#endif
-				  ,&(snp_blocks[startblocki_32]),&(ref_blocks[startblocki_32]),
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+    diff_32 = (block_diff_snp_32)(query_shifted,&(snp_blocks[startblocki_32]),&(ref_blocks[startblocki_32]),
+				  plusp,genestrand,query_unk_mismatch_p);
+#else
+    diff_32 = (block_diff_snp_32)(query_shifted + startcolumni,&(snp_blocks[startblocki_32]),&(ref_blocks[startblocki_32]),
 				  plusp,genestrand,query_unk_mismatch_p);
+#endif
     diff_32 = clear_start_32(diff_32,startdiscard);
     diff_32 = clear_end_32(diff_32,enddiscard);
 
@@ -21533,8 +21806,7 @@ mark_mismatches_snps (char *genomic, int querylength, Compress_T query_compress,
   else {
 #endif
 
-#ifdef HAVE_SSE2
-#else
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
     startblocki = startblocki_32;
     endblocki = endblocki_32;
 #endif
@@ -21558,7 +21830,8 @@ mark_mismatches_snps (char *genomic, int querylength, Compress_T query_compress,
 #endif
 
 
-#ifdef HAVE_SSE2
+#if defined(WORDS_BIGENDIAN)|| !defined(HAVE_SSE2)
+#else
   if (endblocki == startblocki) {
     diff = (block_diff_snp)(query_shifted,&(snp_blocks[startblocki]),&(ref_blocks[startblocki]),
 			    plusp,genestrand,query_unk_mismatch_p);
@@ -21599,13 +21872,13 @@ mark_mismatches_snps (char *genomic, int querylength, Compress_T query_compress,
     }
 
     query_shifted += COMPRESS_BLOCKSIZE;
-#ifdef HAVE_SSE2
-    ref_ptr = &(ref_blocks[startblocki+12]);
-    alt_ptr = &(snp_blocks[startblocki+12]);
-#else
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
     ref_ptr = &(ref_blocks[startblocki]);
     alt_ptr = &(snp_blocks[startblocki]);
     ref_ptr += 1; alt_ptr += 1; if (++startcolumni == 4) {ref_ptr += 8; alt_ptr += 8; startcolumni = 0;}
+#else
+    ref_ptr = &(ref_blocks[startblocki+12]);
+    alt_ptr = &(snp_blocks[startblocki+12]);
 #endif
     end = &(ref_blocks[endblocki]);
     offset += STEP_SIZE; /* 128 or 32 */
@@ -21624,10 +21897,10 @@ mark_mismatches_snps (char *genomic, int querylength, Compress_T query_compress,
       }
 
       query_shifted += COMPRESS_BLOCKSIZE;
-#ifdef HAVE_SSE2
-      ref_ptr += 12; alt_ptr += 12;
-#else
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
       ref_ptr += 1; alt_ptr += 1; if (++startcolumni == 4) {ref_ptr += 8; alt_ptr += 8; startcolumni = 0;}
+#else
+      ref_ptr += 12; alt_ptr += 12;
 #endif
       offset += STEP_SIZE; /* 128 or 32 */
     }
@@ -21650,7 +21923,8 @@ mark_mismatches_snps (char *genomic, int querylength, Compress_T query_compress,
     debug14(if (endblocki_32 == startblocki_32) assert(answer == nmismatches_both));
     return nmismatches_both;
 
-#ifdef HAVE_SSE2
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+#else
   }
 #endif
 }
@@ -21739,13 +22013,13 @@ trim_left_substring (Compress_T query_compress, Univcoord_T left, int pos5, int
     offset = (pos3 - 1) - enddiscard + 32;
     debug(printf("nshift = %d, startdiscard = %u, enddiscard = %u\n",nshift,startdiscard,enddiscard));
 
-
-    diff_32 = (block_diff_32)(query_shifted
-#ifdef HAVE_SSE2
-			      + endcolumni
-#endif
-			      ,&(ref_blocks[endblocki_32]),
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+    diff_32 = (block_diff_32)(query_shifted,&(ref_blocks[endblocki_32]),
 			      plusp,genestrand,query_unk_mismatch_p);
+#else
+    diff_32 = (block_diff_32)(query_shifted + endcolumni,&(ref_blocks[endblocki_32]),
+			      plusp,genestrand,query_unk_mismatch_p);
+#endif
     diff_32 = clear_end_32(diff_32,enddiscard); /* puts 0 (matches) at end */
     diff_32 = set_start_32(diff_32,startdiscard);  /* puts 1 (mismatches) at start */
 
@@ -21779,8 +22053,7 @@ trim_left_substring (Compress_T query_compress, Univcoord_T left, int pos5, int
     else {
 #endif
 
-#ifdef HAVE_SSE2
-#else
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
     startblocki = startblocki_32;
     endblocki = endblocki_32;
 #endif
@@ -21797,7 +22070,8 @@ trim_left_substring (Compress_T query_compress, Univcoord_T left, int pos5, int
 #endif
 
 
-#ifdef HAVE_SSE2
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+#else
   if (startblocki == endblocki) {
     diff = (block_diff)(query_shifted,&(ref_blocks[endblocki]),
 			plusp,genestrand,query_unk_mismatch_p);
@@ -21894,7 +22168,27 @@ trim_left_substring (Compress_T query_compress, Univcoord_T left, int pos5, int
 			plusp,genestrand,query_unk_mismatch_p);
     diff = clear_end(diff,enddiscard); /* puts 0 (matches) at end */
 
-#ifdef HAVE_SSE2
+
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+    p = 3*(diff >> 16);
+    bestscore = score_high[p];
+    trimpos = offset - score_high[p+1];
+    totalscore = score_high[p+2];
+    debug(printf("diff high %04X => bestscore %d at pos %d, offset %d, trimpos %d, totalscore %d\n",
+		 diff >> 16,score_high[p],score_high[p+1],offset,trimpos,totalscore));
+    offset -= 16;
+
+    p = 3*(diff & 0x0000FFFF);
+    if ((score = score_high[p] + totalscore) > bestscore) {
+      bestscore = score;
+      trimpos = offset - score_high[p+1];
+    }
+    totalscore += score_high[p+2];
+    debug(printf("diff low %04X => bestscore %d at pos %d, offset %d, trimpos %d, totalscore %d\n",
+		 diff & 0x0000FFFF,score_high[p],score_high[p+1],offset,trimpos,totalscore));
+    offset -= 16;
+
+#else
     p = 3*((unsigned short) _mm_extract_epi16(diff,7));
     bestscore = score_high[p];
     trimpos = offset - score_high[p+1];
@@ -21972,39 +22266,42 @@ trim_left_substring (Compress_T query_compress, Univcoord_T left, int pos5, int
     debug(printf("diff piece %d %04X => bestscore %d at pos %d, offset %d, trimpos %d, totalscore %d\n",
 		 0,(unsigned short) _mm_extract_epi16(diff,0),score_high[p],score_high[p+1],offset,trimpos,totalscore));
     offset -= 16;
-
-#else
-    p = 3*(diff >> 16);
-    bestscore = score_high[p];
-    trimpos = offset - score_high[p+1];
-    totalscore = score_high[p+2];
-    debug(printf("diff high %04X => bestscore %d at pos %d, offset %d, trimpos %d, totalscore %d\n",
-		 diff >> 16,score_high[p],score_high[p+1],offset,trimpos,totalscore));
-    offset -= 16;
-
-    p = 3*(diff & 0x0000FFFF);
-    if ((score = score_high[p] + totalscore) > bestscore) {
-      bestscore = score;
-      trimpos = offset - score_high[p+1];
-    }
-    totalscore += score_high[p+2];
-    debug(printf("diff low %04X => bestscore %d at pos %d, offset %d, trimpos %d, totalscore %d\n",
-		 diff & 0x0000FFFF,score_high[p],score_high[p+1],offset,trimpos,totalscore));
-    offset -= 16;
 #endif
 
     query_shifted -= COMPRESS_BLOCKSIZE;
-#ifdef HAVE_SSE2
-    ptr = &(ref_blocks[endblocki-12]);
-#else
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
     ptr = &(ref_blocks[endblocki]);
     ptr -= 1; if (endcolumni-- == 0) {ptr -= 8; endcolumni = 3;}
+#else
+    ptr = &(ref_blocks[endblocki-12]);
 #endif
     start = &(ref_blocks[startblocki]);
     while (ptr > start) {
       diff = (block_diff)(query_shifted,ptr,plusp,genestrand,query_unk_mismatch_p);
 
-#ifdef HAVE_SSE2
+
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+      p = 3*(diff >> 16);
+      if ((score = score_high[p] + totalscore) > bestscore) {
+	bestscore = score;
+	trimpos = offset - score_high[p+1];
+      }
+      totalscore += score_high[p+2];
+      debug(printf("diff high %04X => bestscore %d at pos %d, offset %d, trimpos %d, totalscore %d\n",
+		   diff >> 16,score_high[p],score_high[p+1],offset,trimpos,totalscore));
+      offset -= 16;
+
+      p = 3*(diff & 0x0000FFFF);
+      if ((score = score_high[p] + totalscore) > bestscore) {
+	bestscore = score;
+	trimpos = offset - score_high[p+1];
+      }
+      totalscore += score_high[p+2];
+      debug(printf("diff low %04X => bestscore %d at pos %d, offset %d, trimpos %d, totalscore %d\n",
+		   diff & 0x0000FFFF,score_high[p],score_high[p+1],offset,trimpos,totalscore));
+      offset -= 16;
+
+#else
       p = 3*((unsigned short) _mm_extract_epi16(diff,7));
       if ((score = score_high[p] + totalscore) > bestscore) {
 	bestscore = score;
@@ -22084,34 +22381,13 @@ trim_left_substring (Compress_T query_compress, Univcoord_T left, int pos5, int
       debug(printf("diff piece %d %04X => bestscore %d at pos %d, offset %d, trimpos %d, totalscore %d\n",
 		   0,(unsigned short) _mm_extract_epi16(diff,0),score_high[p],score_high[p+1],offset,trimpos,totalscore));
       offset -= 16;
-
-#else
-      p = 3*(diff >> 16);
-      if ((score = score_high[p] + totalscore) > bestscore) {
-	bestscore = score;
-	trimpos = offset - score_high[p+1];
-      }
-      totalscore += score_high[p+2];
-      debug(printf("diff high %04X => bestscore %d at pos %d, offset %d, trimpos %d, totalscore %d\n",
-		   diff >> 16,score_high[p],score_high[p+1],offset,trimpos,totalscore));
-      offset -= 16;
-
-      p = 3*(diff & 0x0000FFFF);
-      if ((score = score_high[p] + totalscore) > bestscore) {
-	bestscore = score;
-	trimpos = offset - score_high[p+1];
-      }
-      totalscore += score_high[p+2];
-      debug(printf("diff low %04X => bestscore %d at pos %d, offset %d, trimpos %d, totalscore %d\n",
-		   diff & 0x0000FFFF,score_high[p],score_high[p+1],offset,trimpos,totalscore));
-      offset -= 16;
 #endif
 
       query_shifted -= COMPRESS_BLOCKSIZE;
-#ifdef HAVE_SSE2
-      ptr -= 12;
-#else
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
       ptr -= 1; if (endcolumni-- == 0) {ptr -= 8; endcolumni = 3;}
+#else
+      ptr -= 12;
 #endif
     }
 
@@ -22119,7 +22395,29 @@ trim_left_substring (Compress_T query_compress, Univcoord_T left, int pos5, int
     diff = (block_diff)(query_shifted,ptr,plusp,genestrand,query_unk_mismatch_p);
     diff = set_start(diff,startdiscard); /* puts 1 (mismatches) at start */
 
-#ifdef HAVE_SSE2
+
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+    p = 3*(diff >> 16);
+    if ((score = score_high[p] + totalscore) > bestscore) {
+      bestscore = score;
+      trimpos = offset - score_high[p+1];
+    }
+    totalscore += score_high[p+2];
+    debug(printf("diff high %04X => bestscore %d at pos %d, offset %d, trimpos %d, totalscore %d\n",
+		 diff >> 16,score_high[p],score_high[p+1],offset,trimpos,totalscore));
+    offset -= 16;
+
+    p = 3*(diff & 0x0000FFFF);
+    if ((score = score_high[p] + totalscore) > bestscore) {
+      bestscore = score;
+      trimpos = offset - score_high[p+1];
+    }
+    /* totalscore += score_high[p+2]; */
+    debug(printf("diff low %04X => bestscore %d at pos %d, offset %d, trimpos %d, totalscore %d\n",
+		 diff & 0x0000FFFF,score_high[p],score_high[p+1],offset,trimpos,totalscore));
+    /* offset -= 16; */
+
+#else
     p = 3*((unsigned short) _mm_extract_epi16(diff,7));
     if ((score = score_high[p] + totalscore) > bestscore) {
       bestscore = score;
@@ -22199,33 +22497,13 @@ trim_left_substring (Compress_T query_compress, Univcoord_T left, int pos5, int
     debug(printf("diff piece %d %04X => bestscore %d at pos %d, offset %d, trimpos %d, totalscore %d\n",
 		 0,(unsigned short) _mm_extract_epi16(diff,0),score_high[p],score_high[p+1],offset,trimpos,totalscore));
     offset -= 16;
-
-#else
-    p = 3*(diff >> 16);
-    if ((score = score_high[p] + totalscore) > bestscore) {
-      bestscore = score;
-      trimpos = offset - score_high[p+1];
-    }
-    totalscore += score_high[p+2];
-    debug(printf("diff high %04X => bestscore %d at pos %d, offset %d, trimpos %d, totalscore %d\n",
-		 diff >> 16,score_high[p],score_high[p+1],offset,trimpos,totalscore));
-    offset -= 16;
-
-    p = 3*(diff & 0x0000FFFF);
-    if ((score = score_high[p] + totalscore) > bestscore) {
-      bestscore = score;
-      trimpos = offset - score_high[p+1];
-    }
-    /* totalscore += score_high[p+2]; */
-    debug(printf("diff low %04X => bestscore %d at pos %d, offset %d, trimpos %d, totalscore %d\n",
-		 diff & 0x0000FFFF,score_high[p],score_high[p+1],offset,trimpos,totalscore));
-    /* offset -= 16; */
 #endif
     
     debug14(if (startblocki_32 == endblocki_32) assert(answer == trimpos - 1));
     return (trimpos - 1);	/* trimpos-1 is on side of mismatch */
 
-#ifdef HAVE_SSE2
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+#else
   }
 #endif
 }
@@ -22281,13 +22559,13 @@ trim_left_substring_snps (Compress_T query_compress, Univcoord_T left, int pos5,
     offset = (pos3 - 1) - enddiscard + 32;
     debug(printf("nshift = %d, startdiscard = %u, enddiscard = %u\n",nshift,startdiscard,enddiscard));
 
-
-    diff_32 = (block_diff_snp_32)(query_shifted
-#ifdef HAVE_SSE2
-				  + endcolumni
-#endif
-				  ,&(snp_blocks[endblocki_32]),&(ref_blocks[endblocki_32]),
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+    diff_32 = (block_diff_snp_32)(query_shifted,&(snp_blocks[endblocki_32]),&(ref_blocks[endblocki_32]),
+				  plusp,genestrand,query_unk_mismatch_p);
+#else
+    diff_32 = (block_diff_snp_32)(query_shifted + endcolumni,&(snp_blocks[endblocki_32]),&(ref_blocks[endblocki_32]),
 				  plusp,genestrand,query_unk_mismatch_p);
+#endif
 
     diff_32 = clear_end_32(diff_32,enddiscard); /* puts 0 (matches) at end */
     diff_32 = set_start_32(diff_32,startdiscard);  /* puts 1 (mismatches) at start */
@@ -22322,8 +22600,7 @@ trim_left_substring_snps (Compress_T query_compress, Univcoord_T left, int pos5,
   else {
 #endif
 
-#ifdef HAVE_SSE2
-#else
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
     startblocki = startblocki_32;
     endblocki = endblocki_32;
 #endif
@@ -22339,7 +22616,8 @@ trim_left_substring_snps (Compress_T query_compress, Univcoord_T left, int pos5,
 #endif
 
 
-#ifdef HAVE_SSE2
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+#else
   if (startblocki == endblocki) {
     diff = (block_diff_snp)(query_shifted,&(snp_blocks[endblocki]),&(ref_blocks[endblocki]),
 			plusp,genestrand,query_unk_mismatch_p);
@@ -22437,7 +22715,27 @@ trim_left_substring_snps (Compress_T query_compress, Univcoord_T left, int pos5,
 			    plusp,genestrand,query_unk_mismatch_p);
     diff = clear_end(diff,enddiscard); /* puts 0 (matches) at end */
 
-#ifdef HAVE_SSE2
+
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+    p = 3*(diff >> 16);
+    bestscore = score_high[p];
+    trimpos = offset - score_high[p+1];
+    totalscore = score_high[p+2];
+    debug(printf("diff high %04X => bestscore %d at pos %d, offset %d, trimpos %d, totalscore %d\n",
+		 diff >> 16,score_high[p],score_high[p+1],offset,trimpos,totalscore));
+    offset -= 16;
+
+    p = 3*(diff & 0x0000FFFF);
+    if ((score = score_high[p] + totalscore) > bestscore) {
+      bestscore = score;
+      trimpos = offset - score_high[p+1];
+    }
+    totalscore += score_high[p+2];
+    debug(printf("diff low %04X => bestscore %d at pos %d, offset %d, trimpos %d, totalscore %d\n",
+		 diff & 0x0000FFFF,score_high[p],score_high[p+1],offset,trimpos,totalscore));
+    offset -= 16;
+
+#else
     p = 3*((unsigned short) _mm_extract_epi16(diff,7));
     bestscore = score_high[p];
     trimpos = offset - score_high[p+1];
@@ -22515,41 +22813,44 @@ trim_left_substring_snps (Compress_T query_compress, Univcoord_T left, int pos5,
     debug(printf("diff piece %d %04X => bestscore %d at pos %d, offset %d, trimpos %d, totalscore %d\n",
 		 0,(unsigned short) _mm_extract_epi16(diff,0),score_high[p],score_high[p+1],offset,trimpos,totalscore));
     offset -= 16;
-
-#else
-    p = 3*(diff >> 16);
-    bestscore = score_high[p];
-    trimpos = offset - score_high[p+1];
-    totalscore = score_high[p+2];
-    debug(printf("diff high %04X => bestscore %d at pos %d, offset %d, trimpos %d, totalscore %d\n",
-		 diff >> 16,score_high[p],score_high[p+1],offset,trimpos,totalscore));
-    offset -= 16;
-
-    p = 3*(diff & 0x0000FFFF);
-    if ((score = score_high[p] + totalscore) > bestscore) {
-      bestscore = score;
-      trimpos = offset - score_high[p+1];
-    }
-    totalscore += score_high[p+2];
-    debug(printf("diff low %04X => bestscore %d at pos %d, offset %d, trimpos %d, totalscore %d\n",
-		 diff & 0x0000FFFF,score_high[p],score_high[p+1],offset,trimpos,totalscore));
-    offset -= 16;
 #endif
 
     query_shifted -= COMPRESS_BLOCKSIZE;
-#ifdef HAVE_SSE2
-    ref_ptr = &(ref_blocks[endblocki-12]);
-    alt_ptr = &(snp_blocks[endblocki-12]);
-#else
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
     ref_ptr = &(ref_blocks[endblocki]);
     alt_ptr = &(snp_blocks[endblocki]);
     ref_ptr -= 1; alt_ptr -= 1; if (endcolumni-- == 0) {ref_ptr -= 8; alt_ptr -= 8; endcolumni = 3;}
+#else
+    ref_ptr = &(ref_blocks[endblocki-12]);
+    alt_ptr = &(snp_blocks[endblocki-12]);
 #endif
     start = &(ref_blocks[startblocki]);
     while (ref_ptr > start) {
       diff = (block_diff_snp)(query_shifted,alt_ptr,ref_ptr,plusp,genestrand,query_unk_mismatch_p);
 
-#ifdef HAVE_SSE2
+
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+      p = 3*(diff >> 16);
+      if ((score = score_high[p] + totalscore) > bestscore) {
+	bestscore = score;
+	trimpos = offset - score_high[p+1];
+      }
+      totalscore += score_high[p+2];
+      debug(printf("diff high %04X => bestscore %d at pos %d, offset %d, trimpos %d, totalscore %d\n",
+		   diff >> 16,score_high[p],score_high[p+1],offset,trimpos,totalscore));
+      offset -= 16;
+
+      p = 3*(diff & 0x0000FFFF);
+      if ((score = score_high[p] + totalscore) > bestscore) {
+	bestscore = score;
+	trimpos = offset - score_high[p+1];
+      }
+      totalscore += score_high[p+2];
+      debug(printf("diff low %04X => bestscore %d at pos %d, offset %d, trimpos %d, totalscore %d\n",
+		   diff & 0x0000FFFF,score_high[p],score_high[p+1],offset,trimpos,totalscore));
+      offset -= 16;
+
+#else
       p = 3*((unsigned short) _mm_extract_epi16(diff,7));
       if ((score = score_high[p] + totalscore) > bestscore) {
 	bestscore = score;
@@ -22629,34 +22930,13 @@ trim_left_substring_snps (Compress_T query_compress, Univcoord_T left, int pos5,
       debug(printf("diff piece %d %04X => bestscore %d at pos %d, offset %d, trimpos %d, totalscore %d\n",
 		   0,(unsigned short) _mm_extract_epi16(diff,0),score_high[p],score_high[p+1],offset,trimpos,totalscore));
       offset -= 16;
-
-#else
-      p = 3*(diff >> 16);
-      if ((score = score_high[p] + totalscore) > bestscore) {
-	bestscore = score;
-	trimpos = offset - score_high[p+1];
-      }
-      totalscore += score_high[p+2];
-      debug(printf("diff high %04X => bestscore %d at pos %d, offset %d, trimpos %d, totalscore %d\n",
-		   diff >> 16,score_high[p],score_high[p+1],offset,trimpos,totalscore));
-      offset -= 16;
-
-      p = 3*(diff & 0x0000FFFF);
-      if ((score = score_high[p] + totalscore) > bestscore) {
-	bestscore = score;
-	trimpos = offset - score_high[p+1];
-      }
-      totalscore += score_high[p+2];
-      debug(printf("diff low %04X => bestscore %d at pos %d, offset %d, trimpos %d, totalscore %d\n",
-		   diff & 0x0000FFFF,score_high[p],score_high[p+1],offset,trimpos,totalscore));
-      offset -= 16;
 #endif
 
       query_shifted -= COMPRESS_BLOCKSIZE;
-#ifdef HAVE_SSE2
-      ref_ptr -= 12; alt_ptr -= 12;
-#else
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
       ref_ptr -= 1; alt_ptr -= 1; if (endcolumni-- == 0) {ref_ptr -= 8; alt_ptr -= 8; endcolumni = 3;}
+#else
+      ref_ptr -= 12; alt_ptr -= 12;
 #endif
     }
 
@@ -22665,7 +22945,29 @@ trim_left_substring_snps (Compress_T query_compress, Univcoord_T left, int pos5,
 
     diff = set_start(diff,startdiscard); /* puts 1 (mismatches) at start */
 
-#ifdef HAVE_SSE2
+
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+    p = 3*(diff >> 16);
+    if ((score = score_high[p] + totalscore) > bestscore) {
+      bestscore = score;
+      trimpos = offset - score_high[p+1];
+    }
+    totalscore += score_high[p+2];
+    debug(printf("diff high %04X => bestscore %d at pos %d, offset %d, trimpos %d, totalscore %d\n",
+		 diff >> 16,score_high[p],score_high[p+1],offset,trimpos,totalscore));
+    offset -= 16;
+
+    p = 3*(diff & 0x0000FFFF);
+    if ((score = score_high[p] + totalscore) > bestscore) {
+      bestscore = score;
+      trimpos = offset - score_high[p+1];
+    }
+    /* totalscore += score_high[p+2]; */
+    debug(printf("diff low %04X => bestscore %d at pos %d, offset %d, trimpos %d, totalscore %d\n",
+		 diff & 0x0000FFFF,score_high[p],score_high[p+1],offset,trimpos,totalscore));
+    /* offset -= 16; */
+
+#else
     p = 3*((unsigned short) _mm_extract_epi16(diff,7));
     if ((score = score_high[p] + totalscore) > bestscore) {
       bestscore = score;
@@ -22745,33 +23047,13 @@ trim_left_substring_snps (Compress_T query_compress, Univcoord_T left, int pos5,
     debug(printf("diff piece %d %04X => bestscore %d at pos %d, offset %d, trimpos %d, totalscore %d\n",
 		 0,(unsigned short) _mm_extract_epi16(diff,0),score_high[p],score_high[p+1],offset,trimpos,totalscore));
     offset -= 16;
-
-#else
-    p = 3*(diff >> 16);
-    if ((score = score_high[p] + totalscore) > bestscore) {
-      bestscore = score;
-      trimpos = offset - score_high[p+1];
-    }
-    totalscore += score_high[p+2];
-    debug(printf("diff high %04X => bestscore %d at pos %d, offset %d, trimpos %d, totalscore %d\n",
-		 diff >> 16,score_high[p],score_high[p+1],offset,trimpos,totalscore));
-    offset -= 16;
-
-    p = 3*(diff & 0x0000FFFF);
-    if ((score = score_high[p] + totalscore) > bestscore) {
-      bestscore = score;
-      trimpos = offset - score_high[p+1];
-    }
-    /* totalscore += score_high[p+2]; */
-    debug(printf("diff low %04X => bestscore %d at pos %d, offset %d, trimpos %d, totalscore %d\n",
-		 diff & 0x0000FFFF,score_high[p],score_high[p+1],offset,trimpos,totalscore));
-    /* offset -= 16; */
 #endif
 
     debug14(if (startblocki_32 == endblocki_32) assert(answer == trimpos - 1));
     return (trimpos - 1);	/* trimpos-1 is on side of mismatch */
 
-#ifdef HAVE_SSE2
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+#else
   }
 #endif
 }
@@ -22828,14 +23110,13 @@ trim_right_substring (Compress_T query_compress, Univcoord_T left, int pos5, int
     offset = -startdiscard + pos5;
     debug(printf("nshift = %d, startdiscard = %u, enddiscard = %u\n",nshift,startdiscard,enddiscard));
 
-
-    diff_32 = (block_diff_32)(query_shifted
-#ifdef HAVE_SSE2
-			      + startcolumni
-#endif
-			      ,&(ref_blocks[startblocki_32]),
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+    diff_32 = (block_diff_32)(query_shifted,&(ref_blocks[startblocki_32]),
 			      plusp,genestrand,query_unk_mismatch_p);
-
+#else
+    diff_32 = (block_diff_32)(query_shifted + startcolumni,&(ref_blocks[startblocki_32]),
+			      plusp,genestrand,query_unk_mismatch_p);
+#endif
     diff_32 = clear_start_32(diff_32,startdiscard); /* puts 0 (matches) at start */
     diff_32 = set_end_32(diff_32,enddiscard);  /* puts 1 (mismatches) at end */
 
@@ -22869,8 +23150,7 @@ trim_right_substring (Compress_T query_compress, Univcoord_T left, int pos5, int
   else {
 #endif
 
-#ifdef HAVE_SSE2
-#else
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
     startblocki = startblocki_32;
     endblocki = endblocki_32;
 #endif
@@ -22886,7 +23166,8 @@ trim_right_substring (Compress_T query_compress, Univcoord_T left, int pos5, int
 #endif
 
 
-#ifdef HAVE_SSE2
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+#else
   if (endblocki == startblocki) {
     diff = (block_diff)(query_shifted,&(ref_blocks[startblocki]),
 			plusp,genestrand,query_unk_mismatch_p);
@@ -22984,7 +23265,27 @@ trim_right_substring (Compress_T query_compress, Univcoord_T left, int pos5, int
     diff = clear_start(diff,startdiscard); /* puts 0 (matches) at start */
     debug(printf("clearing start %08X\n",clear_start_mask(startdiscard)));
 
-#ifdef HAVE_SSE2
+      
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+    p = 3*(diff & 0x0000FFFF);
+    bestscore = score_low[p];
+    trimpos = offset + score_low[p+1];
+    totalscore = score_low[p+2];
+    debug(printf("diff low %04X => bestscore %d at pos %d, offset %d, trimpos %d, totalscore %d\n",
+		 diff & 0x0000FFFF,score_high[p],score_high[p+1],offset,trimpos,totalscore));
+    offset += 16;
+
+    p = 3*(diff >> 16);
+    if ((score = score_low[p] + totalscore) > bestscore) {
+      bestscore = score;
+      trimpos = offset + score_low[p+1];
+    }
+    totalscore += score_low[p+2];
+    debug(printf("diff high %04X => bestscore %d at pos %d, offset %d, trimpos %d, totalscore %d\n",
+		 diff >> 16,score_high[p],score_high[p+1],offset,trimpos,totalscore));
+    offset += 16;
+
+#else
     p = 3*((unsigned short) _mm_extract_epi16(diff,0));
     bestscore = score_low[p];
     trimpos = offset + score_low[p+1];
@@ -23050,51 +23351,54 @@ trim_right_substring (Compress_T query_compress, Univcoord_T left, int pos5, int
     }
     totalscore += score_low[p+2];
     debug(printf("diff piece %d %04X => bestscore %d at pos %d, offset %d, trimpos %d, totalscore %d\n",
-		 6,(unsigned short) _mm_extract_epi16(diff,6),score_high[p],score_high[p+1],offset,trimpos,totalscore));
-    offset += 16;
-      
-    p = 3*((unsigned short) _mm_extract_epi16(diff,7));
-    if ((score = score_low[p] + totalscore) > bestscore) {
-      bestscore = score;
-      trimpos = offset + score_low[p+1];
-    }
-    totalscore += score_low[p+2];
-    debug(printf("diff piece %d %04X => bestscore %d at pos %d, offset %d, trimpos %d, totalscore %d\n",
-	      7,(unsigned short) _mm_extract_epi16(diff,7),score_high[p],score_high[p+1],offset,trimpos,totalscore));
+		 6,(unsigned short) _mm_extract_epi16(diff,6),score_high[p],score_high[p+1],offset,trimpos,totalscore));
     offset += 16;
       
-#else
-    p = 3*(diff & 0x0000FFFF);
-    bestscore = score_low[p];
-    trimpos = offset + score_low[p+1];
-    totalscore = score_low[p+2];
-    debug(printf("diff low %04X => bestscore %d at pos %d, offset %d, trimpos %d, totalscore %d\n",
-		 diff & 0x0000FFFF,score_high[p],score_high[p+1],offset,trimpos,totalscore));
-    offset += 16;
-
-    p = 3*(diff >> 16);
+    p = 3*((unsigned short) _mm_extract_epi16(diff,7));
     if ((score = score_low[p] + totalscore) > bestscore) {
       bestscore = score;
       trimpos = offset + score_low[p+1];
     }
     totalscore += score_low[p+2];
-    debug(printf("diff high %04X => bestscore %d at pos %d, offset %d, trimpos %d, totalscore %d\n",
-		 diff >> 16,score_high[p],score_high[p+1],offset,trimpos,totalscore));
+    debug(printf("diff piece %d %04X => bestscore %d at pos %d, offset %d, trimpos %d, totalscore %d\n",
+	      7,(unsigned short) _mm_extract_epi16(diff,7),score_high[p],score_high[p+1],offset,trimpos,totalscore));
     offset += 16;
 #endif
 
     query_shifted += COMPRESS_BLOCKSIZE;
-#ifdef HAVE_SSE2
-    ptr = &(ref_blocks[startblocki+12]);
-#else
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
     ptr = &(ref_blocks[startblocki]);
     ptr += 1; if (++startcolumni == 4) {ptr += 8; startcolumni = 0;}
+#else
+    ptr = &(ref_blocks[startblocki+12]);
 #endif
     end = &(ref_blocks[endblocki]);
     while (ptr < end) {
       diff = (block_diff)(query_shifted,ptr,plusp,genestrand,query_unk_mismatch_p);
 
-#ifdef HAVE_SSE2
+
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+      p = 3*(diff & 0x0000FFFF);
+      if ((score = score_low[p] + totalscore) > bestscore) {
+	bestscore = score;
+	trimpos = offset + score_low[p+1];
+      }
+      totalscore += score_low[p+2];
+      debug(printf("diff low %04X => bestscore %d at pos %d, offset %d, trimpos %d, totalscore %d\n",
+		   diff & 0x0000FFFF,score_high[p],score_high[p+1],offset,trimpos,totalscore));
+      offset += 16;
+
+      p = 3*(diff >> 16);
+      if ((score = score_low[p] + totalscore) > bestscore) {
+	bestscore = score;
+	trimpos = offset + score_low[p+1];
+      }
+      totalscore += score_low[p+2];
+      debug(printf("diff high %04X => bestscore %d at pos %d, offset %d, trimpos %d, totalscore %d\n",
+		   diff >> 16,score_high[p],score_high[p+1],offset,trimpos,totalscore));
+      offset += 16;
+
+#else
       p = 3*((unsigned short) _mm_extract_epi16(diff,0));
       if ((score = score_low[p] + totalscore) > bestscore) {
 	bestscore = score;
@@ -23174,34 +23478,13 @@ trim_right_substring (Compress_T query_compress, Univcoord_T left, int pos5, int
       debug(printf("diff piece %d %04X => bestscore %d at pos %d, offset %d, trimpos %d, totalscore %d\n",
 		   7,(unsigned short) _mm_extract_epi16(diff,7),score_high[p],score_high[p+1],offset,trimpos,totalscore));
       offset += 16;
-
-#else
-      p = 3*(diff & 0x0000FFFF);
-      if ((score = score_low[p] + totalscore) > bestscore) {
-	bestscore = score;
-	trimpos = offset + score_low[p+1];
-      }
-      totalscore += score_low[p+2];
-      debug(printf("diff low %04X => bestscore %d at pos %d, offset %d, trimpos %d, totalscore %d\n",
-		   diff & 0x0000FFFF,score_high[p],score_high[p+1],offset,trimpos,totalscore));
-      offset += 16;
-
-      p = 3*(diff >> 16);
-      if ((score = score_low[p] + totalscore) > bestscore) {
-	bestscore = score;
-	trimpos = offset + score_low[p+1];
-      }
-      totalscore += score_low[p+2];
-      debug(printf("diff high %04X => bestscore %d at pos %d, offset %d, trimpos %d, totalscore %d\n",
-		   diff >> 16,score_high[p],score_high[p+1],offset,trimpos,totalscore));
-      offset += 16;
 #endif
 
       query_shifted += COMPRESS_BLOCKSIZE;
-#ifdef HAVE_SSE2
-      ptr += 12;
-#else
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
       ptr += 1; if (++startcolumni == 4) {ptr += 8; startcolumni = 0;}
+#else
+      ptr += 12;
 #endif
     }
 
@@ -23209,7 +23492,29 @@ trim_right_substring (Compress_T query_compress, Univcoord_T left, int pos5, int
     diff = (block_diff)(query_shifted,ptr,plusp,genestrand,query_unk_mismatch_p);
     diff = set_end(diff,enddiscard); /* puts 1 (mismatches) at end */
 
-#ifdef HAVE_SSE2
+
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+    p = 3*(diff & 0x0000FFFF);
+    if ((score = score_low[p] + totalscore) > bestscore) {
+      bestscore = score;
+      trimpos = offset + score_low[p+1];
+    }
+    totalscore += score_low[p+2];
+    debug(printf("diff low %04X => bestscore %d at pos %d, offset %d, trimpos %d, totalscore %d\n",
+		 diff & 0x0000FFFF,score_high[p],score_high[p+1],offset,trimpos,totalscore));
+    offset += 16;
+
+    p = 3*(diff >> 16);
+    if ((score = score_low[p] + totalscore) > bestscore) {
+      bestscore = score;
+      trimpos = offset + score_low[p+1];
+    }
+    /* totalscore += score_low[p+2]; */
+    debug(printf("diff high %04X => bestscore %d at pos %d, offset %d, trimpos %d, totalscore %d\n",
+		 diff >> 16,score_high[p],score_high[p+1],offset,trimpos,totalscore));
+    /* offset += 16; */
+
+#else
     p = 3*((unsigned short) _mm_extract_epi16(diff,0));
     if ((score = score_low[p] + totalscore) > bestscore) {
       bestscore = score;
@@ -23289,33 +23594,13 @@ trim_right_substring (Compress_T query_compress, Univcoord_T left, int pos5, int
     debug(printf("diff piece %d %04X => bestscore %d at pos %d, offset %d, trimpos %d, totalscore %d\n",
 		 7,(unsigned short) _mm_extract_epi16(diff,7),score_high[p],score_high[p+1],offset,trimpos,totalscore));
     offset += 16;
-
-#else
-    p = 3*(diff & 0x0000FFFF);
-    if ((score = score_low[p] + totalscore) > bestscore) {
-      bestscore = score;
-      trimpos = offset + score_low[p+1];
-    }
-    totalscore += score_low[p+2];
-    debug(printf("diff low %04X => bestscore %d at pos %d, offset %d, trimpos %d, totalscore %d\n",
-		 diff & 0x0000FFFF,score_high[p],score_high[p+1],offset,trimpos,totalscore));
-    offset += 16;
-
-    p = 3*(diff >> 16);
-    if ((score = score_low[p] + totalscore) > bestscore) {
-      bestscore = score;
-      trimpos = offset + score_low[p+1];
-    }
-    /* totalscore += score_low[p+2]; */
-    debug(printf("diff high %04X => bestscore %d at pos %d, offset %d, trimpos %d, totalscore %d\n",
-		 diff >> 16,score_high[p],score_high[p+1],offset,trimpos,totalscore));
-    /* offset += 16; */
 #endif
     
     debug14(if (startblocki_32 == endblocki_32) assert(answer == trimpos + 1));
     return (trimpos + 1);	/* trimpos+1 is on side of mismatch */
 
-#ifdef HAVE_SSE2
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+#else
   }
 #endif
 }
@@ -23372,14 +23657,13 @@ trim_right_substring_snps (Compress_T query_compress, Univcoord_T left, int pos5
     offset = -startdiscard + pos5;
     debug(printf("nshift = %d, startdiscard = %u, enddiscard = %u\n",nshift,startdiscard,enddiscard));
 
-
-    diff_32 = (block_diff_snp_32)(query_shifted
-#ifdef HAVE_SSE2
-				  + startcolumni
-#endif
-				  ,&(snp_blocks[startblocki_32]),&(ref_blocks[startblocki_32]),
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+    diff_32 = (block_diff_snp_32)(query_shifted,&(snp_blocks[startblocki_32]),&(ref_blocks[startblocki_32]),
 				  plusp,genestrand,query_unk_mismatch_p);
-
+#else
+    diff_32 = (block_diff_snp_32)(query_shifted + startcolumni,&(snp_blocks[startblocki_32]),&(ref_blocks[startblocki_32]),
+				  plusp,genestrand,query_unk_mismatch_p);
+#endif
     diff_32 = clear_start_32(diff_32,startdiscard); /* puts 0 (matches) at start */
     diff_32 = set_end_32(diff_32,enddiscard);  /* puts 1 (mismatches) at end */
 
@@ -23413,8 +23697,7 @@ trim_right_substring_snps (Compress_T query_compress, Univcoord_T left, int pos5
   else {
 #endif
 
-#ifdef HAVE_SSE2
-#else
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
     startblocki = startblocki_32;
     endblocki = endblocki_32;
 #endif
@@ -23429,7 +23712,8 @@ trim_right_substring_snps (Compress_T query_compress, Univcoord_T left, int pos5
 #endif  
 
 
-#ifdef HAVE_SSE2
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+#else
   if (endblocki == startblocki) {
     diff = (block_diff_snp)(query_shifted,&(snp_blocks[startblocki]),&(ref_blocks[startblocki]),
 			    plusp,genestrand,query_unk_mismatch_p);
@@ -23527,7 +23811,27 @@ trim_right_substring_snps (Compress_T query_compress, Univcoord_T left, int pos5
 
     diff = clear_start(diff,startdiscard); /* puts 0 (matches) at start */
 
-#ifdef HAVE_SSE2
+
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+    p = 3*(diff & 0x0000FFFF);
+    bestscore = score_low[p];
+    trimpos = offset + score_low[p+1];
+    totalscore = score_low[p+2];
+    debug(printf("diff low %04X => bestscore %d at pos %d, offset %d, trimpos %d, totalscore %d\n",
+		 diff & 0x0000FFFF,score_high[p],score_high[p+1],offset,trimpos,totalscore));
+    offset += 16;
+
+    p = 3*(diff >> 16);
+    if ((score = score_low[p] + totalscore) > bestscore) {
+      bestscore = score;
+      trimpos = offset + score_low[p+1];
+    }
+    totalscore += score_low[p+2];
+    debug(printf("diff high %04X => bestscore %d at pos %d, offset %d, trimpos %d, totalscore %d\n",
+		 diff >> 16,score_high[p],score_high[p+1],offset,trimpos,totalscore));
+    offset += 16;
+
+#else
     p = 3*((unsigned short) _mm_extract_epi16(diff,0));
     bestscore = score_low[p];
     trimpos = offset + score_low[p+1];
@@ -23605,41 +23909,44 @@ trim_right_substring_snps (Compress_T query_compress, Univcoord_T left, int pos5
     debug(printf("diff piece %d %04X => bestscore %d at pos %d, offset %d, trimpos %d, totalscore %d\n",
 		 7,(unsigned short) _mm_extract_epi16(diff,7),score_high[p],score_high[p+1],offset,trimpos,totalscore));
     offset += 16;
-
-#else
-    p = 3*(diff & 0x0000FFFF);
-    bestscore = score_low[p];
-    trimpos = offset + score_low[p+1];
-    totalscore = score_low[p+2];
-    debug(printf("diff low %04X => bestscore %d at pos %d, offset %d, trimpos %d, totalscore %d\n",
-		 diff & 0x0000FFFF,score_high[p],score_high[p+1],offset,trimpos,totalscore));
-    offset += 16;
-
-    p = 3*(diff >> 16);
-    if ((score = score_low[p] + totalscore) > bestscore) {
-      bestscore = score;
-      trimpos = offset + score_low[p+1];
-    }
-    totalscore += score_low[p+2];
-    debug(printf("diff high %04X => bestscore %d at pos %d, offset %d, trimpos %d, totalscore %d\n",
-		 diff >> 16,score_high[p],score_high[p+1],offset,trimpos,totalscore));
-    offset += 16;
 #endif
 
     query_shifted += COMPRESS_BLOCKSIZE;
-#ifdef HAVE_SSE2
-    ref_ptr = &(ref_blocks[startblocki+12]);
-    alt_ptr = &(snp_blocks[startblocki+12]);
-#else
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
     ref_ptr = &(ref_blocks[startblocki]);
     alt_ptr = &(snp_blocks[startblocki]);
     ref_ptr += 1; alt_ptr += 1; if (++startcolumni == 4) {ref_ptr += 8; alt_ptr += 8; startcolumni = 0;}
+#else
+    ref_ptr = &(ref_blocks[startblocki+12]);
+    alt_ptr = &(snp_blocks[startblocki+12]);
 #endif
     end = &(ref_blocks[endblocki]);
     while (ref_ptr < end) {
       diff = (block_diff_snp)(query_shifted,alt_ptr,ref_ptr,plusp,genestrand,query_unk_mismatch_p);
 
-#ifdef HAVE_SSE2
+
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+      p = 3*(diff & 0x0000FFFF);
+      if ((score = score_low[p] + totalscore) > bestscore) {
+	bestscore = score;
+	trimpos = offset + score_low[p+1];
+      }
+      totalscore += score_low[p+2];
+      debug(printf("diff low %04X => bestscore %d at pos %d, offset %d, trimpos %d, totalscore %d\n",
+		   diff & 0x0000FFFF,score_high[p],score_high[p+1],offset,trimpos,totalscore));
+      offset += 16;
+
+      p = 3*(diff >> 16);
+      if ((score = score_low[p] + totalscore) > bestscore) {
+	bestscore = score;
+	trimpos = offset + score_low[p+1];
+      }
+      totalscore += score_low[p+2];
+      debug(printf("diff high %04X => bestscore %d at pos %d, offset %d, trimpos %d, totalscore %d\n",
+		   diff >> 16,score_high[p],score_high[p+1],offset,trimpos,totalscore));
+      offset += 16;
+
+#else
       p = 3*((unsigned short) _mm_extract_epi16(diff,0));
       if ((score = score_low[p] + totalscore) > bestscore) {
 	bestscore = score;
@@ -23719,34 +24026,13 @@ trim_right_substring_snps (Compress_T query_compress, Univcoord_T left, int pos5
       debug(printf("diff piece %d %04X => bestscore %d at pos %d, offset %d, trimpos %d, totalscore %d\n",
 		   7,(unsigned short) _mm_extract_epi16(diff,7),score_high[p],score_high[p+1],offset,trimpos,totalscore));
       offset += 16;
-
-#else
-      p = 3*(diff & 0x0000FFFF);
-      if ((score = score_low[p] + totalscore) > bestscore) {
-	bestscore = score;
-	trimpos = offset + score_low[p+1];
-      }
-      totalscore += score_low[p+2];
-      debug(printf("diff low %04X => bestscore %d at pos %d, offset %d, trimpos %d, totalscore %d\n",
-		   diff & 0x0000FFFF,score_high[p],score_high[p+1],offset,trimpos,totalscore));
-      offset += 16;
-
-      p = 3*(diff >> 16);
-      if ((score = score_low[p] + totalscore) > bestscore) {
-	bestscore = score;
-	trimpos = offset + score_low[p+1];
-      }
-      totalscore += score_low[p+2];
-      debug(printf("diff high %04X => bestscore %d at pos %d, offset %d, trimpos %d, totalscore %d\n",
-		   diff >> 16,score_high[p],score_high[p+1],offset,trimpos,totalscore));
-      offset += 16;
 #endif
 
       query_shifted += COMPRESS_BLOCKSIZE;
-#ifdef HAVE_SSE2
-      ref_ptr += 12; alt_ptr += 12;
-#else
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
       ref_ptr += 1; alt_ptr += 1; if (++startcolumni == 4) {ref_ptr += 8; alt_ptr += 8; startcolumni = 0;}
+#else
+      ref_ptr += 12; alt_ptr += 12;
 #endif
     }
 
@@ -23755,7 +24041,29 @@ trim_right_substring_snps (Compress_T query_compress, Univcoord_T left, int pos5
 
     diff = set_end(diff,enddiscard); /* puts 1 (mismatches) at end */
 
-#ifdef HAVE_SSE2
+
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+    p = 3*(diff & 0x0000FFFF);
+    if ((score = score_low[p] + totalscore) > bestscore) {
+      bestscore = score;
+      trimpos = offset + score_low[p+1];
+    }
+    totalscore += score_low[p+2];
+    debug(printf("diff low %04X => bestscore %d at pos %d, offset %d, trimpos %d, totalscore %d\n",
+		 diff & 0x0000FFFF,score_high[p],score_high[p+1],offset,trimpos,totalscore));
+    offset += 16;
+
+    p = 3*(diff >> 16);
+    if ((score = score_low[p] + totalscore) > bestscore) {
+      bestscore = score;
+      trimpos = offset + score_low[p+1];
+    }
+    /* totalscore += score_low[p+2]; */
+    debug(printf("diff high %04X => bestscore %d at pos %d, offset %d, trimpos %d, totalscore %d\n",
+		 diff >> 16,score_high[p],score_high[p+1],offset,trimpos,totalscore));
+    /* offset += 16; */
+
+#else
     p = 3*((unsigned short) _mm_extract_epi16(diff,0));
     if ((score = score_low[p] + totalscore) > bestscore) {
       bestscore = score;
@@ -23835,33 +24143,13 @@ trim_right_substring_snps (Compress_T query_compress, Univcoord_T left, int pos5
     debug(printf("diff piece %d %04X => bestscore %d at pos %d, offset %d, trimpos %d, totalscore %d\n",
 		 7,(unsigned short) _mm_extract_epi16(diff,7),score_high[p],score_high[p+1],offset,trimpos,totalscore));
     offset += 16;
-
-#else
-    p = 3*(diff & 0x0000FFFF);
-    if ((score = score_low[p] + totalscore) > bestscore) {
-      bestscore = score;
-      trimpos = offset + score_low[p+1];
-    }
-    totalscore += score_low[p+2];
-    debug(printf("diff low %04X => bestscore %d at pos %d, offset %d, trimpos %d, totalscore %d\n",
-		 diff & 0x0000FFFF,score_high[p],score_high[p+1],offset,trimpos,totalscore));
-    offset += 16;
-
-    p = 3*(diff >> 16);
-    if ((score = score_low[p] + totalscore) > bestscore) {
-      bestscore = score;
-      trimpos = offset + score_low[p+1];
-    }
-    /* totalscore += score_low[p+2]; */
-    debug(printf("diff high %04X => bestscore %d at pos %d, offset %d, trimpos %d, totalscore %d\n",
-		 diff >> 16,score_high[p],score_high[p+1],offset,trimpos,totalscore));
-    /* offset += 16; */
 #endif
 
     debug14(if (startblocki_32 == endblocki_32) assert(answer == trimpos + 1));
     return (trimpos + 1);	/* trimpos+1 is on side of mismatch */
 
-#ifdef HAVE_SSE2
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+#else
   }
 #endif
 }
diff --git a/src/get-genome.c b/src/get-genome.c
index e2e6e7a..10c3288 100644
--- a/src/get-genome.c
+++ b/src/get-genome.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: get-genome.c 161940 2015-03-25 20:36:59Z twu $";
+static char rcsid[] = "$Id: get-genome.c 170023 2015-07-17 16:47:21Z twu $";
 #ifdef HAVE_CONFIG_H
 #include <config.h>
 #endif
@@ -72,6 +72,8 @@ static bool vareffect_p = false;
 
 /* Dump options */
 static bool dumpallp = false;
+static bool stream_chars_p = false;
+static bool stream_ints_p = false;
 static bool dumpchrp = false;
 static bool dumpchr_forsam_p = false;
 static bool dumpsegsp = false;
@@ -105,6 +107,7 @@ static struct option long_options[] = {
 
   /* Dump options */
   {"dump", no_argument, 0, 'A'},	/* dumpallp */
+  {"stream-chars", no_argument, 0, 0},	/* stream_chars_p */
   {"chromosomes", no_argument, 0, 'L'},	/* dumpchrp */
   {"forsam", no_argument, 0, 0},	/* dumpchr_forsam_p */
   {"contigs", no_argument, 0, 'I'}, /* dumpsegsp */
@@ -176,6 +179,7 @@ External map file options\n\
 \n\
 Dump options\n\
   -A, --dump              Dump entire genome in FASTA format\n\
+  --stream                Dump entire genome as a single stream of ACGTX bytes\n\
   -L, --chromosomes       List all chromosomes with universal coordinates\n\
   --forsam                List all chromosomes for use in a SAM file\n\
   -I, --contigs           List all contigs with universal coordinates\n\
@@ -393,7 +397,9 @@ print_sequence (Genome_T genome, Genome_T genomealt, Univcoord_T genomicstart, C
   Chrpos_T chrpos;
 
   /* Handle reference strain */
-  if (vareffect_p == true) {
+  if (stream_chars_p == true || stream_ints_p == true) {
+    /* Don't print a header */
+  } else if (vareffect_p == true) {
     /* Don't print a header */
   } else if (user_typestring != NULL) {
     /* Don't print a header */
@@ -475,6 +481,16 @@ print_sequence (Genome_T genome, Genome_T genomealt, Univcoord_T genomicstart, C
     Sequence_free(&genomicseg);
     FREE(chromosome1);
 
+  } else if (stream_chars_p == true) {
+    genomicseg = Genome_get_segment(genome,genomicstart,genomiclength,chromosome_iit,revcomp);
+    Sequence_stdout_stream_chars(genomicseg);
+    Sequence_free(&genomicseg);
+
+  } else if (stream_ints_p == true) {
+    genomicseg = Genome_get_segment(genome,genomicstart,genomiclength,chromosome_iit,revcomp);
+    Sequence_stdout_stream_ints(genomicseg);
+    Sequence_free(&genomicseg);
+
   } else if (snps_root == NULL || print_snps_mode == 0 || print_snps_mode == 2) {
     genomicseg = Genome_get_segment(genome,genomicstart,genomiclength,chromosome_iit,revcomp);
     if (user_typestring == NULL) {
@@ -1047,7 +1063,6 @@ int
 main (int argc, char *argv[]) {
   char *snpsdir = NULL;
   char *iitfile;
-  FILE *fp;
   Genome_T genome = NULL, genomealt = NULL;
   Univcoord_T genomicstart, chroffset;
   Chrpos_T genomiclength, chrlength, chrstart, chrend;
@@ -1063,6 +1078,10 @@ main (int argc, char *argv[]) {
   int *matches, nmatches, ndivs, i, *leftflanks, *rightflanks, nleftflanks = 0, nrightflanks = 0;
   int sign;
 
+  int circular_typeint;
+  bool *circularp = NULL;
+  bool any_circular_p;
+
   char *chr, *with_colon;
   int indx;
   bool allocp;
@@ -1098,6 +1117,12 @@ main (int argc, char *argv[]) {
       } else if (!strcmp(long_name,"vareffect")) {
 	vareffect_p = true;
 
+      } else if (!strcmp(long_name,"stream-chars")) {
+	stream_chars_p = true;
+
+      } else if (!strcmp(long_name,"stream-ints")) {
+	stream_ints_p = true;
+
       } else {
 	/* Shouldn't reach here */
 	fprintf(stderr,"Don't recognize option %s.  For usage, run 'get-genome --help'",long_name);
@@ -1157,7 +1182,47 @@ main (int argc, char *argv[]) {
     snpsdir = user_snpsdir;
   }
 
-  if (dumpallp == true) {
+  if (stream_chars_p == true || stream_ints_p == true) {
+    iitfile = (char *) CALLOC(strlen(genomesubdir)+strlen("/")+
+			      strlen(fileroot)+strlen(".chromosome.iit")+1,sizeof(char));
+    sprintf(iitfile,"%s/%s.chromosome.iit",genomesubdir,fileroot);
+    chromosome_iit = Univ_IIT_read(iitfile,/*readonlyp*/true,/*add_iit_p*/false);
+    FREE(iitfile);
+
+    circular_typeint = Univ_IIT_typeint(chromosome_iit,"circular");
+    circularp = Univ_IIT_circularp(&any_circular_p,chromosome_iit);
+
+    genome = Genome_new(genomesubdir,fileroot,/*snps_root*/NULL,/*genometype*/GENOME_OLIGOS,
+			uncompressedp,/*access*/USE_MMAP_ONLY,/*sharedp*/false);
+
+    for (indx = 1; indx <= Univ_IIT_total_nintervals(chromosome_iit); indx++) {
+      chr = Univ_IIT_label(chromosome_iit,indx,&allocp);
+      with_colon = (char *) CALLOC(strlen(chr)+strlen(":")+1,sizeof(char));
+      sprintf(with_colon,"%s:",chr);
+      if (allocp == true) {
+	FREE(chr);
+      }
+      if (Parserange_universal(&segment,&revcomp,&genomicstart,&genomiclength,&chrstart,&chrend,
+			       &chroffset,&chrlength,with_colon,genomesubdir,fileroot) == true) {
+	print_sequence(genome,/*genomealt*/NULL,genomicstart,genomiclength,chromosome_iit,
+		       /*whole_chromosome_p*/true);
+	if (circularp[indx] == true) {
+	  /* Print again, since internal genome represents circular chromosomes twice */
+	  print_sequence(genome,/*genomealt*/NULL,genomicstart,genomiclength,chromosome_iit,
+			 /*whole_chromosome_p*/true);
+	}
+      }
+      FREE(with_colon);
+    }
+
+    Genome_free(&genome);
+
+    Univ_IIT_free(&chromosome_iit);
+
+    return 0;
+
+
+  } else if (dumpallp == true) {
     iitfile = (char *) CALLOC(strlen(genomesubdir)+strlen("/")+
 			      strlen(fileroot)+strlen(".chromosome.iit")+1,sizeof(char));
     sprintf(iitfile,"%s/%s.chromosome.iit",genomesubdir,fileroot);
diff --git a/src/gmap.c b/src/gmap.c
index 60f8f29..74a9046 100644
--- a/src/gmap.c
+++ b/src/gmap.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: gmap.c 166641 2015-05-29 21:13:04Z twu $";
+static char rcsid[] = "$Id: gmap.c 168166 2015-06-24 03:57:10Z twu $";
 #ifdef HAVE_CONFIG_H
 #include <config.h>
 #endif
@@ -5150,8 +5150,12 @@ parse_command_line (int argc, char *argv[], int optind) {
 	  mode = ATOI_STRANDED;
 	} else if (!strcmp(optarg,"atoi-nonstranded")) {
 	  mode = ATOI_NONSTRANDED;
+	} else if (!strcmp(optarg,"ttoc-stranded")) {
+	  mode = TTOC_STRANDED;
+	} else if (!strcmp(optarg,"ttoc-nonstranded")) {
+	  mode = TTOC_NONSTRANDED;
 	} else {
-	  fprintf(stderr,"--mode must be standard, cmet-stranded, cmet-nonstranded, atoi-stranded, or atoi\n");
+	  fprintf(stderr,"--mode must be standard, cmet-stranded, cmet-nonstranded, atoi-stranded, atoi-nonstranded, ttoc-stranded, or ttoc-nonstranded\n");
 	  return 9;
 	}
 
@@ -5831,6 +5835,7 @@ main (int argc, char *argv[]) {
 
   if (nread > 1) {
     multiple_sequences_p = true;
+#if 0
 #ifdef HAVE_MMAP
     if (offsetsstrm_access != USE_ALLOCATE || genome_access != USE_ALLOCATE) {
       fprintf(stderr,"Note: >1 sequence detected, so index files are being memory mapped.\n");
@@ -5841,6 +5846,7 @@ main (int argc, char *argv[]) {
       fprintf(stderr,"  For more speed, also try multiple threads (-t <int>), if you have multiple processors or cores.");
 #endif
       fprintf(stderr,"\n");
+#endif
     }
 #endif
 
@@ -6126,6 +6132,31 @@ main (int argc, char *argv[]) {
 	exit(9);
       }
 
+    } else if (mode == TTOC_STRANDED || mode == TTOC_NONSTRANDED) {
+      if (user_atoidir == NULL) {
+	modedir = genomesubdir;
+      } else {
+	modedir = user_atoidir;
+      }
+
+      if ((indexdb_fwd = Indexdb_new_genome(&index1part,&index1interval,
+					    modedir,fileroot,/*idx_filesuffix*/"a2itc",/*snps_root*/NULL,
+					    required_index1part,required_index1interval,
+					    expand_offsets_p,offsetsstrm_access,positions_access,
+					    sharedp)) == NULL) {
+	fprintf(stderr,"Cannot find a2itc index file.  Need to run atoiindex first\n");
+	exit(9);
+      }
+
+      if ((indexdb_rev = Indexdb_new_genome(&index1part,&index1interval,
+					    modedir,fileroot,/*idx_filesuffix*/"a2iag",/*snps_root*/NULL,
+					    required_index1part,required_index1interval,
+					    expand_offsets_p,offsetsstrm_access,positions_access,
+					    sharedp)) == NULL) {
+	fprintf(stderr,"Cannot find a2iag index file.  Need to run atoiindex first\n");
+	exit(9);
+      }
+
     } else {
       /* Standard behavior */
       if ((indexdb_fwd = Indexdb_new_genome(&index1part,&index1interval,
@@ -6234,6 +6265,30 @@ main (int argc, char *argv[]) {
 	exit(9);
       }
 
+    } else if (mode == TTOC_STRANDED || mode == TTOC_NONSTRANDED) {
+      if (user_atoidir == NULL) {
+	modedir = snpsdir;
+      } else {
+	modedir = user_atoidir;
+      }
+
+      if ((indexdb_fwd = Indexdb_new_genome(&index1part,&index1interval,
+					    modedir,fileroot,/*idx_filesuffix*/"a2itc",snps_root,
+					    required_index1part,required_index1interval,
+					    expand_offsets_p,offsetsstrm_access,positions_access,
+					    sharedp)) == NULL) {
+	fprintf(stderr,"Cannot find a2itc index file.  Need to run atoiindex first\n");
+	exit(9);
+      }
+      if ((indexdb_rev = Indexdb_new_genome(&index1part,&index1interval,
+					    modedir,fileroot,/*idx_filesuffix*/"a2iag",snps_root,
+					    required_index1part,required_index1interval,
+					    expand_offsets_p,offsetsstrm_access,positions_access,
+					    sharedp)) == NULL) {
+	fprintf(stderr,"Cannot find a2iag index file.  Need to run atoiindex first\n");
+	exit(9);
+      }
+
     } else {
       indexdb_fwd = Indexdb_new_genome(&index1part,&index1interval,
 				       snpsdir,fileroot,/*idx_filesuffix*/"ref",snps_root,
@@ -6840,8 +6895,9 @@ Usage: gmap [OPTIONS...] <FASTA files...>, or\n\
   --atoidir=STRING               Directory for A-to-I RNA editing index files (created using atoiindex)\n\
                                    (default is location of genome index files specified using -D, -V, and -d)\n\
   --mode=STRING                  Alignment mode: standard (default), cmet-stranded, cmet-nonstranded,\n\
-                                    atoi-stranded, or atoi-nonstranded.  Non-standard modes requires you\n\
-                                    to have previously run the cmetindex or atoiindex programs on the genome\n\
+                                    atoi-stranded, atoi-nonstranded, ttoc-stranded, or ttoc-nonstranded.\n\
+                                    Non-standard modes requires you to have previously run the cmetindex\n\
+                                    or atoiindex programs (which also cover the ttoc modes) on the genome\n\
 ");
 #endif
 
diff --git a/src/gsnap.c b/src/gsnap.c
index f6931d6..faf7281 100644
--- a/src/gsnap.c
+++ b/src/gsnap.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: gsnap.c 166787 2015-06-02 18:00:56Z twu $";
+static char rcsid[] = "$Id: gsnap.c 168165 2015-06-24 03:56:57Z twu $";
 #ifdef HAVE_CONFIG_H
 #include <config.h>
 #endif
@@ -279,6 +279,7 @@ static int exclude_ranks[1];
 static MPI_Comm workers_comm;
 static MPI_Group world_group, workers_group;
 static int nthreads0;
+static bool master_is_worker_p = false; /* default behavior */
 #endif
 
 #ifdef HAVE_PTHREAD
@@ -779,6 +780,7 @@ check_compiler_assumptions () {
   fprintf(stderr,"\n");
 
 #ifdef HAVE_SSE2
+  /* With -mavx, compiler may use assembly instructions for _mm_set1_epi32 that don't work on non-AVX machines */
   fprintf(stderr,"Checking compiler assumptions for SSE2: ");
   fprintf(stderr,"%08X %08X",x,y);
   a = _mm_xor_si128(_mm_set1_epi32(x),_mm_set1_epi32(y));
@@ -1680,8 +1682,12 @@ parse_command_line (int argc, char *argv[], int optind) {
 	  mode = ATOI_STRANDED;
 	} else if (!strcmp(optarg,"atoi-nonstranded")) {
 	  mode = ATOI_NONSTRANDED;
+	} else if (!strcmp(optarg,"ttoc-stranded")) {
+	  mode = TTOC_STRANDED;
+	} else if (!strcmp(optarg,"ttoc-nonstranded")) {
+	  mode = TTOC_NONSTRANDED;
 	} else {
-	  fprintf(stderr,"--mode must be standard, cmet-stranded, cmet-nonstranded, atoi-stranded, or atoi-nonstranded\n");
+	  fprintf(stderr,"--mode must be standard, cmet-stranded, cmet-nonstranded, atoi-stranded, atoi-nonstranded, ttoc-stranded, or ttoc-nonstranded\n");
 	  return 9;
 	}
 
@@ -1974,6 +1980,18 @@ parse_command_line (int argc, char *argv[], int optind) {
       } else if (!strcmp(long_name,"read-group-platform")) {
 	sam_read_group_platform = optarg;
 
+#ifdef USE_MPI
+      } else if (!strcmp(long_name,"master-is-worker")) {
+	if (!strcmp(optarg,"1")) {
+	  master_is_worker_p = true;
+	} else if (!strcmp(optarg,"0")) {
+	  master_is_worker_p = false; /* Default */
+	} else {
+	  fprintf(stderr,"--master-is-worker flag must be 0 or 1\n");
+	  return 9;
+	}
+#endif
+
       } else if (!strcmp(long_name,"print-snps")) {
 	print_snplabels_p = true;
 
@@ -2708,6 +2726,29 @@ worker_setup (char *genomesubdir, char *fileroot) {
 	exit(9);
       }
 
+    } else if (mode == TTOC_STRANDED || mode == TTOC_NONSTRANDED) {
+      if (user_atoidir == NULL) {
+	modedir = genomesubdir;
+      } else {
+	modedir = user_atoidir;
+      }
+
+      if ((indexdb = Indexdb_new_genome(&index1part,&index1interval,
+					modedir,fileroot,/*idx_filesuffix*/"a2itc",/*snps_root*/NULL,
+					required_index1part,required_index1interval,
+					expand_offsets_p,offsetsstrm_access,positions_access,sharedp)) == NULL) {
+	fprintf(stderr,"Cannot find a2itc index file.  Need to run atoiindex first\n");
+	exit(9);
+      }
+
+      if ((indexdb2 = Indexdb_new_genome(&index1part,&index1interval,
+					 modedir,fileroot,/*idx_filesuffix*/"a2iag",/*snps_root*/NULL,
+					 required_index1part,required_index1interval,
+					 expand_offsets_p,offsetsstrm_access,positions_access,sharedp)) == NULL) {
+	fprintf(stderr,"Cannot find a2iag index file.  Need to run atoiindex first\n");
+	exit(9);
+      }
+
 
     } else {
       /* Standard behavior */
@@ -2810,6 +2851,28 @@ worker_setup (char *genomesubdir, char *fileroot) {
 	exit(9);
       }
 
+    } else if (mode == TTOC_STRANDED || mode == TTOC_NONSTRANDED) {
+      if (user_atoidir == NULL) {
+	modedir = snpsdir;
+      } else {
+	modedir = user_atoidir;
+      }
+
+      if ((indexdb = Indexdb_new_genome(&index1part,&index1interval,
+					modedir,fileroot,/*idx_filesuffix*/"a2itc",snps_root,
+					required_index1part,required_index1interval,
+					expand_offsets_p,offsetsstrm_access,positions_access,sharedp)) == NULL) {
+	fprintf(stderr,"Cannot find a2itc index file.  Need to run atoiindex first\n");
+	exit(9);
+      }
+      if ((indexdb2 = Indexdb_new_genome(&index1part,&index1interval,
+					 modedir,fileroot,/*idx_filesuffix*/"a2iag",snps_root,
+					 required_index1part,required_index1interval,
+					 expand_offsets_p,offsetsstrm_access,positions_access,sharedp)) == NULL) {
+	fprintf(stderr,"Cannot find a2iag index file.  Need to run atoiindex first\n");
+	exit(9);
+      }
+
     } else {
       indexdb = Indexdb_new_genome(&index1part,&index1interval,
 				   snpsdir,fileroot,/*idx_filesuffix*/"ref",snps_root,
@@ -3307,7 +3370,6 @@ main (int argc, char *argv[]) {
 
 #ifdef USE_MPI
   Master_T master;
-  bool master_is_worker_p;
   char **files_master;
   int nfiles_master;
   FILE *input_parser, *input2_parser;
@@ -3403,8 +3465,18 @@ main (int argc, char *argv[]) {
   MPI_Comm_size(MPI_COMM_WORLD,&nranks);
   MPI_Debug_setup(myid);
 
-  if ((nthreads0 = nthreads - 1) <= 0) {
-    /* Exclude master rank 0 from workers_group */
+  nthreads0 = nthreads - 1;
+  if (master_is_worker_p == false) {
+    /* Default is to exclude master node from working */
+    exclude_ranks[0] = 0;
+    MPI_Comm_group(MPI_COMM_WORLD,&world_group);
+    MPI_Group_excl(world_group,1,exclude_ranks,&workers_group);
+    MPI_Comm_create(MPI_COMM_WORLD,workers_group,&workers_comm);
+    MPI_Group_free(&workers_group);
+    MPI_Group_free(&world_group);
+
+  } else if (nthreads0 <= 0) {
+    /* If insufficient threads, then also exclude master node from working */
     exclude_ranks[0] = 0;
     MPI_Comm_group(MPI_COMM_WORLD,&world_group);
     MPI_Group_excl(world_group,1,exclude_ranks,&workers_group);
@@ -3418,7 +3490,7 @@ main (int argc, char *argv[]) {
     MPI_Comm_group(MPI_COMM_WORLD,&world_group);
     MPI_Comm_create(MPI_COMM_WORLD,world_group,&workers_comm);
     MPI_Group_free(&world_group);
-    master_is_worker_p = true;
+    /* master_is_worker_p = true; */
   }
   n_slave_ranks = nranks - 1;	/* Don't include master, even if it's a worker */
 
@@ -3526,6 +3598,7 @@ main (int argc, char *argv[]) {
 #endif
 
   if (multiple_sequences_p == true) {
+#if 0
     if (offsetsstrm_access != USE_ALLOCATE || genome_access != USE_ALLOCATE ||
 	sarray_access != USE_ALLOCATE || lcp_access != USE_ALLOCATE) {
       fprintf(stderr,"Note: >1 sequence detected, so index files are being memory mapped.\n");
@@ -3537,6 +3610,7 @@ main (int argc, char *argv[]) {
 #endif
       fprintf(stderr,"\n");
     }
+#endif
 
   } else {
     /* fprintf(stderr,"Note: only 1 sequence detected.  Ignoring batch (-B) command\n"); */
@@ -4010,8 +4084,9 @@ is still designed to be fast.\n\
   --atoidir=STRING               Directory for A-to-I RNA editing index files (created using atoiindex)\n\
                                    (default is location of genome index files specified using -D, -V, and -d)\n\
   --mode=STRING                  Alignment mode: standard (default), cmet-stranded, cmet-nonstranded,\n\
-                                    atoi-stranded, or atoi-nonstranded.  Non-standard modes requires you\n\
-                                    to have previously run the cmetindex or atoiindex programs on the genome\n\
+                                    atoi-stranded, atoi-nonstranded, ttoc-stranded, or ttoc-nonstranded.\n\
+                                    Non-standard modes requires you to have previously run the cmetindex\n\
+                                    or atoiindex programs (which also cover the ttoc modes) on the genome\n\
 ");
 
 
@@ -4277,6 +4352,17 @@ is still designed to be fast.\n\
 ");
   fprintf(stdout,"\n");
 
+#ifdef USE_MPI
+  fprintf(stdout,"Options for MPI\n");
+  fprintf(stdout,"\
+  --master-is-worker=INT         Determines whether master node allocates threads for performing computation\n\
+                                   in addition to coordinating input and output.  Number of worker threads\n\
+                                   will be --nthreads minus 2\n\
+                                   Values: 0 (no, default), 1 (yes if enough worker threads available)\n\
+");
+  fprintf(stdout,"\n");
+#endif
+
   /* Help options */
   fprintf(stdout,"Help options\n");
   fprintf(stdout,"\
diff --git a/src/iit-read-univ.c b/src/iit-read-univ.c
index 3cef0d9..f76e46c 100644
--- a/src/iit-read-univ.c
+++ b/src/iit-read-univ.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: iit-read-univ.c 161940 2015-03-25 20:36:59Z twu $";
+static char rcsid[] = "$Id: iit-read-univ.c 168395 2015-06-26 17:13:13Z twu $";
 #ifdef HAVE_CONFIG_H
 #include <config.h>
 #endif
@@ -95,7 +95,7 @@ static char rcsid[] = "$Id: iit-read-univ.c 161940 2015-03-25 20:36:59Z twu $";
    available). */
 typedef struct Univ_FNode_T *Univ_FNode_T;
 struct Univ_FNode_T {
-  Univcoord_T value;
+  Univ_IIT_coord_T value;
   int a;
   int b;
   int leftindex;
@@ -986,6 +986,7 @@ read_tree_univ (off_t offset, off_t filesize, FILE *fp, char *filename, T new) {
     new->nodes = (struct Univ_FNode_T *) CALLOC(new->nnodes,sizeof(struct Univ_FNode_T));
 #ifdef WORDS_BIGENDIAN
     if (new->coord_values_8p == true) {
+#ifdef HAVE_64_BIT
       for (i = 0; i < new->nnodes; i++) {
 	Bigendian_fread_uint8(&(new->nodes[i].value),fp);
 	Bigendian_fread_int(&(new->nodes[i].a),fp);
@@ -994,6 +995,10 @@ read_tree_univ (off_t offset, off_t filesize, FILE *fp, char *filename, T new) {
 	Bigendian_fread_int(&(new->nodes[i].rightindex),fp);
       }
       offset += (sizeof(UINT8)+sizeof(int)+sizeof(int)+sizeof(int)+sizeof(int))*new->nnodes;
+#else
+      fprintf(stderr,"IIT file contains 64-bit coordinates, but this computer is only 32-bit.  Cannot continue.\n");
+      exit(9);
+#endif
     } else {
       for (i = 0; i < new->nnodes; i++) {
 	Bigendian_fread_uint(&uint4,fp);
@@ -1007,6 +1012,7 @@ read_tree_univ (off_t offset, off_t filesize, FILE *fp, char *filename, T new) {
     }
 #else
     if (new->coord_values_8p == true) {
+#ifdef HAVE_64_BIT
 #if 1
       offset += sizeof(struct Univ_FNode_T)*fread(new->nodes,sizeof(struct Univ_FNode_T),new->nnodes,fp);
 #else
@@ -1020,6 +1026,10 @@ read_tree_univ (off_t offset, off_t filesize, FILE *fp, char *filename, T new) {
       }
       offset += (sizeof(UINT8)+sizeof(int)+sizeof(int)+sizeof(int)+sizeof(int))*new->nnodes;
 #endif
+#else
+      fprintf(stderr,"IIT file contains 64-bit coordinates, but this computer is only 32-bit.  Cannot continue.\n");
+      exit(9);
+#endif
     } else {
       for (i = 0; i < new->nnodes; i++) {
 	FREAD_UINT(&uint4,fp);
@@ -1051,16 +1061,21 @@ read_intervals_univ (off_t offset, off_t filesize, FILE *fp, char *filename, T n
 
 #ifdef WORDS_BIGENDIAN
   if (new->coord_values_8p == true) {
+#ifdef HAVE_64_BIT
     for (i = 0; i < new->total_nintervals; i++) {
       Bigendian_fread_uint8(&(new->intervals[i].low),fp);
       Bigendian_fread_uint8(&(new->intervals[i].high),fp);
       Bigendian_fread_int(&(new->intervals[i].type),fp);
     }
+#else
+    fprintf(stderr,"IIT file contains 64-bit coordinates, but this computer is only 32-bit.  Cannot continue.\n");
+    exit(9);
+#endif
   } else {
     for (i = 0; i < new->total_nintervals; i++) {
-      Bigendian_fread_uint(&unit4,fp);
+      Bigendian_fread_uint(&uint4,fp);
       new->intervals[i].low = (Univcoord_T) uint4;
-      Bigendian_fread_uint(&unit4,fp);
+      Bigendian_fread_uint(&uint4,fp);
       new->intervals[i].high = (Univcoord_T) uint4;
       Bigendian_fread_int(&(new->intervals[i].type),fp);
     }
@@ -1068,12 +1083,17 @@ read_intervals_univ (off_t offset, off_t filesize, FILE *fp, char *filename, T n
   }
 #else
   if (new->coord_values_8p == true) {
+#ifdef HAVE_64_BIT
     for (i = 0; i < new->total_nintervals; i++) {
       FREAD_UINT8(&(new->intervals[i].low),fp);
       FREAD_UINT8(&(new->intervals[i].high),fp);
       FREAD_INT(&(new->intervals[i].type),fp);
     }
     offset += (sizeof(UINT8)+sizeof(UINT8)+sizeof(int))*new->total_nintervals;
+#else
+    fprintf(stderr,"IIT file contains 64-bit coordinates, but this computer is only 32-bit.  Cannot continue.\n");
+    exit(9);
+#endif
   } else {
     for (i = 0; i < new->total_nintervals; i++) {
       FREAD_UINT(&uint4,fp);
diff --git a/src/indexdb.c b/src/indexdb.c
index fa9232b..ec4b674 100644
--- a/src/indexdb.c
+++ b/src/indexdb.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: indexdb.c 161940 2015-03-25 20:36:59Z twu $";
+static char rcsid[] = "$Id: indexdb.c 168395 2015-06-26 17:13:13Z twu $";
 #ifdef HAVE_CONFIG_H
 #include <config.h>
 #endif
@@ -254,17 +254,14 @@ Indexdb_mean_size (T this, Mode_T mode, Width_T index1part) {
   n = oligospace = power(this->alphabet_size,index1part);
 #else
   n = oligospace = power(4,index1part);
-  if (mode == CMET_STRANDED || mode == CMET_NONSTRANDED || mode == ATOI_STRANDED || mode == ATOI_NONSTRANDED) {
+  if (mode != STANDARD) {
     n = power(3,index1part);
   }
 #endif
 
 #ifdef WORDS_BIGENDIAN
-  if (this->offsetsstrm_access == ALLOCATED_PRIVATE || this->offsetsstrm_access == ALLOCATED_SHARED) {
-    return (double) this->offsetsstrm[this->offsetsmeta[oligospace/this->blocksize]]/(double) n;
-  } else {
-    return (double) Bigendian_convert_uint(this->offsetsstrm[this->offsetsmeta[oligospace/this->blocksize]])/(double) n;
-  }
+  /* Also holds for ALLOCATED_PRIVATE and ALLOCATED_SHARED */
+  return (double) Bigendian_convert_uint(this->offsetsstrm[Bigendian_convert_uint(this->offsetsmeta[oligospace/this->blocksize])])/(double) n;
 #else
   return (double) this->offsetsstrm[this->offsetsmeta[oligospace/this->blocksize]]/(double) n;
 #endif
@@ -2013,13 +2010,9 @@ Indexdb_read (int *nentries, T this, Storedoligomer_T aaindex) {
 
   if (this->compression_type == NO_COMPRESSION) {
 #ifdef WORDS_BIGENDIAN
-    if (this->offsetsstrm_access == ALLOCATED_PRIVATE || this->offsetsstrm_access == ALLOCATED_SHARED) {
-      ptr0 = this->offsetsstrm[aaindex];
-      end0 = this->offsetsstrm[aaindex+1];
-    } else {
-      ptr0 = Bigendian_convert_uint(this->offsetsstrm[aaindex]);
-      end0 = Bigendian_convert_uint(this->offsetsstrm[aaindex+1]);
-    }
+    /* Also holds for ALLOCATED_PRIVATE and ALLOCATED_SHARED */
+    ptr0 = Bigendian_convert_uint(this->offsetsstrm[aaindex]);
+    end0 = Bigendian_convert_uint(this->offsetsstrm[aaindex+1]);
 #else
     ptr0 = this->offsetsstrm[aaindex];
     end0 = this->offsetsstrm[aaindex+1];
@@ -2168,13 +2161,9 @@ Indexdb_read (int *nentries, T this, Storedoligomer_T oligo) {
 
   if (this->compression_type == NO_COMPRESSION) {
 #ifdef WORDS_BIGENDIAN
-    if (this->offsetsstrm_access == ALLOCATED_PRIVATE || this->offsetsstrm_access == ALLOCATED_SHARED) {
-      ptr0 = this->offsetsstrm[part0];
-      end0 = this->offsetsstrm[part0+1];
-    } else {
-      ptr0 = Bigendian_convert_uint(this->offsetsstrm[part0]);
-      end0 = Bigendian_convert_uint(this->offsetsstrm[part0+1]);
-    }
+    /* Also holds for ALLOCATED_PRIVATE and ALLOCATED_SHARED */
+    ptr0 = Bigendian_convert_uint(this->offsetsstrm[part0]);
+    end0 = Bigendian_convert_uint(this->offsetsstrm[part0+1]);
 #else
     ptr0 = this->offsetsstrm[part0];
     end0 = this->offsetsstrm[part0+1];
@@ -2321,13 +2310,9 @@ Indexdb_read_inplace (int *nentries,
 
   if (this->compression_type == NO_COMPRESSION) {
 #ifdef WORDS_BIGENDIAN
-    if (this->offsetsstrm_access == ALLOCATED_PRIVATE || this->offsetsstrm_access == ALLOCATED_SHARED) {
-      ptr0 = this->offsetsstrm[part0];
-      end0 = this->offsetsstrm[part0+1];
-    } else {
-      ptr0 = Bigendian_convert_uint(this->offsetsstrm[part0]);
-      end0 = Bigendian_convert_uint(this->offsetsstrm[part0+1]);
-    }
+    /* Also holds for ALLOCATED_PRIVATE and ALLOCATED_SHARED */
+    ptr0 = Bigendian_convert_uint(this->offsetsstrm[part0]);
+    end0 = Bigendian_convert_uint(this->offsetsstrm[part0+1]);
 #else
     ptr0 = this->offsetsstrm[part0];
     end0 = this->offsetsstrm[part0+1];
@@ -2394,13 +2379,9 @@ Indexdb_read_with_diagterm (int *nentries, T this, Storedoligomer_T oligo, int d
 
   if (this->compression_type == NO_COMPRESSION) {
 #ifdef WORDS_BIGENDIAN
-    if (this->offsetsstrm_access == ALLOCATED_PRIVATE || this->offsetsstrm_access == ALLOCATED_SHARED) {
-      ptr0 = this->offsetsstrm[oligo];
-      end0 = this->offsetsstrm[oligo+1];
-    } else {
-      ptr0 = Bigendian_convert_uint(this->offsetsstrm[oligo]);
-      end0 = Bigendian_convert_uint(this->offsetsstrm[oligo+1]);
-    }
+    /* Also holds for ALLOCATED_PRIVATE and ALLOCATED_SHARED */
+    ptr0 = Bigendian_convert_uint(this->offsetsstrm[oligo]);
+    end0 = Bigendian_convert_uint(this->offsetsstrm[oligo+1]);
 #else
     ptr0 = this->offsetsstrm[oligo];
     end0 = this->offsetsstrm[oligo+1];
@@ -2498,13 +2479,9 @@ Indexdb_read_with_diagterm_sizelimit (int *nentries, T this, Storedoligomer_T ol
 
   if (this->compression_type == NO_COMPRESSION) {
 #ifdef WORDS_BIGENDIAN
-    if (this->offsetsstrm_access == ALLOCATED_PRIVATE || this->offsetsstrm_access == ALLOCATED_SHARED) {
-      ptr0 = this->offsetsstrm[oligo];
-      end0 = this->offsetsstrm[oligo+1];
-    } else {
-      ptr0 = Bigendian_convert_uint(this->offsetsstrm[oligo]);
-      end0 = Bigendian_convert_uint(this->offsetsstrm[oligo+1]);
-    }
+    /* Also holds for ALLOCATED_PRIVATE and ALLOCATED_SHARED */
+    ptr0 = Bigendian_convert_uint(this->offsetsstrm[oligo]);
+    end0 = Bigendian_convert_uint(this->offsetsstrm[oligo+1]);
 #else
     ptr0 = this->offsetsstrm[oligo];
     end0 = this->offsetsstrm[oligo+1];
diff --git a/src/indexdb_hr.c b/src/indexdb_hr.c
index 1731c2f..0ba9fb2 100644
--- a/src/indexdb_hr.c
+++ b/src/indexdb_hr.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: indexdb_hr.c 153955 2014-11-24 17:54:45Z twu $";
+static char rcsid[] = "$Id: indexdb_hr.c 168395 2015-06-26 17:13:13Z twu $";
 #ifdef HAVE_CONFIG_H
 #include <config.h>
 #endif
@@ -702,6 +702,7 @@ point_one_shift (int *nentries, T this, Storedoligomer_T subst) {
 
   if (this->compression_type == NO_COMPRESSION) {
 #ifdef WORDS_BIGENDIAN
+#if 0
     if (this->offsetsstrm_access == ALLOCATED) {
       ptr0 = this->offsetsstrm[subst];
       end0 = this->offsetsstrm[subst+1];
@@ -710,6 +711,9 @@ point_one_shift (int *nentries, T this, Storedoligomer_T subst) {
       end0 = Bigendian_convert_uint(this->offsetsstrm[subst+1]);
     }
 #else
+    abort();
+#endif
+#else
     ptr0 = this->offsetsstrm[subst];
     end0 = this->offsetsstrm[subst+1];
 #endif
@@ -808,6 +812,7 @@ count_one_shift (T this, Storedoligomer_T subst, int nadjacent) {
 
   if (this->compression_type == NO_COMPRESSION) {
 #ifdef WORDS_BIGENDIAN
+#if 0
     if (this->offsetsstrm_access == ALLOCATED) {
       ptr0 = this->offsetsstrm[subst];
       end0 = this->offsetsstrm[subst+nadjacent];
@@ -816,6 +821,9 @@ count_one_shift (T this, Storedoligomer_T subst, int nadjacent) {
       end0 = Bigendian_convert_uint(this->offsetsstrm[subst+nadjacent]);
     }
 #else
+    abort();
+#endif
+#else
     ptr0 = this->offsetsstrm[subst];
     end0 = this->offsetsstrm[subst+nadjacent];
 #endif
@@ -841,6 +849,7 @@ count_one_shift (T this, Storedoligomer_T subst, int nadjacent) {
 
   if (this->compression_type == NO_COMPRESSION) {
 #ifdef WORDS_BIGENDIAN
+#if 0
     if (this->offsetsstrm_access == ALLOCATED) {
       ptr0 = this->offsetsstrm[subst];
       end0 = this->offsetsstrm[subst+nadjacent];
@@ -849,6 +858,9 @@ count_one_shift (T this, Storedoligomer_T subst, int nadjacent) {
       end0 = Bigendian_convert_uint(this->offsetsstrm[subst+nadjacent]);
     }
 #else
+    abort();
+#endif
+#else
     ptr0 = this->offsetsstrm[subst];
     end0 = this->offsetsstrm[subst+nadjacent];
 #endif
@@ -1836,6 +1848,7 @@ Indexdb_count_no_subst (T this, Storedoligomer_T oligo) {
 
   if (this->compression_type == NO_COMPRESSION) {
 #ifdef WORDS_BIGENDIAN
+#if 0
     if (this->offsetsstrm_access == ALLOCATED) {
       ptr0 = this->offsetsstrm[oligo];
       end0 = this->offsetsstrm[oligo+1];
@@ -1844,6 +1857,9 @@ Indexdb_count_no_subst (T this, Storedoligomer_T oligo) {
       end0 = Bigendian_convert_uint(this->offsetsstrm[oligo+1]);
     }
 #else
+    abort();
+#endif
+#else
     ptr0 = this->offsetsstrm[oligo];
     end0 = this->offsetsstrm[oligo+1];
 #endif
diff --git a/src/mode.h b/src/mode.h
index 1592c99..2fbc598 100644
--- a/src/mode.h
+++ b/src/mode.h
@@ -1,8 +1,8 @@
-/* $Id: mode.h 48805 2011-09-30 20:20:26Z twu $ */
+/* $Id: mode.h 167592 2015-06-15 18:56:59Z twu $ */
 #ifndef MODE_INCLUDED
 #define MODE_INCLUDED
 
-typedef enum {STANDARD, CMET_STRANDED, CMET_NONSTRANDED, ATOI_STRANDED, ATOI_NONSTRANDED} Mode_T;
+typedef enum {STANDARD, CMET_STRANDED, CMET_NONSTRANDED, ATOI_STRANDED, ATOI_NONSTRANDED, TTOC_STRANDED, TTOC_NONSTRANDED} Mode_T;
 
 #endif
 
diff --git a/src/oligoindex_hr.c b/src/oligoindex_hr.c
index f4ebc19..c541ce9 100644
--- a/src/oligoindex_hr.c
+++ b/src/oligoindex_hr.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: oligoindex_hr.c 166641 2015-05-29 21:13:04Z twu $";
+static char rcsid[] = "$Id: oligoindex_hr.c 167575 2015-06-15 17:26:24Z twu $";
 #ifdef HAVE_CONFIG_H
 #include <config.h>
 #endif
@@ -40,10 +40,18 @@ static char rcsid[] = "$Id: oligoindex_hr.c 166641 2015-05-29 21:13:04Z twu $";
 
 #ifdef HAVE_SSE2
 #define USE_SIMD_FOR_COUNTS 1
-#else
+#endif
+
+#if !defined(HAVE_SSE2)
 #define INDIVIDUAL_SHIFTS 1
+#elif !defined(HAVE_SSE4_1)
+#define SIMD_MASK_THEN_STORE
+#define EXTRACT(x,i) x[i]
+#else
+#define EXTRACT(x,i) _mm_extract_epi32(x,i)
 #endif
 
+
 #define THETADIFF1 20.0
 #define THETADIFF2 20.0
 #define REPOLIGOCOUNT 8
@@ -196,7 +204,7 @@ print_counts (__m128i x, char *label) {
 #endif
 
 
-#if !defined(HAVE_SSE2) || defined(CHECK_ASSERTIONS)
+#if !defined(HAVE_SSE2) || !defined(HAVE_SSE4_1) || defined(CHECK_ASSERTIONS)
 static const Genomecomp_T reverse_nt[] = 
 {0x0000,0x4000,0x8000,0xC000,0x1000,0x5000,0x9000,0xD000,
  0x2000,0x6000,0xA000,0xE000,0x3000,0x7000,0xB000,0xF000,
@@ -10210,7 +10218,11 @@ store_fwdrev_simd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, C
 static void
 count_9mers_fwd (Count_T *counts, Genomecomp_T high_rev, Genomecomp_T low_rev, Genomecomp_T nexthigh_rev) {
   Genomecomp_T masked, oligo;
-#ifndef INDIVIDUAL_SHIFTS
+#ifdef INDIVIDUAL_SHIFTS
+#elif defined(SIMD_MASK_THEN_STORE)
+  UINT4 _masked[4] __attribute__ ((aligned (16)));
+  __m128i _oligo;
+#else
   __m128i _oligo, _masked;
 #endif
 
@@ -10253,41 +10265,49 @@ count_9mers_fwd (Count_T *counts, Genomecomp_T high_rev, Genomecomp_T low_rev, G
 
 #else
   _oligo = _mm_setr_epi32(oligo, oligo >> 2, oligo >> 4, oligo >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask9));
+#else
   _masked = _mm_and_si128(_oligo, mask9);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("31 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("30 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   counts[masked] += 1;
   debug(printf("29 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   counts[masked] += 1;
   debug(printf("28 %04X => %d\n",masked,counts[masked]));
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask9));
+#else
   _masked = _mm_and_si128(_oligo, mask9);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("27 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("26 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   counts[masked] += 1;
   debug(printf("25 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   counts[masked] += 1;
   debug(printf("24 %04X => %d\n",masked,counts[masked]));
 #endif
@@ -10328,41 +10348,49 @@ count_9mers_fwd (Count_T *counts, Genomecomp_T high_rev, Genomecomp_T low_rev, G
 
 #else
   _oligo = _mm_setr_epi32(low_rev, low_rev >> 2, low_rev >> 4, low_rev >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask9));
+#else
   _masked = _mm_and_si128(_oligo, mask9);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("23 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("22 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   counts[masked] += 1;
   debug(printf("21 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   counts[masked] += 1;
   debug(printf("20 %04X => %d\n",masked,counts[masked]));
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask9));
+#else
   _masked = _mm_and_si128(_oligo, mask9);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("19 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("18 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   counts[masked] += 1;
   debug(printf("17 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   counts[masked] += 1;
   debug(printf("16 %04X => %d\n",masked,counts[masked]));
 #endif
@@ -10406,41 +10434,49 @@ count_9mers_fwd (Count_T *counts, Genomecomp_T high_rev, Genomecomp_T low_rev, G
 
 #else
   _oligo = _mm_setr_epi32(oligo, oligo >> 2, oligo >> 4, oligo >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask9));
+#else
   _masked = _mm_and_si128(_oligo, mask9);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("15 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("14 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   counts[masked] += 1;
   debug(printf("13 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   counts[masked] += 1;
   debug(printf("12 %04X => %d\n",masked,counts[masked]));
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask9));
+#else
   _masked = _mm_and_si128(_oligo, mask9);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("11 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("10 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   counts[masked] += 1;
   debug(printf("9 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   counts[masked] += 1;
   debug(printf("8 %04X => %d\n",masked,counts[masked]));
 #endif
@@ -10481,41 +10517,49 @@ count_9mers_fwd (Count_T *counts, Genomecomp_T high_rev, Genomecomp_T low_rev, G
 
 #else
   _oligo = _mm_setr_epi32(high_rev, high_rev >> 2, high_rev >> 4, high_rev >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask9));
+#else
   _masked = _mm_and_si128(_oligo, mask9);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("7 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("6 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   counts[masked] += 1;
   debug(printf("5 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   counts[masked] += 1;
   debug(printf("4 %04X => %d\n",masked,counts[masked]));
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask9));
+#else
   _masked = _mm_and_si128(_oligo, mask9);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("3 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("2 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   counts[masked] += 1;
   debug(printf("1 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   counts[masked] += 1;
   debug(printf("0 %04X => %d\n",masked,counts[masked]));
 #endif
@@ -10665,7 +10709,11 @@ static int
 store_9mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Count_T *counts,
 		 Genomecomp_T high_rev, Genomecomp_T low_rev, Genomecomp_T nexthigh_rev) {
   Genomecomp_T masked, oligo;
-#ifndef INDIVIDUAL_SHIFTS
+#ifdef INDIVIDUAL_SHIFTS
+#elif defined(SIMD_MASK_THEN_STORE)
+  UINT4 _masked[4] __attribute__ ((aligned (16)));
+  __m128i _oligo;
+#else
   __m128i _oligo, _masked;
 #endif
 
@@ -10748,9 +10796,13 @@ store_9mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 #else
   _oligo = _mm_setr_epi32(oligo, oligo >> 2, oligo >> 4, oligo >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask9));
+#else
   _masked = _mm_and_si128(_oligo, mask9);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -10759,7 +10811,7 @@ store_9mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -10768,7 +10820,7 @@ store_9mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -10777,7 +10829,7 @@ store_9mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -10788,9 +10840,13 @@ store_9mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask9));
+#else
   _masked = _mm_and_si128(_oligo, mask9);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -10799,7 +10855,7 @@ store_9mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -10808,7 +10864,7 @@ store_9mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -10817,7 +10873,7 @@ store_9mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -10903,9 +10959,13 @@ store_9mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 #else
   _oligo = _mm_setr_epi32(low_rev, low_rev >> 2, low_rev >> 4, low_rev >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask9));
+#else
   _masked = _mm_and_si128(_oligo, mask9);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -10914,7 +10974,7 @@ store_9mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -10923,7 +10983,7 @@ store_9mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -10932,7 +10992,7 @@ store_9mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -10943,9 +11003,13 @@ store_9mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask9));
+#else
   _masked = _mm_and_si128(_oligo, mask9);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -10954,7 +11018,7 @@ store_9mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -10963,7 +11027,7 @@ store_9mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -10972,7 +11036,7 @@ store_9mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -11061,9 +11125,13 @@ store_9mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 #else
   _oligo = _mm_setr_epi32(oligo, oligo >> 2, oligo >> 4, oligo >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask9));
+#else
   _masked = _mm_and_si128(_oligo, mask9);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -11072,7 +11140,7 @@ store_9mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -11081,7 +11149,7 @@ store_9mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -11090,7 +11158,7 @@ store_9mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -11101,9 +11169,13 @@ store_9mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask9));
+#else
   _masked = _mm_and_si128(_oligo, mask9);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -11112,7 +11184,7 @@ store_9mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -11121,7 +11193,7 @@ store_9mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -11130,7 +11202,7 @@ store_9mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -11216,9 +11288,13 @@ store_9mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 #else
   _oligo = _mm_setr_epi32(high_rev, high_rev >> 2, high_rev >> 4, high_rev >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask9));
+#else
   _masked = _mm_and_si128(_oligo, mask9);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -11227,7 +11303,7 @@ store_9mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -11236,7 +11312,7 @@ store_9mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -11245,7 +11321,7 @@ store_9mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -11256,9 +11332,13 @@ store_9mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask9));
+#else
   _masked = _mm_and_si128(_oligo, mask9);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -11267,7 +11347,7 @@ store_9mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -11276,7 +11356,7 @@ store_9mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -11285,7 +11365,7 @@ store_9mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -11303,7 +11383,11 @@ store_9mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 static void
 count_8mers_fwd (Count_T *counts, Genomecomp_T high_rev, Genomecomp_T low_rev, Genomecomp_T nexthigh_rev) {
   Genomecomp_T masked, oligo;
-#ifndef INDIVIDUAL_SHIFTS
+#ifdef INDIVIDUAL_SHIFTS
+#elif defined(SIMD_MASK_THEN_STORE)
+  UINT4 _masked[4] __attribute__ ((aligned (16)));
+  __m128i _oligo;
+#else
   __m128i _oligo, _masked;
 #endif
 
@@ -11342,37 +11426,45 @@ count_8mers_fwd (Count_T *counts, Genomecomp_T high_rev, Genomecomp_T low_rev, G
 
 #else
   _oligo = _mm_setr_epi32(oligo, oligo >> 2, oligo >> 4, oligo >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask8));
+#else
   _masked = _mm_and_si128(_oligo, mask8);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("31 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("30 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   counts[masked] += 1;
   debug(printf("29 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   counts[masked] += 1;
   debug(printf("28 %04X => %d\n",masked,counts[masked]));
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask8));
+#else
   _masked = _mm_and_si128(_oligo, mask8);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("27 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("26 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   counts[masked] += 1;
   debug(printf("25 %04X => %d\n",masked,counts[masked]));
 #endif
@@ -11417,41 +11509,49 @@ count_8mers_fwd (Count_T *counts, Genomecomp_T high_rev, Genomecomp_T low_rev, G
   
 #else
   _oligo = _mm_setr_epi32(low_rev, low_rev >> 2, low_rev >> 4, low_rev >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask8));
+#else
   _masked = _mm_and_si128(_oligo, mask8);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("24 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("23 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   counts[masked] += 1;
   debug(printf("22 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   counts[masked] += 1;
   debug(printf("21 %04X => %d\n",masked,counts[masked]));
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask8));
+#else
   _masked = _mm_and_si128(_oligo, mask8);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("20 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("19 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   counts[masked] += 1;
   debug(printf("18 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   counts[masked] += 1;
   debug(printf("17 %04X => %d\n",masked,counts[masked]));
 
@@ -11496,37 +11596,45 @@ count_8mers_fwd (Count_T *counts, Genomecomp_T high_rev, Genomecomp_T low_rev, G
 
 #else
   _oligo = _mm_setr_epi32(oligo, oligo >> 2, oligo >> 4, oligo >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask8));
+#else
   _masked = _mm_and_si128(_oligo, mask8);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("15 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("14 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   counts[masked] += 1;
   debug(printf("13 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   counts[masked] += 1;
   debug(printf("12 %04X => %d\n",masked,counts[masked]));
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask8));
+#else
   _masked = _mm_and_si128(_oligo, mask8);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("11 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("10 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   counts[masked] += 1;
   debug(printf("9 %04X => %d\n",masked,counts[masked]));
 #endif
@@ -11571,41 +11679,49 @@ count_8mers_fwd (Count_T *counts, Genomecomp_T high_rev, Genomecomp_T low_rev, G
   
 #else
   _oligo = _mm_setr_epi32(high_rev, high_rev >> 2, high_rev >> 4, high_rev >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask8));
+#else
   _masked = _mm_and_si128(_oligo, mask8);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("8 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("7 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   counts[masked] += 1;
   debug(printf("6 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   counts[masked] += 1;
   debug(printf("5 %04X => %d\n",masked,counts[masked]));
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask8));
+#else
   _masked = _mm_and_si128(_oligo, mask8);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("4 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("3 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   counts[masked] += 1;
   debug(printf("2 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   counts[masked] += 1;
   debug(printf("1 %04X => %d\n",masked,counts[masked]));
 
@@ -11760,7 +11876,11 @@ static int
 store_8mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Count_T *counts,
 		 Genomecomp_T high_rev, Genomecomp_T low_rev, Genomecomp_T nexthigh_rev) {
   Genomecomp_T masked, oligo;
-#ifndef INDIVIDUAL_SHIFTS
+#ifdef INDIVIDUAL_SHIFTS
+#elif defined(SIMD_MASK_THEN_STORE)
+  UINT4 _masked[4] __attribute__ ((aligned (16)));
+  __m128i _oligo;
+#else
   __m128i _oligo, _masked;
 #endif
 
@@ -11841,9 +11961,13 @@ store_8mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 #else
   _oligo = _mm_setr_epi32(oligo, oligo >> 2, oligo >> 4, oligo >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask8));
+#else
   _masked = _mm_and_si128(_oligo, mask8);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -11853,7 +11977,7 @@ store_8mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -11863,7 +11987,7 @@ store_8mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -11873,7 +11997,7 @@ store_8mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -11885,9 +12009,13 @@ store_8mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask8));
+#else
   _masked = _mm_and_si128(_oligo, mask8);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -11897,7 +12025,7 @@ store_8mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -11907,7 +12035,7 @@ store_8mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -12012,9 +12140,13 @@ store_8mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 #else
   _oligo = _mm_setr_epi32(low_rev, low_rev >> 2, low_rev >> 4, low_rev >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask8));
+#else
   _masked = _mm_and_si128(_oligo, mask8);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -12024,7 +12156,7 @@ store_8mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -12034,7 +12166,7 @@ store_8mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -12044,7 +12176,7 @@ store_8mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -12056,9 +12188,13 @@ store_8mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask8));
+#else
   _masked = _mm_and_si128(_oligo, mask8);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -12068,7 +12204,7 @@ store_8mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -12078,7 +12214,7 @@ store_8mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -12088,7 +12224,7 @@ store_8mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -12187,9 +12323,13 @@ store_8mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 #else
   _oligo = _mm_setr_epi32(oligo, oligo >> 2, oligo >> 4, oligo >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask8));
+#else
   _masked = _mm_and_si128(_oligo, mask8);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -12199,7 +12339,7 @@ store_8mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -12209,7 +12349,7 @@ store_8mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -12219,7 +12359,7 @@ store_8mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -12231,9 +12371,13 @@ store_8mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask8));
+#else
   _masked = _mm_and_si128(_oligo, mask8);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -12243,7 +12387,7 @@ store_8mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -12253,7 +12397,7 @@ store_8mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -12358,9 +12502,13 @@ store_8mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 #else
   _oligo = _mm_setr_epi32(high_rev, high_rev >> 2, high_rev >> 4, high_rev >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask8));
+#else
   _masked = _mm_and_si128(_oligo, mask8);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -12370,7 +12518,7 @@ store_8mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -12380,7 +12528,7 @@ store_8mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -12390,7 +12538,7 @@ store_8mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -12402,9 +12550,13 @@ store_8mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask8));
+#else
   _masked = _mm_and_si128(_oligo, mask8);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -12414,7 +12566,7 @@ store_8mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -12424,7 +12576,7 @@ store_8mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -12434,7 +12586,7 @@ store_8mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -12463,7 +12615,11 @@ store_8mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 static void
 count_7mers_fwd (Count_T *counts, Genomecomp_T high_rev, Genomecomp_T low_rev, Genomecomp_T nexthigh_rev) {
   Genomecomp_T masked, oligo;
-#ifndef INDIVIDUAL_SHIFTS
+#ifdef INDIVIDUAL_SHIFTS
+#elif defined(SIMD_MASK_THEN_STORE)
+  UINT4 _masked[4] __attribute__ ((aligned (16)));
+  __m128i _oligo;
+#else
   __m128i _oligo, _masked;
 #endif
 
@@ -12498,33 +12654,41 @@ count_7mers_fwd (Count_T *counts, Genomecomp_T high_rev, Genomecomp_T low_rev, G
 
 #else
   _oligo = _mm_setr_epi32(oligo, oligo >> 2, oligo >> 4, oligo >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask7));
+#else
   _masked = _mm_and_si128(_oligo, mask7);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("31 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("30 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   counts[masked] += 1;
   debug(printf("29 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   counts[masked] += 1;
   debug(printf("28 %04X => %d\n",masked,counts[masked]));
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask7));
+#else
   _masked = _mm_and_si128(_oligo, mask7);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("27 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("26 %04X => %d\n",masked,counts[masked]));
 #endif
@@ -12573,53 +12737,65 @@ count_7mers_fwd (Count_T *counts, Genomecomp_T high_rev, Genomecomp_T low_rev, G
 
 #else
   _oligo = _mm_setr_epi32(low_rev, low_rev >> 2, low_rev >> 4, low_rev >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask7));
+#else
   _masked = _mm_and_si128(_oligo, mask7);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("25 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("24 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   counts[masked] += 1;
   debug(printf("23 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   counts[masked] += 1;
   debug(printf("22 %04X => %d\n",masked,counts[masked]));
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask7));
+#else
   _masked = _mm_and_si128(_oligo, mask7);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("21 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("20 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   counts[masked] += 1;
   debug(printf("19 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   counts[masked] += 1;
   debug(printf("18 %04X => %d\n",masked,counts[masked]));
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask7));
+#else
   _masked = _mm_and_si128(_oligo, mask7);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("17 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("16 %04X => %d\n",masked,counts[masked]));
 #endif
@@ -12655,33 +12831,41 @@ count_7mers_fwd (Count_T *counts, Genomecomp_T high_rev, Genomecomp_T low_rev, G
 
 #else
   _oligo = _mm_setr_epi32(oligo, oligo >> 2, oligo >> 4, oligo >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask7));
+#else
   _masked = _mm_and_si128(_oligo, mask7);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("15 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("14 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   counts[masked] += 1;
   debug(printf("13 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   counts[masked] += 1;
   debug(printf("12 %04X => %d\n",masked,counts[masked]));
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask7));
+#else
   _masked = _mm_and_si128(_oligo, mask7);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("11 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("10 %04X => %d\n",masked,counts[masked]));
 #endif
@@ -12730,53 +12914,65 @@ count_7mers_fwd (Count_T *counts, Genomecomp_T high_rev, Genomecomp_T low_rev, G
 
 #else
   _oligo = _mm_setr_epi32(high_rev, high_rev >> 2, high_rev >> 4, high_rev >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask7));
+#else
   _masked = _mm_and_si128(_oligo, mask7);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("9 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("8 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   counts[masked] += 1;
   debug(printf("7 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   counts[masked] += 1;
   debug(printf("6 %04X => %d\n",masked,counts[masked]));
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask7));
+#else
   _masked = _mm_and_si128(_oligo, mask7);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("5 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("4 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   counts[masked] += 1;
   debug(printf("3 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   counts[masked] += 1;
   debug(printf("2 %04X => %d\n",masked,counts[masked]));
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask7));
+#else
   _masked = _mm_and_si128(_oligo, mask7);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("1 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("0 %04X => %d\n",masked,counts[masked]));
 #endif
@@ -12927,7 +13123,11 @@ static int
 store_7mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Count_T *counts,
 		 Genomecomp_T high_rev, Genomecomp_T low_rev, Genomecomp_T nexthigh_rev) {
   Genomecomp_T masked, oligo;
-#ifndef INDIVIDUAL_SHIFTS
+#ifdef INDIVIDUAL_SHIFTS
+#elif defined(SIMD_MASK_THEN_STORE)
+  UINT4 _masked[4] __attribute__ ((aligned (16)));
+  __m128i _oligo;
+#else
   __m128i _oligo, _masked;
 #endif
 
@@ -12992,9 +13192,13 @@ store_7mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 #else
   _oligo = _mm_setr_epi32(oligo, oligo >> 2, oligo >> 4, oligo >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask7));
+#else
   _masked = _mm_and_si128(_oligo, mask7);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -13003,7 +13207,7 @@ store_7mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -13012,7 +13216,7 @@ store_7mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -13021,7 +13225,7 @@ store_7mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -13032,9 +13236,13 @@ store_7mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask7));
+#else
   _masked = _mm_and_si128(_oligo, mask7);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -13043,7 +13251,7 @@ store_7mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -13147,9 +13355,13 @@ store_7mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 #else
   _oligo = _mm_setr_epi32(low_rev, low_rev >> 2, low_rev >> 4, low_rev >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask7));
+#else
   _masked = _mm_and_si128(_oligo, mask7);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -13158,7 +13370,7 @@ store_7mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -13167,7 +13379,7 @@ store_7mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -13176,7 +13388,7 @@ store_7mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -13187,9 +13399,13 @@ store_7mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask7));
+#else
   _masked = _mm_and_si128(_oligo, mask7);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -13198,7 +13414,7 @@ store_7mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -13207,7 +13423,7 @@ store_7mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -13216,7 +13432,7 @@ store_7mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -13227,9 +13443,13 @@ store_7mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask7));
+#else
   _masked = _mm_and_si128(_oligo, mask7);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -13238,7 +13458,7 @@ store_7mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -13309,9 +13529,13 @@ store_7mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 #else
   _oligo = _mm_setr_epi32(oligo, oligo >> 2, oligo >> 4, oligo >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask7));
+#else
   _masked = _mm_and_si128(_oligo, mask7);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -13320,7 +13544,7 @@ store_7mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -13329,7 +13553,7 @@ store_7mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -13338,7 +13562,7 @@ store_7mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -13349,9 +13573,13 @@ store_7mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask7));
+#else
   _masked = _mm_and_si128(_oligo, mask7);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -13360,7 +13588,7 @@ store_7mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -13464,9 +13692,13 @@ store_7mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
   
 #else
   _oligo = _mm_setr_epi32(high_rev, high_rev >> 2, high_rev >> 4, high_rev >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask7));
+#else
   _masked = _mm_and_si128(_oligo, mask7);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -13475,7 +13707,7 @@ store_7mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -13484,7 +13716,7 @@ store_7mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -13493,7 +13725,7 @@ store_7mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -13504,9 +13736,13 @@ store_7mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask7));
+#else
   _masked = _mm_and_si128(_oligo, mask7);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -13515,7 +13751,7 @@ store_7mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -13524,7 +13760,7 @@ store_7mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -13533,7 +13769,7 @@ store_7mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -13544,9 +13780,13 @@ store_7mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask7));
+#else
   _masked = _mm_and_si128(_oligo, mask7);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -13555,7 +13795,7 @@ store_7mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -13572,7 +13812,11 @@ store_7mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 static void
 count_6mers_fwd (Count_T *counts, Genomecomp_T high_rev, Genomecomp_T low_rev, Genomecomp_T nexthigh_rev) {
   Genomecomp_T masked, oligo;
-#ifndef INDIVIDUAL_SHIFTS
+#ifdef INDIVIDUAL_SHIFTS
+#elif defined(SIMD_MASK_THEN_STORE)
+  UINT4 _masked[4] __attribute__ ((aligned (16)));
+  __m128i _oligo;
+#else
   __m128i _oligo, _masked;
 #endif
 
@@ -13603,21 +13847,25 @@ count_6mers_fwd (Count_T *counts, Genomecomp_T high_rev, Genomecomp_T low_rev, G
 
 #else
   _oligo = _mm_setr_epi32(oligo, oligo >> 2, oligo >> 4, oligo >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask6));
+#else
   _masked = _mm_and_si128(_oligo, mask6);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("31 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("30 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   counts[masked] += 1;
   debug(printf("29 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   counts[masked] += 1;
   debug(printf("28 %04X => %d\n",masked,counts[masked]));
 
@@ -13675,57 +13923,69 @@ count_6mers_fwd (Count_T *counts, Genomecomp_T high_rev, Genomecomp_T low_rev, G
 
 #else
   _oligo = _mm_setr_epi32(low_rev, low_rev >> 2, low_rev >> 4, low_rev >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask6));
+#else
   _masked = _mm_and_si128(_oligo, mask6);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("26 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("25 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   counts[masked] += 1;
   debug(printf("24 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   counts[masked] += 1;
   debug(printf("23 %04X => %d\n",masked,counts[masked]));
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask6));
+#else
   _masked = _mm_and_si128(_oligo, mask6);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("22 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("21 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   counts[masked] += 1;
   debug(printf("20 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   counts[masked] += 1;
   debug(printf("19 %04X => %d\n",masked,counts[masked]));
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask6));
+#else
   _masked = _mm_and_si128(_oligo, mask6);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("18 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("17 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   counts[masked] += 1;
   debug(printf("16 %04X => %d\n",masked,counts[masked]));
 #endif
@@ -13757,21 +14017,25 @@ count_6mers_fwd (Count_T *counts, Genomecomp_T high_rev, Genomecomp_T low_rev, G
 
 #else
   _oligo = _mm_setr_epi32(oligo, oligo >> 2, oligo >> 4, oligo >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask6));
+#else
   _masked = _mm_and_si128(_oligo, mask6);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("15 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("14 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   counts[masked] += 1;
   debug(printf("13 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   counts[masked] += 1;
   debug(printf("12 %04X => %d\n",masked,counts[masked]));
 
@@ -13829,57 +14093,69 @@ count_6mers_fwd (Count_T *counts, Genomecomp_T high_rev, Genomecomp_T low_rev, G
 
 #else
   _oligo = _mm_setr_epi32(high_rev, high_rev >> 2, high_rev >> 4, high_rev >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask6));
+#else
   _masked = _mm_and_si128(_oligo, mask6);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("10 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("9 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   counts[masked] += 1;
   debug(printf("8 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   counts[masked] += 1;
   debug(printf("7 %04X => %d\n",masked,counts[masked]));
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask6));
+#else
   _masked = _mm_and_si128(_oligo, mask6);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("6 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("5 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   counts[masked] += 1;
   debug(printf("4 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   counts[masked] += 1;
   debug(printf("3 %04X => %d\n",masked,counts[masked]));
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask6));
+#else
   _masked = _mm_and_si128(_oligo, mask6);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("2 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("1 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   counts[masked] += 1;
   debug(printf("0 %04X => %d\n",masked,counts[masked]));
 #endif
@@ -14031,7 +14307,11 @@ static int
 store_6mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Count_T *counts,
 		 Genomecomp_T high_rev, Genomecomp_T low_rev, Genomecomp_T nexthigh_rev) {
   Genomecomp_T masked, oligo;
-#ifndef INDIVIDUAL_SHIFTS
+#ifdef INDIVIDUAL_SHIFTS
+#elif defined(SIMD_MASK_THEN_STORE)
+  UINT4 _masked[4] __attribute__ ((aligned (16)));
+  __m128i _oligo;
+#else
   __m128i _oligo, _masked;
 #endif
 
@@ -14087,9 +14367,13 @@ store_6mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 #else
   _oligo = _mm_setr_epi32(oligo, oligo >> 2, oligo >> 4, oligo >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask6));
+#else
   _masked = _mm_and_si128(_oligo, mask6);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -14098,7 +14382,7 @@ store_6mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -14107,7 +14391,7 @@ store_6mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -14116,7 +14400,7 @@ store_6mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -14239,9 +14523,13 @@ store_6mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
   
 #else
   _oligo = _mm_setr_epi32(low_rev, low_rev >> 2, low_rev >> 4, low_rev >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask6));
+#else
   _masked = _mm_and_si128(_oligo, mask6);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -14250,7 +14538,7 @@ store_6mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -14259,7 +14547,7 @@ store_6mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -14268,7 +14556,7 @@ store_6mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -14279,9 +14567,13 @@ store_6mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask6));
+#else
   _masked = _mm_and_si128(_oligo, mask6);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -14290,7 +14582,7 @@ store_6mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -14299,7 +14591,7 @@ store_6mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -14308,7 +14600,7 @@ store_6mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -14319,9 +14611,13 @@ store_6mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask6));
+#else
   _masked = _mm_and_si128(_oligo, mask6);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -14330,7 +14626,7 @@ store_6mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -14339,7 +14635,7 @@ store_6mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -14401,9 +14697,13 @@ store_6mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 #else
   _oligo = _mm_setr_epi32(oligo, oligo >> 2, oligo >> 4, oligo >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask6));
+#else
   _masked = _mm_and_si128(_oligo, mask6);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -14412,7 +14712,7 @@ store_6mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -14421,7 +14721,7 @@ store_6mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -14430,7 +14730,7 @@ store_6mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -14553,9 +14853,13 @@ store_6mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
   
 #else
   _oligo = _mm_setr_epi32(high_rev, high_rev >> 2, high_rev >> 4, high_rev >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask6));
+#else
   _masked = _mm_and_si128(_oligo, mask6);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -14564,7 +14868,7 @@ store_6mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -14573,7 +14877,7 @@ store_6mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -14582,7 +14886,7 @@ store_6mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -14593,9 +14897,13 @@ store_6mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask6));
+#else
   _masked = _mm_and_si128(_oligo, mask6);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -14604,7 +14912,7 @@ store_6mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -14613,7 +14921,7 @@ store_6mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -14622,7 +14930,7 @@ store_6mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -14633,9 +14941,13 @@ store_6mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask6));
+#else
   _masked = _mm_and_si128(_oligo, mask6);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -14644,7 +14956,7 @@ store_6mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -14653,7 +14965,7 @@ store_6mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -14671,7 +14983,11 @@ store_6mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 static void
 count_5mers_fwd (Count_T *counts, Genomecomp_T high_rev, Genomecomp_T low_rev, Genomecomp_T nexthigh_rev) {
   Genomecomp_T masked, oligo;
-#ifndef INDIVIDUAL_SHIFTS
+#ifdef INDIVIDUAL_SHIFTS
+#elif defined(SIMD_MASK_THEN_STORE)
+  UINT4 _masked[4] __attribute__ ((aligned (16)));
+  __m128i _oligo;
+#else
   __m128i _oligo, _masked;
 #endif
 
@@ -14698,21 +15014,25 @@ count_5mers_fwd (Count_T *counts, Genomecomp_T high_rev, Genomecomp_T low_rev, G
 
 #else
   _oligo = _mm_setr_epi32(oligo, oligo >> 2, oligo >> 4, oligo >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask5));
+#else
   _masked = _mm_and_si128(_oligo, mask5);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("31 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("30 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   counts[masked] += 1;
   debug(printf("29 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   counts[masked] += 1;
   debug(printf("28 %04X => %d\n",masked,counts[masked]));
 #endif
@@ -14769,61 +15089,73 @@ count_5mers_fwd (Count_T *counts, Genomecomp_T high_rev, Genomecomp_T low_rev, G
 
 #else
   _oligo = _mm_setr_epi32(low_rev, low_rev >> 2, low_rev >> 4, low_rev >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask5));
+#else
   _masked = _mm_and_si128(_oligo, mask5);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("27 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("26 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   counts[masked] += 1;
   debug(printf("25 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   counts[masked] += 1;
   debug(printf("24 %04X => %d\n",masked,counts[masked]));
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask5));
+#else
   _masked = _mm_and_si128(_oligo, mask5);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("23 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("22 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   counts[masked] += 1;
   debug(printf("21 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   counts[masked] += 1;
   debug(printf("20 %04X => %d\n",masked,counts[masked]));
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask5));
+#else
   _masked = _mm_and_si128(_oligo, mask5);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("19 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("18 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   counts[masked] += 1;
   debug(printf("17 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   counts[masked] += 1;
   debug(printf("16 %04X => %d\n",masked,counts[masked]));
 #endif
@@ -14851,21 +15183,25 @@ count_5mers_fwd (Count_T *counts, Genomecomp_T high_rev, Genomecomp_T low_rev, G
 
 #else
   _oligo = _mm_setr_epi32(oligo, oligo >> 2, oligo >> 4, oligo >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask5));
+#else
   _masked = _mm_and_si128(_oligo, mask5);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("15 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("14 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   counts[masked] += 1;
   debug(printf("13 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   counts[masked] += 1;
   debug(printf("12 %04X => %d\n",masked,counts[masked]));
 #endif
@@ -14922,61 +15258,73 @@ count_5mers_fwd (Count_T *counts, Genomecomp_T high_rev, Genomecomp_T low_rev, G
   
 #else
   _oligo = _mm_setr_epi32(high_rev, high_rev >> 2, high_rev >> 4, high_rev >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask5));
+#else
   _masked = _mm_and_si128(_oligo, mask5);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("11 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("10 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   counts[masked] += 1;
   debug(printf("9 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   counts[masked] += 1;
   debug(printf("8 %04X => %d\n",masked,counts[masked]));
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask5));
+#else
   _masked = _mm_and_si128(_oligo, mask5);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("7 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("6 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   counts[masked] += 1;
   debug(printf("5 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   counts[masked] += 1;
   debug(printf("4 %04X => %d\n",masked,counts[masked]));
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask5));
+#else
   _masked = _mm_and_si128(_oligo, mask5);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("3 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("2 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   counts[masked] += 1;
   debug(printf("1 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   counts[masked] += 1;
   debug(printf("0 %04X => %d\n",masked,counts[masked]));
 #endif
@@ -15125,7 +15473,11 @@ static int
 store_5mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Count_T *counts,
 		 Genomecomp_T high_rev, Genomecomp_T low_rev, Genomecomp_T nexthigh_rev) {
   Genomecomp_T masked, oligo;
-#ifndef INDIVIDUAL_SHIFTS
+#ifdef INDIVIDUAL_SHIFTS
+#elif defined(SIMD_MASK_THEN_STORE)
+  UINT4 _masked[4] __attribute__ ((aligned (16)));
+  __m128i _oligo;
+#else
   __m128i _oligo, _masked;
 #endif
 
@@ -15172,9 +15524,13 @@ store_5mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 #else
   _oligo = _mm_setr_epi32(oligo, oligo >> 2, oligo >> 4, oligo >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask5));
+#else
   _masked = _mm_and_si128(_oligo, mask5);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -15183,7 +15539,7 @@ store_5mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -15192,7 +15548,7 @@ store_5mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -15201,7 +15557,7 @@ store_5mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -15323,9 +15679,13 @@ store_5mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
   
 #else
   _oligo = _mm_setr_epi32(low_rev, low_rev >> 2, low_rev >> 4, low_rev >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask5));
+#else
   _masked = _mm_and_si128(_oligo, mask5);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -15334,7 +15694,7 @@ store_5mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -15343,7 +15703,7 @@ store_5mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -15352,7 +15712,7 @@ store_5mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -15363,9 +15723,13 @@ store_5mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask5));
+#else
   _masked = _mm_and_si128(_oligo, mask5);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -15374,7 +15738,7 @@ store_5mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -15383,7 +15747,7 @@ store_5mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -15392,7 +15756,7 @@ store_5mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -15403,9 +15767,13 @@ store_5mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask5));
+#else
   _masked = _mm_and_si128(_oligo, mask5);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -15414,7 +15782,7 @@ store_5mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -15423,7 +15791,7 @@ store_5mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -15432,7 +15800,7 @@ store_5mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -15485,9 +15853,13 @@ store_5mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 #else
   _oligo = _mm_setr_epi32(oligo, oligo >> 2, oligo >> 4, oligo >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask5));
+#else
   _masked = _mm_and_si128(_oligo, mask5);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -15496,7 +15868,7 @@ store_5mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -15505,7 +15877,7 @@ store_5mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -15514,7 +15886,7 @@ store_5mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -15636,9 +16008,13 @@ store_5mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
   
 #else
   _oligo = _mm_setr_epi32(high_rev, high_rev >> 2, high_rev >> 4, high_rev >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask5));
+#else
   _masked = _mm_and_si128(_oligo, mask5);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -15647,7 +16023,7 @@ store_5mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -15656,7 +16032,7 @@ store_5mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -15665,7 +16041,7 @@ store_5mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -15676,9 +16052,13 @@ store_5mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask5));
+#else
   _masked = _mm_and_si128(_oligo, mask5);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -15687,7 +16067,7 @@ store_5mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -15696,7 +16076,7 @@ store_5mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -15705,7 +16085,7 @@ store_5mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -15716,9 +16096,13 @@ store_5mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask5));
+#else
   _masked = _mm_and_si128(_oligo, mask5);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -15727,7 +16111,7 @@ store_5mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -15736,7 +16120,7 @@ store_5mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -15745,7 +16129,7 @@ store_5mers_fwd (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -16160,12 +16544,15 @@ count_positions_fwd_simd (Count_T *counts, int indexsize,
     current = _mm_or_si128(_mm_srli_epi32(current,16),_mm_slli_epi32(current,16)); /* Swap 16-bit quantities */
 #endif
 
+#ifdef HAVE_SSE4_1
     high0_rev = (unsigned int) _mm_extract_epi32(current,0);
-    assert(high0_rev == (reverse_nt[low0 >> 16] | reverse_nt[low0 & 0x0000FFFF] << 16));
     low0_rev = (unsigned int) _mm_extract_epi32(current,1);
-    assert(low0_rev == (reverse_nt[high0 >> 16] | reverse_nt[high0 & 0x0000FFFF] << 16));
     nexthigh_rev = (unsigned int) _mm_extract_epi32(current,2);
-    assert(nexthigh_rev == (reverse_nt[nextlow >> 16] | reverse_nt[nextlow & 0x0000FFFF] << 16));
+#else
+    high0_rev = (reverse_nt[low0 >> 16] | reverse_nt[low0 & 0x0000FFFF] << 16);
+    low0_rev = (reverse_nt[high0 >> 16] | reverse_nt[high0 & 0x0000FFFF] << 16);
+    nexthigh_rev = (reverse_nt[nextlow >> 16] | reverse_nt[nextlow & 0x0000FFFF] << 16);
+#endif
 
     if (indexsize == 9) {
       count_9mers_fwd_partial(counts,high0_rev,low0_rev,nexthigh_rev,startdiscard,enddiscard);
@@ -16215,12 +16602,15 @@ count_positions_fwd_simd (Count_T *counts, int indexsize,
     current = _mm_or_si128(_mm_srli_epi32(current,16),_mm_slli_epi32(current,16)); /* Swap 16-bit quantities */
 #endif
 
+#ifdef HAVE_SSE4_1
     high0_rev = (unsigned int) _mm_extract_epi32(current,0);
-    assert(high0_rev == (reverse_nt[low0 >> 16] | reverse_nt[low0 & 0x0000FFFF] << 16));
     low0_rev = (unsigned int) _mm_extract_epi32(current,1);
-    assert(low0_rev == (reverse_nt[high0 >> 16] | reverse_nt[high0 & 0x0000FFFF] << 16));
     nexthigh_rev = (unsigned int) _mm_extract_epi32(current,2);
-    assert(nexthigh_rev == (reverse_nt[nextlow >> 16] | reverse_nt[nextlow & 0x0000FFFF] << 16));
+#else
+    high0_rev = (reverse_nt[low0 >> 16] | reverse_nt[low0 & 0x0000FFFF] << 16);
+    low0_rev = (reverse_nt[high0 >> 16] | reverse_nt[high0 & 0x0000FFFF] << 16);
+    nexthigh_rev = (reverse_nt[nextlow >> 16] | reverse_nt[nextlow & 0x0000FFFF] << 16);
+#endif
 
     if (indexsize == 9) {
       count_9mers_fwd_partial(counts,high0_rev,low0_rev,nexthigh_rev,/*startdiscard*/0,enddiscard);
@@ -16281,19 +16671,20 @@ count_positions_fwd_simd (Count_T *counts, int indexsize,
 #endif
 
 	nexthigh_rev = high0_rev;
-	high0_rev = (unsigned int) _mm_extract_epi32(current,0);
-	assert(high0_rev == (reverse_nt[low0 >> 16] | reverse_nt[low0 & 0x0000FFFF] << 16));
 
 #ifdef HAVE_SSE4_1
+	high0_rev = (unsigned int) _mm_extract_epi32(current,0);
+	/* low0_rev = (unsigned int) _mm_extract_epi32(current,1); */
+	/* high1_rev = (unsigned int) _mm_extract_epi32(current,2); */
+	/* low1_rev = (unsigned int) _mm_extract_epi32(current,3); */
+
 	temp = _mm_insert_epi32(current,nexthigh_rev,0x00);
 	next = _mm_shuffle_epi32(temp,0x39);
 #else
-	low0_rev = (unsigned int) _mm_extract_epi32(current,1);
-	assert(low0_rev == (reverse_nt[high0 >> 16] | reverse_nt[high0 & 0x0000FFFF] << 16));
-	high1_rev = (unsigned int) _mm_extract_epi32(current,2);
-	assert(high1_rev == (reverse_nt[low1 >> 16] | reverse_nt[low1 & 0x0000FFFF] << 16));
-	low1_rev = (unsigned int) _mm_extract_epi32(current,3);
-	assert(low1_rev == (reverse_nt[high1 >> 16] | reverse_nt[high1 & 0x0000FFFF] << 16));
+	high0_rev = (reverse_nt[low0 >> 16] | reverse_nt[low0 & 0x0000FFFF] << 16);
+	low0_rev = (reverse_nt[high0 >> 16] | reverse_nt[high0 & 0x0000FFFF] << 16);
+	high1_rev = (reverse_nt[low1 >> 16] | reverse_nt[low1 & 0x0000FFFF] << 16);
+	low1_rev = (reverse_nt[high1 >> 16] | reverse_nt[high1 & 0x0000FFFF] << 16);
 
 	next = _mm_setr_epi32(low0_rev,high1_rev,low1_rev,nexthigh_rev);
 #endif
@@ -16339,10 +16730,13 @@ count_positions_fwd_simd (Count_T *counts, int indexsize,
 #endif
 
 	nexthigh_rev = high0_rev;
+#ifdef HAVE_SSE4_1
 	high0_rev = (unsigned int) _mm_extract_epi32(current,0);
-	assert(high0_rev == (reverse_nt[low0 >> 16] | reverse_nt[low0 & 0x0000FFFF] << 16));
 	low0_rev = (unsigned int) _mm_extract_epi32(current,1);
-	assert(low0_rev == (reverse_nt[high0 >> 16] | reverse_nt[high0 & 0x0000FFFF] << 16));
+#else
+	high0_rev = (reverse_nt[low0 >> 16] | reverse_nt[low0 & 0x0000FFFF] << 16);
+	low0_rev = (reverse_nt[high0 >> 16] | reverse_nt[high0 & 0x0000FFFF] << 16);
+#endif
 
 	count_9mers_fwd(counts,high0_rev,low0_rev,nexthigh_rev);
       }
@@ -16391,19 +16785,20 @@ count_positions_fwd_simd (Count_T *counts, int indexsize,
 #endif
 
 	nexthigh_rev = high0_rev;
-	high0_rev = (unsigned int) _mm_extract_epi32(current,0);
-	assert(high0_rev == (reverse_nt[low0 >> 16] | reverse_nt[low0 & 0x0000FFFF] << 16));
 
 #ifdef HAVE_SSE4_1
+	high0_rev = (unsigned int) _mm_extract_epi32(current,0);
+	/* low0_rev = (unsigned int) _mm_extract_epi32(current,1); */
+	/* high1_rev = (unsigned int) _mm_extract_epi32(current,2);*/
+	/* low1_rev = (unsigned int) _mm_extract_epi32(current,3); */
+
 	temp = _mm_insert_epi32(current,nexthigh_rev,0x00);
 	next = _mm_shuffle_epi32(temp,0x39);
 #else
-	low0_rev = (unsigned int) _mm_extract_epi32(current,1);
-	assert(low0_rev == (reverse_nt[high0 >> 16] | reverse_nt[high0 & 0x0000FFFF] << 16));
-	high1_rev = (unsigned int) _mm_extract_epi32(current,2);
-	assert(high1_rev == (reverse_nt[low1 >> 16] | reverse_nt[low1 & 0x0000FFFF] << 16));
-	low1_rev = (unsigned int) _mm_extract_epi32(current,3);
-	assert(low1_rev == (reverse_nt[high1 >> 16] | reverse_nt[high1 & 0x0000FFFF] << 16));
+	high0_rev = (reverse_nt[low0 >> 16] | reverse_nt[low0 & 0x0000FFFF] << 16);
+	low0_rev = (reverse_nt[high0 >> 16] | reverse_nt[high0 & 0x0000FFFF] << 16);
+	high1_rev = (reverse_nt[low1 >> 16] | reverse_nt[low1 & 0x0000FFFF] << 16);
+	low1_rev = (reverse_nt[high1 >> 16] | reverse_nt[high1 & 0x0000FFFF] << 16);
 
 	next = _mm_setr_epi32(low0_rev,high1_rev,low1_rev,nexthigh_rev);
 #endif
@@ -16449,10 +16844,13 @@ count_positions_fwd_simd (Count_T *counts, int indexsize,
 #endif
 
 	nexthigh_rev = high0_rev;
+#ifdef HAVE_SSE4_1
 	high0_rev = (unsigned int) _mm_extract_epi32(current,0);
-	assert(high0_rev == (reverse_nt[low0 >> 16] | reverse_nt[low0 & 0x0000FFFF] << 16));
 	low0_rev = (unsigned int) _mm_extract_epi32(current,1);
-	assert(low0_rev == (reverse_nt[high0 >> 16] | reverse_nt[high0 & 0x0000FFFF] << 16));
+#else
+	high0_rev = (reverse_nt[low0 >> 16] | reverse_nt[low0 & 0x0000FFFF] << 16);
+	low0_rev = (reverse_nt[high0 >> 16] | reverse_nt[high0 & 0x0000FFFF] << 16);
+#endif
 
 	count_8mers_fwd(counts,high0_rev,low0_rev,nexthigh_rev);
       }
@@ -16501,19 +16899,20 @@ count_positions_fwd_simd (Count_T *counts, int indexsize,
 #endif
 
 	nexthigh_rev = high0_rev;
-	high0_rev = (unsigned int) _mm_extract_epi32(current,0);
-	assert(high0_rev == (reverse_nt[low0 >> 16] | reverse_nt[low0 & 0x0000FFFF] << 16));
 
 #ifdef HAVE_SSE4_1
+	high0_rev = (unsigned int) _mm_extract_epi32(current,0);
+	/* low0_rev = (unsigned int) _mm_extract_epi32(current,1); */
+	/* high1_rev = (unsigned int) _mm_extract_epi32(current,2); */
+	/* low1_rev = (unsigned int) _mm_extract_epi32(current,3); */
+
 	temp = _mm_insert_epi32(current,nexthigh_rev,0x00);
 	next = _mm_shuffle_epi32(temp,0x39);
 #else
-	low0_rev = (unsigned int) _mm_extract_epi32(current,1);
-	assert(low0_rev == (reverse_nt[high0 >> 16] | reverse_nt[high0 & 0x0000FFFF] << 16));
-	high1_rev = (unsigned int) _mm_extract_epi32(current,2);
-	assert(high1_rev == (reverse_nt[low1 >> 16] | reverse_nt[low1 & 0x0000FFFF] << 16));
-	low1_rev = (unsigned int) _mm_extract_epi32(current,3);
-	assert(low1_rev == (reverse_nt[high1 >> 16] | reverse_nt[high1 & 0x0000FFFF] << 16));
+	high0_rev = (reverse_nt[low0 >> 16] | reverse_nt[low0 & 0x0000FFFF] << 16);
+	low0_rev = (reverse_nt[high0 >> 16] | reverse_nt[high0 & 0x0000FFFF] << 16);
+	high1_rev = (reverse_nt[low1 >> 16] | reverse_nt[low1 & 0x0000FFFF] << 16);
+	low1_rev = (reverse_nt[high1 >> 16] | reverse_nt[high1 & 0x0000FFFF] << 16);
 
 	next = _mm_setr_epi32(low0_rev,high1_rev,low1_rev,nexthigh_rev);
 #endif
@@ -16559,10 +16958,13 @@ count_positions_fwd_simd (Count_T *counts, int indexsize,
 #endif
 
 	nexthigh_rev = high0_rev;
+#ifdef HAVE_SSE4_1
 	high0_rev = (unsigned int) _mm_extract_epi32(current,0);
-	assert(high0_rev == (reverse_nt[low0 >> 16] | reverse_nt[low0 & 0x0000FFFF] << 16));
 	low0_rev = (unsigned int) _mm_extract_epi32(current,1);
-	assert(low0_rev == (reverse_nt[high0 >> 16] | reverse_nt[high0 & 0x0000FFFF] << 16));
+#else
+	high0_rev = (reverse_nt[low0 >> 16] | reverse_nt[low0 & 0x0000FFFF] << 16);
+	low0_rev = (reverse_nt[high0 >> 16] | reverse_nt[high0 & 0x0000FFFF] << 16);
+#endif
 
 	count_7mers_fwd(counts,high0_rev,low0_rev,nexthigh_rev);
       }
@@ -16611,19 +17013,20 @@ count_positions_fwd_simd (Count_T *counts, int indexsize,
 #endif
 
 	nexthigh_rev = high0_rev;
-	high0_rev = (unsigned int) _mm_extract_epi32(current,0);
-	assert(high0_rev == (reverse_nt[low0 >> 16] | reverse_nt[low0 & 0x0000FFFF] << 16));
 
 #ifdef HAVE_SSE4_1
+	high0_rev = (unsigned int) _mm_extract_epi32(current,0);
+	/* low0_rev = (unsigned int) _mm_extract_epi32(current,1); */
+	/* high1_rev = (unsigned int) _mm_extract_epi32(current,2); */
+	/* low1_rev = (unsigned int) _mm_extract_epi32(current,3); */
+
 	temp = _mm_insert_epi32(current,nexthigh_rev,0x00);
 	next = _mm_shuffle_epi32(temp,0x39);
 #else
-	low0_rev = (unsigned int) _mm_extract_epi32(current,1);
-	assert(low0_rev == (reverse_nt[high0 >> 16] | reverse_nt[high0 & 0x0000FFFF] << 16));
-	high1_rev = (unsigned int) _mm_extract_epi32(current,2);
-	assert(high1_rev == (reverse_nt[low1 >> 16] | reverse_nt[low1 & 0x0000FFFF] << 16));
-	low1_rev = (unsigned int) _mm_extract_epi32(current,3);
-	assert(low1_rev == (reverse_nt[high1 >> 16] | reverse_nt[high1 & 0x0000FFFF] << 16));
+	high0_rev = (reverse_nt[low0 >> 16] | reverse_nt[low0 & 0x0000FFFF] << 16);
+	low0_rev = (reverse_nt[high0 >> 16] | reverse_nt[high0 & 0x0000FFFF] << 16);
+	high1_rev = (reverse_nt[low1 >> 16] | reverse_nt[low1 & 0x0000FFFF] << 16);
+	low1_rev = (reverse_nt[high1 >> 16] | reverse_nt[high1 & 0x0000FFFF] << 16);
 
 	next = _mm_setr_epi32(low0_rev,high1_rev,low1_rev,nexthigh_rev);
 #endif
@@ -16669,10 +17072,13 @@ count_positions_fwd_simd (Count_T *counts, int indexsize,
 #endif
 
 	nexthigh_rev = high0_rev;
+#ifdef HAVE_SSE4_1
 	high0_rev = (unsigned int) _mm_extract_epi32(current,0);
-	assert(high0_rev == (reverse_nt[low0 >> 16] | reverse_nt[low0 & 0x0000FFFF] << 16));
 	low0_rev = (unsigned int) _mm_extract_epi32(current,1);
-	assert(low0_rev == (reverse_nt[high0 >> 16] | reverse_nt[high0 & 0x0000FFFF] << 16));
+#else
+	high0_rev = (reverse_nt[low0 >> 16] | reverse_nt[low0 & 0x0000FFFF] << 16);
+	low0_rev = (reverse_nt[high0 >> 16] | reverse_nt[high0 & 0x0000FFFF] << 16);
+#endif
 
 	count_6mers_fwd(counts,high0_rev,low0_rev,nexthigh_rev);
       }
@@ -16721,19 +17127,19 @@ count_positions_fwd_simd (Count_T *counts, int indexsize,
 #endif
 
 	nexthigh_rev = high0_rev;
+#ifdef HAVE_SSE4_1
 	high0_rev = (unsigned int) _mm_extract_epi32(current,0);
-	assert(high0_rev == (reverse_nt[low0 >> 16] | reverse_nt[low0 & 0x0000FFFF] << 16));
+	/* low0_rev = (unsigned int) _mm_extract_epi32(current,1); */
+	/* high1_rev = (unsigned int) _mm_extract_epi32(current,2); */
+	/* low1_rev = (unsigned int) _mm_extract_epi32(current,3); */
 
-#ifdef HAVE_SSE4_1
 	temp = _mm_insert_epi32(current,nexthigh_rev,0x00);
 	next = _mm_shuffle_epi32(temp,0x39);
 #else
-	low0_rev = (unsigned int) _mm_extract_epi32(current,1);
-	assert(low0_rev == (reverse_nt[high0 >> 16] | reverse_nt[high0 & 0x0000FFFF] << 16));
-	high1_rev = (unsigned int) _mm_extract_epi32(current,2);
-	assert(high1_rev == (reverse_nt[low1 >> 16] | reverse_nt[low1 & 0x0000FFFF] << 16));
-	low1_rev = (unsigned int) _mm_extract_epi32(current,3);
-	assert(low1_rev == (reverse_nt[high1 >> 16] | reverse_nt[high1 & 0x0000FFFF] << 16));
+	high0_rev = (reverse_nt[low0 >> 16] | reverse_nt[low0 & 0x0000FFFF] << 16);
+	low0_rev = (reverse_nt[high0 >> 16] | reverse_nt[high0 & 0x0000FFFF] << 16);
+	high1_rev = (reverse_nt[low1 >> 16] | reverse_nt[low1 & 0x0000FFFF] << 16);
+	low1_rev = (reverse_nt[high1 >> 16] | reverse_nt[high1 & 0x0000FFFF] << 16);
 
 	next = _mm_setr_epi32(low0_rev,high1_rev,low1_rev,nexthigh_rev);
 #endif
@@ -16779,10 +17185,13 @@ count_positions_fwd_simd (Count_T *counts, int indexsize,
 #endif
 
 	nexthigh_rev = high0_rev;
+#ifdef HAVE_SSE4_1
 	high0_rev = (unsigned int) _mm_extract_epi32(current,0);
-	assert(high0_rev == (reverse_nt[low0 >> 16] | reverse_nt[low0 & 0x0000FFFF] << 16));
 	low0_rev = (unsigned int) _mm_extract_epi32(current,1);
-	assert(low0_rev == (reverse_nt[high0 >> 16] | reverse_nt[high0 & 0x0000FFFF] << 16));
+#else
+	high0_rev = (reverse_nt[low0 >> 16] | reverse_nt[low0 & 0x0000FFFF] << 16);
+	low0_rev = (reverse_nt[high0 >> 16] | reverse_nt[high0 & 0x0000FFFF] << 16);
+#endif
 
 	count_5mers_fwd(counts,high0_rev,low0_rev,nexthigh_rev);
       }
@@ -16826,10 +17235,13 @@ count_positions_fwd_simd (Count_T *counts, int indexsize,
 #endif
 
     nexthigh_rev = high0_rev;
+#ifdef HAVE_SSE4_1
     high0_rev = (unsigned int) _mm_extract_epi32(current,0);
-    assert(high0_rev == (reverse_nt[low0 >> 16] | reverse_nt[low0 & 0x0000FFFF] << 16));
     low0_rev = (unsigned int) _mm_extract_epi32(current,1);
-    assert(low0_rev == (reverse_nt[high0 >> 16] | reverse_nt[high0 & 0x0000FFFF] << 16));
+#else
+    high0_rev = (reverse_nt[low0 >> 16] | reverse_nt[low0 & 0x0000FFFF] << 16);
+    low0_rev = (reverse_nt[high0 >> 16] | reverse_nt[high0 & 0x0000FFFF] << 16);
+#endif
 
     if (indexsize == 9) {
       count_9mers_fwd_partial(counts,high0_rev,low0_rev,nexthigh_rev,startdiscard,/*enddiscard*/31);
@@ -17253,12 +17665,15 @@ store_positions_fwd_simd (Chrpos_T **pointers, Chrpos_T **positions, Count_T *co
     current = _mm_or_si128(_mm_srli_epi32(current,16),_mm_slli_epi32(current,16)); /* Swap 16-bit quantities */
 #endif
 
+#ifdef HAVE_SSE4_1
     high0_rev = (unsigned int) _mm_extract_epi32(current,0);
-    assert(high0_rev == (reverse_nt[low0 >> 16] | reverse_nt[low0 & 0x0000FFFF] << 16));
     low0_rev = (unsigned int) _mm_extract_epi32(current,1);
-    assert(low0_rev == (reverse_nt[high0 >> 16] | reverse_nt[high0 & 0x0000FFFF] << 16));
     nexthigh_rev = (unsigned int) _mm_extract_epi32(current,2);
-    assert(nexthigh_rev == (reverse_nt[nextlow >> 16] | reverse_nt[nextlow & 0x0000FFFF] << 16));
+#else
+    high0_rev = (reverse_nt[low0 >> 16] | reverse_nt[low0 & 0x0000FFFF] << 16);
+    low0_rev = (reverse_nt[high0 >> 16] | reverse_nt[high0 & 0x0000FFFF] << 16);
+    nexthigh_rev = (reverse_nt[nextlow >> 16] | reverse_nt[nextlow & 0x0000FFFF] << 16);
+#endif
 
     if (indexsize == 9) {
       chrpos = store_9mers_fwd_partial(chrpos,pointers,positions,counts,high0_rev,low0_rev,nexthigh_rev,startdiscard,enddiscard);
@@ -17308,12 +17723,15 @@ store_positions_fwd_simd (Chrpos_T **pointers, Chrpos_T **positions, Count_T *co
     current = _mm_or_si128(_mm_srli_epi32(current,16),_mm_slli_epi32(current,16)); /* Swap 16-bit quantities */
 #endif
 
+#ifdef HAVE_SSE4_1
     high0_rev = (unsigned int) _mm_extract_epi32(current,0);
-    assert(high0_rev == (reverse_nt[low0 >> 16] | reverse_nt[low0 & 0x0000FFFF] << 16));
     low0_rev = (unsigned int) _mm_extract_epi32(current,1);
-    assert(low0_rev == (reverse_nt[high0 >> 16] | reverse_nt[high0 & 0x0000FFFF] << 16));
     nexthigh_rev = (unsigned int) _mm_extract_epi32(current,2);
-    assert(nexthigh_rev == (reverse_nt[nextlow >> 16] | reverse_nt[nextlow & 0x0000FFFF] << 16));
+#else
+    high0_rev = (reverse_nt[low0 >> 16] | reverse_nt[low0 & 0x0000FFFF] << 16);
+    low0_rev = (reverse_nt[high0 >> 16] | reverse_nt[high0 & 0x0000FFFF] << 16);
+    nexthigh_rev = (reverse_nt[nextlow >> 16] | reverse_nt[nextlow & 0x0000FFFF] << 16);
+#endif
 
     if (indexsize == 9) {
       chrpos = store_9mers_fwd_partial(chrpos,pointers,positions,counts,high0_rev,low0_rev,nexthigh_rev,/*startdiscard*/0,enddiscard);
@@ -17374,19 +17792,19 @@ store_positions_fwd_simd (Chrpos_T **pointers, Chrpos_T **positions, Count_T *co
 #endif
 
 	nexthigh_rev = high0_rev;
+#ifdef HAVE_SSE4_1
 	high0_rev = (unsigned int) _mm_extract_epi32(current,0);
-	assert(high0_rev == (reverse_nt[low0 >> 16] | reverse_nt[low0 & 0x0000FFFF] << 16));
+	/* low0_rev = (unsigned int) _mm_extract_epi32(current,1); */
+	/* high1_rev = (unsigned int) _mm_extract_epi32(current,2); */
+	/* low1_rev = (unsigned int) _mm_extract_epi32(current,3); */
 
-#ifdef HAVE_SSE4_1
 	temp = _mm_insert_epi32(current,nexthigh_rev,0x00);
 	next = _mm_shuffle_epi32(temp,0x39);
 #else
-	low0_rev = (unsigned int) _mm_extract_epi32(current,1);
-	assert(low0_rev == (reverse_nt[high0 >> 16] | reverse_nt[high0 & 0x0000FFFF] << 16));
-	high1_rev = (unsigned int) _mm_extract_epi32(current,2);
-	assert(high1_rev == (reverse_nt[low1 >> 16] | reverse_nt[low1 & 0x0000FFFF] << 16));
-	low1_rev = (unsigned int) _mm_extract_epi32(current,3);
-	assert(low1_rev == (reverse_nt[high1 >> 16] | reverse_nt[high1 & 0x0000FFFF] << 16));
+	high0_rev = (reverse_nt[low0 >> 16] | reverse_nt[low0 & 0x0000FFFF] << 16);
+	low0_rev = (reverse_nt[high0 >> 16] | reverse_nt[high0 & 0x0000FFFF] << 16);
+	high1_rev = (reverse_nt[low1 >> 16] | reverse_nt[low1 & 0x0000FFFF] << 16);
+	low1_rev = (reverse_nt[high1 >> 16] | reverse_nt[high1 & 0x0000FFFF] << 16);
 
 	next = _mm_setr_epi32(low0_rev,high1_rev,low1_rev,nexthigh_rev);
 #endif
@@ -17428,10 +17846,13 @@ store_positions_fwd_simd (Chrpos_T **pointers, Chrpos_T **positions, Count_T *co
 #endif
 
 	nexthigh_rev = high0_rev;
+#ifdef HAVE_SSE4_1
 	high0_rev = (unsigned int) _mm_extract_epi32(current,0);
-	assert(high0_rev == (reverse_nt[low0 >> 16] | reverse_nt[low0 & 0x0000FFFF] << 16));
 	low0_rev = (unsigned int) _mm_extract_epi32(current,1);
-	assert(low0_rev == (reverse_nt[high0 >> 16] | reverse_nt[high0 & 0x0000FFFF] << 16));
+#else
+	high0_rev = (reverse_nt[low0 >> 16] | reverse_nt[low0 & 0x0000FFFF] << 16);
+	low0_rev = (reverse_nt[high0 >> 16] | reverse_nt[high0 & 0x0000FFFF] << 16);
+#endif
 
 	chrpos = store_9mers_fwd(chrpos,pointers,positions,counts,high0_rev,low0_rev,nexthigh_rev);
       }
@@ -17480,19 +17901,19 @@ store_positions_fwd_simd (Chrpos_T **pointers, Chrpos_T **positions, Count_T *co
 #endif
 
 	nexthigh_rev = high0_rev;
+#ifdef HAVE_SSE4_1
 	high0_rev = (unsigned int) _mm_extract_epi32(current,0);
-	assert(high0_rev == (reverse_nt[low0 >> 16] | reverse_nt[low0 & 0x0000FFFF] << 16));
+	/* low0_rev = (unsigned int) _mm_extract_epi32(current,1); */
+	/* high1_rev = (unsigned int) _mm_extract_epi32(current,2); */
+	/* low1_rev = (unsigned int) _mm_extract_epi32(current,3); */
 
-#ifdef HAVE_SSE4_1
 	temp = _mm_insert_epi32(current,nexthigh_rev,0x00);
 	next = _mm_shuffle_epi32(temp,0x39);
 #else
-	low0_rev = (unsigned int) _mm_extract_epi32(current,1);
-	assert(low0_rev == (reverse_nt[high0 >> 16] | reverse_nt[high0 & 0x0000FFFF] << 16));
-	high1_rev = (unsigned int) _mm_extract_epi32(current,2);
-	assert(high1_rev == (reverse_nt[low1 >> 16] | reverse_nt[low1 & 0x0000FFFF] << 16));
-	low1_rev = (unsigned int) _mm_extract_epi32(current,3);
-	assert(low1_rev == (reverse_nt[high1 >> 16] | reverse_nt[high1 & 0x0000FFFF] << 16));
+	high0_rev = (reverse_nt[low0 >> 16] | reverse_nt[low0 & 0x0000FFFF] << 16);
+	low0_rev = (reverse_nt[high0 >> 16] | reverse_nt[high0 & 0x0000FFFF] << 16);
+	high1_rev = (reverse_nt[low1 >> 16] | reverse_nt[low1 & 0x0000FFFF] << 16);
+	low1_rev = (reverse_nt[high1 >> 16] | reverse_nt[high1 & 0x0000FFFF] << 16);
 
 	next = _mm_setr_epi32(low0_rev,high1_rev,low1_rev,nexthigh_rev);
 #endif
@@ -17534,10 +17955,13 @@ store_positions_fwd_simd (Chrpos_T **pointers, Chrpos_T **positions, Count_T *co
 #endif
 
 	nexthigh_rev = high0_rev;
+#ifdef HAVE_SSE4_1
 	high0_rev = (unsigned int) _mm_extract_epi32(current,0);
-	assert(high0_rev == (reverse_nt[low0 >> 16] | reverse_nt[low0 & 0x0000FFFF] << 16));
 	low0_rev = (unsigned int) _mm_extract_epi32(current,1);
-	assert(low0_rev == (reverse_nt[high0 >> 16] | reverse_nt[high0 & 0x0000FFFF] << 16));
+#else
+	high0_rev = (reverse_nt[low0 >> 16] | reverse_nt[low0 & 0x0000FFFF] << 16);
+	low0_rev = (reverse_nt[high0 >> 16] | reverse_nt[high0 & 0x0000FFFF] << 16);
+#endif
 
 	chrpos = store_8mers_fwd(chrpos,pointers,positions,counts,high0_rev,low0_rev,nexthigh_rev);
       }
@@ -17586,19 +18010,19 @@ store_positions_fwd_simd (Chrpos_T **pointers, Chrpos_T **positions, Count_T *co
 #endif
 
 	nexthigh_rev = high0_rev;
+#ifdef HAVE_SSE4_1
 	high0_rev = (unsigned int) _mm_extract_epi32(current,0);
-	assert(high0_rev == (reverse_nt[low0 >> 16] | reverse_nt[low0 & 0x0000FFFF] << 16));
+	/* low0_rev = (unsigned int) _mm_extract_epi32(current,1); */
+	/* high1_rev = (unsigned int) _mm_extract_epi32(current,2); */
+	/* low1_rev = (unsigned int) _mm_extract_epi32(current,3); */
 
-#ifdef HAVE_SSE4_1
 	temp = _mm_insert_epi32(current,nexthigh_rev,0x00);
 	next = _mm_shuffle_epi32(temp,0x39);
 #else
-	low0_rev = (unsigned int) _mm_extract_epi32(current,1);
-	assert(low0_rev == (reverse_nt[high0 >> 16] | reverse_nt[high0 & 0x0000FFFF] << 16));
-	high1_rev = (unsigned int) _mm_extract_epi32(current,2);
-	assert(high1_rev == (reverse_nt[low1 >> 16] | reverse_nt[low1 & 0x0000FFFF] << 16));
-	low1_rev = (unsigned int) _mm_extract_epi32(current,3);
-	assert(low1_rev == (reverse_nt[high1 >> 16] | reverse_nt[high1 & 0x0000FFFF] << 16));
+	high0_rev = (reverse_nt[low0 >> 16] | reverse_nt[low0 & 0x0000FFFF] << 16);
+	low0_rev = (reverse_nt[high0 >> 16] | reverse_nt[high0 & 0x0000FFFF] << 16);
+	high1_rev = (reverse_nt[low1 >> 16] | reverse_nt[low1 & 0x0000FFFF] << 16);
+	low1_rev = (reverse_nt[high1 >> 16] | reverse_nt[high1 & 0x0000FFFF] << 16);
 
 	next = _mm_setr_epi32(low0_rev,high1_rev,low1_rev,nexthigh_rev);
 #endif
@@ -17640,10 +18064,13 @@ store_positions_fwd_simd (Chrpos_T **pointers, Chrpos_T **positions, Count_T *co
 #endif
 
 	nexthigh_rev = high0_rev;
+#ifdef HAVE_SSE4_1
 	high0_rev = (unsigned int) _mm_extract_epi32(current,0);
-	assert(high0_rev == (reverse_nt[low0 >> 16] | reverse_nt[low0 & 0x0000FFFF] << 16));
 	low0_rev = (unsigned int) _mm_extract_epi32(current,1);
-	assert(low0_rev == (reverse_nt[high0 >> 16] | reverse_nt[high0 & 0x0000FFFF] << 16));
+#else
+	high0_rev = (reverse_nt[low0 >> 16] | reverse_nt[low0 & 0x0000FFFF] << 16);
+	low0_rev = (reverse_nt[high0 >> 16] | reverse_nt[high0 & 0x0000FFFF] << 16);
+#endif
 
 	chrpos = store_7mers_fwd(chrpos,pointers,positions,counts,high0_rev,low0_rev,nexthigh_rev);
       }
@@ -17692,19 +18119,19 @@ store_positions_fwd_simd (Chrpos_T **pointers, Chrpos_T **positions, Count_T *co
 #endif
 
 	nexthigh_rev = high0_rev;
+#ifdef HAVE_SSE4_1
 	high0_rev = (unsigned int) _mm_extract_epi32(current,0);
-	assert(high0_rev == (reverse_nt[low0 >> 16] | reverse_nt[low0 & 0x0000FFFF] << 16));
+	/* low0_rev = (unsigned int) _mm_extract_epi32(current,1); */
+	/* high1_rev = (unsigned int) _mm_extract_epi32(current,2); */
+	/* low1_rev = (unsigned int) _mm_extract_epi32(current,3); */
 
-#ifdef HAVE_SSE4_1
 	temp = _mm_insert_epi32(current,nexthigh_rev,0x00);
 	next = _mm_shuffle_epi32(temp,0x39);
 #else
-	low0_rev = (unsigned int) _mm_extract_epi32(current,1);
-	assert(low0_rev == (reverse_nt[high0 >> 16] | reverse_nt[high0 & 0x0000FFFF] << 16));
-	high1_rev = (unsigned int) _mm_extract_epi32(current,2);
-	assert(high1_rev == (reverse_nt[low1 >> 16] | reverse_nt[low1 & 0x0000FFFF] << 16));
-	low1_rev = (unsigned int) _mm_extract_epi32(current,3);
-	assert(low1_rev == (reverse_nt[high1 >> 16] | reverse_nt[high1 & 0x0000FFFF] << 16));
+	high0_rev = (reverse_nt[low0 >> 16] | reverse_nt[low0 & 0x0000FFFF] << 16);
+	low0_rev = (reverse_nt[high0 >> 16] | reverse_nt[high0 & 0x0000FFFF] << 16);
+	high1_rev = (reverse_nt[low1 >> 16] | reverse_nt[low1 & 0x0000FFFF] << 16);
+	low1_rev = (reverse_nt[high1 >> 16] | reverse_nt[high1 & 0x0000FFFF] << 16);
 
 	next = _mm_setr_epi32(low0_rev,high1_rev,low1_rev,nexthigh_rev);
 #endif
@@ -17746,10 +18173,13 @@ store_positions_fwd_simd (Chrpos_T **pointers, Chrpos_T **positions, Count_T *co
 #endif
 
 	nexthigh_rev = high0_rev;
+#ifdef HAVE_SSE4_1
 	high0_rev = (unsigned int) _mm_extract_epi32(current,0);
-	assert(high0_rev == (reverse_nt[low0 >> 16] | reverse_nt[low0 & 0x0000FFFF] << 16));
 	low0_rev = (unsigned int) _mm_extract_epi32(current,1);
-	assert(low0_rev == (reverse_nt[high0 >> 16] | reverse_nt[high0 & 0x0000FFFF] << 16));
+#else
+	high0_rev = (reverse_nt[low0 >> 16] | reverse_nt[low0 & 0x0000FFFF] << 16);
+	low0_rev = (reverse_nt[high0 >> 16] | reverse_nt[high0 & 0x0000FFFF] << 16);
+#endif
 
 	chrpos = store_6mers_fwd(chrpos,pointers,positions,counts,high0_rev,low0_rev,nexthigh_rev);
       }
@@ -17798,19 +18228,19 @@ store_positions_fwd_simd (Chrpos_T **pointers, Chrpos_T **positions, Count_T *co
 #endif
 
 	nexthigh_rev = high0_rev;
+#ifdef HAVE_SSE4_1
 	high0_rev = (unsigned int) _mm_extract_epi32(current,0);
-	assert(high0_rev == (reverse_nt[low0 >> 16] | reverse_nt[low0 & 0x0000FFFF] << 16));
+	/* low0_rev = (unsigned int) _mm_extract_epi32(current,1); */
+	/* high1_rev = (unsigned int) _mm_extract_epi32(current,2); */
+	/* low1_rev = (unsigned int) _mm_extract_epi32(current,3); */
 
-#ifdef HAVE_SSE4_1
 	temp = _mm_insert_epi32(current,nexthigh_rev,0x00);
 	next = _mm_shuffle_epi32(temp,0x39);
 #else
-	low0_rev = (unsigned int) _mm_extract_epi32(current,1);
-	assert(low0_rev == (reverse_nt[high0 >> 16] | reverse_nt[high0 & 0x0000FFFF] << 16));
-	high1_rev = (unsigned int) _mm_extract_epi32(current,2);
-	assert(high1_rev == (reverse_nt[low1 >> 16] | reverse_nt[low1 & 0x0000FFFF] << 16));
-	low1_rev = (unsigned int) _mm_extract_epi32(current,3);
-	assert(low1_rev == (reverse_nt[high1 >> 16] | reverse_nt[high1 & 0x0000FFFF] << 16));
+	high0_rev = (reverse_nt[low0 >> 16] | reverse_nt[low0 & 0x0000FFFF] << 16);
+	low0_rev = (reverse_nt[high0 >> 16] | reverse_nt[high0 & 0x0000FFFF] << 16);
+	high1_rev = (reverse_nt[low1 >> 16] | reverse_nt[low1 & 0x0000FFFF] << 16);
+	low1_rev = (reverse_nt[high1 >> 16] | reverse_nt[high1 & 0x0000FFFF] << 16);
 
 	next = _mm_setr_epi32(low0_rev,high1_rev,low1_rev,nexthigh_rev);
 #endif
@@ -17852,10 +18282,13 @@ store_positions_fwd_simd (Chrpos_T **pointers, Chrpos_T **positions, Count_T *co
 #endif
 
 	nexthigh_rev = high0_rev;
+#ifdef HAVE_SSE4_1
 	high0_rev = (unsigned int) _mm_extract_epi32(current,0);
-	assert(high0_rev == (reverse_nt[low0 >> 16] | reverse_nt[low0 & 0x0000FFFF] << 16));
 	low0_rev = (unsigned int) _mm_extract_epi32(current,1);
-	assert(low0_rev == (reverse_nt[high0 >> 16] | reverse_nt[high0 & 0x0000FFFF] << 16));
+#else
+	high0_rev = (reverse_nt[low0 >> 16] | reverse_nt[low0 & 0x0000FFFF] << 16);
+	low0_rev = (reverse_nt[high0 >> 16] | reverse_nt[high0 & 0x0000FFFF] << 16);
+#endif
 
 	chrpos = store_5mers_fwd(chrpos,pointers,positions,counts,high0_rev,low0_rev,nexthigh_rev);
       }
@@ -17899,10 +18332,13 @@ store_positions_fwd_simd (Chrpos_T **pointers, Chrpos_T **positions, Count_T *co
 #endif
 
     nexthigh_rev = high0_rev;
+#ifdef HAVE_SSE4_1
     high0_rev = (unsigned int) _mm_extract_epi32(current,0);
-    assert(high0_rev == (reverse_nt[low0 >> 16] | reverse_nt[low0 & 0x0000FFFF] << 16));
     low0_rev = (unsigned int) _mm_extract_epi32(current,1);
-    assert(low0_rev == (reverse_nt[high0 >> 16] | reverse_nt[high0 & 0x0000FFFF] << 16));
+#else
+    high0_rev = (reverse_nt[low0 >> 16] | reverse_nt[low0 & 0x0000FFFF] << 16);
+    low0_rev = (reverse_nt[high0 >> 16] | reverse_nt[high0 & 0x0000FFFF] << 16);
+#endif
 
     if (indexsize == 9) {
       chrpos = store_9mers_fwd_partial(chrpos,pointers,positions,counts,high0_rev,low0_rev,nexthigh_rev,startdiscard,/*enddiscard*/31);
@@ -18516,7 +18952,11 @@ store_5mers_rev_partial (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positi
 static void
 count_9mers_rev (Count_T *counts, Genomecomp_T low_rc, Genomecomp_T high_rc, Genomecomp_T nextlow_rc) {
   Genomecomp_T masked, oligo;
-#ifndef INDIVIDUAL_SHIFTS
+#ifdef INDIVIDUAL_SHIFTS
+#elif defined(SIMD_MASK_THEN_STORE)
+  UINT4 _masked[4] __attribute__ ((aligned (16)));
+  __m128i _oligo;
+#else
   __m128i _oligo, _masked;
 #endif
 
@@ -18556,41 +18996,49 @@ count_9mers_rev (Count_T *counts, Genomecomp_T low_rc, Genomecomp_T high_rc, Gen
 
 #else
   _oligo = _mm_setr_epi32(low_rc, low_rc >> 2, low_rc >> 4, low_rc >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask9));
+#else
   _masked = _mm_and_si128(_oligo, mask9);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("0 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("1 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   counts[masked] += 1;
   debug(printf("2 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   counts[masked] += 1;
   debug(printf("3 %04X => %d\n",masked,counts[masked]));
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask9));
+#else
   _masked = _mm_and_si128(_oligo, mask9);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("4 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("5 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   counts[masked] += 1;
   debug(printf("6 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   counts[masked] += 1;
   debug(printf("7 %04X => %d\n",masked,counts[masked]));
 #endif
@@ -18634,41 +19082,49 @@ count_9mers_rev (Count_T *counts, Genomecomp_T low_rc, Genomecomp_T high_rc, Gen
 
 #else
   _oligo = _mm_setr_epi32(oligo, oligo >> 2, oligo >> 4, oligo >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask9));
+#else
   _masked = _mm_and_si128(_oligo, mask9);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("8 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("9 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   counts[masked] += 1;
   debug(printf("10 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   counts[masked] += 1;
   debug(printf("11 %04X => %d\n",masked,counts[masked]));
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask9));
+#else
   _masked = _mm_and_si128(_oligo, mask9);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("12 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("13 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   counts[masked] += 1;
   debug(printf("14 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   counts[masked] += 1;
   debug(printf("15 %04X => %d\n",masked,counts[masked]));
 #endif
@@ -18709,41 +19165,49 @@ count_9mers_rev (Count_T *counts, Genomecomp_T low_rc, Genomecomp_T high_rc, Gen
 
 #else
   _oligo = _mm_setr_epi32(high_rc, high_rc >> 2, high_rc >> 4, high_rc >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask9));
+#else
   _masked = _mm_and_si128(_oligo, mask9);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("16 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("17 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   counts[masked] += 1;
   debug(printf("18 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   counts[masked] += 1;
   debug(printf("19 %04X => %d\n",masked,counts[masked]));
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask9));
+#else
   _masked = _mm_and_si128(_oligo, mask9);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("20 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("21 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   counts[masked] += 1;
   debug(printf("22 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   counts[masked] += 1;
   debug(printf("23 %04X => %d\n",masked,counts[masked]));
 #endif
@@ -18787,41 +19251,49 @@ count_9mers_rev (Count_T *counts, Genomecomp_T low_rc, Genomecomp_T high_rc, Gen
 
 #else
   _oligo = _mm_setr_epi32(oligo, oligo >> 2, oligo >> 4, oligo >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask9));
+#else
   _masked = _mm_and_si128(_oligo, mask9);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("24 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("25 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   counts[masked] += 1;
   debug(printf("26 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   counts[masked] += 1;
   debug(printf("27 %04X => %d\n",masked,counts[masked]));
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask9));
+#else
   _masked = _mm_and_si128(_oligo, mask9);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("28 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("29 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   counts[masked] += 1;
   debug(printf("30 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   counts[masked] += 1;
   debug(printf("31 %04X => %d\n",masked,counts[masked]));
 #endif
@@ -18970,7 +19442,11 @@ static int
 store_9mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Count_T *counts,
 		 Genomecomp_T low_rc, Genomecomp_T high_rc, Genomecomp_T nextlow_rc) {
   Genomecomp_T masked, oligo;
-#ifndef INDIVIDUAL_SHIFTS
+#ifdef INDIVIDUAL_SHIFTS
+#elif defined(SIMD_MASK_THEN_STORE)
+  UINT4 _masked[4] __attribute__ ((aligned (16)));
+  __m128i _oligo;
+#else
   __m128i _oligo, _masked;
 #endif
 
@@ -19050,9 +19526,13 @@ store_9mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 #else
   _oligo = _mm_setr_epi32(low_rc, low_rc >> 2, low_rc >> 4, low_rc >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask9));
+#else
   _masked = _mm_and_si128(_oligo, mask9);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -19061,7 +19541,7 @@ store_9mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -19070,7 +19550,7 @@ store_9mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -19079,7 +19559,7 @@ store_9mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -19090,9 +19570,13 @@ store_9mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask9));
+#else
   _masked = _mm_and_si128(_oligo, mask9);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -19101,7 +19585,7 @@ store_9mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -19110,7 +19594,7 @@ store_9mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -19119,7 +19603,7 @@ store_9mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -19208,9 +19692,13 @@ store_9mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 #else
   _oligo = _mm_setr_epi32(oligo, oligo >> 2, oligo >> 4, oligo >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask9));
+#else
   _masked = _mm_and_si128(_oligo, mask9);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -19219,7 +19707,7 @@ store_9mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -19228,7 +19716,7 @@ store_9mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -19237,7 +19725,7 @@ store_9mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -19248,9 +19736,13 @@ store_9mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask9));
+#else
   _masked = _mm_and_si128(_oligo, mask9);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -19259,7 +19751,7 @@ store_9mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -19268,7 +19760,7 @@ store_9mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -19277,7 +19769,7 @@ store_9mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -19363,9 +19855,13 @@ store_9mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 #else
   _oligo = _mm_setr_epi32(high_rc, high_rc >> 2, high_rc >> 4, high_rc >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask9));
+#else
   _masked = _mm_and_si128(_oligo, mask9);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -19374,7 +19870,7 @@ store_9mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -19383,7 +19879,7 @@ store_9mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -19392,7 +19888,7 @@ store_9mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -19403,9 +19899,13 @@ store_9mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask9));
+#else
   _masked = _mm_and_si128(_oligo, mask9);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -19414,7 +19914,7 @@ store_9mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -19423,7 +19923,7 @@ store_9mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -19432,7 +19932,7 @@ store_9mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -19521,9 +20021,13 @@ store_9mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 #else
   _oligo = _mm_setr_epi32(oligo, oligo >> 2, oligo >> 4, oligo >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask9));
+#else
   _masked = _mm_and_si128(_oligo, mask9);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -19532,7 +20036,7 @@ store_9mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -19541,7 +20045,7 @@ store_9mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -19550,7 +20054,7 @@ store_9mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -19561,9 +20065,13 @@ store_9mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask9));
+#else
   _masked = _mm_and_si128(_oligo, mask9);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -19572,7 +20080,7 @@ store_9mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -19581,7 +20089,7 @@ store_9mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -19590,7 +20098,7 @@ store_9mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -19607,7 +20115,11 @@ store_9mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 static void
 count_8mers_rev (Count_T *counts, Genomecomp_T low_rc, Genomecomp_T high_rc, Genomecomp_T nextlow_rc) {
   Genomecomp_T masked, oligo;
-#ifndef INDIVIDUAL_SHIFTS
+#ifdef INDIVIDUAL_SHIFTS
+#elif defined(SIMD_MASK_THEN_STORE)
+  UINT4 _masked[4] __attribute__ ((aligned (16)));
+  __m128i _oligo;
+#else
   __m128i _oligo, _masked;
 #endif
 
@@ -19651,48 +20163,56 @@ count_8mers_rev (Count_T *counts, Genomecomp_T low_rc, Genomecomp_T high_rc, Gen
 
 #else
   _oligo = _mm_setr_epi32(low_rc, low_rc >> 2, low_rc >> 4, low_rc >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask8));
+#else
   _masked = _mm_and_si128(_oligo, mask8);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   assert(masked == (low_rc & MASK8));
   counts[masked] += 1;
   debug(printf("0 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   assert(masked == ((low_rc >> 2) & MASK8));
   counts[masked] += 1;
   debug(printf("1 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   assert(masked == ((low_rc >> 4) & MASK8));
   counts[masked] += 1;
   debug(printf("2 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   assert(masked == ((low_rc >> 6) & MASK8));
   counts[masked] += 1;
   debug(printf("3 %04X => %d\n",masked,counts[masked]));
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask8));
+#else
   _masked = _mm_and_si128(_oligo, mask8);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   assert(masked == ((low_rc >> 8) & MASK8));
   counts[masked] += 1;
   debug(printf("4 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   assert(masked == ((low_rc >> 10) & MASK8));
   counts[masked] += 1;
   debug(printf("5 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   assert(masked == ((low_rc >> 12) & MASK8));
   counts[masked] += 1;
   debug(printf("6 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   assert(masked == ((low_rc >> 14) & MASK8));
   counts[masked] += 1;
   debug(printf("7 %04X => %d\n",masked,counts[masked]));
@@ -19738,43 +20258,51 @@ count_8mers_rev (Count_T *counts, Genomecomp_T low_rc, Genomecomp_T high_rc, Gen
 
 #else
   _oligo = _mm_setr_epi32(oligo, oligo >> 2, oligo >> 4, oligo >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask8));
+#else
   _masked = _mm_and_si128(_oligo, mask8);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   assert(masked == (oligo & MASK8));
   counts[masked] += 1;
   debug(printf("9 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   assert(masked == ((oligo >> 2) & MASK8));
   counts[masked] += 1;
   debug(printf("10 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   assert(masked == ((oligo >> 4) & MASK8));
   counts[masked] += 1;
   debug(printf("11 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   assert(masked == ((oligo >> 6) & MASK8));
   counts[masked] += 1;
   debug(printf("12 %04X => %d\n",masked,counts[masked]));
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask8));
+#else
   _masked = _mm_and_si128(_oligo, mask8);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   assert(masked == ((oligo >> 8) & MASK8));
   counts[masked] += 1;
   debug(printf("13 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   assert(masked == ((oligo >> 10) & MASK8));
   counts[masked] += 1;
   debug(printf("14 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   assert(masked == ((oligo >> 12) & MASK8));
   counts[masked] += 1;
   debug(printf("15 %04X => %d\n",masked,counts[masked]));
@@ -19820,48 +20348,56 @@ count_8mers_rev (Count_T *counts, Genomecomp_T low_rc, Genomecomp_T high_rc, Gen
 
 #else
   _oligo = _mm_setr_epi32(high_rc, high_rc >> 2, high_rc >> 4, high_rc >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask8));
+#else
   _masked = _mm_and_si128(_oligo, mask8);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   assert(masked == (high_rc & MASK8));
   counts[masked] += 1;
   debug(printf("16 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   assert(masked == ((high_rc >> 2) & MASK8));
   counts[masked] += 1;
   debug(printf("17 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   assert(masked == ((high_rc >> 4) & MASK8));
   counts[masked] += 1;
   debug(printf("18 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   assert(masked == ((high_rc >> 6) & MASK8));
   counts[masked] += 1;
   debug(printf("19 %04X => %d\n",masked,counts[masked]));
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask8));
+#else
   _masked = _mm_and_si128(_oligo, mask8);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   assert(masked == ((high_rc >> 8) & MASK8));
   counts[masked] += 1;
   debug(printf("20 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   assert(masked == ((high_rc >> 10) & MASK8));
   counts[masked] += 1;
   debug(printf("21 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   assert(masked == ((high_rc >> 12) & MASK8));
   counts[masked] += 1;
   debug(printf("22 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   assert(masked == ((high_rc >> 14) & MASK8));
   counts[masked] += 1;
   debug(printf("23 %04X => %d\n",masked,counts[masked]));
@@ -19907,43 +20443,51 @@ count_8mers_rev (Count_T *counts, Genomecomp_T low_rc, Genomecomp_T high_rc, Gen
 
 #else
   _oligo = _mm_setr_epi32(oligo, oligo >> 2, oligo >> 4, oligo >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask8));
+#else
   _masked = _mm_and_si128(_oligo, mask8);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   assert(masked == (oligo & MASK8));
   counts[masked] += 1;
   debug(printf("25 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   assert(masked == ((oligo >> 2) & MASK8));
   counts[masked] += 1;
   debug(printf("26 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   assert(masked == ((oligo >> 4) & MASK8));
   counts[masked] += 1;
   debug(printf("27 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   assert(masked == ((oligo >> 6) & MASK8));
   counts[masked] += 1;
   debug(printf("28 %04X => %d\n",masked,counts[masked]));
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask8));
+#else
   _masked = _mm_and_si128(_oligo, mask8);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   assert(masked == ((oligo >> 8) & MASK8));
   counts[masked] += 1;
   debug(printf("29 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   assert(masked == ((oligo >> 10) & MASK8));
   counts[masked] += 1;
   debug(printf("30 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   assert(masked == ((oligo >> 12) & MASK8));
   counts[masked] += 1;
   debug(printf("31 %04X => %d\n",masked,counts[masked]));
@@ -20096,7 +20640,11 @@ static int
 store_8mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Count_T *counts,
 		 Genomecomp_T low_rc, Genomecomp_T high_rc, Genomecomp_T nextlow_rc) {
   Genomecomp_T masked, oligo;
-#ifndef INDIVIDUAL_SHIFTS
+#ifdef INDIVIDUAL_SHIFTS
+#elif defined(SIMD_MASK_THEN_STORE)
+  UINT4 _masked[4] __attribute__ ((aligned (16)));
+  __m128i _oligo;
+#else
   __m128i _oligo, _masked;
 #endif
 
@@ -20185,9 +20733,13 @@ store_8mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 #else
   _oligo = _mm_setr_epi32(low_rc, low_rc >> 2, low_rc >> 4, low_rc >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask8));
+#else
   _masked = _mm_and_si128(_oligo, mask8);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   assert(masked == (low_rc & MASK8));
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
@@ -20197,7 +20749,7 @@ store_8mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   assert(masked == ((low_rc >> 2) & MASK8));
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
@@ -20207,7 +20759,7 @@ store_8mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   assert(masked == ((low_rc >> 4) & MASK8));
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
@@ -20217,7 +20769,7 @@ store_8mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   assert(masked == ((low_rc >> 6) & MASK8));
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
@@ -20229,9 +20781,13 @@ store_8mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask8));
+#else
   _masked = _mm_and_si128(_oligo, mask8);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   assert(masked == ((low_rc >> 8) & MASK8));
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
@@ -20241,7 +20797,7 @@ store_8mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   assert(masked == ((low_rc >> 10) & MASK8));
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
@@ -20251,7 +20807,7 @@ store_8mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   assert(masked == ((low_rc >> 12) & MASK8));
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
@@ -20261,7 +20817,7 @@ store_8mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   assert(masked == ((low_rc >> 14) & MASK8));
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
@@ -20352,9 +20908,13 @@ store_8mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 #else
   _oligo = _mm_setr_epi32(oligo, oligo >> 2, oligo >> 4, oligo >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask8));
+#else
   _masked = _mm_and_si128(_oligo, mask8);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   assert(masked == (oligo & MASK8));
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
@@ -20364,7 +20924,7 @@ store_8mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   assert(masked == ((oligo >> 2) & MASK8));
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
@@ -20374,7 +20934,7 @@ store_8mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   assert(masked == ((oligo >> 4) & MASK8));
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
@@ -20384,7 +20944,7 @@ store_8mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   assert(masked == ((oligo >> 6) & MASK8));
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
@@ -20396,9 +20956,13 @@ store_8mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask8));
+#else
   _masked = _mm_and_si128(_oligo, mask8);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   assert(masked == ((oligo >> 8) & MASK8));
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
@@ -20408,7 +20972,7 @@ store_8mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   assert(masked == ((oligo >> 10) & MASK8));
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
@@ -20418,7 +20982,7 @@ store_8mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   assert(masked == ((oligo >> 12) & MASK8));
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
@@ -20514,9 +21078,13 @@ store_8mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 #else
   _oligo = _mm_setr_epi32(high_rc, high_rc >> 2, high_rc >> 4, high_rc >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask8));
+#else
   _masked = _mm_and_si128(_oligo, mask8);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   assert(masked == (high_rc & MASK8));
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
@@ -20526,7 +21094,7 @@ store_8mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   assert(masked == ((high_rc >> 2) & MASK8));
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
@@ -20536,7 +21104,7 @@ store_8mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   assert(masked == ((high_rc >> 4) & MASK8));
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
@@ -20546,7 +21114,7 @@ store_8mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   assert(masked == ((high_rc >> 6) & MASK8));
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
@@ -20558,9 +21126,13 @@ store_8mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask8));
+#else
   _masked = _mm_and_si128(_oligo, mask8);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   assert(masked == ((high_rc >> 8) & MASK8));
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
@@ -20570,7 +21142,7 @@ store_8mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   assert(masked == ((high_rc >> 10) & MASK8));
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
@@ -20580,7 +21152,7 @@ store_8mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   assert(masked == ((high_rc >> 12) & MASK8));
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
@@ -20590,7 +21162,7 @@ store_8mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   assert(masked == ((high_rc >> 14) & MASK8));
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
@@ -20681,9 +21253,13 @@ store_8mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 #else
   _oligo = _mm_setr_epi32(oligo, oligo >> 2, oligo >> 4, oligo >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask8));
+#else
   _masked = _mm_and_si128(_oligo, mask8);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   assert(masked == (oligo & MASK8));
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
@@ -20693,7 +21269,7 @@ store_8mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   assert(masked == ((oligo >> 2) & MASK8));
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
@@ -20703,7 +21279,7 @@ store_8mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   assert(masked == ((oligo >> 4) & MASK8));
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
@@ -20713,7 +21289,7 @@ store_8mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   assert(masked == ((oligo >> 6) & MASK8));
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
@@ -20725,9 +21301,13 @@ store_8mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask8));
+#else
   _masked = _mm_and_si128(_oligo, mask8);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   assert(masked == ((oligo >> 8) & MASK8));
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
@@ -20737,7 +21317,7 @@ store_8mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   assert(masked == ((oligo >> 10) & MASK8));
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
@@ -20747,7 +21327,7 @@ store_8mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   assert(masked == ((oligo >> 12) & MASK8));
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
@@ -20766,7 +21346,11 @@ store_8mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 static void
 count_7mers_rev (Count_T *counts, Genomecomp_T low_rc, Genomecomp_T high_rc, Genomecomp_T nextlow_rc) {
   Genomecomp_T masked, oligo;
-#ifndef INDIVIDUAL_SHIFTS
+#ifdef INDIVIDUAL_SHIFTS
+#elif defined(SIMD_MASK_THEN_STORE)
+  UINT4 _masked[4] __attribute__ ((aligned (16)));
+  __m128i _oligo;
+#else
   __m128i _oligo, _masked;
 #endif
 
@@ -20814,53 +21398,65 @@ count_7mers_rev (Count_T *counts, Genomecomp_T low_rc, Genomecomp_T high_rc, Gen
 
 #else
   _oligo = _mm_setr_epi32(low_rc, low_rc >> 2, low_rc >> 4, low_rc >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask7));
+#else
   _masked = _mm_and_si128(_oligo, mask7);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("0 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("1 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   counts[masked] += 1;
   debug(printf("2 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   counts[masked] += 1;
   debug(printf("3 %04X => %d\n",masked,counts[masked]));
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask7));
+#else
   _masked = _mm_and_si128(_oligo, mask7);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("4 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("5 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   counts[masked] += 1;
   debug(printf("6 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   counts[masked] += 1;
   debug(printf("7 %04X => %d\n",masked,counts[masked]));
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask7));
+#else
   _masked = _mm_and_si128(_oligo, mask7);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("8 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("9 %04X => %d\n",masked,counts[masked]));
 #endif
@@ -20896,33 +21492,41 @@ count_7mers_rev (Count_T *counts, Genomecomp_T low_rc, Genomecomp_T high_rc, Gen
 
 #else
   _oligo = _mm_setr_epi32(oligo, oligo >> 2, oligo >> 4, oligo >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask7));
+#else
   _masked = _mm_and_si128(_oligo, mask7);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("10 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("11 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   counts[masked] += 1;
   debug(printf("12 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   counts[masked] += 1;
   debug(printf("13 %04X => %d\n",masked,counts[masked]));
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask7));
+#else
   _masked = _mm_and_si128(_oligo, mask7);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("14 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("15 %04X => %d\n",masked,counts[masked]));
 #endif
@@ -20971,53 +21575,65 @@ count_7mers_rev (Count_T *counts, Genomecomp_T low_rc, Genomecomp_T high_rc, Gen
 
 #else
   _oligo = _mm_setr_epi32(high_rc, high_rc >> 2, high_rc >> 4, high_rc >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask7));
+#else
   _masked = _mm_and_si128(_oligo, mask7);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("16 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("17 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   counts[masked] += 1;
   debug(printf("18 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   counts[masked] += 1;
   debug(printf("19 %04X => %d\n",masked,counts[masked]));
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask7));
+#else
   _masked = _mm_and_si128(_oligo, mask7);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("20 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("21 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   counts[masked] += 1;
   debug(printf("22 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   counts[masked] += 1;
   debug(printf("23 %04X => %d\n",masked,counts[masked]));
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask7));
+#else
   _masked = _mm_and_si128(_oligo, mask7);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("24 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("25 %04X => %d\n",masked,counts[masked]));
 #endif
@@ -21053,33 +21669,41 @@ count_7mers_rev (Count_T *counts, Genomecomp_T low_rc, Genomecomp_T high_rc, Gen
 
 #else
   _oligo = _mm_setr_epi32(oligo, oligo >> 2, oligo >> 4, oligo >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask7));
+#else
   _masked = _mm_and_si128(_oligo, mask7);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("26 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("27 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   counts[masked] += 1;
   debug(printf("28 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   counts[masked] += 1;
   debug(printf("29 %04X => %d\n",masked,counts[masked]));
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask7));
+#else
   _masked = _mm_and_si128(_oligo, mask7);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("30 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("31 %04X => %d\n",masked,counts[masked]));
 #endif
@@ -21229,7 +21853,11 @@ static Chrpos_T
 store_7mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Count_T *counts,
 		 Genomecomp_T low_rc, Genomecomp_T high_rc, Genomecomp_T nextlow_rc) {
   Genomecomp_T masked, oligo;
-#ifndef INDIVIDUAL_SHIFTS
+#ifdef INDIVIDUAL_SHIFTS
+#elif defined(SIMD_MASK_THEN_STORE)
+  UINT4 _masked[4] __attribute__ ((aligned (16)));
+  __m128i _oligo;
+#else
   __m128i _oligo, _masked;
 #endif
 
@@ -21327,9 +21955,13 @@ store_7mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 #else
   _oligo = _mm_setr_epi32(low_rc, low_rc >> 2, low_rc >> 4, low_rc >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask7));
+#else
   _masked = _mm_and_si128(_oligo, mask7);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -21338,7 +21970,7 @@ store_7mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -21347,7 +21979,7 @@ store_7mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -21356,7 +21988,7 @@ store_7mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -21367,9 +21999,13 @@ store_7mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask7));
+#else
   _masked = _mm_and_si128(_oligo, mask7);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -21378,7 +22014,7 @@ store_7mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -21387,7 +22023,7 @@ store_7mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -21396,7 +22032,7 @@ store_7mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -21407,9 +22043,13 @@ store_7mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask7));
+#else
   _masked = _mm_and_si128(_oligo, mask7);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -21418,7 +22058,7 @@ store_7mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -21489,9 +22129,13 @@ store_7mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 #else
   _oligo = _mm_setr_epi32(oligo, oligo >> 2, oligo >> 4, oligo >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask7));
+#else
   _masked = _mm_and_si128(_oligo, mask7);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -21500,7 +22144,7 @@ store_7mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -21509,7 +22153,7 @@ store_7mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -21518,7 +22162,7 @@ store_7mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -21529,9 +22173,13 @@ store_7mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask7));
+#else
   _masked = _mm_and_si128(_oligo, mask7);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -21540,7 +22188,7 @@ store_7mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -21645,9 +22293,13 @@ store_7mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 #else
   _oligo = _mm_setr_epi32(high_rc, high_rc >> 2, high_rc >> 4, high_rc >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask7));
+#else
   _masked = _mm_and_si128(_oligo, mask7);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -21656,7 +22308,7 @@ store_7mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -21665,7 +22317,7 @@ store_7mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -21674,7 +22326,7 @@ store_7mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -21685,9 +22337,13 @@ store_7mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask7));
+#else
   _masked = _mm_and_si128(_oligo, mask7);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -21696,7 +22352,7 @@ store_7mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -21705,7 +22361,7 @@ store_7mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -21714,7 +22370,7 @@ store_7mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -21725,9 +22381,13 @@ store_7mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask7));
+#else
   _masked = _mm_and_si128(_oligo, mask7);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -21736,7 +22396,7 @@ store_7mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -21807,9 +22467,13 @@ store_7mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 #else
   _oligo = _mm_setr_epi32(oligo, oligo >> 2, oligo >> 4, oligo >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask7));
+#else
   _masked = _mm_and_si128(_oligo, mask7);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -21818,7 +22482,7 @@ store_7mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -21827,7 +22491,7 @@ store_7mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -21836,7 +22500,7 @@ store_7mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -21847,9 +22511,13 @@ store_7mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask7));
+#else
   _masked = _mm_and_si128(_oligo, mask7);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -21858,7 +22526,7 @@ store_7mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -21875,7 +22543,11 @@ store_7mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 static void
 count_6mers_rev (Count_T *counts, Genomecomp_T low_rc, Genomecomp_T high_rc, Genomecomp_T nextlow_rc) {
   Genomecomp_T masked, oligo;
-#ifndef INDIVIDUAL_SHIFTS
+#ifdef INDIVIDUAL_SHIFTS
+#elif defined(SIMD_MASK_THEN_STORE)
+  UINT4 _masked[4] __attribute__ ((aligned (16)));
+  __m128i _oligo;
+#else
   __m128i _oligo, _masked;
 #endif
 
@@ -21927,57 +22599,69 @@ count_6mers_rev (Count_T *counts, Genomecomp_T low_rc, Genomecomp_T high_rc, Gen
 
 #else
   _oligo = _mm_setr_epi32(low_rc, low_rc >> 2, low_rc >> 4, low_rc >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask6));
+#else
   _masked = _mm_and_si128(_oligo, mask6);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("0 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("1 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   counts[masked] += 1;
   debug(printf("2 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   counts[masked] += 1;
   debug(printf("3 %04X => %d\n",masked,counts[masked]));
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask6));
+#else
   _masked = _mm_and_si128(_oligo, mask6);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("4 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("5 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   counts[masked] += 1;
   debug(printf("6 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   counts[masked] += 1;
   debug(printf("7 %04X => %d\n",masked,counts[masked]));
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask6));
+#else
   _masked = _mm_and_si128(_oligo, mask6);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("8 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("9 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   counts[masked] += 1;
   debug(printf("10 %04X => %d\n",masked,counts[masked]));
 #endif
@@ -22009,21 +22693,25 @@ count_6mers_rev (Count_T *counts, Genomecomp_T low_rc, Genomecomp_T high_rc, Gen
 
 #else
   _oligo = _mm_setr_epi32(oligo, oligo >> 2, oligo >> 4, oligo >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask6));
+#else
   _masked = _mm_and_si128(_oligo, mask6);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("11 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("12 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   counts[masked] += 1;
   debug(printf("13 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   counts[masked] += 1;
   debug(printf("14 %04X => %d\n",masked,counts[masked]));
 
@@ -22081,57 +22769,69 @@ count_6mers_rev (Count_T *counts, Genomecomp_T low_rc, Genomecomp_T high_rc, Gen
 
 #else
   _oligo = _mm_setr_epi32(high_rc, high_rc >> 2, high_rc >> 4, high_rc >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask6));
+#else
   _masked = _mm_and_si128(_oligo, mask6);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("16 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("17 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   counts[masked] += 1;
   debug(printf("18 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   counts[masked] += 1;
   debug(printf("19 %04X => %d\n",masked,counts[masked]));
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask6));
+#else
   _masked = _mm_and_si128(_oligo, mask6);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("20 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("21 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   counts[masked] += 1;
   debug(printf("22 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   counts[masked] += 1;
   debug(printf("23 %04X => %d\n",masked,counts[masked]));
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask6));
+#else
   _masked = _mm_and_si128(_oligo, mask6);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("24 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("25 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   counts[masked] += 1;
   debug(printf("26 %04X => %d\n",masked,counts[masked]));
 #endif
@@ -22163,21 +22863,25 @@ count_6mers_rev (Count_T *counts, Genomecomp_T low_rc, Genomecomp_T high_rc, Gen
 
 #else
   _oligo = _mm_setr_epi32(oligo, oligo >> 2, oligo >> 4, oligo >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask6));
+#else
   _masked = _mm_and_si128(_oligo, mask6);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("27 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("28 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   counts[masked] += 1;
   debug(printf("29 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   counts[masked] += 1;
   debug(printf("30 %04X => %d\n",masked,counts[masked]));
 
@@ -22332,7 +23036,11 @@ static int
 store_6mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Count_T *counts,
 		 Genomecomp_T low_rc, Genomecomp_T high_rc, Genomecomp_T nextlow_rc) {
   Genomecomp_T masked, oligo;
-#ifndef INDIVIDUAL_SHIFTS
+#ifdef INDIVIDUAL_SHIFTS
+#elif defined(SIMD_MASK_THEN_STORE)
+  UINT4 _masked[4] __attribute__ ((aligned (16)));
+  __m128i _oligo;
+#else
   __m128i _oligo, _masked;
 #endif
 
@@ -22439,9 +23147,13 @@ store_6mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 #else
   _oligo = _mm_setr_epi32(low_rc, low_rc >> 2, low_rc >> 4, low_rc >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask6));
+#else
   _masked = _mm_and_si128(_oligo, mask6);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -22450,7 +23162,7 @@ store_6mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -22459,7 +23171,7 @@ store_6mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -22468,7 +23180,7 @@ store_6mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -22479,9 +23191,13 @@ store_6mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask6));
+#else
   _masked = _mm_and_si128(_oligo, mask6);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -22490,7 +23206,7 @@ store_6mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -22499,7 +23215,7 @@ store_6mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -22508,7 +23224,7 @@ store_6mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -22519,9 +23235,13 @@ store_6mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask6));
+#else
   _masked = _mm_and_si128(_oligo, mask6);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -22530,7 +23250,7 @@ store_6mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -22539,7 +23259,7 @@ store_6mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -22601,9 +23321,13 @@ store_6mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 #else
   _oligo = _mm_setr_epi32(oligo, oligo >> 2, oligo >> 4, oligo >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask6));
+#else
   _masked = _mm_and_si128(_oligo, mask6);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -22612,7 +23336,7 @@ store_6mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -22621,7 +23345,7 @@ store_6mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -22630,7 +23354,7 @@ store_6mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -22753,9 +23477,13 @@ store_6mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 #else
   _oligo = _mm_setr_epi32(high_rc, high_rc >> 2, high_rc >> 4, high_rc >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask6));
+#else
   _masked = _mm_and_si128(_oligo, mask6);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -22764,7 +23492,7 @@ store_6mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -22773,7 +23501,7 @@ store_6mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -22782,7 +23510,7 @@ store_6mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -22793,9 +23521,13 @@ store_6mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask6));
+#else
   _masked = _mm_and_si128(_oligo, mask6);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -22804,7 +23536,7 @@ store_6mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -22813,7 +23545,7 @@ store_6mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -22822,7 +23554,7 @@ store_6mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -22833,9 +23565,13 @@ store_6mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask6));
+#else
   _masked = _mm_and_si128(_oligo, mask6);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -22844,7 +23580,7 @@ store_6mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -22853,7 +23589,7 @@ store_6mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -22915,9 +23651,13 @@ store_6mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 #else
   _oligo = _mm_setr_epi32(oligo, oligo >> 2, oligo >> 4, oligo >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask6));
+#else
   _masked = _mm_and_si128(_oligo, mask6);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -22926,7 +23666,7 @@ store_6mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -22935,7 +23675,7 @@ store_6mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -22944,7 +23684,7 @@ store_6mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -22971,7 +23711,11 @@ store_6mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 static void
 count_5mers_rev (Count_T *counts, Genomecomp_T low_rc, Genomecomp_T high_rc, Genomecomp_T nextlow_rc) {
   Genomecomp_T masked, oligo;
-#ifndef INDIVIDUAL_SHIFTS
+#ifdef INDIVIDUAL_SHIFTS
+#elif defined(SIMD_MASK_THEN_STORE)
+  UINT4 _masked[4] __attribute__ ((aligned (16)));
+  __m128i _oligo;
+#else
   __m128i _oligo, _masked;
 #endif
 
@@ -23027,61 +23771,73 @@ count_5mers_rev (Count_T *counts, Genomecomp_T low_rc, Genomecomp_T high_rc, Gen
 
 #else
   _oligo = _mm_setr_epi32(low_rc, low_rc >> 2, low_rc >> 4, low_rc >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask5));
+#else
   _masked = _mm_and_si128(_oligo, mask5);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("0 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("1 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   counts[masked] += 1;
   debug(printf("2 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   counts[masked] += 1;
   debug(printf("3 %04X => %d\n",masked,counts[masked]));
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask5));
+#else
   _masked = _mm_and_si128(_oligo, mask5);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("4 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("5 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   counts[masked] += 1;
   debug(printf("6 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   counts[masked] += 1;
   debug(printf("7 %04X => %d\n",masked,counts[masked]));
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask5));
+#else
   _masked = _mm_and_si128(_oligo, mask5);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("8 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("9 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   counts[masked] += 1;
   debug(printf("10 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   counts[masked] += 1;
   debug(printf("11 %04X => %d\n",masked,counts[masked]));
 #endif
@@ -23109,21 +23865,25 @@ count_5mers_rev (Count_T *counts, Genomecomp_T low_rc, Genomecomp_T high_rc, Gen
 
 #else
   _oligo = _mm_setr_epi32(oligo, oligo >> 2, oligo >> 4, oligo >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask5));
+#else
   _masked = _mm_and_si128(_oligo, mask5);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("12 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("13 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   counts[masked] += 1;
   debug(printf("14 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   counts[masked] += 1;
   debug(printf("15 %04X => %d\n",masked,counts[masked]));
 #endif
@@ -23180,61 +23940,73 @@ count_5mers_rev (Count_T *counts, Genomecomp_T low_rc, Genomecomp_T high_rc, Gen
 
 #else
   _oligo = _mm_setr_epi32(high_rc, high_rc >> 2, high_rc >> 4, high_rc >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask5));
+#else
   _masked = _mm_and_si128(_oligo, mask5);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("16 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("17 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   counts[masked] += 1;
   debug(printf("18 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   counts[masked] += 1;
   debug(printf("19 %04X => %d\n",masked,counts[masked]));
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask5));
+#else
   _masked = _mm_and_si128(_oligo, mask5);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("20 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("21 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   counts[masked] += 1;
   debug(printf("22 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   counts[masked] += 1;
   debug(printf("23 %04X => %d\n",masked,counts[masked]));
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask5));
+#else
   _masked = _mm_and_si128(_oligo, mask5);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("24 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("25 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   counts[masked] += 1;
   debug(printf("26 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   counts[masked] += 1;
   debug(printf("27 %04X => %d\n",masked,counts[masked]));
 #endif
@@ -23262,21 +24034,25 @@ count_5mers_rev (Count_T *counts, Genomecomp_T low_rc, Genomecomp_T high_rc, Gen
 
 #else
   _oligo = _mm_setr_epi32(oligo, oligo >> 2, oligo >> 4, oligo >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask5));
+#else
   _masked = _mm_and_si128(_oligo, mask5);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   counts[masked] += 1;
   debug(printf("28 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   counts[masked] += 1;
   debug(printf("29 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   counts[masked] += 1;
   debug(printf("30 %04X => %d\n",masked,counts[masked]));
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   counts[masked] += 1;
   debug(printf("31 %04X => %d\n",masked,counts[masked]));
 #endif
@@ -23426,7 +24202,11 @@ static int
 store_5mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Count_T *counts,
 		 Genomecomp_T low_rc, Genomecomp_T high_rc, Genomecomp_T nextlow_rc) {
   Genomecomp_T masked, oligo;
-#ifndef INDIVIDUAL_SHIFTS
+#ifdef INDIVIDUAL_SHIFTS
+#elif defined(SIMD_MASK_THEN_STORE)
+  UINT4 _masked[4] __attribute__ ((aligned (16)));
+  __m128i _oligo;
+#else
   __m128i _oligo, _masked;
 #endif
 
@@ -23542,9 +24322,13 @@ store_5mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 #else
   _oligo = _mm_setr_epi32(low_rc, low_rc >> 2, low_rc >> 4, low_rc >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask5));
+#else
   _masked = _mm_and_si128(_oligo, mask5);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -23553,7 +24337,7 @@ store_5mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -23562,7 +24346,7 @@ store_5mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -23571,7 +24355,7 @@ store_5mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -23582,9 +24366,13 @@ store_5mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask5));
+#else
   _masked = _mm_and_si128(_oligo, mask5);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -23593,7 +24381,7 @@ store_5mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -23602,7 +24390,7 @@ store_5mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -23611,7 +24399,7 @@ store_5mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -23622,9 +24410,13 @@ store_5mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask5));
+#else
   _masked = _mm_and_si128(_oligo, mask5);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -23633,7 +24425,7 @@ store_5mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -23642,7 +24434,7 @@ store_5mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -23651,7 +24443,7 @@ store_5mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -23704,9 +24496,13 @@ store_5mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 #else
   _oligo = _mm_setr_epi32(oligo, oligo >> 2, oligo >> 4, oligo >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask5));
+#else
   _masked = _mm_and_si128(_oligo, mask5);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -23715,7 +24511,7 @@ store_5mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -23724,7 +24520,7 @@ store_5mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -23733,7 +24529,7 @@ store_5mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -23855,9 +24651,13 @@ store_5mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 #else
   _oligo = _mm_setr_epi32(high_rc, high_rc >> 2, high_rc >> 4, high_rc >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask5));
+#else
   _masked = _mm_and_si128(_oligo, mask5);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -23866,7 +24666,7 @@ store_5mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -23875,7 +24675,7 @@ store_5mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -23884,7 +24684,7 @@ store_5mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -23895,9 +24695,13 @@ store_5mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask5));
+#else
   _masked = _mm_and_si128(_oligo, mask5);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -23906,7 +24710,7 @@ store_5mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -23915,7 +24719,7 @@ store_5mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -23924,7 +24728,7 @@ store_5mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -23935,9 +24739,13 @@ store_5mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 
   _oligo = _mm_srli_epi32(_oligo, 8);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask5));
+#else
   _masked = _mm_and_si128(_oligo, mask5);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -23946,7 +24754,7 @@ store_5mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -23955,7 +24763,7 @@ store_5mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -23964,7 +24772,7 @@ store_5mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -24017,9 +24825,13 @@ store_5mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
 
 #else
   _oligo = _mm_setr_epi32(oligo, oligo >> 2, oligo >> 4, oligo >> 6);
+#ifdef SIMD_MASK_THEN_STORE
+  _mm_store_si128((__m128i *) _masked,_mm_and_si128(_oligo, mask5));
+#else
   _masked = _mm_and_si128(_oligo, mask5);
+#endif
 
-  masked = _mm_extract_epi32(_masked,0);
+  masked = EXTRACT(_masked,0);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -24028,7 +24840,7 @@ store_5mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,1);
+  masked = EXTRACT(_masked,1);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -24037,7 +24849,7 @@ store_5mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,2);
+  masked = EXTRACT(_masked,2);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -24046,7 +24858,7 @@ store_5mers_rev (Chrpos_T chrpos, Chrpos_T **pointers, Chrpos_T **positions, Cou
     }
   }
 
-  masked = _mm_extract_epi32(_masked,3);
+  masked = EXTRACT(_masked,3);
   if (counts[masked]) {
     if (pointers[masked] == positions[masked]) {
       counts[masked] = 0;
@@ -24509,12 +25321,17 @@ count_positions_rev_simd (Count_T *counts, int indexsize,
 	current = _mm_xor_si128(current,invert3);
 	nextlow_rc = ~nextlow;
 #ifdef HAVE_SSE4_1
+	/* high0_rc = _mm_extract_epi32(current,2); */
+	/* low1_rc = _mm_extract_epi32(current,1); */
+	/* high1_rc = _mm_extract_epi32(current,0); */
+
 	temp = _mm_insert_epi32(current,nextlow_rc,0x03);
 	next = _mm_shuffle_epi32(temp,0x93);
 #else
-	high0_rc = _mm_extract_epi32(current,2);
-	low1_rc = _mm_extract_epi32(current,1);
-	high1_rc = _mm_extract_epi32(current,0);
+	high0_rc = ~high0;
+	low1_rc = ~low1;
+	high1_rc = ~high1;
+
 	next = _mm_set_epi32(high0_rc,low1_rc,high1_rc,nextlow_rc);
 #endif
 
@@ -24592,12 +25409,17 @@ count_positions_rev_simd (Count_T *counts, int indexsize,
 	current = _mm_xor_si128(current,invert3);
 	nextlow_rc = ~nextlow;
 #ifdef HAVE_SSE4_1
+	/* high0_rc = _mm_extract_epi32(current,2); */
+	/* low1_rc = _mm_extract_epi32(current,1); */
+	/* high1_rc = _mm_extract_epi32(current,0); */
+
 	temp = _mm_insert_epi32(current,nextlow_rc,0x03);
 	next = _mm_shuffle_epi32(temp,0x93);
 #else
-	high0_rc = _mm_extract_epi32(current,2);
-	low1_rc = _mm_extract_epi32(current,1);
-	high1_rc = _mm_extract_epi32(current,0);
+	high0_rc = ~high0;
+	low1_rc = ~low1;
+	high1_rc = ~high1;
+
 	next = _mm_set_epi32(high0_rc,low1_rc,high1_rc,nextlow_rc);
 #endif
 
@@ -24675,12 +25497,17 @@ count_positions_rev_simd (Count_T *counts, int indexsize,
 	current = _mm_xor_si128(current,invert3);
 	nextlow_rc = ~nextlow;
 #ifdef HAVE_SSE4_1
+	/* high0_rc = _mm_extract_epi32(current,2); */
+	/* low1_rc = _mm_extract_epi32(current,1); */
+	/* high1_rc = _mm_extract_epi32(current,0); */
+
 	temp = _mm_insert_epi32(current,nextlow_rc,0x03);
 	next = _mm_shuffle_epi32(temp,0x93);
 #else
-	high0_rc = _mm_extract_epi32(current,2);
-	low1_rc = _mm_extract_epi32(current,1);
-	high1_rc = _mm_extract_epi32(current,0);
+	high0_rc = ~high0;
+	low1_rc = ~low1;
+	high1_rc = ~high1;
+
 	next = _mm_set_epi32(high0_rc,low1_rc,high1_rc,nextlow_rc);
 #endif
 
@@ -24758,12 +25585,17 @@ count_positions_rev_simd (Count_T *counts, int indexsize,
 	current = _mm_xor_si128(current,invert3);
 	nextlow_rc = ~nextlow;
 #ifdef HAVE_SSE4_1
+	/* high0_rc = _mm_extract_epi32(current,2); */
+	/* low1_rc = _mm_extract_epi32(current,1); */
+	/* high1_rc = _mm_extract_epi32(current,0); */
+
 	temp = _mm_insert_epi32(current,nextlow_rc,0x03);
 	next = _mm_shuffle_epi32(temp,0x93);
 #else
-	high0_rc = _mm_extract_epi32(current,2);
-	low1_rc = _mm_extract_epi32(current,1);
-	high1_rc = _mm_extract_epi32(current,0);
+	high0_rc = ~high0;
+	low1_rc = ~low1;
+	high1_rc = ~high1;
+
 	next = _mm_set_epi32(high0_rc,low1_rc,high1_rc,nextlow_rc);
 #endif
 
@@ -24841,12 +25673,17 @@ count_positions_rev_simd (Count_T *counts, int indexsize,
 	current = _mm_xor_si128(current,invert3);
 	nextlow_rc = ~nextlow;
 #ifdef HAVE_SSE4_1
+	/* high0_rc = _mm_extract_epi32(current,2); */
+	/* low1_rc = _mm_extract_epi32(current,1); */
+	/* high1_rc = _mm_extract_epi32(current,0); */
+
 	temp = _mm_insert_epi32(current,nextlow_rc,0x03);
 	next = _mm_shuffle_epi32(temp,0x93);
 #else
-	high0_rc = _mm_extract_epi32(current,2);
-	low1_rc = _mm_extract_epi32(current,1);
-	high1_rc = _mm_extract_epi32(current,0);
+	high0_rc = ~high0;
+	low1_rc = ~low1;
+	high1_rc = ~high1;
+
 	next = _mm_set_epi32(high0_rc,low1_rc,high1_rc,nextlow_rc);
 #endif
 
@@ -25391,12 +26228,17 @@ store_positions_rev_simd (Chrpos_T **pointers, Chrpos_T **positions, Count_T *co
 	current = _mm_xor_si128(current,invert3);
 	nextlow_rc = ~nextlow;
 #ifdef HAVE_SSE4_1
+	/* high0_rc = _mm_extract_epi32(current,2); */
+	/* low1_rc = _mm_extract_epi32(current,1); */
+	/* high1_rc = _mm_extract_epi32(current,0); */
+
 	temp = _mm_insert_epi32(current,nextlow_rc,0x03);
 	next = _mm_shuffle_epi32(temp,0x93);
 #else
-	high0_rc = _mm_extract_epi32(current,2);
-	low1_rc = _mm_extract_epi32(current,1);
-	high1_rc = _mm_extract_epi32(current,0);
+	high0_rc = ~high0;
+	low1_rc = ~low1;
+	high1_rc = ~high1;
+
 	next = _mm_set_epi32(high0_rc,low1_rc,high1_rc,nextlow_rc);
 #endif
 
@@ -25470,12 +26312,17 @@ store_positions_rev_simd (Chrpos_T **pointers, Chrpos_T **positions, Count_T *co
 	current = _mm_xor_si128(current,invert3);
 	nextlow_rc = ~nextlow;
 #ifdef HAVE_SSE4_1
+	/* high0_rc = _mm_extract_epi32(current,2); */
+	/* low1_rc = _mm_extract_epi32(current,1); */
+	/* high1_rc = _mm_extract_epi32(current,0); */
+
 	temp = _mm_insert_epi32(current,nextlow_rc,0x03);
 	next = _mm_shuffle_epi32(temp,0x93);
 #else
-	high0_rc = _mm_extract_epi32(current,2);
-	low1_rc = _mm_extract_epi32(current,1);
-	high1_rc = _mm_extract_epi32(current,0);
+	high0_rc = ~high0;
+	low1_rc = ~low1;
+	high1_rc = ~high1;
+
 	next = _mm_set_epi32(high0_rc,low1_rc,high1_rc,nextlow_rc);
 #endif
 
@@ -25549,12 +26396,17 @@ store_positions_rev_simd (Chrpos_T **pointers, Chrpos_T **positions, Count_T *co
 	current = _mm_xor_si128(current,invert3);
 	nextlow_rc = ~nextlow;
 #ifdef HAVE_SSE4_1
+	/* high0_rc = _mm_extract_epi32(current,2); */
+	/* low1_rc = _mm_extract_epi32(current,1); */
+	/* high1_rc = _mm_extract_epi32(current,0); */
+
 	temp = _mm_insert_epi32(current,nextlow_rc,0x03);
 	next = _mm_shuffle_epi32(temp,0x93);
 #else
-	high0_rc = _mm_extract_epi32(current,2);
-	low1_rc = _mm_extract_epi32(current,1);
-	high1_rc = _mm_extract_epi32(current,0);
+	high0_rc = ~high0;
+	low1_rc = ~low1;
+	high1_rc = ~high1;
+
 	next = _mm_set_epi32(high0_rc,low1_rc,high1_rc,nextlow_rc);
 #endif
 
@@ -25628,12 +26480,17 @@ store_positions_rev_simd (Chrpos_T **pointers, Chrpos_T **positions, Count_T *co
 	current = _mm_xor_si128(current,invert3);
 	nextlow_rc = ~nextlow;
 #ifdef HAVE_SSE4_1
+	/* high0_rc = _mm_extract_epi32(current,2); */
+	/* low1_rc = _mm_extract_epi32(current,1); */
+	/* high1_rc = _mm_extract_epi32(current,0); */
+
 	temp = _mm_insert_epi32(current,nextlow_rc,0x03);
 	next = _mm_shuffle_epi32(temp,0x93);
 #else
-	high0_rc = _mm_extract_epi32(current,2);
-	low1_rc = _mm_extract_epi32(current,1);
-	high1_rc = _mm_extract_epi32(current,0);
+	high0_rc = ~high0;
+	low1_rc = ~low1;
+	high1_rc = ~high1;
+
 	next = _mm_set_epi32(high0_rc,low1_rc,high1_rc,nextlow_rc);
 #endif
 
@@ -25707,12 +26564,17 @@ store_positions_rev_simd (Chrpos_T **pointers, Chrpos_T **positions, Count_T *co
 	current = _mm_xor_si128(current,invert3);
 	nextlow_rc = ~nextlow;
 #ifdef HAVE_SSE4_1
+	/* high0_rc = _mm_extract_epi32(current,2); */
+	/* low1_rc = _mm_extract_epi32(current,1); */
+	/* high1_rc = _mm_extract_epi32(current,0); */
+
 	temp = _mm_insert_epi32(current,nextlow_rc,0x03);
 	next = _mm_shuffle_epi32(temp,0x93);
 #else
-	high0_rc = _mm_extract_epi32(current,2);
-	low1_rc = _mm_extract_epi32(current,1);
-	high1_rc = _mm_extract_epi32(current,0);
+	high0_rc = ~high0;
+	low1_rc = ~low1;
+	high1_rc = ~high1;
+
 	next = _mm_set_epi32(high0_rc,low1_rc,high1_rc,nextlow_rc);
 #endif
 
diff --git a/src/sarray-read.c b/src/sarray-read.c
index 5582b1a..46c0cdc 100644
--- a/src/sarray-read.c
+++ b/src/sarray-read.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: sarray-read.c 166828 2015-06-03 06:56:12Z twu $";
+static char rcsid[] = "$Id: sarray-read.c 170516 2015-07-23 23:15:12Z twu $";
 #ifdef HAVE_CONFIG_H
 #include <config.h>
 #endif
@@ -8,6 +8,14 @@ static char rcsid[] = "$Id: sarray-read.c 166828 2015-06-03 06:56:12Z twu $";
 
 #include "sarray-read.h"
 
+#ifdef WORDS_BIGENDIAN
+#define CONVERT(x) Bigendian_convert_uint(x)
+#include "bigendian.h"
+#else
+#define CONVERT(x) (x)
+#include "littleendian.h"
+#endif
+
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -38,20 +46,13 @@ static char rcsid[] = "$Id: sarray-read.c 166828 2015-06-03 06:56:12Z twu $";
 #include "junction.h"
 #include "stage3hr.h"
 
-#ifdef USE_CSA
-/* For FREAD_UINT */
-#ifdef WORDS_BIGENDIAN
-#include "bigendian.h"
-#else
-#include "littleendian.h"
-#endif
-#endif
-
 
-#ifdef HAVE_SSE2
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSE2)
+#else
 #include <emmintrin.h>
 #endif
-#ifdef HAVE_SSSE3
+#if defined(WORDS_BIGENDIAN) || !defined(HAVE_SSSE3)
+#else
 #include <tmmintrin.h>
 #endif
 #ifdef HAVE_POPCNT
@@ -274,7 +275,7 @@ struct T {
   Sarrayptr_T indexG;
   Sarrayptr_T indexT;
   Sarrayptr_T indexX;
-#ifdef HAVE_SSE2
+#if defined(HAVE_SSE2) && !defined(WORDS_BIGENDIAN)
   __m128i indices0;
   UINT4 index0[16];
 #endif
@@ -360,11 +361,11 @@ static Chrpos_T *splicedists;
 static int nsplicesites;
 
 
-#ifdef HAVE_SSE2
+#if defined(HAVE_SSE2) && !defined(WORDS_BIGENDIAN)
 static __m128i epi32_convert;	/* For converting unsigned ints to signed ints */
 #endif
 
-#if defined(HAVE_SSE2) && defined(USE_SHUFFLE_MASK)
+#if defined(HAVE_SSE2) && !defined(WORDS_BIGENDIAN) && defined(USE_SHUFFLE_MASK)
 static __m128i shuffle_mask16[16];
 #endif
 
@@ -391,7 +392,11 @@ sarray_search_char (Sarrayptr_T *initptr, Sarrayptr_T *finalptr, char desired_ch
 #else
     mid = low + ((high - low) / 2);
 #endif
+#ifdef WORDS_BIGENDIAN
+    pos = Bigendian_convert_uint(SA[mid]);
+#else
     pos = SA[mid];
+#endif
     c = Genome_get_char_lex(genome,pos,n,chartable);
     if (desired_char > c) {
       low = mid + 1;
@@ -415,7 +420,11 @@ sarray_search_char (Sarrayptr_T *initptr, Sarrayptr_T *finalptr, char desired_ch
     /* This does not work for ceiling */
     mid = low + ((high - low) / 2);
 #endif
+#ifdef WORDS_BIGENDIAN
+    pos = Bigendian_convert_uint(SA[mid]);
+#else
     pos = SA[mid];
+#endif
     c = Genome_get_char_lex(genome,pos,n,chartable);
     if (desired_char >= c) {
       low = mid;
@@ -456,6 +465,9 @@ Sarray_setup (T sarray_fwd_in, T sarray_rev_in, Genome_T genome_in, Mode_T mode,
   } else if (mode == ATOI_STRANDED || mode == ATOI_NONSTRANDED) {
     conversion_fwd['A'] = 'G';	/* AG */
     conversion_rev['T'] = 'C';	/* TC */
+  } else if (mode == TTOC_STRANDED || mode == TTOC_NONSTRANDED) {
+    conversion_fwd['T'] = 'C';	/* TC */
+    conversion_rev['A'] = 'G';	/* AG */
   }
 
   chromosome_iit = chromosome_iit_in;
@@ -496,11 +508,11 @@ Sarray_setup (T sarray_fwd_in, T sarray_rev_in, Genome_T genome_in, Mode_T mode,
   printf("T => %u %u\n",sarray->initindexi[3],sarray->initindexj[3]);
 #endif
 
-#ifdef HAVE_SSE2
+#if defined(HAVE_SSE2) && !defined(WORDS_BIGENDIAN)
   epi32_convert = _mm_set1_epi32(2147483648); /* 2^31 */
 #endif
 
-#if defined(HAVE_SSE2) && defined(USE_SHUFFLE_MASK)
+#if defined(HAVE_SSE2) && !defined(WORDS_BIGENDIAN) && defined(USE_SHUFFLE_MASK)
   /* Used by fill_positions_filtered_first */
   shuffle_mask16[0] =  _mm_set_epi8(-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1);
   shuffle_mask16[1] =  _mm_set_epi8(-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,  3, 2, 1, 0);
@@ -572,6 +584,12 @@ Sarray_shmem_remove (char *dir, char *fileroot, char *snps_root, Mode_T mode, bo
     } else {
       mode_prefix = ".a2itc.";
     }
+  } else if (mode == TTOC_STRANDED || mode == TTOC_NONSTRANDED) {
+    if (fwdp == true) {
+      mode_prefix = ".a2itc.";
+    } else {
+      mode_prefix = ".a2iag.";
+    }
   }
 
   sarrayfile = (char *) CALLOC(strlen(dir)+strlen("/")+strlen(fileroot)+strlen(mode_prefix)+strlen("sarray")+1,sizeof(char));
@@ -648,7 +666,7 @@ csa_lookup (T sarray, Sarrayptr_T i) {
       expected_i = sarray->csa[i];
 #endif
 
-#ifdef HAVE_SSE2
+#if defined(HAVE_SSE2) && !defined(WORDS_BIGENDIAN)
       converted = _mm_sub_epi32(_mm_set1_epi32(i),epi32_convert);
       cmp = _mm_cmpgt_epi32(converted,sarray->indices0); /* To use cmpgt, sarray->indices0 is shifted down by 1 */
       matchbits = _mm_movemask_ps(_mm_castsi128_ps(cmp));
@@ -694,6 +712,10 @@ csa_lookup (T sarray, Sarrayptr_T i) {
   }
 }
 
+#elif defined(WORDS_BIGENDIAN)
+
+#define csa_lookup(sarray,i) Bigendian_convert_uint(sarray->array[i])
+
 #else
 
 #define csa_lookup(sarray,i) sarray->array[i]
@@ -757,6 +779,12 @@ Sarray_new (char *dir, char *fileroot, char *snps_root, Access_mode_T sarray_acc
     } else {
       mode_prefix = ".a2itc.";
     }
+  } else if (mode == TTOC_STRANDED || mode == TTOC_NONSTRANDED) {
+    if (fwdp == true) {
+      mode_prefix = ".a2itc.";
+    } else {
+      mode_prefix = ".a2iag.";
+    }
   }
 
   /* Old format */
@@ -894,7 +922,7 @@ Sarray_new (char *dir, char *fileroot, char *snps_root, Access_mode_T sarray_acc
       fclose(fp);
       FREE(filename);
 
-#ifdef HAVE_SSE2
+#if defined(HAVE_SSE2) && !defined(WORDS_BIGENDIAN)
       new->indices0 = _mm_sub_epi32(_mm_set_epi32(new->indexX,new->indexT,new->indexG,new->indexC),
 				    _mm_set1_epi32(2147483648) /* 2^31, same as epi_convert */);
       /* because (a >= indices) is equivalent to (a > indices - 1) */
@@ -1406,17 +1434,29 @@ sarray_search_init (char *query, int querylength, int queryoffset, Compress_T qu
     debug1(printf("low %u, high %u => mid %u\n",low,high,mid));
     nmatches_mid =  (nmatches_low < nmatches_high) ? nmatches_low : nmatches_high;
 
+#ifdef WORDS_BIGENDIAN
+    fasti = nmatches_mid +
+      (Univcoord_T) Genome_consecutive_matches_rightward(query_compress,/*left*/Bigendian_convert_uint(sarray->array[mid])-queryoffset,
+							 /*pos5*/queryoffset+nmatches_mid,
+							 /*pos3*/queryoffset+querylength,plusp,genestrand,first_read_p);
+    pos = Bigendian_convert_uint(sarray->array[mid]) + fasti;
+#else
     fasti = nmatches_mid +
       (Univcoord_T) Genome_consecutive_matches_rightward(query_compress,/*left*/sarray->array[mid]-queryoffset,
 							 /*pos5*/queryoffset+nmatches_mid,
 							 /*pos3*/queryoffset+querylength,plusp,genestrand,first_read_p);
     pos = sarray->array[mid] + fasti;
+#endif
     c = Genome_get_char_lex(genome,pos,sarray->n,chartable);
 
     if (fasti == (Univcoord_T) querylength || c > query[fasti]) {
       high = mid;
       /* nmatches_high = (sarray->lcp[mid] < nmatches_mid) ? sarray->lcp[mid] : nmatches_mid; */
+#ifdef WORDS_BIGENDIAN
+      sa_mid = Bigendian_convert_uint(sarray->array[mid]);
+#else
       sa_mid = sarray->array[mid];
+#endif
       lcp_mid = Bitpack64_read_one(sa_mid,sarray->plcp_ptrs,sarray->plcp_comp) - sa_mid;
 #ifdef USE_LCP
       if (lcp_mid != sarray->lcp[mid]) {
@@ -1427,7 +1467,11 @@ sarray_search_init (char *query, int querylength, int queryoffset, Compress_T qu
     } else {
       low = mid;
       /* nmatches_low = (sarray->lcp[low] < nmatches_mid) ? sarray->lcp[low] : nmatches_mid; */
+#ifdef WORDS_BIGENDIAN
+      sa_low = Bigendian_convert_uint(sarray->array[low]);
+#else
       sa_low = sarray->array[low];
+#endif
       lcp_low = Bitpack64_read_one(sa_low,sarray->plcp_ptrs,sarray->plcp_comp) - sa_low;
 #ifdef USE_LCP
       if (lcp_low != sarray->lcp[low]) {
@@ -1475,17 +1519,29 @@ sarray_search_final (char *query, int querylength, int queryoffset, Compress_T q
     debug1(printf("low %u, high %u => mid %u\n",low,high,mid));
     nmatches_mid =  (nmatches_low < nmatches_high) ? nmatches_low : nmatches_high;
 
+#ifdef WORDS_BIGENDIAN
+    fasti = nmatches_mid +
+      (Univcoord_T) Genome_consecutive_matches_rightward(query_compress,/*left*/Bigendian_convert_uint(sarray->array[mid])-queryoffset,
+							 /*pos5*/queryoffset+nmatches_mid,
+							 /*pos3*/queryoffset+querylength,plusp,genestrand,first_read_p);
+    pos = Bigendian_convert_uint(sarray->array[mid]) + fasti;
+#else
     fasti = nmatches_mid +
       (Univcoord_T) Genome_consecutive_matches_rightward(query_compress,/*left*/sarray->array[mid]-queryoffset,
 							 /*pos5*/queryoffset+nmatches_mid,
 							 /*pos3*/queryoffset+querylength,plusp,genestrand,first_read_p);
     pos = sarray->array[mid] + fasti;
+#endif
     c = Genome_get_char_lex(genome,pos,sarray->n,chartable);
 
     if (fasti == (Univcoord_T) querylength || c < query[fasti]) {
       low = mid;
       /* nmatches_low = (sarray->lcp[low] < nmatches_mid) ? sarray->lcp[low] : nmatches_mid; */
+#ifdef WORDS_BIGENDIAN
+      sa_low = Bigendian_convert_uint(sarray->array[low]);
+#else
       sa_low = sarray->array[low];
+#endif
       lcp_low = Bitpack64_read_one(sa_low,sarray->plcp_ptrs,sarray->plcp_comp) - sa_low;
 #ifdef USE_LCP
       if (lcp_low != sarray->lcp[low]) {
@@ -1496,7 +1552,11 @@ sarray_search_final (char *query, int querylength, int queryoffset, Compress_T q
     } else {
       high = mid;
       /* nmatches_high = (sarray->lcp[mid] < nmatches_mid) ? sarray->lcp[mid] : nmatches_mid; */
+#ifdef WORDS_BIGENDIAN
+      sa_mid = Bigendian_convert_uint(sarray->array[mid]);
+#else
       sa_mid = sarray->array[mid];
+#endif
       lcp_mid = Bitpack64_read_one(sa_mid,sarray->plcp_ptrs,sarray->plcp_comp) - sa_mid;
 #ifdef USE_LCP
       if (lcp_mid != sarray->lcp[mid]) {
@@ -1593,6 +1653,7 @@ static bool
 get_child_given_first (Sarrayptr_T *l, Sarrayptr_T *r, Sarrayptr_T i, Sarrayptr_T j, char desired_char,
 		       T sarray, unsigned char *lcpchilddc, UINT4 lcp_whole, UINT4 nextl) {
   char c1, c2;
+  UINT4 child_next;
 
   debug2(printf("Getting children for l-interval from %u to %u, char %c\n",i,j,desired_char));
 
@@ -1632,20 +1693,28 @@ get_child_given_first (Sarrayptr_T *l, Sarrayptr_T *r, Sarrayptr_T i, Sarrayptr_
   /* Test for child[i] being down: lcp[child[i]] > lcp[i] */
   /* Test for child[i] being next_lindex: lcp[child[i]] == lcp[i] */
   /* Test middle children */
-  while (nextl < j && Bytecoding_lcpchilddc_lcp_next(nextl,/*bytes*/lcpchilddc,sarray->child_guide,sarray->child_exceptions,
+  while (nextl < j && Bytecoding_lcpchilddc_lcp_next(&child_next,nextl,/*bytes*/lcpchilddc,sarray->child_guide,sarray->child_exceptions,
 						     sarray->child_guide_interval,sarray->lcp_exceptions,sarray->n_lcp_exceptions) == lcp_whole) {
     /* Already tested for desired_char < c2 */
     if (desired_char == c2) {
       *l = nextl;
+#if 0
       *r = Bytecoding_lcpchilddc_child_next(nextl,lcpchilddc,sarray->child_guide,sarray->child_exceptions,
 					    sarray->child_guide_interval) - 1; /* child[nextl] - 1 */
+#else
+      *r = child_next - 1;
+#endif
       debug2(printf("Child: %u to %u, c2 %c\n",nextl,*r,c2));
       debug2(printf("Returning true\n\n"));
       return true;
     } else {
       debug2(printf("Child: %u",nextl));
+#if 0
       nextl = Bytecoding_lcpchilddc_child_next(nextl,lcpchilddc,sarray->child_guide,sarray->child_exceptions,
 					       sarray->child_guide_interval); /* child[nextl] */
+#else
+      nextl = child_next;
+#endif
       c2 = Bytecoding_lcpchilddc_dc(&c1,nextl,lcpchilddc);
       debug2(printf(" to %u, discrim chars %c and %c\n",nextl-1,c1,c2));
 
@@ -1772,10 +1841,10 @@ sarray_search (Sarrayptr_T *initptr, Sarrayptr_T *finalptr, bool *successp,
   UINT4 l, r;
 
 #ifdef DEBUG1
-  Univcoord_T SA_i;
+  Univcoord_T SA_i, hit, child_next;
   int k = 0;
-  UINT4 recount;
-  char Buffer[1000];
+  UINT4 recount, lcp_prev, lcp_next, lcp_i, max_lcp;
+  char Buffer[1000+1], c1, c2;
   bool failp;
 #endif
 
@@ -1866,6 +1935,13 @@ sarray_search (Sarrayptr_T *initptr, Sarrayptr_T *finalptr, bool *successp,
 						   /*pos5*/queryoffset,/*pos3*/queryoffset+querylength,
 						   plusp,genestrand,first_read_p);
     printf("%d\t%u\t%u\t",recount,(*initptr)-1,SA_i/*+ 1U*/);
+    c2 = Bytecoding_lcpchilddc_dc(&c1,(*initptr)-1,sarray->lcpchilddc);
+    printf("%c%c\t",c1,c2);
+    lcp_i = Bytecoding_lcpchilddc_lcp((*initptr)-1,/*bytes*/sarray->lcpchilddc,sarray->lcp_exceptions,sarray->n_lcp_exceptions);
+    printf("%u\t",lcp_i);
+    lcp_next = Bytecoding_lcpchilddc_lcp((*initptr),/*bytes*/sarray->lcpchilddc,sarray->lcp_exceptions,sarray->n_lcp_exceptions);
+    printf("%u\t",Bytecoding_lcpchilddc_lcp_next(&child_next,(*initptr)-1,/*bytes*/sarray->lcpchilddc,sarray->child_guide,sarray->child_exceptions,
+						 sarray->child_guide_interval,sarray->lcp_exceptions,sarray->n_lcp_exceptions));
     if (genestrand == +2) {
       if (plusp) {
 	Genome_fill_buffer_convert_rev(SA_i,recount+1,Buffer);
@@ -1891,23 +1967,42 @@ sarray_search (Sarrayptr_T *initptr, Sarrayptr_T *finalptr, bool *successp,
 
 
   /* Hits */
+  lcp_prev = lcp_i;
   for (k = 0; k < (int) (*finalptr - *initptr + 1) && k < MAX_DEBUG1_HITS; k++) {
     SA_i = csa_lookup(sarray,(*initptr)+k);
     recount = Genome_consecutive_matches_rightward(query_compress,/*left*/SA_i-queryoffset,
 						   /*pos5*/queryoffset,/*pos3*/queryoffset+querylength,
 						   plusp,genestrand,first_read_p);
     printf("%d\t%u\t%u\t",recount,(*initptr)+k,SA_i/*+ 1U*/);
+    c2 = Bytecoding_lcpchilddc_dc(&c1,(*initptr)+k,sarray->lcpchilddc);
+    printf("%c%c\t",c1,c2);
+    lcp_i = Bytecoding_lcpchilddc_lcp((*initptr)+k,/*bytes*/sarray->lcpchilddc,sarray->lcp_exceptions,sarray->n_lcp_exceptions);
+    lcp_next = Bytecoding_lcpchilddc_lcp((*initptr)+k+1,/*bytes*/sarray->lcpchilddc,sarray->lcp_exceptions,sarray->n_lcp_exceptions);
+    printf("%u\t",lcp_i);
+    printf("%u\t",Bytecoding_lcpchilddc_lcp_next(&child_next,(*initptr)+k,/*bytes*/sarray->lcpchilddc,sarray->child_guide,sarray->child_exceptions,
+						 sarray->child_guide_interval,sarray->lcp_exceptions,sarray->n_lcp_exceptions));
+    max_lcp = lcp_i;
+    if (lcp_prev > max_lcp) {
+      max_lcp = lcp_prev;
+    }
+    if (lcp_next > max_lcp) {
+      max_lcp = lcp_next;
+    }
+    if (max_lcp > 1000) {
+      max_lcp = 1000;
+    }
+
     if (genestrand == +2) {
       if (plusp) {
-	Genome_fill_buffer_convert_rev(SA_i,recount+1,Buffer);
+	Genome_fill_buffer_convert_rev(SA_i,max_lcp+1,Buffer);
       } else {
-	Genome_fill_buffer_convert_fwd(SA_i,recount+1,Buffer);
+	Genome_fill_buffer_convert_fwd(SA_i,max_lcp+1,Buffer);
       }
     } else {
       if (plusp) {
-	Genome_fill_buffer_convert_fwd(SA_i,recount+1,Buffer);
+	Genome_fill_buffer_convert_fwd(SA_i,max_lcp+1,Buffer);
       } else {
-	Genome_fill_buffer_convert_rev(SA_i,recount+1,Buffer);
+	Genome_fill_buffer_convert_rev(SA_i,max_lcp+1,Buffer);
       }
     }
     printf("%s\n",Buffer);
@@ -1917,35 +2012,44 @@ sarray_search (Sarrayptr_T *initptr, Sarrayptr_T *finalptr, bool *successp,
 	     recount,csa_lookup(sarray,(*initptr)),*nmatches);
       failp = true;
     }
+
+    lcp_prev = lcp_i;
   }
 
   if (k < (int) (*finalptr - *initptr + 1)) {
     /* Overflow */
     printf("...\n");
     k = (int) (*finalptr - *initptr);
-    hit = sarray->array[(*initptr)+k];
+    hit = csa_lookup(sarray,(*initptr)+k);
     recount = Genome_consecutive_matches_rightward(query_compress,/*left*/hit-queryoffset,
 						   /*pos5*/queryoffset,/*pos3*/queryoffset+querylength,
 						   plusp,genestrand,first_read_p);
     printf("%d\t%u\t%u\t",recount,(*initptr)+k,hit /*+ 1U*/);
+    c2 = Bytecoding_lcpchilddc_dc(&c1,(*initptr)+k,sarray->lcpchilddc);
+    printf("%c%c\t",c1,c2);
+    lcp_i = Bytecoding_lcpchilddc_lcp((*initptr)+k,/*bytes*/sarray->lcpchilddc,sarray->lcp_exceptions,sarray->n_lcp_exceptions);
+    lcp_next = Bytecoding_lcpchilddc_lcp((*initptr)+k+1,/*bytes*/sarray->lcpchilddc,sarray->lcp_exceptions,sarray->n_lcp_exceptions);
+    printf("%u\t",lcp_i);
+    printf("%u\t",Bytecoding_lcpchilddc_lcp_next(&child_next,(*initptr)+k,/*bytes*/sarray->lcpchilddc,sarray->child_guide,sarray->child_exceptions,
+						 sarray->child_guide_interval,sarray->lcp_exceptions,sarray->n_lcp_exceptions));
     if (genestrand == +2) {
       if (plusp) {
-	Genome_fill_buffer_convert_rev(sarray->array[(*initptr)+k],recount+1,Buffer);
+	Genome_fill_buffer_convert_rev(hit,recount+1,Buffer);
       } else {
-	Genome_fill_buffer_convert_fwd(sarray->array[(*initptr)+k],recount+1,Buffer);
+	Genome_fill_buffer_convert_fwd(hit,recount+1,Buffer);
       }
     } else {
       if (plusp) {
-	Genome_fill_buffer_convert_fwd(sarray->array[(*initptr)+k],recount+1,Buffer);
+	Genome_fill_buffer_convert_fwd(hit,recount+1,Buffer);
       } else {
-	Genome_fill_buffer_convert_rev(sarray->array[(*initptr)+k],recount+1,Buffer);
+	Genome_fill_buffer_convert_rev(hit,recount+1,Buffer);
       }
     }
     printf("%s\n",Buffer);
     if (recount != *nmatches) {
       printf("querylength is %d\n",querylength);
       printf("false positive: recount %d at %u does not equal expected nmatches %d\n",
-	     recount,sarray->array[(*initptr)],*nmatches);
+	     recount,csa_lookup(sarray,*initptr),*nmatches);
       failp = true;
     }
     /* hits[k] = sarray->array[(*initptr)++]; */
@@ -1959,6 +2063,11 @@ sarray_search (Sarrayptr_T *initptr, Sarrayptr_T *finalptr, bool *successp,
 						   /*pos5*/queryoffset,/*pos3*/queryoffset+querylength,
 						   plusp,genestrand,first_read_p);
     printf("%d\t%u\t%u\t",recount,(*finalptr)+1,SA_i/*+ 1U*/);
+    c2 = Bytecoding_lcpchilddc_dc(&c1,(*finalptr)+1,sarray->lcpchilddc);
+    printf("%c%c\t",c1,c2);
+    printf("%u\t",Bytecoding_lcpchilddc_lcp((*finalptr)+1,/*bytes*/sarray->lcpchilddc,sarray->lcp_exceptions,sarray->n_lcp_exceptions));
+    printf("%u\t",Bytecoding_lcpchilddc_lcp_next(&child_next,(*finalptr)+1,/*bytes*/sarray->lcpchilddc,sarray->child_guide,sarray->child_exceptions,
+						 sarray->child_guide_interval,sarray->lcp_exceptions,sarray->n_lcp_exceptions));
     if (genestrand == +2) {
       if (plusp) {
 	Genome_fill_buffer_convert_rev(SA_i,recount+1,Buffer);
@@ -2324,7 +2433,7 @@ fill_positions_std (int *npositions, Univcoord_T low_adj, Univcoord_T high_adj,
 
   while (ptr <= finalptr) {
     debug7a(printf("Std: Looking at value %u, relative to low %u and high %u\n",array[ptr],low_adj,high_adj));
-    if ((value = array[ptr++]) < low_adj) {
+    if ((value = CONVERT(array[ptr++])) < low_adj) {
       /* Skip */
     } else if (value > high_adj) {
       /* Skip */
@@ -2350,7 +2459,7 @@ fill_positions_std (int *npositions, Univcoord_T low_adj, Univcoord_T high_adj,
     ptr = lastptr;	/* One past the last ptr with a result */
 
     while (i < *npositions) {
-      if ((value = array[--ptr]) < low_adj) {
+      if ((value = CONVERT(array[--ptr])) < low_adj) {
 	/* Skip */
       } else if (value > high_adj) {
 	/* Skip */
@@ -2372,7 +2481,7 @@ fill_positions_std (int *npositions, Univcoord_T low_adj, Univcoord_T high_adj,
 
 #ifdef HAVE_ALLOCA
 
-#if defined(HAVE_SSSE3) && defined(HAVE_SSE2)
+#if defined(HAVE_SSSE3) && defined(HAVE_SSE2) && !defined(WORDS_BIGENDIAN)
 /* SSSE3 needed for _mm_shuffle_epi8 */
 static void
 fill_positions_filtered_first (Elt_T this, T sarray, Univcoord_T goal, Univcoord_T low, Univcoord_T high,
@@ -2461,7 +2570,7 @@ fill_positions_filtered_first (Elt_T this, T sarray, Univcoord_T goal, Univcoord
       n_prealign = this->finalptr - this->initptr + 1;
     }
     for (k = 0; k < n_prealign; k++) {
-      debug7a(printf("Looking at value %u, relative to low %u and high %u\n",array[ptr],low_adj,high_adj));
+      debug7a(printf("Looking at value %u, relative to low %u and high %u\n",CONVERT(array[ptr]),low_adj,high_adj));
       if ((value = *array_ptr++) >= low_adj && value <= high_adj) {
 	*out++ = value - this->querystart;
       }
@@ -2609,7 +2718,7 @@ fill_positions_filtered_first (Elt_T this, T sarray, Univcoord_T goal, Univcoord
 
 
 #else
-/* Missing SSSE3 or SSE2 */
+/* Bigendian or missing SSSE3 or SSE2 */
 
 static void
 fill_positions_filtered_first (Elt_T this, T sarray, Univcoord_T goal, Univcoord_T low, Univcoord_T high,
@@ -2623,7 +2732,7 @@ fill_positions_filtered_first (Elt_T this, T sarray, Univcoord_T goal, Univcoord
   Univcoord_T *array = sarray->array;
 #endif
   Univcoord_T *positions_temp;
-#ifdef HAVE_SSE2
+#if defined(HAVE_SSE2) && !defined(WORDS_BIGENDIAN)
 #ifdef HAVE_64_BIT
   UINT8 pointer;
 #else
@@ -2660,7 +2769,7 @@ fill_positions_filtered_first (Elt_T this, T sarray, Univcoord_T goal, Univcoord
 
     this->npositions_allocated = this->npositions = 0;
     ptr = this->initptr;
-#ifdef HAVE_SSE2
+#if defined(HAVE_SSE2) && !defined(WORDS_BIGENDIAN)
     if (ptr + 3 > this->finalptr) { /* ptr + 4 > (this->finalptr + 1) */
       /* Handle in normal manner */
       debug7(printf("Small batch, because %u + 3 <= %u\n",ptr,this->finalptr));
@@ -2694,8 +2803,8 @@ fill_positions_filtered_first (Elt_T this, T sarray, Univcoord_T goal, Univcoord
       /* Initial part */
       debug7(printf("Initial part:\n"));
       for (k = 0; k < n_prealign; k++) {
-	debug7a(printf("Looking at value %u, relative to low %u and high %u\n",array[ptr],low_adj,high_adj));
-	if ((value0 = array[ptr++]) < low_adj) {
+	debug7a(printf("Looking at value %u, relative to low %u and high %u\n",CONVERT(array[ptr]),low_adj,high_adj));
+	if ((value0 = CONVERT(array[ptr++])) < low_adj) {
 	  /* Skip */
 	} else if (value0 > high_adj) {
 	  /* Skip */
@@ -2730,7 +2839,7 @@ fill_positions_filtered_first (Elt_T this, T sarray, Univcoord_T goal, Univcoord
 	  ptr += 4;
 	} else {
 #ifndef USE_CSA
-	  value3 = array[ptr++];
+	  value3 = CONVERT(array[ptr++]);
 #endif
 	  if (value3 < low_adj) {
 	    /* Skip */
@@ -2744,7 +2853,7 @@ fill_positions_filtered_first (Elt_T this, T sarray, Univcoord_T goal, Univcoord
 	  }
 
 #ifndef USE_CSA
-	  value2 = array[ptr++];
+	  value2 = CONVERT(array[ptr++]);
 #endif
 	  if (value2 < low_adj) {
 	    /* Skip */
@@ -2758,7 +2867,7 @@ fill_positions_filtered_first (Elt_T this, T sarray, Univcoord_T goal, Univcoord
 	  }
 
 #ifndef USE_CSA
-	  value1 = array[ptr++];
+	  value1 = CONVERT(array[ptr++]);
 #endif
 	  if (value1 < low_adj) {
 	    /* Skip */
@@ -2772,7 +2881,7 @@ fill_positions_filtered_first (Elt_T this, T sarray, Univcoord_T goal, Univcoord
 	  }
 
 #ifndef USE_CSA
-	  value0 = array[ptr++];
+	  value0 = CONVERT(array[ptr++]);
 #endif
 	  if (value0 < low_adj) {
 	    /* Skip */
@@ -2882,7 +2991,7 @@ fill_positions_filtered_first (Elt_T this, T sarray, Univcoord_T goal, Univcoord
   Univcoord_T *array = sarray->array;
 #endif
   Univcoord_T *more_positions;
-#ifdef HAVE_SSE2
+#if defined(HAVE_SSE2) && !defined(WORDS_BIGENDIAN)
 #ifdef HAVE_64_BIT
   UINT8 pointer;
 #else
@@ -2920,7 +3029,7 @@ fill_positions_filtered_first (Elt_T this, T sarray, Univcoord_T goal, Univcoord
 
     this->npositions_allocated = this->npositions = 0;
     ptr = this->initptr;
-#ifdef HAVE_SSE2
+#if defined(HAVE_SSE2) && !defined(WORDS_BIGENDIAN)
     if (ptr + 3 > this->finalptr) { /* ptr + 4 > (this->finalptr + 1) */
       /* Handle in normal manner */
       debug7(printf("Small batch, because %u + 3 <= %u\n",ptr,this->finalptr));
@@ -2958,8 +3067,8 @@ fill_positions_filtered_first (Elt_T this, T sarray, Univcoord_T goal, Univcoord
       /* Initial part */
       debug7(printf("Initial part:\n"));
       for (k = 0; k < n_prealign; k++) {
-	debug7a(printf("Looking at value %u, relative to low %u and high %u\n",array[ptr],low_adj,high_adj));
-	if ((value0 = array[ptr++]) < low_adj) {
+	debug7a(printf("Looking at value %u, relative to low %u and high %u\n",CONVERT(array[ptr]),low_adj,high_adj));
+	if ((value0 = CONVERT(array[ptr++])) < low_adj) {
 	  /* Skip */
 	} else if (value0 > high_adj) {
 	  /* Skip */
@@ -2998,7 +3107,7 @@ fill_positions_filtered_first (Elt_T this, T sarray, Univcoord_T goal, Univcoord
 	  ptr += 4;
 	} else {
 #ifndef USE_CSA
-	  value3 = array[ptr++];
+	  value3 = CONVERT(array[ptr++]);
 #endif
 	  if (value3 < low_adj) {
 	    /* Skip */
@@ -3016,7 +3125,7 @@ fill_positions_filtered_first (Elt_T this, T sarray, Univcoord_T goal, Univcoord
 	  }
 
 #ifndef USE_CSA
-	  value2 = array[ptr++];
+	  value2 = CONVERT(array[ptr++]);
 #endif
 	  if (value2 < low_adj) {
 	    /* Skip */
@@ -3034,7 +3143,7 @@ fill_positions_filtered_first (Elt_T this, T sarray, Univcoord_T goal, Univcoord
 	  }
 
 #ifndef USE_CSA
-	  value1 = array[ptr++];
+	  value1 = CONVERT(array[ptr++]);
 #endif
 	  if (value1 < low_adj) {
 	    /* Skip */
@@ -3052,7 +3161,7 @@ fill_positions_filtered_first (Elt_T this, T sarray, Univcoord_T goal, Univcoord
 	  }
 
 #ifndef USE_CSA
-	  value0 = array[ptr++];
+	  value0 = CONVERT(array[ptr++]);
 #endif
 	  if (value0 < low_adj) {
 	    /* Skip */
@@ -3132,7 +3241,7 @@ fill_positions_filtered_first (Elt_T this, T sarray, Univcoord_T goal, Univcoord
 
       i = GUESS_ALLOCATION;	/* Start count with the number stored */
       ptr = lastptr;		/* One past the last ptr with a result */
-#ifdef HAVE_SSE2
+#if defined(HAVE_SSE2) && !defined(WORDS_BIGENDIAN)
       if (this->initptr + 4 < ptr) {
 	while (i < this->npositions) {
 	  if ((value0 = csa_lookup(sarray,--ptr)) < low_adj) {
@@ -3157,7 +3266,7 @@ fill_positions_filtered_first (Elt_T this, T sarray, Univcoord_T goal, Univcoord
 
 	/* Initial part */
 	while (i < this->npositions) {
-	  if ((value0 = array[--ptr]) < low_adj) {
+	  if ((value0 = CONVERT(array[--ptr])) < low_adj) {
 	    /* Skip */
 	  } else if (value0 > high_adj) {
 	    /* Skip */
@@ -3185,7 +3294,7 @@ fill_positions_filtered_first (Elt_T this, T sarray, Univcoord_T goal, Univcoord
 	    ptr -= 4;
 	  } else {
 #ifndef USE_CSA
-	    value0 = array[--ptr];
+	    value0 = CONVERT(array[--ptr]);
 #endif
 	    if (value0 < low_adj) {
 	      /* Skip */
@@ -3196,7 +3305,7 @@ fill_positions_filtered_first (Elt_T this, T sarray, Univcoord_T goal, Univcoord
 	    }
 
 #ifndef USE_CSA
-	    value1 = array[--ptr];
+	    value1 = CONVERT(array[--ptr]);
 #endif
 	    if (value1 < low_adj) {
 	      /* Skip */
@@ -3207,7 +3316,7 @@ fill_positions_filtered_first (Elt_T this, T sarray, Univcoord_T goal, Univcoord
 	    }
 
 #ifndef USE_CSA
-	    value2 = array[--ptr];
+	    value2 = CONVERT(array[--ptr]);
 #endif
 	    if (value2 < low_adj) {
 	      /* Skip */
@@ -3218,7 +3327,7 @@ fill_positions_filtered_first (Elt_T this, T sarray, Univcoord_T goal, Univcoord
 	    }
 
 #ifndef USE_CSA
-	    value3 = array[--ptr];
+	    value3 = CONVERT(array[--ptr]);
 #endif
 	    if (value3 < low_adj) {
 	      /* Skip */
@@ -3421,7 +3530,13 @@ Elt_fill_positions_filtered (Elt_T this, T sarray, Univcoord_T goal, Univcoord_T
       this->n_all_positions = 0;
     } else {
       this->all_positions = (Univcoord_T *) MALLOC(this->n_all_positions*sizeof(Univcoord_T));
+#ifdef WORDS_BIGENDIAN
+      for (i = 0; i < this->n_all_positions; i++) {
+	this->all_positions[i] = Bigendian_convert_uint(sarray->array[this->initptr+i]);
+      }
+#else
       memcpy(this->all_positions,&(sarray->array[this->initptr]),this->n_all_positions*sizeof(Univcoord_T));
+#endif
       qsort(this->all_positions,this->n_all_positions,sizeof(Univcoord_T),Univcoord_compare);
     }
 #endif
@@ -6078,6 +6193,8 @@ find_best_path (List_T *right_paths, Intlist_T *right_endpoints_sense, Intlist_T
   *fillin_diagonals = (List_T) NULL;
   middle_path = (List_T) NULL;
 
+#ifdef SUBDIVIDE_ENDS
+  /* Without SUBDIVIDE_ENDS, sub_diagonals is guaranteed to be NULL */
   /* A4.  Process oligoindex diagonals from right */
   if (List_length(sub_diagonals) == 0) {
     /* Skip */
@@ -6096,6 +6213,7 @@ find_best_path (List_T *right_paths, Intlist_T *right_endpoints_sense, Intlist_T
     }
 #endif
   }
+#endif
 
   if (right_indel_diagonal != NULL) {
     debug13(printf("Pushing right indel diagonal onto middle: query %d..%d, diagonal %u\n",
@@ -6302,7 +6420,7 @@ find_best_path (List_T *right_paths, Intlist_T *right_endpoints_sense, Intlist_T
   }
 
 
-    sub_diagonals = (List_T) NULL;
+  sub_diagonals = (List_T) NULL;
 
 #ifdef SUBDIVIDE_ENDS
   /* Run oligoindex here to left of common_diagonal */
@@ -6358,6 +6476,8 @@ find_best_path (List_T *right_paths, Intlist_T *right_endpoints_sense, Intlist_T
   }
 
 
+#ifdef SUBDIVIDE_ENDS
+  /* Without SUBDIVIDE_ENDS, sub_diagonals is guaranteed to be NULL */
   /* C4. Process oligoindex diagonals from left */
   if (List_length(sub_diagonals) == 0) {
     /* Skip */
@@ -6376,6 +6496,7 @@ find_best_path (List_T *right_paths, Intlist_T *right_endpoints_sense, Intlist_T
     }
 #endif
   }
+#endif
 
   debug13(printf("***Exiting find_best_path\n"));
 
@@ -7985,7 +8106,8 @@ Sarray_search_greedy (int *found_score, char *queryuc_ptr, char *queryrc, int qu
       all_left_diagonals_plus = (List_T *) MALLOC(nseeds_plus*sizeof(List_T));
       fillin_diagonals_plus = (List_T *) CALLOC(nseeds_plus,sizeof(List_T));
 
-      chrhigh = 0;
+      chrnum = 1;
+      Univ_IIT_interval_bounds(&chroffset,&chrhigh,&chrlength,chromosome_iit,/*chrnum*/1,circular_typeint);
       for (i = 0; i < nseeds_plus; i++) {
 	left = best_plus_elt->positions[i];
 	if (left > chrhigh) {
@@ -8054,7 +8176,8 @@ Sarray_search_greedy (int *found_score, char *queryuc_ptr, char *queryrc, int qu
       all_left_diagonals_minus = (List_T *) MALLOC(nseeds_minus*sizeof(List_T));
       fillin_diagonals_minus = (List_T *) CALLOC(nseeds_minus,sizeof(List_T));
 
-      chrhigh = 0;
+      chrnum = 1;
+      Univ_IIT_interval_bounds(&chroffset,&chrhigh,&chrlength,chromosome_iit,/*chrnum*/1,circular_typeint);
       for (i = 0; i < nseeds_minus; i++) {
 	left = best_minus_elt->positions[i];
 	if (left > chrhigh) {
@@ -8102,7 +8225,8 @@ Sarray_search_greedy (int *found_score, char *queryuc_ptr, char *queryrc, int qu
 
   /* *sarray_gmap = (List_T) NULL; */
 
-  chrhigh = 0;
+  chrnum = 1;
+  Univ_IIT_interval_bounds(&chroffset,&chrhigh,&chrlength,chromosome_iit,/*chrnum*/1,circular_typeint);
   for (i = 0; i < nseeds_plus; i++) {
     if (1 /*|| scores_plus[i] > best_score - 20*/) {
       diagonal = middle_diagonals_plus[i];
@@ -8183,7 +8307,8 @@ Sarray_search_greedy (int *found_score, char *queryuc_ptr, char *queryrc, int qu
     }
   }
 
-  chrhigh = 0;
+  chrnum = 1;
+  Univ_IIT_interval_bounds(&chroffset,&chrhigh,&chrlength,chromosome_iit,/*chrnum*/1,circular_typeint);
   for (i = 0; i < nseeds_minus; i++) {
     if (1 /*|| scores_minus[i] > best_score - 20*/) {
       diagonal = middle_diagonals_minus[i];
@@ -8268,7 +8393,8 @@ Sarray_search_greedy (int *found_score, char *queryuc_ptr, char *queryrc, int qu
 
 #if 0
   /* Salvage using gmap */
-  chrhigh = 0;
+  chrnum = 1;
+  Univ_IIT_interval_bounds(&chroffset,&chrhigh,&chrlength,chromosome_iit,/*chrnum*/1,circular_typeint);
   for (i = 0; i < nseeds_plus; i++) {
     if (incomplete_result_p(middle_path_plus[i],querylength) == true) {
       left = best_plus_elt->positions[i];
@@ -8284,7 +8410,8 @@ Sarray_search_greedy (int *found_score, char *queryuc_ptr, char *queryrc, int qu
     }
   }
 
-  chrhigh = 0;
+  chrnum = 1;
+  Univ_IIT_interval_bounds(&chroffset,&chrhigh,&chrlength,chromosome_iit,/*chrnum*/1,circular_typeint);
   for (i = 0; i < nseeds_minus; i++) {
     if (incomplete_result_p(middle_path_minus[i],querylength) == true) {
       left = best_minus_elt->positions[i];
diff --git a/src/sarray-write.c b/src/sarray-write.c
index f9e75dc..30418b9 100644
--- a/src/sarray-write.c
+++ b/src/sarray-write.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: sarray-write.c 167266 2015-06-11 00:07:57Z twu $";
+static char rcsid[] = "$Id: sarray-write.c 170326 2015-07-22 17:49:55Z twu $";
 #ifdef HAVE_CONFIG_H
 #include <config.h>
 #endif
@@ -188,7 +188,11 @@ sarray_search_char (Sarrayptr_T *initptr, Sarrayptr_T *finalptr, char desired_ch
     if (low % 2 == 1 && high % 2 == 1) {
       mid += 1;
     }
+#ifdef WORDS_BIGENDIAN
+    pos = Bigendian_convert_uint(SA[mid]);
+#else
     pos = SA[mid];
+#endif
     c = Genome_get_char_lex(genomecomp,pos,n,chartable);
     if (desired_char > c) {
       low = mid + 1;
@@ -207,7 +211,11 @@ sarray_search_char (Sarrayptr_T *initptr, Sarrayptr_T *finalptr, char desired_ch
     if (low % 2 == 1 || high % 2 == 1) {
       mid += 1;
     }
+#ifdef WORDS_BIGENDIAN
+    pos = Bigendian_convert_uint(SA[mid]);
+#else
     pos = SA[mid];
+#endif
     c = Genome_get_char_lex(genomecomp,pos,n,chartable);
     if (desired_char >= c) {
       low = mid;
@@ -274,7 +282,11 @@ sarray_search_simple (Sarrayptr_T *initptr, Sarrayptr_T *finalptr, char *query,
     }
 
     nmatches = 0;
+#ifdef WORDS_BIGENDIAN
+    pos = Bigendian_convert_uint(SA[mid]);
+#else
     pos = SA[mid];
+#endif
 
     while (nmatches < querylength && (c = Genome_get_char_lex(genomecomp,pos,n,chartable)) == query[nmatches]) {
       nmatches++;
@@ -300,7 +312,11 @@ sarray_search_simple (Sarrayptr_T *initptr, Sarrayptr_T *finalptr, char *query,
     }
 
     nmatches = 0;
+#ifdef WORDS_BIGENDIAN
+    pos = Bigendian_convert_uint(SA[mid]);
+#else
     pos = SA[mid];
+#endif
 
     while (nmatches < querylength && (c = Genome_get_char_lex(genomecomp,pos,n,chartable)) == query[nmatches]) {
       nmatches++;
@@ -1032,7 +1048,11 @@ Sarray_compute_lcp (char *rankfile, char *permuted_sarray_file, char *sarrayfile
     for (b = 0, i = ii; b < RW_BATCH; b++, i++) {
       rank_i = read_buffer_1[b];
       if (rank_i > 0) {
+#ifdef WORDS_BIGENDIAN
+	write_buffer[b] = Bigendian_convert_uint(SA[rank_i - 1]);
+#else
 	write_buffer[b] = SA[rank_i - 1];
+#endif
       } else {
 	write_buffer[b] = 0;	/* Will be ignored */
       }
@@ -1042,7 +1062,11 @@ Sarray_compute_lcp (char *rankfile, char *permuted_sarray_file, char *sarrayfile
   for (i = ii; i <= n; i++) {	/* final partial batch */
     FREAD_UINT(&rank_i,fp);
     if (rank_i > 0) {
+#ifdef WORDS_BIGENDIAN
+      FWRITE_UINT(Bigendian_convert_uint(SA[rank_i - 1]),permsa_fp);
+#else
       FWRITE_UINT(SA[rank_i - 1],permsa_fp);
+#endif
     } else {
       FWRITE_UINT(zero,permsa_fp); /* Will be ignored */
     }
@@ -1288,8 +1312,13 @@ compute_plcp (UINT4 *plcp, UINT4 *SA, UINT4 n) {
     }
   }
 
+#if 0
   /* This makes lcp[0] = -1, because lcp[0] = plcp[SA[0]] = plcp[n] = -1 */
   plcp[n] = -1;
+#else
+  /* This makes lcp[0] = 0, because lcp[0] = plcp[SA[0]] = plcp[n] = 0 */
+  plcp[n] = 0;
+#endif
 
   return;
 }
@@ -1375,9 +1404,11 @@ get_all_children (bool *filledp, Sarrayptr_T *l, Sarrayptr_T *r, Sarrayptr_T i,
 void
 Sarray_write_plcp (char *plcpptrsfile, char *plcpcompfile, UINT4 *SA, UINT4 genomelength) {
   UINT4 *plcp;
-  UINT4 *ramp;
+  UINT4 *ramp, *p;
 
   UINT4 n = genomelength, i;
+  UINT4 ii;
+  FILE *fp;
 
   plcp = (UINT4 *) MALLOC((n+1)*sizeof(UINT4));
   ramp = plcp;
@@ -1398,7 +1429,23 @@ Sarray_write_plcp (char *plcpptrsfile, char *plcpcompfile, UINT4 *SA, UINT4 geno
 
   fprintf(stderr,"Writing permuted lcp file...");
   /* Provide n to write values [0..n] */
+
+#if 0
+  /* Print plcp as an array */
+  fp = fopen("plcp","wb");
+  for (ii = 0; ii + RW_BATCH <= n; ii += RW_BATCH) {
+    p = (void *) &(ramp[ii]);
+    FWRITE_UINTS(p,RW_BATCH,fp);
+  }
+  if (ii <= n) {
+    p = (void *) &(ramp[ii]);
+    FWRITE_UINTS(p,n - ii + 1,fp);
+  }
+  fclose(fp);
+#else
   Bitpack64_write_differential(plcpptrsfile,plcpcompfile,ramp,n);
+#endif
+
   fprintf(stderr,"done\n");
 
   FREE(plcp);
@@ -2244,10 +2291,18 @@ Sarray_array_uncompress (Genome_T genomecomp, char *sarrayfile, char *plcpptrsfi
   printf("i\tSA\tLCP\n");
 
   pos = start;
+#ifdef WORDS_BIGENDIAN
+  sa_i = Bigendian_convert_uint(SA[pos]);
+#else
   sa_i = SA[pos];
+#endif
   lcp_i = Bitpack64_read_one(sa_i,plcpptrs,plcpcomp) - sa_i;
 
+#ifdef WORDS_BIGENDIAN
+  sa_nexti = Bigendian_convert_uint(SA[pos+1]);
+#else
   sa_nexti = SA[pos+1];
+#endif
   lcp_nexti = Bitpack64_read_one(sa_nexti,plcpptrs,plcpcomp) - sa_nexti;
 
   if (pos == 0) {
@@ -2261,7 +2316,11 @@ Sarray_array_uncompress (Genome_T genomecomp, char *sarrayfile, char *plcpptrsfi
     sa_i = sa_nexti;
     lcp_i = lcp_nexti;
 
+#ifdef WORDS_BIGENDIAN
+    sa_nexti = Bigendian_convert_uint(SA[pos+1]);
+#else
     sa_nexti = SA[pos+1];
+#endif
     lcp_nexti = Bitpack64_read_one(sa_nexti,plcpptrs,plcpcomp) - sa_nexti;
 
     printf("%u\t%u\t%u\t",pos,sa_i,lcp_i);
@@ -2342,7 +2401,11 @@ Sarray_child_uncompress (Genome_T genomecomp, unsigned char *lcpchilddc, UINT4 *
   pos = start;
 
   for (pos = start; pos <= end; pos++) {
+#ifdef WORDS_BIGENDIAN
+    sa_i = Bigendian_convert_uint(SA[pos]);
+#else
     sa_i = SA[pos];
+#endif
     lcp_i = Bytecoding_lcpchilddc_lcp(pos,lcpchilddc,lcp_exceptions,n_lcp_exceptions); /* lcp(i,j) */
     c2 = Bytecoding_lcpchilddc_dc(&c1,pos,lcpchilddc);
 
diff --git a/src/sequence.c b/src/sequence.c
index f049b51..beb2d8f 100644
--- a/src/sequence.c
+++ b/src/sequence.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: sequence.c 166641 2015-05-29 21:13:04Z twu $";
+static char rcsid[] = "$Id: sequence.c 170023 2015-07-17 16:47:21Z twu $";
 #ifdef HAVE_CONFIG_H
 #include <config.h>
 #endif
@@ -15,7 +15,7 @@ static char rcsid[] = "$Id: sequence.c 166641 2015-05-29 21:13:04Z twu $";
 #include <stdlib.h>
 #include <string.h>
 #include <strings.h>		/* For rindex */
-#include <ctype.h>		/* For iscntrl and isspace */
+#include <ctype.h>		/* For iscntrl, isspace, and toupper */
 
 #ifdef HAVE_ZLIB
 #include <zlib.h>
@@ -1741,6 +1741,44 @@ Sequence_stdout_raw (T this) {
   return;
 }
 
+void
+Sequence_stdout_stream_chars (T this) {
+  int i = 0, pos, start, end;
+
+  start = 0;
+  end = this->fulllength;
+
+  for (pos = start; pos < end; pos++, i++) {
+    switch (toupper(this->contents[i])) {
+    case 'A': putchar('A'); break;
+    case 'C': putchar('C'); break;
+    case 'G': putchar('G'); break;
+    case 'T': putchar('T'); break;
+    default: putchar('X');
+    }
+  }
+  return;
+}
+
+void
+Sequence_stdout_stream_ints (T this) {
+  int i = 0, pos, start, end;
+
+  start = 0;
+  end = this->fulllength;
+
+  for (pos = start; pos < end; pos++, i++) {
+    switch (toupper(this->contents[i])) {
+    case 'A': putchar(0); break;
+    case 'C': putchar(1); break;
+    case 'G': putchar(2); break;
+    case 'T': putchar(3); break;
+    default: putchar(4);
+    }
+  }
+  return;
+}
+
 
 T
 Sequence_substring (T usersegment, unsigned int left, unsigned int length, 
diff --git a/src/sequence.h b/src/sequence.h
index 9ba3fbc..bc9ec26 100644
--- a/src/sequence.h
+++ b/src/sequence.h
@@ -1,4 +1,4 @@
-/* $Id: sequence.h 157225 2015-01-22 18:47:23Z twu $ */
+/* $Id: sequence.h 170023 2015-07-17 16:47:21Z twu $ */
 #ifndef SEQUENCE_INCLUDED
 #define SEQUENCE_INCLUDED
 #ifdef HAVE_CONFIG_H
@@ -125,6 +125,10 @@ Sequence_stdout_two (T ref, T alt, bool uppercasep, int wraplength);
 
 extern void
 Sequence_stdout_raw (T this);
+extern void
+Sequence_stdout_stream_chars (T this);
+extern void
+Sequence_stdout_stream_ints (T this);
 
 extern T
 Sequence_substring (T usersegment, unsigned int left, unsigned int length, 
diff --git a/src/snpindex.c b/src/snpindex.c
index bf07662..a54704b 100644
--- a/src/snpindex.c
+++ b/src/snpindex.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: snpindex.c 161940 2015-03-25 20:36:59Z twu $";
+static char rcsid[] = "$Id: snpindex.c 168395 2015-06-26 17:13:13Z twu $";
 #ifdef HAVE_CONFIG_H
 #include <config.h>
 #endif
@@ -855,22 +855,28 @@ merge_positions8 (FILE *positions_high_fp, FILE *positions_low_fp,
   UINT8 *ptr1 = start1, *ptr2 = start2;
   char *nt;
 #ifdef WORDS_BIGENDIAN
-  UINT8 position2;
+  UINT8 position1, position2;
 #endif
 
   while (ptr1 < end1 && ptr2 < end2) {
 #ifdef WORDS_BIGENDIAN
-    abort();
+    position1 = Bigendian_convert_uint8(*ptr1);
     position2 = Bigendian_convert_uint8(*ptr2);
-    if (*ptr1 < position2) {
-      FWRITE_UINT8(*ptr1,positions_fp);
+    if (position1 < position2) {
+      position8_high = position1 >> POSITIONS8_HIGH_SHIFT;
+      position8_low = position1 & POSITIONS8_LOW_MASK;
+      FWRITE_CHAR(position8_high,positions_high_fp);
+      FWRITE_UINT(position8_low,positions_low_fp);
       ptr1++;
-    } else if (position2 < *ptr1) {
-      FWRITE_UINT8(position2,positions_fp);
+    } else if (position2 < position1) {
+      position8_high = position2 >> POSITIONS8_HIGH_SHIFT;
+      position8_low = position2 & POSITIONS8_LOW_MASK;
+      FWRITE_CHAR(position8_high,positions_high_fp);
+      FWRITE_UINT(position8_low,positions_low_fp);
       ptr2++;
     } else {
       nt = shortoligo_nt(oligo,index1part);
-      fprintf(stderr,"Problem: saw duplicate positions %u in oligo %s\n",*ptr1,nt);
+      fprintf(stderr,"Problem: saw duplicate positions %u in oligo %s\n",position1,nt);
       FREE(nt);
       abort();
       /*
@@ -881,7 +887,6 @@ merge_positions8 (FILE *positions_high_fp, FILE *positions_low_fp,
     }
 
 #else
-
     if (*ptr1 < *ptr2) {
       position8_high = *ptr1 >> POSITIONS8_HIGH_SHIFT;
       position8_low = *ptr1 & POSITIONS8_LOW_MASK;
@@ -909,27 +914,32 @@ merge_positions8 (FILE *positions_high_fp, FILE *positions_low_fp,
   }
 
   while (ptr1 < end1) {
+#ifdef WORDS_BIGENDIAN
+    position1 = Bigendian_convert_uint8(*ptr1);
+    position8_high = position1 >> POSITIONS8_HIGH_SHIFT;
+    position8_low = position1 & POSITIONS8_LOW_MASK;
+#else
     position8_high = *ptr1 >> POSITIONS8_HIGH_SHIFT;
     position8_low = *ptr1 & POSITIONS8_LOW_MASK;
+#endif
     FWRITE_CHAR(position8_high,positions_high_fp);
     FWRITE_UINT(position8_low,positions_low_fp);
     ptr1++;
   }
 
-#ifdef WORDS_BIGENDIAN
   while (ptr2 < end2) {
-    FWRITE_UINT8(Bigendian_convert_uint8(*ptr2),positions_fp);
-    ptr2++;
-  }
+#ifdef WORDS_BIGENDIAN
+    position2 = Bigendian_convert_uint8(*ptr2);
+    position8_high = position2 >> POSITIONS8_HIGH_SHIFT;
+    position8_low = position2 & POSITIONS8_LOW_MASK;
 #else
-  while (ptr2 < end2) {
     position8_high = *ptr2 >> POSITIONS8_HIGH_SHIFT;
     position8_low = *ptr2 & POSITIONS8_LOW_MASK;
+#endif
     FWRITE_CHAR(position8_high,positions_high_fp);
     FWRITE_UINT(position8_low,positions_low_fp);
     ptr2++;
   }
-#endif
 
   return;
 }
@@ -941,16 +951,17 @@ merge_positions4 (FILE *positions_fp, UINT4 *start1, UINT4 *end1,
   UINT4 *ptr1 = start1, *ptr2 = start2;
   char *nt;
 #ifdef WORDS_BIGENDIAN
-  UINT4 position2;
+  UINT4 position1, position2;
 #endif
 
   while (ptr1 < end1 && ptr2 < end2) {
 #ifdef WORDS_BIGENDIAN
+    position1 = Bigendian_convert_uint(*ptr1);
     position2 = Bigendian_convert_uint(*ptr2);
-    if (*ptr1 < position2) {
-      FWRITE_UINT(*ptr1,positions_fp);
+    if (position1 < position2) {
+      FWRITE_UINT(position1,positions_fp);
       ptr1++;
-    } else if (position2 < *ptr1) {
+    } else if (position2 < position1) {
       FWRITE_UINT(position2,positions_fp);
       ptr2++;
     } else {
@@ -988,21 +999,24 @@ merge_positions4 (FILE *positions_fp, UINT4 *start1, UINT4 *end1,
   }
 
   while (ptr1 < end1) {
+#ifdef WORDS_BIGENDIAN
+    position1 = Bigendian_convert_uint(*ptr1);
+    FWRITE_UINT(position1,positions_fp);
+#else    
     FWRITE_UINT(*ptr1,positions_fp);
+#endif
     ptr1++;
   }
 
-#ifdef WORDS_BIGENDIAN
   while (ptr2 < end2) {
-    FWRITE_UINT(Bigendian_convert_uint(*ptr2),positions_fp);
-    ptr2++;
-  }
+#ifdef WORDS_BIGENDIAN
+    position2 = Bigendian_convert_uint(*ptr2);
+    FWRITE_UINT(position2,positions_fp);
 #else
-  while (ptr2 < end2) {
     FWRITE_UINT(*ptr2,positions_fp);
+#endif
     ptr2++;
   }
-#endif
 
   return;
 }
diff --git a/src/splice.c b/src/splice.c
index 3e68052..5362f29 100644
--- a/src/splice.c
+++ b/src/splice.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: splice.c 166641 2015-05-29 21:13:04Z twu $";
+static char rcsid[] = "$Id: splice.c 167583 2015-06-15 18:12:14Z twu $";
 #ifdef HAVE_CONFIG_H
 #include <config.h>
 #endif
@@ -2347,11 +2347,11 @@ group_by_segmenti_aux (int *found_score, List_T winners, List_T *ambiguous,
       } else {
 	/* Multiple hits */
 	donor_hits = acceptor_hits = (List_T) NULL;
-	if (plusp == true) {
-	  for (p = accepted_hits; p != NULL; p = List_next(p)) {
-	    hit = (Stage3end_T) List_head(p);
-	    donor = Stage3end_substring_donor(hit);
-	    acceptor = Stage3end_substring_acceptor(hit);
+	for (p = accepted_hits; p != NULL; p = List_next(p)) {
+	  hit = (Stage3end_T) List_head(p);
+	  donor = Stage3end_substring_donor(hit);
+	  acceptor = Stage3end_substring_acceptor(hit);
+	  if (Stage3end_plusp(hit) == true) {
 	    if (Substring_genomicstart(donor) == segmenti_left) {
 	      donor_hits = List_push(donor_hits,(void *) hit);
 	    } else if (Substring_genomicstart(acceptor) == segmenti_left) {
@@ -2359,12 +2359,7 @@ group_by_segmenti_aux (int *found_score, List_T winners, List_T *ambiguous,
 	    } else {
 	      Stage3end_free(&hit);
 	    }
-	  }
-	} else {
-	  for (p = accepted_hits; p != NULL; p = List_next(p)) {
-	    hit = (Stage3end_T) List_head(p);
-	    donor = Stage3end_substring_donor(hit);
-	    acceptor = Stage3end_substring_acceptor(hit);
+	  } else {
 	    if (Substring_genomicend(donor) == segmenti_left) {
 	      donor_hits = List_push(donor_hits,(void *) hit);
 	    } else if (Substring_genomicend(acceptor) == segmenti_left) {
@@ -2685,11 +2680,11 @@ group_by_segmentj_aux (int *found_score, List_T winners, List_T *ambiguous,
       } else {
 	/* Multiple hits */
 	donor_hits = acceptor_hits = (List_T) NULL;
-	if (plusp == true) {
-	  for (p = accepted_hits; p != NULL; p = List_next(p)) {
-	    hit = (Stage3end_T) List_head(p);
-	    donor = Stage3end_substring_donor(hit);
-	    acceptor = Stage3end_substring_acceptor(hit);
+	for (p = accepted_hits; p != NULL; p = List_next(p)) {
+	  hit = (Stage3end_T) List_head(p);
+	  donor = Stage3end_substring_donor(hit);
+	  acceptor = Stage3end_substring_acceptor(hit);
+	  if (Stage3end_plusp(hit) == true) {
 	    if (Substring_genomicstart(donor) == segmentj_left) {
 	      donor_hits = List_push(donor_hits,(void *) hit);
 	    } else if (Substring_genomicstart(acceptor) == segmentj_left) {
@@ -2698,12 +2693,7 @@ group_by_segmentj_aux (int *found_score, List_T winners, List_T *ambiguous,
 	      abort();
 	      Stage3end_free(&hit);
 	    }
-	  }
-	} else {
-	  for (p = accepted_hits; p != NULL; p = List_next(p)) {
-	    hit = (Stage3end_T) List_head(p);
-	    donor = Stage3end_substring_donor(hit);
-	    acceptor = Stage3end_substring_acceptor(hit);
+	  } else {
 	    if (Substring_genomicend(donor) == segmentj_left) {
 	      donor_hits = List_push(donor_hits,(void *) hit);
 	    } else if (Substring_genomicend(acceptor) == segmentj_left) {
diff --git a/src/stage1hr.c b/src/stage1hr.c
index a6f5fa4..67fb4db 100644
--- a/src/stage1hr.c
+++ b/src/stage1hr.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: stage1hr.c 167163 2015-06-09 20:54:02Z twu $";
+static char rcsid[] = "$Id: stage1hr.c 170517 2015-07-23 23:15:28Z twu $";
 #ifdef HAVE_CONFIG_H
 #include <config.h>
 #endif
@@ -189,7 +189,7 @@ static int index1part;
 static int index1interval;
 static int spansize;
 static int two_index1intervals;
-static int min_readlength;
+static int min_kmer_readlength;
 static Univ_IIT_T chromosome_iit;
 static int circular_typeint;
 
@@ -1068,6 +1068,10 @@ read_oligos (bool *allvalidp, T this, char *queryuc_ptr, int querylength,
   /* this->maxfloor = 1 + querylength/oligobase * 2; */
 
   if (use_only_sarray_p == true) {
+    *allvalidp = false;
+    return 1;
+  } else if (use_sarray_p == true && querylength < min_kmer_readlength) {
+    *allvalidp = false;
     return 1;
   } else {
     reader = Reader_new(queryuc_ptr,/*querystart*/0,/*queryend*/querylength);
@@ -1252,6 +1256,67 @@ read_oligos (bool *allvalidp, T this, char *queryuc_ptr, int querylength,
 	}
       }
     }
+
+  } else if (mode == TTOC_STRANDED || mode == TTOC_NONSTRANDED) {
+    if (genestrand == +2) {
+      while ((last_state = Oligo_next(last_state,&querypos,&forward,&revcomp,
+				      reader,/*cdnaend*/FIVE)) != DONE) {
+#ifdef LARGE_GENOMES
+	this->plus_positions_high[querypos] = (unsigned char *) NULL;
+	this->plus_positions_low[querypos] = (UINT4 *) NULL;
+	this->minus_positions_high[querypos] = (unsigned char *) NULL;
+	this->minus_positions_low[querypos] = (UINT4 *) NULL;
+#else
+	this->plus_positions[querypos] = (Univcoord_T *) NULL;
+	this->minus_positions[querypos] = (Univcoord_T *) NULL;
+#endif
+	this->plus_npositions[querypos] = 0;
+	this->minus_npositions[querypos] = 0;
+
+	if (last_state == VALID) {
+#ifdef USE_VALIDP
+	  this->validp[querypos] = true;
+#endif
+	  this->plus_retrievedp[querypos] = false;
+	  this->minus_retrievedp[querypos] = false;
+	  
+	  this->forward_oligos[querypos] = Atoi_reduce_ag(forward) & oligobase_mask;
+	  this->revcomp_oligos[querypos] = Atoi_reduce_tc(revcomp >> leftreadshift) & oligobase_mask;
+
+	  debug(printf("At querypos %d, read oligo = %06X\n",querypos,this->forward_oligos[querypos]));
+	  noligos++;
+	}
+      }
+    } else {
+      while ((last_state = Oligo_next(last_state,&querypos,&forward,&revcomp,
+				      reader,/*cdnaend*/FIVE)) != DONE) {
+#ifdef LARGE_GENOMES
+	this->plus_positions_high[querypos] = (unsigned char *) NULL;
+	this->plus_positions_low[querypos] = (UINT4 *) NULL;
+	this->minus_positions_high[querypos] = (unsigned char *) NULL;
+	this->minus_positions_low[querypos] = (UINT4 *) NULL;
+#else
+	this->plus_positions[querypos] = (Univcoord_T *) NULL;
+	this->minus_positions[querypos] = (Univcoord_T *) NULL;
+#endif
+	this->plus_npositions[querypos] = 0;
+	this->minus_npositions[querypos] = 0;
+
+	if (last_state == VALID) {
+#ifdef USE_VALIDP
+	  this->validp[querypos] = true;
+#endif
+	  this->plus_retrievedp[querypos] = false;
+	  this->minus_retrievedp[querypos] = false;
+	  
+	  this->forward_oligos[querypos] = Atoi_reduce_tc(forward) & oligobase_mask;
+	  this->revcomp_oligos[querypos] = Atoi_reduce_ag(revcomp >> leftreadshift) & oligobase_mask;
+
+	  debug(printf("At querypos %d, read oligo = %06X\n",querypos,this->forward_oligos[querypos]));
+	  noligos++;
+	}
+      }
+    }
   }
 
   if (noligos < query_lastpos + 1) {
@@ -6493,7 +6558,6 @@ find_singlesplices_plus (int *found_score, List_T hits, List_T *ambiguous, List_
   double best_prob, prob, donor_prob, acceptor_prob;
   Substring_T donor, acceptor;
 
-  int sensedir;
 #ifdef LARGE_GENOMES
   Uint8list_T ambcoords;
 #else
@@ -7220,7 +7284,6 @@ find_singlesplices_minus (int *found_score, List_T hits, List_T *ambiguous, List
   double best_prob, prob, donor_prob, acceptor_prob;
   Substring_T donor, acceptor;
 
-  int sensedir;
 #ifdef LARGE_GENOMES
   Uint8list_T ambcoords;
 #else
@@ -9395,7 +9458,6 @@ find_spliceends_distant_dna_plus (List_T **distant_startfrags, List_T **distant_
   int *floors_from_neg3, *floors_to_pos3;
 
   int splice_pos_start, splice_pos_end;
-  int i;
 
 #ifdef HAVE_ALLOCA
   int *mismatch_positions = (int *) ALLOCA((querylength+1)*sizeof(int));
@@ -9569,7 +9631,6 @@ find_spliceends_distant_dna_minus (List_T **distant_startfrags, List_T **distant
   int *floors_from_neg3, *floors_to_pos3;
 
   int splice_pos_start, splice_pos_end;
-  int i;
 
 #ifdef HAVE_ALLOCA
   int *mismatch_positions = (int *) ALLOCA((querylength+1)*sizeof(int));
@@ -14076,6 +14137,7 @@ convert_minus_segments_to_gmap_via_region (History_T gmap_history, List_T hits,
 #endif
   
   
+/* Segment chaining */
 static List_T
 convert_plus_segments_to_gmap (History_T gmap_history, List_T hits,
 			       char *accession, char *queryuc_ptr, int querylength, int query_lastpos,
@@ -14704,6 +14766,7 @@ convert_plus_segments_to_gmap (History_T gmap_history, List_T hits,
 }
 
 
+/* Segment chaining */
 static List_T
 convert_minus_segments_to_gmap (History_T gmap_history, List_T hits,
 				char *accession, char *queryuc_ptr, int querylength, int query_lastpos,
@@ -15336,11 +15399,10 @@ align_singleend_with_gmap (History_T gmap_history, List_T result, T this,
 			   Pairpool_T pairpool, Diagpool_T diagpool, Cellpool_T cellpool,
 			   Dynprog_T dynprogL, Dynprog_T dynprogM, Dynprog_T dynprogR,
 			   int user_maxlevel, int cutoff_level, bool first_read_p) {
-  List_T new_result = NULL, gmap_hits = NULL;
+  List_T new_result = NULL;
   Stage3end_T hit, gmap;
-  List_T p, a;
+  List_T p;
   int genestrand;
-  int missing_hit, missing_gmap;
   int i;
   
   
@@ -15454,17 +15516,16 @@ align_end (int *cutoff_level, History_T gmap_history, T this,
 	   int user_maxlevel, int indel_penalty_middle, int indel_penalty_end,
 	   int localsplicing_penalty, int distantsplicing_penalty, int min_shortend,
 	   bool allow_end_indels_p, int max_end_insertions, int max_end_deletions, int min_indel_end_matches,
-	   bool allvalidp, bool keep_floors_p, int genestrand, bool first_read_p) {
-  List_T hits, greedy = NULL, subs = NULL, terminals = NULL, indels = NULL, new_indels,
-    ambiguous = NULL, singlesplicing = NULL, doublesplicing = NULL, shortendsplicing = NULL,
+	   bool keep_floors_p, int genestrand, bool first_read_p) {
+  List_T hits, greedy = NULL, subs = NULL, terminals = NULL, indels = NULL,
+    singlesplicing = NULL, doublesplicing = NULL, shortendsplicing = NULL,
     longsinglesplicing = NULL, distantsplicing = NULL, gmap_hits = NULL;
   List_T plus_anchor_segments = NULL, minus_anchor_segments = NULL;
-  List_T p, a;
+  List_T p;
   Stage3end_T hit, gmap;
   int nmisses_allowed_sarray;
-  int found_score, done_level, opt_level, fast_level, mismatch_level, nmismatches, max_mismatches_allowed;
+  int found_score, done_level, opt_level, fast_level, mismatch_level, nmismatches;
   int max_splice_mismatches, i;
-  int missing_hit, missing_gmap;
   int nhits = 0, nsplicepairs = 0;
   List_T *startfrags_plus, *endfrags_plus, *startfrags_minus, *endfrags_minus;
   List_T *donors_plus, *antidonors_plus, *acceptors_plus, *antiacceptors_plus,
@@ -15472,8 +15533,9 @@ align_end (int *cutoff_level, History_T gmap_history, T this,
   bool any_omitted_p, ambiguousp, alloc_floors_p = false, floors_computed_p = false;
   Floors_T floors;
   bool spanningsetp, completesetp, gmapp;
-  bool segments_computed_p = false, gmap_better_p, extend_left_p, extend_right_p;
+  bool segments_computed_p = false;
   Indexdb_T plus_indexdb, minus_indexdb;
+  bool allvalidp;
   
   if (genestrand == +2) {
     plus_indexdb = indexdb_rev;
@@ -15484,9 +15546,15 @@ align_end (int *cutoff_level, History_T gmap_history, T this,
   }
   
   found_score = querylength;
-  fast_level = (querylength + index1interval - 1)/spansize - NREQUIRED_FAST;
-  debug(printf("fast_level %d = (querylength %d + index1interval %d - 1)/spansize %d - nrequired_fast %d\n",
-	       fast_level,querylength,index1interval,spansize,NREQUIRED_FAST));
+  if (querylength < min_kmer_readlength) {
+    fast_level = querylength - 1 - NREQUIRED_FAST;
+    debug(printf("fast_level %d = querylength %d - 1 - nrequired_fast %d\n",
+		 fast_level,querylength,NREQUIRED_FAST));
+  } else {
+    fast_level = (querylength + index1interval - 1)/spansize - NREQUIRED_FAST;
+    debug(printf("fast_level %d = (querylength %d + index1interval %d - 1)/spansize %d - nrequired_fast %d\n",
+		 fast_level,querylength,index1interval,spansize,NREQUIRED_FAST));
+  }
   
 #if 0
   /* This prevents complete_mm procedure, needed for short reads */
@@ -15528,7 +15596,7 @@ align_end (int *cutoff_level, History_T gmap_history, T this,
   nmisses_allowed_sarray = *cutoff_level;
   
 #ifndef LARGE_GENOMES
-  if (use_only_sarray_p == true) {
+  if (use_only_sarray_p == true || (use_sarray_p == true && querylength < min_kmer_readlength)) {
     hits = Sarray_search_greedy(&(*cutoff_level),
 				queryuc_ptr,queryrc,querylength,query_compress_fwd,query_compress_rev,maxpeelback,pairpool,
 				dynprogL,dynprogM,dynprogR,oligoindices_minor,diagpool,cellpool,
@@ -15572,6 +15640,9 @@ align_end (int *cutoff_level, History_T gmap_history, T this,
   }
 #endif
   
+  if (querylength < min_kmer_readlength) {
+    spanningsetp = false;
+  }
 
   /* Search 2: Exact/subs via spanning set */
 
@@ -15640,6 +15711,10 @@ align_end (int *cutoff_level, History_T gmap_history, T this,
     completesetp = false;
   }
 
+  if (querylength < min_kmer_readlength) {
+    completesetp = false;
+  }
+
   if (completesetp == true) {
     if (this->read_oligos_p == false) {
       read_oligos(&allvalidp,this,queryuc_ptr,querylength,query_lastpos,/*genestrand*/0,
@@ -16217,1158 +16292,1143 @@ align_end (int *cutoff_level, History_T gmap_history, T this,
 		       gmap,Stage3end_nmatches_posttrim(gmap),missing_gmap,Stage3end_nmatches_posttrim(hit),missing_hit));
 	gmap_hits = List_push(gmap_hits,(void *) gmap);
 	Stage3end_set_improved_by_gmap(hit);
-       }
-     }
-   }
-   debug13(printf("Have %d GMAP hits\n",List_length(gmap_hits)));
-
-   if (alloc_floors_p == true) {
-     Floors_free(&floors);
-   }
-
-   /* Keep gmap_hits found in search 9 and 10 */
-   if (gmap_hits != NULL) {
-     hits = List_append(hits,gmap_hits);
-   }
-
-   if (gmap_improvement_p == false) {
-     debug(printf("No GMAP improvement: Before remove_overlaps at cutoff level %d: %d\n",*cutoff_level,List_length(hits)));
-     hits = Stage3end_optimal_score(hits,*cutoff_level,subopt_levels,query_compress_fwd,query_compress_rev,
-				    querylength,/*keep_gmap_p*/true,/*finalp*/true);
-     hits = Stage3end_reject_trimlengths(hits);
-     hits = Stage3end_remove_overlaps(hits,/*finalp*/true);
-     hits = Stage3end_optimal_score(hits,*cutoff_level,subopt_levels,query_compress_fwd,query_compress_rev,
-				    querylength,/*keep_gmap_p*/false,/*finalp*/true);
-     hits = Stage3end_resolve_multimapping(hits);
-     debug(printf("After remove_overlaps: %d\n",List_length(hits)));
-
-   } else {
-     debug(printf("GMAP improvement: Before remove_overlaps at cutoff level %d: %d\n",*cutoff_level,List_length(hits)));
-     hits = Stage3end_optimal_score(hits,*cutoff_level,subopt_levels,query_compress_fwd,query_compress_rev,
-				    querylength,/*keep_gmap_p*/true,/*finalp*/false);
-     /* Don't reject based on trimlength until after GMAP improvements */
-     hits = Stage3end_remove_overlaps(hits,/*finalp*/false);
-     hits = Stage3end_optimal_score(hits,*cutoff_level,subopt_levels,query_compress_fwd,query_compress_rev,
-				    querylength,/*keep_gmap_p*/false,/*finalp*/false);
-     hits = Stage3end_resolve_multimapping(hits);
-     debug(printf("After remove_overlaps: %d\n",List_length(hits)));
-
-     hits = align_singleend_with_gmap(gmap_history,hits,this,query_compress_fwd,query_compress_rev,
-				      accession,queryuc_ptr,querylength,query_lastpos,
-				      oligoindices_major,oligoindices_minor,
-				      pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,user_maxlevel,*cutoff_level,
-				      first_read_p);
-     hits = Stage3end_optimal_score(hits,*cutoff_level,subopt_levels,query_compress_fwd,query_compress_rev,
-				    querylength,/*keep_gmap_p*/true,/*finalp*/true);
-     hits = Stage3end_reject_trimlengths(hits);
-     hits = Stage3end_remove_overlaps(hits,/*finalp*/true);
-     hits = Stage3end_optimal_score(hits,*cutoff_level,subopt_levels,query_compress_fwd,query_compress_rev,
-				    querylength,/*keep_gmap_p*/false,/*finalp*/true);
-     hits = Stage3end_resolve_multimapping(hits);
-   }
-
-   hits = Stage3end_remove_circular_alias(hits);
-   hits = Stage3end_remove_duplicates(hits); /* Aliases can cause duplicates */
-
-   List_free(&plus_anchor_segments);
-   List_free(&minus_anchor_segments);
-
-   return hits;
- }
-
-
- static Stage3end_T *
- single_read (int *npaths, int *first_absmq, int *second_absmq,
-	      Shortread_T queryseq, Indexdb_T indexdb_fwd, Indexdb_T indexdb_rev,
-	      int indexdb_size_threshold, Floors_T *floors_array,
-	      double user_maxlevel_float, int indel_penalty_middle, int indel_penalty_end,
-	      bool allow_end_indels_p, int max_end_insertions, int max_end_deletions, int min_indel_end_matches,
-	      int localsplicing_penalty, int distantsplicing_penalty, int min_shortend,
-	      Oligoindex_array_T oligoindices_major, Oligoindex_array_T oligoindices_minor,
-	      Pairpool_T pairpool, Diagpool_T diagpool, Cellpool_T cellpool,
-	      Dynprog_T dynprogL, Dynprog_T dynprogM, Dynprog_T dynprogR,
-	      bool keep_floors_p) {
-   Stage3end_T *stage3array;
-   History_T gmap_history;
-   List_T hits = NULL;
-   T this = NULL;
-   int user_maxlevel;
-   int querylength, query_lastpos, cutoff_level;
-   char *queryuc_ptr, *quality_string;
-   Compress_T query_compress_fwd = NULL, query_compress_rev = NULL;
-   bool allvalidp;
-
- #ifdef HAVE_ALLOCA
-   char *queryrc;
- #else
-   char queryrc[MAX_READLENGTH+1];
- #endif
-
-   if ((querylength = Shortread_fulllength(queryseq)) < min_readlength) {
-     fprintf(stderr,"Read %s has length %d < min_readlength %d.  Skipping.\n",
-	     Shortread_accession(queryseq),querylength,min_readlength);
-     /* fprintf(stderr,"You may want to build a genomic index with a smaller k-mer value using the -k flag to gmap_build\n"); */
-     *npaths = 0;
-     return (Stage3end_T *) NULL;
-
- #ifndef HAVE_ALLOCA
-   } else if (querylength > MAX_READLENGTH) {
-     fprintf(stderr,"Read %s has length %d > MAX_READLENGTH %d.  Either run configure and make again with a higher value of MAX_READLENGTH, or consider using GMAP instead.\n",
-	     Shortread_accession(queryseq),querylength,MAX_READLENGTH);
-     *npaths = 0;
-     return (Stage3end_T *) NULL;
- #endif
-
-   } else {
-     if (user_maxlevel_float < 0.0) {
-       user_maxlevel = -1;
-     } else if (user_maxlevel_float > 0.0 && user_maxlevel_float < 1.0) {
-       user_maxlevel = (int) rint(user_maxlevel_float * (double) querylength);
-     } else {
-       user_maxlevel = (int) user_maxlevel_float;
-     }
-
-     /* Limit search on repetitive sequences */
-     queryuc_ptr = Shortread_fullpointer_uc(queryseq);
-     quality_string = Shortread_quality_string(queryseq);
-     if (check_dinucleotides(queryuc_ptr,querylength) == false) {
-       user_maxlevel = 0;
-     }
-
-     query_compress_fwd = Compress_new_fwd(queryuc_ptr,querylength);
-     query_compress_rev = Compress_new_rev(queryuc_ptr,querylength);
- #ifdef HAVE_ALLOCA
-     queryrc = (char *) ALLOCA((querylength+1)*sizeof(int));
- #endif
-     make_complement_buffered(queryrc,queryuc_ptr,querylength);
-
-     this = Stage1_new(querylength);
-     query_lastpos = querylength - index1part;
-
-     gmap_history = History_new();
-     hits = align_end(&cutoff_level,gmap_history,this,
-		      query_compress_fwd,query_compress_rev,
-		      Shortread_accession(queryseq),queryuc_ptr,queryrc,querylength,query_lastpos,
-		      indexdb_fwd,indexdb_rev,indexdb_size_threshold,floors_array,
-		      oligoindices_major,oligoindices_minor,
-		      pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,
-		      user_maxlevel,indel_penalty_middle,indel_penalty_end,
-		      localsplicing_penalty,distantsplicing_penalty,min_shortend,
-		      allow_end_indels_p,max_end_insertions,max_end_deletions,min_indel_end_matches,
-		      allvalidp,keep_floors_p,/*genestrand*/0,/*first_read_p*/true);
-
-     if ((*npaths = List_length(hits)) == 0) {
-       stage3array = (Stage3end_T *) NULL;
-     } else {
-       stage3array = (Stage3end_T *) List_to_array_out(hits,NULL); List_free(&hits); /* Return value */
-       stage3array = Stage3end_eval_and_sort(&(*npaths),&(*first_absmq),&(*second_absmq),
-					     stage3array,maxpaths_search,queryseq,queryuc_ptr,queryrc,
-					     query_compress_fwd,query_compress_rev,
-					     quality_string,/*displayp*/true);
-     }
-     
-     History_free(&gmap_history);
-     Compress_free(&query_compress_fwd);
-     Compress_free(&query_compress_rev);
-     Stage1_free(&this,querylength); 
-     return stage3array;
-   }
- }
-
-
- static Stage3end_T *
- single_read_tolerant_nonstranded (int *npaths, int *first_absmq, int *second_absmq,
-				   Shortread_T queryseq, Indexdb_T indexdb_fwd, Indexdb_T indexdb_rev,
-				   int indexdb_size_threshold, Floors_T *floors_array,
-				   double user_maxlevel_float, int indel_penalty_middle, int indel_penalty_end,
-				   bool allow_end_indels_p, int max_end_insertions, int max_end_deletions, int min_indel_end_matches,
-				   int localsplicing_penalty, int distantsplicing_penalty, int min_shortend,
-				   Oligoindex_array_T oligoindices_major, Oligoindex_array_T oligoindices_minor, 
-				   Pairpool_T pairpool, Diagpool_T diagpool, Cellpool_T cellpool,
-				   Dynprog_T dynprogL, Dynprog_T dynprogM, Dynprog_T dynprogR,
-				   bool keep_floors_p) {
-   Stage3end_T *stage3array;
-   History_T gmap_history;
-   List_T hits, hits_geneplus = NULL, hits_geneminus = NULL;
-   T this_geneplus = NULL, this_geneminus = NULL;
-   int user_maxlevel;
-   int querylength, query_lastpos, cutoff_level;
-   char *queryuc_ptr, *quality_string;
-   Compress_T query_compress_fwd = NULL, query_compress_rev = NULL;
-   bool allvalidp;
-
- #ifdef HAVE_ALLOCA
-   char *queryrc;
- #else
-   char queryrc[MAX_READLENGTH+1];
- #endif
-
-
-   if ((querylength = Shortread_fulllength(queryseq)) < min_readlength) {
-     fprintf(stderr,"Read %s has length %d < min_readlength %d.  Skipping\n",
-	     Shortread_accession(queryseq),querylength,min_readlength);
-     /* fprintf(stderr,"You may want to build a genomic index with a smaller k-mer value using the -k flag to gmap_build\n"); */
-     *npaths = 0;
-     return (Stage3end_T *) NULL;
-
- #ifndef HAVE_ALLOCA
-   } else if (querylength > MAX_READLENGTH) {
-     fprintf(stderr,"Read %s has length %d > MAX_READLENGTH %d.  Either run configure and make again with a higher value of MAX_READLENGTH, or consider using GMAP instead.\n",
-	     Shortread_accession(queryseq),querylength,MAX_READLENGTH);
-     *npaths = 0;
-     return (Stage3end_T *) NULL;
- #endif
-
-   } else {
-     if (user_maxlevel_float < 0.0) {
-       user_maxlevel = -1;
-     } else if (user_maxlevel_float > 0.0 && user_maxlevel_float < 1.0) {
-       user_maxlevel = (int) rint(user_maxlevel_float * (double) querylength);
-     } else {
-       user_maxlevel = (int) user_maxlevel_float;
-     }
-
-     this_geneplus = Stage1_new(querylength);
-     this_geneminus = Stage1_new(querylength);
-
-     queryuc_ptr = Shortread_fullpointer_uc(queryseq);
-     quality_string = Shortread_quality_string(queryseq);
-     query_lastpos = querylength - index1part;
-
-     /* Limit search on repetitive sequences */
-     if (check_dinucleotides(queryuc_ptr,querylength) == false) {
-       user_maxlevel = 0;
-     }
-
-     query_compress_fwd = Compress_new_fwd(queryuc_ptr,querylength);
-     query_compress_rev = Compress_new_rev(queryuc_ptr,querylength);
-     gmap_history = History_new();
- #ifdef HAVE_ALLOCA
-     queryrc = (char *) ALLOCA((querylength+1)*sizeof(char));
- #endif
-     make_complement_buffered(queryrc,queryuc_ptr,querylength);
-
-     if (read_oligos(&allvalidp,this_geneplus,queryuc_ptr,querylength,query_lastpos,/*genestrand*/+1,
-		     /*first_read_p*/true) > 0) {
-       hits_geneplus = align_end(&cutoff_level,gmap_history,this_geneplus,
-				 query_compress_fwd,query_compress_rev,
-				 Shortread_accession(queryseq),queryuc_ptr,queryrc,querylength,query_lastpos,
-				 indexdb_fwd,indexdb_rev,indexdb_size_threshold,
-				 floors_array,oligoindices_major,oligoindices_minor,
-				 pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,
-				 user_maxlevel,indel_penalty_middle,indel_penalty_end,
-				 localsplicing_penalty,distantsplicing_penalty,min_shortend,
-				 allow_end_indels_p,max_end_insertions,max_end_deletions,min_indel_end_matches,
-				 allvalidp,keep_floors_p,/*genestrand*/+1,/*first_read_p*/true);
-     }
-
-     if (read_oligos(&allvalidp,this_geneminus,queryuc_ptr,querylength,query_lastpos,/*genestrand*/+2,
-		     /*first_read_p*/true) > 0) {
-       hits_geneminus = align_end(&cutoff_level,gmap_history,this_geneminus,
-				  query_compress_fwd,query_compress_rev,
-				  Shortread_accession(queryseq),queryuc_ptr,queryrc,querylength,query_lastpos,
-				  indexdb_fwd,indexdb_rev,indexdb_size_threshold,
-				  floors_array,oligoindices_major,oligoindices_minor,
-				  pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,
-				  user_maxlevel,indel_penalty_middle,indel_penalty_end,
-				  localsplicing_penalty,distantsplicing_penalty,min_shortend,
-				  allow_end_indels_p,max_end_insertions,max_end_deletions,min_indel_end_matches,
-				  allvalidp,keep_floors_p,/*genestrand*/+2,/*first_read_p*/true);
-     }
-
-     hits = List_append(hits_geneplus,hits_geneminus);
-     hits = Stage3end_optimal_score(hits,cutoff_level,subopt_levels,query_compress_fwd,query_compress_rev,
-				    querylength,/*keep_gmap_p*/true,/*finalp*/true);
-     hits = Stage3end_reject_trimlengths(hits);
-     hits = Stage3end_remove_overlaps(hits,/*finalp*/true);
-     hits = Stage3end_optimal_score(hits,cutoff_level,subopt_levels,query_compress_fwd,query_compress_rev,
-				    querylength,/*keep_gmap_p*/false,/*finalp*/true);
-     hits = Stage3end_resolve_multimapping(hits);
-
-     if ((*npaths = List_length(hits)) == 0) {
-       stage3array = (Stage3end_T *) NULL;
-     } else {
-       stage3array = (Stage3end_T *) List_to_array_out(hits,NULL); List_free(&hits); /* Return value */
-       stage3array = Stage3end_eval_and_sort(&(*npaths),&(*first_absmq),&(*second_absmq),
-					     stage3array,maxpaths_search,queryseq,queryuc_ptr,queryrc,
-					     query_compress_fwd,query_compress_rev,
-					     quality_string,/*displayp*/true);
-     }
-
-     History_free(&gmap_history);
-     Compress_free(&query_compress_fwd);
-     Compress_free(&query_compress_rev);
-     Stage1_free(&this_geneminus,querylength); 
-     Stage1_free(&this_geneplus,querylength); 
-     return stage3array;
-   }
- }
-
-
- Stage3end_T *
- Stage1_single_read (int *npaths, int *first_absmq, int *second_absmq,
-		     Shortread_T queryseq, Indexdb_T indexdb_fwd, Indexdb_T indexdb_rev,
-		     int indexdb_size_threshold, Floors_T *floors_array,
-		     double user_maxlevel_float, int indel_penalty_middle, int indel_penalty_end,
-		     bool allow_end_indels_p, int max_end_insertions, int max_end_deletions, int min_indel_end_matches,
-		     int localsplicing_penalty, int distantsplicing_penalty, int min_shortend,
-		     Oligoindex_array_T oligoindices_major, Oligoindex_array_T oligoindices_minor,
-		     Pairpool_T pairpool, Diagpool_T diagpool, Cellpool_T cellpool,
-		     Dynprog_T dynprogL, Dynprog_T dynprogM, Dynprog_T dynprogR,
-		     bool keep_floors_p) {
-
-   if (mode == STANDARD || mode == CMET_STRANDED || mode == ATOI_STRANDED) {
-     return single_read(&(*npaths),&(*first_absmq),&(*second_absmq),
-			queryseq,indexdb_fwd,indexdb_rev,indexdb_size_threshold,
-			floors_array,user_maxlevel_float,
-			indel_penalty_middle,indel_penalty_end,
-			allow_end_indels_p,max_end_insertions,max_end_deletions,min_indel_end_matches,
-			localsplicing_penalty,distantsplicing_penalty,min_shortend,
-			oligoindices_major,oligoindices_minor,
-			pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,keep_floors_p);
-   } else if (mode == CMET_NONSTRANDED || mode == ATOI_NONSTRANDED) {
-     return single_read_tolerant_nonstranded(&(*npaths),&(*first_absmq),&(*second_absmq),queryseq,
-					     indexdb_fwd,indexdb_rev,indexdb_size_threshold,
-					     floors_array,user_maxlevel_float,
-					     indel_penalty_middle,indel_penalty_end,
-					     allow_end_indels_p,max_end_insertions,max_end_deletions,min_indel_end_matches,
-					     localsplicing_penalty,distantsplicing_penalty,min_shortend,
-					     oligoindices_major,oligoindices_minor,
-					     pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,keep_floors_p);
-   } else {
-     fprintf(stderr,"Do not recognize mode %d\n",mode);
-     abort();
-   }
- }
-
-
-
- /* #define HITARRAY_SHORTENDSPLICING 4 */
- /* #define HITARRAY_DISTANTSPLICING 4 */
-
-
- static List_T
- align_halfmapping_with_gmap (History_T gmap_history, Stage3end_T hit5, Stage3end_T hit3, 
-			      Shortread_T queryseq5, Shortread_T queryseq3,
-			      char *queryuc_ptr, int querylength, int query_lastpos,
- #ifdef END_KNOWNSPLICING_SHORTCUT
-			      char *queryrc, bool invertedp,
- #endif
-			      Compress_T query_compress_fwd, Compress_T query_compress_rev,
-			      struct Segment_T *plus_segments, int plus_nsegments,
-			      struct Segment_T *minus_segments, int minus_nsegments,
-			      Oligoindex_array_T oligoindices_major, Oligoindex_array_T oligoindices_minor,
-			      Pairpool_T pairpool, Diagpool_T diagpool, Cellpool_T cellpool,
-			      Dynprog_T dynprogL, Dynprog_T dynprogM, Dynprog_T dynprogR,
-			      Chrpos_T pairmax, Chrpos_T shortsplicedist, int user_maxlevel,
-			      int genestrand, bool first_read_p) {
-   List_T hits = NULL;
-   int sensedir, sense_try;
-   int overlap;
-
-   int zero_offset = 0;
-   Univcoord_T segmentstart, segmentend;
-   Univcoord_T genomicbound, genomicbound2, mappingstart, mappingend,
-     chroffset, chrhigh, mappingpos;
- #ifdef USE_GREEDY
-   Univcoord_T close_mappingstart_greedy, close_mappingend_greedy,
-     middle_mappingstart_greedy, middle_mappingend_greedy;
- #endif
-   Univcoord_T close_mappingstart_last, close_mappingend_last,
-     middle_mappingstart_last, middle_mappingend_last;
-   Univcoord_T knownsplice_limit_low, knownsplice_limit_high;
-   Univcoord_T close_knownsplice_limit_low, close_knownsplice_limit_high;
-   Chrpos_T chrlength;
-   Chrnum_T chrnum;
-   bool close_mappingstart_p = false, close_mappingend_p = false;
-   bool middle_mappingstart_p = false, middle_mappingend_p = false;
-   bool fallback_mappingstart_p, fallback_mappingend_p;
-   bool good_start_p, good_end_p, watsonp, favor_right_p;
-
-   int starti, endi, i;
-
-   if (hit3 == NULL) {
-     /* Both events are tested by Stage3end_anomalous_splice_p */
-     if ((chrnum = Stage3end_chrnum(hit5)) == 0) {
-       /* Translocation */
-       return (List_T) NULL;
-
-     } else if (Stage3end_hittype(hit5) == SAMECHR_SPLICE) {
-       /* A genomic event that doesn't get reflected in chrnum */
-       return (List_T) NULL;
-
-     } else if ((watsonp = Stage3end_plusp(hit5)) == true) {
-       chroffset = Stage3end_chroffset(hit5);
-       chrhigh = Stage3end_chrhigh(hit5);
-       chrlength = Stage3end_chrlength(hit5);
-
-       if (Shortread_find_primers(queryseq5,queryseq3) == true) {
-	 /* Go from genomicstart */
-	 debug13(printf("Found primers\n"));
-	 genomicbound = Stage3end_genomicstart(hit5);
-
-       } else if (Stage3end_anomalous_splice_p(hit5) == true) {
-	 /* Go from genomicstart */
-	 debug13(printf("Anomalous splice\n"));
-	 genomicbound = Stage3end_genomicstart(hit5);
-
-       } else {
-	 genomicbound = Stage3end_genomicend(hit5);
-
- #if 0
-	 /* TODO: Previously called Shortread_find_overlap.  Now with Shortread_max_overlap, can optimize this code */
-	 if ((overlap = Shortread_max_overlap(queryseq5,queryseq3)) > 0 &&
-	     Stage3end_genomicbound_from_end(&genomicbound2,hit5,overlap,chroffset) == true) {
-	   debug13(printf("Found overlap of %d\n",overlap));
-	   if (genomicbound2 < genomicbound) {
-	     zero_offset = genomicbound - genomicbound2;
-	     genomicbound = genomicbound2;
-	   }
-	 }
- #endif
-       }
-
-       debug13(printf("Case 1: hit5 plus %s %u..%u (sensedir %d) => genomicbound %u\n",
-		      Stage3end_hittype_string(hit5),
-		      Stage3end_genomicstart(hit5) - chroffset,Stage3end_genomicend(hit5) - chroffset,
-		      Stage3end_sensedir(hit5),genomicbound - chroffset));
-
-       knownsplice_limit_low = mappingstart = segmentstart = genomicbound;
-       knownsplice_limit_high =  add_bounded(Stage3end_genomicend(hit5),pairmax + shortsplicedist,chrhigh);
-       segmentend = add_bounded(Stage3end_genomicend(hit5),pairmax,chrhigh);
- #ifdef LONG_ENDSPLICES
-       mappingend = add_bounded(Stage3end_genomicend(hit5),pairmax + shortsplicedist,chrhigh);
- #else
-       mappingend = add_bounded(Stage3end_genomicend(hit5),pairmax + shortsplicedist_novelend,chrhigh);
- #endif
-       debug13(printf("Original bounds E: knownsplice_limit_low %u, knownsplice_limit_high %u, mappingend %u\n",
-		      knownsplice_limit_low - chroffset,knownsplice_limit_high - chroffset,mappingend - chroffset));
-
-       close_mappingend_last = middle_mappingend_last = Stage3end_genomicend(hit5);
- #ifdef USE_GREEDY
-       close_mappingend_greedy = middle_mappingend_greedy = segmentend;
- #endif
-
-       if (plus_nsegments > 0) {
-	 /* Use segments to bound */
-	 debug13(printf("Finding segments from segmentstart %u to segmentend %u (plus_nsegments %d)\n",
-			segmentstart - chroffset,segmentend - chroffset,plus_nsegments));
-	 starti = endi = -1;
-	 i = binary_search_segments(0,plus_nsegments-1,plus_segments,segmentstart);
-	 while (i < plus_nsegments - 1 && plus_segments[i].diagonal == (Univcoord_T) -1) {
-	   i++;
-	 }
-	 starti = i;
-	 while (plus_segments[i].diagonal < segmentend) {
-	   endi = i;
-	   i++;
-	 }
-	 if (starti >= 0 && endi >= 0) {
-	   debug13(printf("starti = %d, endi = %d\n",starti,endi));
-	   assert(starti <= endi);
-	   for (i = starti; i <= endi; i++) {
-	     debug13(printf("diagonal %u (%llu), querypos %d..%d\n",
-			    (Chrpos_T) (plus_segments[i].diagonal - chroffset),(unsigned long long) plus_segments[i].diagonal,
-			    plus_segments[i].querypos5,plus_segments[i].querypos3));
-	     if (query_lastpos - plus_segments[i].querypos3 >= STAGE2_MIN_OLIGO + index1interval) {
-	       /* Case 1. Missing end of query, so there could be a middle splice */
-	       debug13b(printf("  query_lastpos %d - querypos3 %d >= %d + %d, so using this diagonal plus shortsplicedist\n",
-			       query_lastpos,plus_segments[i].querypos3,STAGE2_MIN_OLIGO,index1interval));
- #ifdef USE_GREEDY
-	       if ((mappingpos = add_bounded(plus_segments[i].diagonal,shortsplicedist_novelend,chrhigh)) < middle_mappingend_greedy &&
-		   mappingpos > genomicbound) {
-		 middle_mappingend_greedy = mappingpos;
-		 middle_mappingend_p = true;
-		 debug13(printf("  Redefining middle mappingend greedy to %u\n",middle_mappingend_greedy - chroffset));
-	       }
- #endif
-
- #ifdef LONG_ENDSPLICES
-	       if ((mappingpos = add_bounded(plus_segments[i].diagonal,shortsplicedist,chrhigh)) > middle_mappingend_last) {
-		 /* Use > for NOT_GREEDY */
-		 middle_mappingend_last = mappingpos;
-		 middle_mappingend_p = true;
-		 debug13(printf("  Redefining middle mappingend last to %u\n",middle_mappingend_last - chroffset));
-	       }
- #else
-	       if ((mappingpos = plus_segments[i].diagonal) > middle_mappingend_last) {
-		 /* Use > for NOT_GREEDY */
-		 middle_mappingend_last = mappingpos;
-		 middle_mappingend_p = true;
-		 debug13(printf("  Redefining middle mappingend last to %u\n",middle_mappingend_last - chroffset));
-	       }
- #endif
-
-	     } else {
-	       debug13b(printf("  query_lastpos %d - querypos3 %d < %d + %d, so using this diagonal\n",
-			       query_lastpos,plus_segments[i].querypos3,STAGE2_MIN_OLIGO,index1interval));
- #ifdef USE_GREEDY
-	       if ((mappingpos = plus_segments[i].diagonal) < close_mappingend_greedy &&
-		   mappingpos > genomicbound) {
-		 close_mappingend_greedy = mappingpos;
-		 close_mappingend_p = true;
-		 debug13(printf("  Redefining close mappingend greedy to %u\n",close_mappingend_greedy - chroffset));
-	       }
- #endif
-	       if ((mappingpos = plus_segments[i].diagonal) > close_mappingend_last) {
-		 /* Use > for NOT_GREEDY */
-		 close_mappingend_last = mappingpos;
-		 close_mappingend_p = true;
-		 debug13(printf("  Redefining close mappingend last to %u\n",close_mappingend_last - chroffset));
-	       }
-	     }
-	   }
+      }
+    }
+  }
+  debug13(printf("Have %d GMAP hits\n",List_length(gmap_hits)));
 
- #ifdef USE_GREEDY
-	   if (close_mappingend_p == true) {
-	     close_knownsplice_limit_high = add_bounded(close_mappingend_greedy,shortsplicedist,chrhigh);
-	   } else if (middle_mappingend_p == true) {
-	     debug13(printf("Using middle mappingend\n"));
-	     close_knownsplice_limit_high = middle_mappingend_greedy;
-	     close_mappingend_greedy = middle_mappingend_greedy;
-	     close_mappingend_p = true;
-	   }
- #else
-	   if (close_mappingend_p == true) {
-	     close_knownsplice_limit_high = add_bounded(close_mappingend_last,shortsplicedist,chrhigh);
-	   } else if (middle_mappingend_p == true) {
-	     debug13(printf("Using middle mappingend\n"));
-	     close_knownsplice_limit_high = middle_mappingend_last;
-	     close_mappingend_last = middle_mappingend_last;
-	     close_mappingend_p = true;
-	   }
- #endif
- #ifdef USE_GREEDY
-	   if (middle_mappingend_p == true && middle_mappingend_last > close_mappingend_greedy) {
-	     knownsplice_limit_high = middle_mappingend_last;
-	     mappingend = middle_mappingend_last;
-	   } else if (close_mappingend_p == true && close_mappingend_last != close_mappingend_greedy) {
-	     knownsplice_limit_high = add_bounded(close_mappingend_last,shortsplicedist,chrhigh);
-	     mappingend = close_mappingend_last;
-	   }
- #else
-	   if (middle_mappingend_p == true && middle_mappingend_last > close_mappingend_last) {
-	     knownsplice_limit_high = middle_mappingend_last;
-	     mappingend = middle_mappingend_last;
-	   }
- #endif
-
-	   if (close_mappingend_p == false) {
-	     fallback_mappingend_p = false;
- #ifdef USE_GREEDY
-	   } else if (mappingend <= close_mappingend_greedy) {
-	     fallback_mappingend_p = false;
- #endif
-	   } else {
-	     debug13(printf("Fallback mappingend = %u\n",mappingend - chroffset));
-	     fallback_mappingend_p = true;
-	   }
-	 }
-       }
-
-       favor_right_p = false;
-
-     } else {
-       chroffset = Stage3end_chroffset(hit5);
-       chrhigh = Stage3end_chrhigh(hit5);
-       chrlength = Stage3end_chrlength(hit5);
-
-       if (Shortread_find_primers(queryseq5,queryseq3) == true) {
-	 /* Go from genomicstart */
-	 debug13(printf("Found primers\n"));
-	 genomicbound = Stage3end_genomicstart(hit5);
-
-       } else if (Stage3end_anomalous_splice_p(hit5) == true) {
-	 /* Go from genomicstart */
-	 debug13(printf("Anomalous splice\n"));
-	 genomicbound = Stage3end_genomicstart(hit5);
-
-       } else {
-	 genomicbound = Stage3end_genomicend(hit5);
-
- #if 0
-	 /* TODO: Previously called Shortread_find_overlap.  Now with Shortread_max_overlap, can optimize this code */
-	 if ((overlap = Shortread_max_overlap(queryseq5,queryseq3)) > 0 &&
-	     Stage3end_genomicbound_from_end(&genomicbound2,hit5,overlap,chroffset) == true) {
-	   debug13(printf("Found overlap of %d\n",overlap));
-	   if (genomicbound2 > genomicbound) {
-	     zero_offset = genomicbound2 - genomicbound;
-	     genomicbound = genomicbound2;
-	   }
-	 }
- #endif
-       }
-
-       debug13(printf("Case 2: hit5 minus %s %u..%u (sensedir %d) => genomicbound %u\n",
-		      Stage3end_hittype_string(hit5),
-		      Stage3end_genomicstart(hit5) - chroffset,Stage3end_genomicend(hit5) - chroffset,
-		      Stage3end_sensedir(hit5),genomicbound - chroffset));
-
-       knownsplice_limit_high = mappingend = segmentend = genomicbound;
-       knownsplice_limit_low = subtract_bounded(Stage3end_genomicend(hit5),pairmax + shortsplicedist,chroffset);
-       segmentstart = subtract_bounded(Stage3end_genomicend(hit5),pairmax,chroffset);
- #ifdef LONG_ENDSPLICES
-       mappingstart = subtract_bounded(Stage3end_genomicend(hit5),pairmax + shortsplicedist,chroffset);
- #else
-       mappingstart = subtract_bounded(Stage3end_genomicend(hit5),pairmax + shortsplicedist_novelend,chroffset);
- #endif
-       debug13(printf("Original bounds F: knownsplice_limit_low %u, knownsplice_limit_high %u, mappingstart %u\n",
-		      knownsplice_limit_low - chroffset,knownsplice_limit_high - chroffset,mappingstart - chroffset));
-
-       close_mappingstart_last = middle_mappingstart_last = Stage3end_genomicend(hit5);
- #ifdef USE_GREEDY
-       close_mappingstart_greedy = middle_mappingstart_greedy = segmentstart;
- #endif
-
-       if (minus_nsegments > 0) {
-	 /* Use segments to bound */
-	 debug13(printf("Finding segments from segmentstart %u to segmentend %u (minus_nsegments %d)\n",
-			segmentstart - chroffset,segmentend - chroffset,minus_nsegments));
-	 starti = endi = -1;
-	 i = binary_search_segments(0,minus_nsegments-1,minus_segments,segmentend);
-	 while (i >= 0 && minus_segments[i].diagonal >= segmentend) {
-	   i--;
-	 }
-	 starti = i;
-	 while (i >= 0 && minus_segments[i].diagonal > segmentstart) {
-	   if (minus_segments[i].diagonal < (Univcoord_T) -1) {
-	     endi = i;
-	   }
-	   i--;
-	 }
-	 if (starti >= 0 && endi >= 0) {
-	   debug13(printf("starti = %d, endi = %d\n",starti,endi));
-	   assert(starti >= endi);
-	   for (i = starti; i >= endi; i--) {
-	     debug13(printf("diagonal %u (%llu), querypos %d..%d\n",
-			    (Chrpos_T) (minus_segments[i].diagonal - chroffset),(unsigned long long) minus_segments[i].diagonal,
-			    minus_segments[i].querypos5,minus_segments[i].querypos3));
-	     if (query_lastpos - minus_segments[i].querypos3 >= STAGE2_MIN_OLIGO + index1interval) {
-	       /* Case 2. Missing end of query, so there could be a middle splice */
-	       debug13b(printf("  query_lastpos %d - querypos3 %d >= %d + %d, so using this diagonal plus shortsplicedist\n",
-			       query_lastpos,minus_segments[i].querypos3,STAGE2_MIN_OLIGO,index1interval));
- #ifdef USE_GREEDY
-	       if ((mappingpos = subtract_bounded(minus_segments[i].diagonal,querylength + shortsplicedist_novelend,chroffset)) > middle_mappingstart_greedy &&
-		   mappingpos < genomicbound) {
-		 middle_mappingstart_greedy = mappingpos;
-		 middle_mappingstart_p = true;
-		 debug13(printf("  Redefining middle mappingstart greedy to %u\n",middle_mappingstart_greedy - chroffset));
-	       }
- #endif
- #ifdef LONG_ENDSPLICES
-	       if ((mappingpos = subtract_bounded(minus_segments[i].diagonal,querylength + shortsplicedist,chroffset)) < middle_mappingstart_last) {
-		 /* Use < for NOT_GREEDY */
-		 middle_mappingstart_last = mappingpos;
-		 middle_mappingstart_p = true;
-		 debug13(printf("  Redefining middle mappingstart last to %u\n",middle_mappingstart_last - chroffset));
-	       }
- #else
-	       if ((mappingpos = subtract_bounded(minus_segments[i].diagonal,querylength,chroffset)) < middle_mappingstart_last) {
-		 /* Use < for NOT_GREEDY */
-		 middle_mappingstart_last = mappingpos;
-		 middle_mappingstart_p = true;
-		 debug13(printf("  Redefining middle mappingstart last to %u\n",middle_mappingstart_last - chroffset));
-	       }
- #endif
-
-	     } else {
-	       debug13b(printf("  query_lastpos %d - querypos3 %d < %d + %d, so using this diagonal\n",
-			       query_lastpos,minus_segments[i].querypos3,STAGE2_MIN_OLIGO,index1interval));
- #ifdef USE_GREEDY
-	       if ((mappingpos = subtract_bounded(minus_segments[i].diagonal,querylength,chroffset)) > close_mappingstart_greedy &&
-		   mappingpos < genomicbound) {
-		 close_mappingstart_greedy = mappingpos;
-		 close_mappingstart_p = true;
-		 debug13(printf("  Redefining close mappingstart greedy to %u\n",close_mappingstart_greedy - chroffset));
-	       }
- #endif
-	       if ((mappingpos = subtract_bounded(minus_segments[i].diagonal,querylength,chroffset)) < close_mappingstart_last) {
-		 /* Use < for NOT_GREEDY */
-		 close_mappingstart_last = mappingpos;
-		 close_mappingstart_p = true;
-		 debug13(printf("  Redefining close mappingstart last to %u\n",close_mappingstart_last - chroffset));
-	       }
-	     }
-	   }
+  if (alloc_floors_p == true) {
+    Floors_free(&floors);
+  }
 
- #ifdef USE_GREEDY
-	   if (close_mappingstart_p == true) {
-	     close_knownsplice_limit_low = subtract_bounded(close_mappingstart_greedy,shortsplicedist,chroffset);
-	   } else if (middle_mappingstart_p == true) {
-	     debug13(printf("Using middle mappingstart\n"));
-	     close_knownsplice_limit_low = middle_mappingstart_greedy;
-	     close_mappingstart_greedy = middle_mappingstart_greedy;
-	     close_mappingstart_p = true;
-	   }
- #else
-	   if (close_mappingstart_p == true) {
-	     close_knownsplice_limit_low = subtract_bounded(close_mappingstart_last,shortsplicedist,chroffset);
-	   } else if (middle_mappingstart_p == true) {
-	     debug13(printf("Using middle mappingstart\n"));
-	     close_knownsplice_limit_low = middle_mappingstart_last;
-	     close_mappingstart_last = middle_mappingstart_last;
-	     close_mappingstart_p = true;
-	   }
- #endif
- #ifdef USE_GREEDY
-	   if (middle_mappingstart_p == true && middle_mappingstart_last < close_mappingstart_greedy) {
-	     knownsplice_limit_low = middle_mappingstart_last;
-	     mappingstart = middle_mappingstart_last;
-	   } else if (close_mappingstart_p == true && close_mappingstart_last != close_mappingstart_greedy) {
-	     knownsplice_limit_low = subtract_bounded(close_mappingstart_last,shortsplicedist,chroffset);
-	     mappingstart = close_mappingstart_last;
-	   }
- #else
-	   if (middle_mappingstart_p == true && middle_mappingstart_last < close_mappingstart_last) {
-	     knownsplice_limit_low = middle_mappingstart_last;
-	     mappingstart = middle_mappingstart_last;
-	   }
- #endif
-	   if (close_mappingstart_p == false) {
-	     fallback_mappingstart_p = false;
- #ifdef USE_GREEDY
-	   } else if (mappingstart >= close_mappingstart_greedy) {
-	     fallback_mappingstart_p = false;
- #endif
-	   } else {
-	     debug13(printf("Fallback mappingstart = %u\n",mappingstart - chroffset));
-	     fallback_mappingstart_p = true;
-	   }
-	 }
-       }
-
-       favor_right_p = false;
-     }
-
-     if ((sensedir = Stage3end_sensedir(hit5)) == SENSE_FORWARD) {
-       sense_try = +1;
-     } else if (sensedir == SENSE_ANTI) {
-       sense_try = -1;
-     } else {
-       sense_try = 0;
-     }
-
-   } else if (hit5 == NULL) {
-     /* Both events are tested by Stage3end_anomalous_splice_p */
-     if ((chrnum = Stage3end_chrnum(hit3)) == 0) {
-       /* Translocation */
-       return (List_T) NULL;
-
-     } else if (Stage3end_hittype(hit3) == SAMECHR_SPLICE) {
-       /* A genomic event that doesn't get reflected in chrnum */
-       return (List_T) NULL;
-
-     } else if ((watsonp = Stage3end_plusp(hit3)) == true) {
-       chroffset = Stage3end_chroffset(hit3);
-       chrhigh = Stage3end_chrhigh(hit3);
-       chrlength = Stage3end_chrlength(hit3);
-
-       if (Shortread_find_primers(queryseq5,queryseq3) == true) {
-	 /* Go from genomicend */
-	 debug13(printf("Found primers\n"));
-	 genomicbound = Stage3end_genomicend(hit3);
-
-       } else if (Stage3end_anomalous_splice_p(hit3) == true) {
-	 /* Go from genomicend */
-	 debug13(printf("Anomalous splice\n"));
-	 genomicbound = Stage3end_genomicend(hit3);
-
-       } else {
-	 genomicbound = Stage3end_genomicstart(hit3);
-
- #if 0
-	 /* TODO: Previously called Shortread_find_overlap.  Now with Shortread_max_overlap, can optimize this code */
-	 if ((overlap = Shortread_max_overlap(queryseq5,queryseq3)) > 0 &&
-	     Stage3end_genomicbound_from_start(&genomicbound2,hit3,overlap,chroffset) == true) {
-	   debug13(printf("Found overlap of %d\n",overlap));
-	   if (genomicbound2 > genomicbound) {
-	     zero_offset = genomicbound2 - genomicbound;
-	     genomicbound = genomicbound2;
-	   }
-	 }
- #endif
-       }
-
-       debug13(printf("Case 3: hit3 plus %s %u..%u (sensedir %d) => genomicbound %u\n",
-		      Stage3end_hittype_string(hit3),
-		      Stage3end_genomicstart(hit3) - chroffset,Stage3end_genomicend(hit3) - chroffset,
-		      Stage3end_sensedir(hit3),genomicbound - chroffset));
-
-       knownsplice_limit_high = mappingend = segmentend = genomicbound;
-       knownsplice_limit_low = subtract_bounded(Stage3end_genomicstart(hit3),pairmax + shortsplicedist,chroffset);
-       segmentstart = subtract_bounded(Stage3end_genomicstart(hit3),pairmax,chroffset);
- #ifdef LONG_ENDSPLICES
-       mappingstart = subtract_bounded(Stage3end_genomicstart(hit3),pairmax + shortsplicedist,chroffset);
- #else
-       mappingstart = subtract_bounded(Stage3end_genomicstart(hit3),pairmax + shortsplicedist_novelend,chroffset);
- #endif
-
-       close_mappingstart_last = middle_mappingstart_last = Stage3end_genomicstart(hit3);
- #ifdef USE_GREEDY
-       close_mappingstart_greedy = middle_mappingstart_greedy = segmentstart;
- #endif
-
-       if (plus_nsegments > 0) {
-	 /* Use segments to bound */
-	 debug13(printf("Finding segments from segmentstart %u to segmentend %u (plus_nsegments %d)\n",
-			segmentstart - chroffset,segmentend - chroffset,plus_nsegments));
-	 starti = endi = -1;
-	 i = binary_search_segments(0,plus_nsegments-1,plus_segments,segmentend);
-	 while (i >= 0 && plus_segments[i].diagonal >= segmentend) {
-	   i--;
-	 }
-	 starti = i;
-	 while (i >= 0 && plus_segments[i].diagonal > segmentstart) {
-	   if (plus_segments[i].diagonal < (Univcoord_T) -1) {
-	     endi = i;
-	   }
-	   i--;
-	 }
-	 if (starti >= 0 && endi >= 0) {
-	   debug13(printf("starti = %d, endi = %d\n",starti,endi));
-	   assert(starti >= endi);
-	   for (i = starti; i >= endi; i--) {
-	     debug13(printf("diagonal %u (%llu), querypos %d..%d\n",
-			    (Chrpos_T) (plus_segments[i].diagonal - chroffset),(unsigned long long) plus_segments[i].diagonal,
-			    plus_segments[i].querypos5,plus_segments[i].querypos3));
-	     if (plus_segments[i].querypos5 >= STAGE2_MIN_OLIGO + index1interval) {
-	       /* Case 3. Missing start of query, so there could be a middle splice */
-	       debug13b(printf("  querypos5 %d >= %d + %d, so using this diagonal plus shortsplicedist\n",
-			       plus_segments[i].querypos5,STAGE2_MIN_OLIGO,index1interval));
- #ifdef USE_GREEDY
-	       if ((mappingpos = subtract_bounded(plus_segments[i].diagonal,querylength + shortsplicedist_novelend,chroffset)) > middle_mappingstart_greedy &&
-		   mappingpos < genomicbound) {
-		 middle_mappingstart_greedy = mappingpos;
-		 middle_mappingstart_p = true;
-		 debug13(printf("  Redefining middle mappingstart greedy to %u\n",middle_mappingstart_greedy - chroffset));
-	       }
- #endif
- #ifdef LONG_ENDSPLICES
-	       if ((mappingpos = subtract_bounded(plus_segments[i].diagonal,querylength + shortsplicedist,chroffset)) < middle_mappingstart_last) {
-		 /* Use < for NOT_GREEDY */
-		 middle_mappingstart_last = mappingpos;
-		 middle_mappingstart_p = true;
-		 debug13(printf("  Redefining middle mappingstart last to %u\n",middle_mappingstart_last - chroffset));
-	       }
- #else
-	       if ((mappingpos = subtract_bounded(plus_segments[i].diagonal,querylength,chroffset)) < middle_mappingstart_last) {
-		 /* Use < for NOT_GREEDY */
-		 middle_mappingstart_last = mappingpos;
-		 middle_mappingstart_p = true;
-		 debug13(printf("  Redefining middle mappingstart last to %u\n",middle_mappingstart_last - chroffset));
-	       }
- #endif
-
-	     } else {
-	       debug13b(printf("  querypos5 %d < %d + %d, so using this diagonal\n",
-			       plus_segments[i].querypos5,STAGE2_MIN_OLIGO,index1interval));
- #ifdef USE_GREEDY
-	       if ((mappingpos = subtract_bounded(plus_segments[i].diagonal,querylength,chroffset)) > close_mappingstart_greedy &&
-		   mappingpos < genomicbound) {
-		 close_mappingstart_greedy = mappingpos;
-		 close_mappingstart_p = true;
-		 debug13(printf("  Redefining close mappingstart greedy to %u\n",close_mappingstart_greedy - chroffset));
-	       }
- #endif
-	       if ((mappingpos = subtract_bounded(plus_segments[i].diagonal,querylength,chroffset)) < close_mappingstart_last) {
-		 /* Use < for NOT_GREEDY */
-		 close_mappingstart_last = mappingpos;
-		 close_mappingstart_p = true;
-		 debug13(printf("  Redefining close mappingstart last to %u\n",close_mappingstart_last - chroffset));
-	       }
-	     }
-	   }
+  /* Keep gmap_hits found in search 9 and 10 */
+  if (gmap_hits != NULL) {
+    hits = List_append(hits,gmap_hits);
+  }
 
- #ifdef USE_GREEDY
-	   if (close_mappingstart_p == true) {
-	     close_knownsplice_limit_low = subtract_bounded(close_mappingstart_greedy,shortsplicedist,chroffset);
-	   } else if (middle_mappingstart_p == true) {
-	     debug13(printf("Using middle mappingstart\n"));
-	     close_knownsplice_limit_low = middle_mappingstart_greedy;
-	     close_mappingstart_greedy = middle_mappingstart_greedy;
-	     close_mappingstart_p = true;
-	   }
- #else
-	   if (close_mappingstart_p == true) {
-	     close_knownsplice_limit_low = subtract_bounded(close_mappingstart_last,shortsplicedist,chroffset);
-	   } else if (middle_mappingstart_p == true) {
-	     debug13(printf("Using middle mappingstart\n"));
-	     close_knownsplice_limit_low = middle_mappingstart_last;
-	     close_mappingstart_last = middle_mappingstart_last;
-	     close_mappingstart_p = true;
-	   }
- #endif
- #ifdef USE_GREEDY
-	   if (middle_mappingstart_p == true && middle_mappingstart_last < close_mappingstart_greedy) {
-	     knownsplice_limit_low = middle_mappingstart_last;
-	     mappingstart = middle_mappingstart_last;
-	   } else if (close_mappingstart_p == true && close_mappingstart_last != close_mappingstart_greedy) {
-	     knownsplice_limit_low = subtract_bounded(close_mappingstart_last,shortsplicedist,chroffset);
-	     mappingstart = close_mappingstart_last;
-	   }
- #else
-	   if (middle_mappingstart_p == true && middle_mappingstart_last < close_mappingstart_last) {
-	     knownsplice_limit_low = middle_mappingstart_last;
-	     mappingstart = middle_mappingstart_last;
-	   }
- #endif
-	   if (close_mappingstart_p == false) {
-	     fallback_mappingstart_p = false;
- #ifdef USE_GREEDY
-	   } else if (mappingstart >= close_mappingstart_greedy) {
-	     fallback_mappingstart_p = false;
- #endif
-	   } else {
-	     debug13(printf("Fallback mappingstart = %u\n",mappingstart - chroffset));
-	     fallback_mappingstart_p = true;
-	   }
-	 }
-       }
-
-       favor_right_p = true;
-
-     } else {
-       chroffset = Stage3end_chroffset(hit3);
-       chrhigh = Stage3end_chrhigh(hit3);
-       chrlength = Stage3end_chrlength(hit3);
-
-       if (Shortread_find_primers(queryseq5,queryseq3) == true) {
-	 /* Go from genomicend */
-	 debug13(printf("Found primers\n"));
-	 genomicbound = Stage3end_genomicend(hit3);
-
-       } else if (Stage3end_anomalous_splice_p(hit3) == true) {
-	 /* Go from genomicend */
-	 debug13(printf("Anomalous splice\n"));
-	 genomicbound = Stage3end_genomicend(hit3);
-
-       } else {
-	 genomicbound = Stage3end_genomicstart(hit3);
-
- #if 0
-	 /* TODO: Previously called Shortread_find_overlap.  Now with Shortread_max_overlap, can optimize this code */
-	 if ((overlap = Shortread_max_overlap(queryseq5,queryseq3)) > 0 &&
-	     Stage3end_genomicbound_from_start(&genomicbound2,hit3,overlap,chroffset) == true) {
-	   debug13(printf("Found overlap of %d\n",overlap));
-	   if (genomicbound2 < genomicbound) {
-	     zero_offset = genomicbound - genomicbound2;
-	     genomicbound = genomicbound2;
-	   }
-	 }
- #endif
-       }
-
-       debug13(printf("Case 4: hit3 minus %s %u..%u (sensedir %d) => genomicbound %u\n",
-		      Stage3end_hittype_string(hit3),
-		      Stage3end_genomicstart(hit3) - chroffset,Stage3end_genomicend(hit3) - chroffset,
-		      Stage3end_sensedir(hit3),genomicbound - chroffset));
-
-       knownsplice_limit_low = mappingstart = segmentstart = genomicbound;
-       knownsplice_limit_high = add_bounded(Stage3end_genomicstart(hit3),pairmax + shortsplicedist,chrhigh);
-       segmentend = add_bounded(Stage3end_genomicstart(hit3),pairmax,chrhigh);
- #ifdef LONG_ENDSPLICES
-       mappingend = add_bounded(Stage3end_genomicstart(hit3),pairmax + shortsplicedist,chrhigh);
- #else
-       mappingend = add_bounded(Stage3end_genomicstart(hit3),pairmax + shortsplicedist_novelend,chrhigh);
- #endif
-
-       close_mappingend_last = middle_mappingend_last = Stage3end_genomicstart(hit3);
- #ifdef USE_GREEDY
-       close_mappingend_greedy = middle_mappingend_greedy = segmentend;
- #endif
-
-       if (minus_nsegments > 0) {
-	 /* Use segments to bound */
-	 debug13(printf("Finding segments from segmentstart %u to segmentend %u (minus_nsegments %d)\n",
-			segmentstart - chroffset,segmentend - chroffset,minus_nsegments));
-	 starti = endi = -1;
-	 i = binary_search_segments(0,minus_nsegments-1,minus_segments,segmentstart);
-	 while (i < minus_nsegments - 1 && minus_segments[i].diagonal == (Univcoord_T) -1) {
-	   i++;
-	 }
-	 starti = i;
-	 while (minus_segments[i].diagonal < segmentend) {
-	   endi = i;
-	   i++;
-	 }
-	 if (starti >= 0 && endi >= 0) {
-	   debug13(printf("starti = %d, endi = %d\n",starti,endi));
-	   assert(starti <= endi);
-	   for (i = starti; i <= endi; i++) {
-	     debug13(printf("diagonal %u (%llu), querypos %d..%d\n",
-			    (Chrpos_T) (minus_segments[i].diagonal - chroffset),(unsigned long long) minus_segments[i].diagonal,
-			    minus_segments[i].querypos5,minus_segments[i].querypos3));
-	     if (minus_segments[i].querypos5 >= STAGE2_MIN_OLIGO + index1interval) {
-	       /* Case 4. Missing start of query, so there could be a middle splice */
-	       debug13b(printf("  querypos5 %d >= %d + %d, so using this diagonal plus shortsplicedist\n",
-			       minus_segments[i].querypos5,STAGE2_MIN_OLIGO,index1interval));
- #ifdef USE_GREEDY
-	       if ((mappingpos = add_bounded(minus_segments[i].diagonal,shortsplicedist_novelend,chrhigh)) < middle_mappingend_greedy &&
-		   mappingpos > genomicbound) {
-		 middle_mappingend_greedy = mappingpos;
-		 middle_mappingend_p = true;
-		 debug13(printf("  Redefining middle mappingend greedy to %u\n",middle_mappingend_greedy - chroffset));
-	       }
- #endif
- #ifdef LONG_ENDSPLICES
-	       if ((mappingpos = add_bounded(minus_segments[i].diagonal,shortsplicedist,chrhigh)) > middle_mappingend_last) {
-		 /* Use > for NOT_GREEDY */
-		 middle_mappingend_last = mappingpos;
-		 middle_mappingend_p = true;
-		 debug13(printf("  Redefining middle mappingend to %u\n",middle_mappingend_last - chroffset));
-	       }
- #else
-	       if ((mappingpos = minus_segments[i].diagonal) > middle_mappingend_last) {
-		 /* Use > for NOT_GREEDY */
-		 middle_mappingend_last = mappingpos;
-		 middle_mappingend_p = true;
-		 debug13(printf("  Redefining middle mappingend to %u\n",middle_mappingend_last - chroffset));
-	       }
- #endif
-
-	     } else {
-	       debug13b(printf("  querypos5 %d < %d + %d, so using this diagonal\n",
-			       minus_segments[i].querypos5,STAGE2_MIN_OLIGO,index1interval));
- #ifdef USE_GREEDY
-	       if ((mappingpos = minus_segments[i].diagonal) < close_mappingend_greedy &&
-		   mappingpos > genomicbound) {
-		 close_mappingend_greedy = mappingpos;
-		 close_mappingend_p = true;
-		 debug13(printf("  Redefining close mappingend greedy to %u\n",close_mappingend_greedy - chroffset));
-	       }
- #endif
-	       if ((mappingpos = minus_segments[i].diagonal) > close_mappingend_last) {
-		 /* Use > for NOT_GREEDY */
-		 close_mappingend_last = mappingpos;
-		 close_mappingend_p = true;
-		 debug13(printf("  Redefining close mappingend last to %u\n",close_mappingend_last - chroffset));
-	       }
-	     }
-	   }
+  if (gmap_improvement_p == false) {
+    debug(printf("No GMAP improvement: Before remove_overlaps at cutoff level %d: %d\n",*cutoff_level,List_length(hits)));
+    hits = Stage3end_optimal_score(hits,*cutoff_level,subopt_levels,query_compress_fwd,query_compress_rev,
+				   querylength,/*keep_gmap_p*/true,/*finalp*/true);
+    hits = Stage3end_reject_trimlengths(hits);
+    hits = Stage3end_remove_overlaps(hits,/*finalp*/true);
+    hits = Stage3end_optimal_score(hits,*cutoff_level,subopt_levels,query_compress_fwd,query_compress_rev,
+				   querylength,/*keep_gmap_p*/false,/*finalp*/true);
+    hits = Stage3end_resolve_multimapping(hits);
+    debug(printf("After remove_overlaps: %d\n",List_length(hits)));
 
- #ifdef USE_GREEDY
-	   if (close_mappingend_p == true) {
-	     close_knownsplice_limit_high = add_bounded(close_mappingend_greedy,shortsplicedist,chrhigh);
-	   } else if (middle_mappingend_p == true) {
-	     debug13(printf("Using middle mappingend\n"));
-	     close_knownsplice_limit_high = middle_mappingend_greedy;
-	     close_mappingend_greedy = middle_mappingend_greedy;
-	     close_mappingend_p = true;
-	   }
- #else
-	   if (close_mappingend_p == true) {
-	     close_knownsplice_limit_high = add_bounded(close_mappingend_last,shortsplicedist,chrhigh);
-	   } else if (middle_mappingend_p == true) {
-	     debug13(printf("Using middle mappingend\n"));
-	     close_knownsplice_limit_high = middle_mappingend_last;
-	     close_mappingend_last = middle_mappingend_last;
-	     close_mappingend_p = true;
-	   }
- #endif
- #ifdef USE_GREEDY
-	   if (middle_mappingend_p == true && middle_mappingend_last > close_mappingend_greedy) {
-	     knownsplice_limit_high = middle_mappingend_last;
-	     mappingend = middle_mappingend_last;
-	   } else if (close_mappingend_p == true && close_mappingend_last != close_mappingend_greedy) {
-	     knownsplice_limit_high = add_bounded(close_mappingend_last,shortsplicedist,chrhigh);
-	     mappingend = close_mappingend_last;
-	   }
- #else
-	   if (middle_mappingend_p == true && middle_mappingend_last > close_mappingend_last) {
-	     knownsplice_limit_high = middle_mappingend_last;
-	     mappingend = middle_mappingend_last;
-	   }
- #endif
-	   if (close_mappingend_p == false) {
-	     fallback_mappingend_p = false;
- #ifdef USE_GREEDY
-	   } else if (mappingend <= close_mappingend_greedy) {
-	     fallback_mappingend_p = false;
- #endif
-	   } else {
-	     debug13(printf("Fallback mappingend = %u\n",mappingend - chroffset));
-		    fallback_mappingend_p = true;
-	  }
-	}
-      }
+  } else {
+    debug(printf("GMAP improvement: Before remove_overlaps at cutoff level %d: %d\n",*cutoff_level,List_length(hits)));
+    hits = Stage3end_optimal_score(hits,*cutoff_level,subopt_levels,query_compress_fwd,query_compress_rev,
+				   querylength,/*keep_gmap_p*/true,/*finalp*/false);
+    /* Don't reject based on trimlength until after GMAP improvements */
+    hits = Stage3end_remove_overlaps(hits,/*finalp*/false);
+    hits = Stage3end_optimal_score(hits,*cutoff_level,subopt_levels,query_compress_fwd,query_compress_rev,
+				   querylength,/*keep_gmap_p*/false,/*finalp*/false);
+    hits = Stage3end_resolve_multimapping(hits);
+    debug(printf("After remove_overlaps: %d\n",List_length(hits)));
 
-      favor_right_p = true;
-    }
+    hits = align_singleend_with_gmap(gmap_history,hits,this,query_compress_fwd,query_compress_rev,
+				     accession,queryuc_ptr,querylength,query_lastpos,
+				     oligoindices_major,oligoindices_minor,
+				     pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,user_maxlevel,*cutoff_level,
+				     first_read_p);
+    hits = Stage3end_optimal_score(hits,*cutoff_level,subopt_levels,query_compress_fwd,query_compress_rev,
+				   querylength,/*keep_gmap_p*/true,/*finalp*/true);
+    hits = Stage3end_reject_trimlengths(hits);
+    hits = Stage3end_remove_overlaps(hits,/*finalp*/true);
+    hits = Stage3end_optimal_score(hits,*cutoff_level,subopt_levels,query_compress_fwd,query_compress_rev,
+				   querylength,/*keep_gmap_p*/false,/*finalp*/true);
+    hits = Stage3end_resolve_multimapping(hits);
+  }
 
-    if ((sensedir = Stage3end_sensedir(hit3)) == SENSE_FORWARD) {
-      sense_try = +1;
-    } else if (sensedir == SENSE_ANTI) {
-      sense_try = -1;
-    } else {
-      sense_try = 0;
-    }
+  hits = Stage3end_remove_circular_alias(hits);
+  hits = Stage3end_remove_duplicates(hits); /* Aliases can cause duplicates */
+
+  List_free(&plus_anchor_segments);
+  List_free(&minus_anchor_segments);
+
+  return hits;
+}
+
+
+static Stage3end_T *
+single_read (int *npaths, int *first_absmq, int *second_absmq,
+	     Shortread_T queryseq, Indexdb_T indexdb_fwd, Indexdb_T indexdb_rev,
+	     int indexdb_size_threshold, Floors_T *floors_array,
+	     double user_maxlevel_float, int indel_penalty_middle, int indel_penalty_end,
+	     bool allow_end_indels_p, int max_end_insertions, int max_end_deletions, int min_indel_end_matches,
+	     int localsplicing_penalty, int distantsplicing_penalty, int min_shortend,
+	     Oligoindex_array_T oligoindices_major, Oligoindex_array_T oligoindices_minor,
+	     Pairpool_T pairpool, Diagpool_T diagpool, Cellpool_T cellpool,
+	     Dynprog_T dynprogL, Dynprog_T dynprogM, Dynprog_T dynprogR,
+	     bool keep_floors_p) {
+  Stage3end_T *stage3array;
+  History_T gmap_history;
+  List_T hits = NULL;
+  T this = NULL;
+  int user_maxlevel;
+  int querylength, query_lastpos, cutoff_level;
+  char *queryuc_ptr, *quality_string;
+  Compress_T query_compress_fwd = NULL, query_compress_rev = NULL;
+
+#ifdef HAVE_ALLOCA
+  char *queryrc;
+#else
+  char queryrc[MAX_READLENGTH+1];
+#endif
 
+  querylength = Shortread_fulllength(queryseq);
+
+#ifndef HAVE_ALLOCA
+  if (querylength > MAX_READLENGTH) {
+    fprintf(stderr,"Read %s has length %d > MAX_READLENGTH %d.  Either run configure and make again with a higher value of MAX_READLENGTH, or consider using GMAP instead.\n",
+	    Shortread_accession(queryseq),querylength,MAX_READLENGTH);
+    *npaths = 0;
+    return (Stage3end_T *) NULL;
+  }
+#endif
+
+  if (user_maxlevel_float < 0.0) {
+    user_maxlevel = -1;
+  } else if (user_maxlevel_float > 0.0 && user_maxlevel_float < 1.0) {
+    user_maxlevel = (int) rint(user_maxlevel_float * (double) querylength);
   } else {
-    abort();
+    user_maxlevel = (int) user_maxlevel_float;
   }
 
-#ifdef OLD_GENOMICBOUND
-  knownsplice_limit_low = genomicstart + querylength;
-  knownsplice_limit_high = genomicend - querylength;
+  /* Limit search on repetitive sequences */
+  queryuc_ptr = Shortread_fullpointer_uc(queryseq);
+  quality_string = Shortread_quality_string(queryseq);
+  if (check_dinucleotides(queryuc_ptr,querylength) == false) {
+    user_maxlevel = 0;
+  }
+
+  query_compress_fwd = Compress_new_fwd(queryuc_ptr,querylength);
+  query_compress_rev = Compress_new_rev(queryuc_ptr,querylength);
+#ifdef HAVE_ALLOCA
+  queryrc = (char *) ALLOCA((querylength+1)*sizeof(int));
 #endif
+  make_complement_buffered(queryrc,queryuc_ptr,querylength);
 
-  if (close_mappingstart_p == true && close_mappingend_p == true) {
-    debug13(printf("Halfmapping: Running gmap with close mappingstart and close mappingend\n"));
-    hits = run_gmap_for_region(&good_start_p,&good_end_p,gmap_history,
-			       hits,Shortread_accession(queryseq5),queryuc_ptr,querylength,sense_try,favor_right_p,
-			       /*paired_favor_mode*/favor_right_p == true ? +1 : -1,zero_offset,
-			       query_compress_fwd,query_compress_rev,close_mappingstart_last,close_mappingend_last,
-			       close_knownsplice_limit_low,close_knownsplice_limit_high,
-			       watsonp,genestrand,first_read_p,chrnum,chroffset,chrhigh,chrlength,
-			       oligoindices_major,oligoindices_minor,
-			       pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,user_maxlevel);
+  this = Stage1_new(querylength);
+  query_lastpos = querylength - index1part;
 
-    if (good_start_p == true && good_end_p == true) {
-      /* Success */
-    } else if (gmap_rerun_p == false) {
-      debug13(printf("Skipping re-run of gmap\n"));
-    } else if (/* require both ends to be good */ 0 && good_start_p == true) {
-      if (fallback_mappingend_p == true) {
-	debug13(printf("Halfmapping: Re-running gmap with close mappingstart only\n"));
-	hits = run_gmap_for_region(&good_start_p,&good_end_p,gmap_history,
-				   hits,Shortread_accession(queryseq5),queryuc_ptr,querylength,sense_try,favor_right_p,
-				   /*paired_favor_mode*/favor_right_p == true ? +1 : -1,zero_offset,
-				   query_compress_fwd,query_compress_rev,close_mappingstart_last,mappingend,
-				   close_knownsplice_limit_low,knownsplice_limit_high,
-				   watsonp,genestrand,first_read_p,chrnum,chroffset,chrhigh,chrlength,
-				   oligoindices_major,oligoindices_minor,
-				   pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,user_maxlevel);
-      }
+  gmap_history = History_new();
+  hits = align_end(&cutoff_level,gmap_history,this,
+		   query_compress_fwd,query_compress_rev,
+		   Shortread_accession(queryseq),queryuc_ptr,queryrc,querylength,query_lastpos,
+		   indexdb_fwd,indexdb_rev,indexdb_size_threshold,floors_array,
+		   oligoindices_major,oligoindices_minor,
+		   pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,
+		   user_maxlevel,indel_penalty_middle,indel_penalty_end,
+		   localsplicing_penalty,distantsplicing_penalty,min_shortend,
+		   allow_end_indels_p,max_end_insertions,max_end_deletions,min_indel_end_matches,
+		   keep_floors_p,/*genestrand*/0,/*first_read_p*/true);
+
+  if ((*npaths = List_length(hits)) == 0) {
+    stage3array = (Stage3end_T *) NULL;
+  } else {
+    stage3array = (Stage3end_T *) List_to_array_out(hits,NULL); List_free(&hits); /* Return value */
+    stage3array = Stage3end_eval_and_sort(&(*npaths),&(*first_absmq),&(*second_absmq),
+					  stage3array,maxpaths_search,queryseq,queryuc_ptr,queryrc,
+					  query_compress_fwd,query_compress_rev,
+					  quality_string,/*displayp*/true);
+  }
+     
+  History_free(&gmap_history);
+  Compress_free(&query_compress_fwd);
+  Compress_free(&query_compress_rev);
+  Stage1_free(&this,querylength); 
+  return stage3array;
+}
 
-    } else if (/* require both ends to be good */ 0 && good_end_p == true) {
-      if (fallback_mappingstart_p == true) {
-	debug13(printf("Halfmapping: Re-running gmap with close mappingend only\n"));
-	hits = run_gmap_for_region(&good_start_p,&good_end_p,gmap_history,
-				   hits,Shortread_accession(queryseq5),queryuc_ptr,querylength,sense_try,favor_right_p,
-				   /*paired_favor_mode*/favor_right_p == true ? +1 : -1,zero_offset,
-				   query_compress_fwd,query_compress_rev,mappingstart,close_mappingend_last,
-				   knownsplice_limit_low,close_knownsplice_limit_high,
-				   watsonp,genestrand,first_read_p,chrnum,chroffset,chrhigh,chrlength,
-				   oligoindices_major,oligoindices_minor,
-				   pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,user_maxlevel);
-      }
-    } else {
-      if (fallback_mappingstart_p == true && fallback_mappingend_p == true) {
-	debug13(printf("Halfmapping: Re-running gmap with far mappingstart and mappingend\n"));
-	hits = run_gmap_for_region(&good_start_p,&good_end_p,gmap_history,
-				   hits,Shortread_accession(queryseq5),queryuc_ptr,querylength,sense_try,favor_right_p,
-				   /*paired_favor_mode*/favor_right_p == true ? +1 : -1,zero_offset,
-				   query_compress_fwd,query_compress_rev,mappingstart,mappingend,
-				   knownsplice_limit_low,knownsplice_limit_high,
-				   watsonp,genestrand,first_read_p,chrnum,chroffset,chrhigh,chrlength,
-				   oligoindices_major,oligoindices_minor,
-				   pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,user_maxlevel);
-      }
-    }
 
-  } else if (close_mappingstart_p == true) {
-    debug13(printf("Halfmapping: Running gmap with close mappingstart\n"));
-    hits = run_gmap_for_region(&good_start_p,&good_end_p,gmap_history,
+static Stage3end_T *
+single_read_tolerant_nonstranded (int *npaths, int *first_absmq, int *second_absmq,
+				  Shortread_T queryseq, Indexdb_T indexdb_fwd, Indexdb_T indexdb_rev,
+				  int indexdb_size_threshold, Floors_T *floors_array,
+				  double user_maxlevel_float, int indel_penalty_middle, int indel_penalty_end,
+				  bool allow_end_indels_p, int max_end_insertions, int max_end_deletions, int min_indel_end_matches,
+				  int localsplicing_penalty, int distantsplicing_penalty, int min_shortend,
+				  Oligoindex_array_T oligoindices_major, Oligoindex_array_T oligoindices_minor, 
+				  Pairpool_T pairpool, Diagpool_T diagpool, Cellpool_T cellpool,
+				  Dynprog_T dynprogL, Dynprog_T dynprogM, Dynprog_T dynprogR,
+				  bool keep_floors_p) {
+  Stage3end_T *stage3array;
+  History_T gmap_history;
+  List_T hits, hits_geneplus = NULL, hits_geneminus = NULL;
+  T this_geneplus = NULL, this_geneminus = NULL;
+  int user_maxlevel;
+  int querylength, query_lastpos, cutoff_level;
+  char *queryuc_ptr, *quality_string;
+  Compress_T query_compress_fwd = NULL, query_compress_rev = NULL;
+  bool allvalidp;
+
+#ifdef HAVE_ALLOCA
+  char *queryrc;
+#else
+  char queryrc[MAX_READLENGTH+1];
+#endif
+
+  querylength = Shortread_fulllength(queryseq);
+
+#ifndef HAVE_ALLOCA
+  if (querylength > MAX_READLENGTH) {
+    fprintf(stderr,"Read %s has length %d > MAX_READLENGTH %d.  Either run configure and make again with a higher value of MAX_READLENGTH, or consider using GMAP instead.\n",
+	    Shortread_accession(queryseq),querylength,MAX_READLENGTH);
+    *npaths = 0;
+    return (Stage3end_T *) NULL;
+  }
+#endif
+
+  if (user_maxlevel_float < 0.0) {
+    user_maxlevel = -1;
+  } else if (user_maxlevel_float > 0.0 && user_maxlevel_float < 1.0) {
+    user_maxlevel = (int) rint(user_maxlevel_float * (double) querylength);
+  } else {
+    user_maxlevel = (int) user_maxlevel_float;
+  }
+
+  this_geneplus = Stage1_new(querylength);
+  this_geneminus = Stage1_new(querylength);
+
+  queryuc_ptr = Shortread_fullpointer_uc(queryseq);
+  quality_string = Shortread_quality_string(queryseq);
+  query_lastpos = querylength - index1part;
+
+  /* Limit search on repetitive sequences */
+  if (check_dinucleotides(queryuc_ptr,querylength) == false) {
+    user_maxlevel = 0;
+  }
+
+  query_compress_fwd = Compress_new_fwd(queryuc_ptr,querylength);
+  query_compress_rev = Compress_new_rev(queryuc_ptr,querylength);
+  gmap_history = History_new();
+#ifdef HAVE_ALLOCA
+  queryrc = (char *) ALLOCA((querylength+1)*sizeof(char));
+#endif
+  make_complement_buffered(queryrc,queryuc_ptr,querylength);
+
+  if (read_oligos(&allvalidp,this_geneplus,queryuc_ptr,querylength,query_lastpos,/*genestrand*/+1,
+		  /*first_read_p*/true) > 0) {
+    hits_geneplus = align_end(&cutoff_level,gmap_history,this_geneplus,
+			      query_compress_fwd,query_compress_rev,
+			      Shortread_accession(queryseq),queryuc_ptr,queryrc,querylength,query_lastpos,
+			      indexdb_fwd,indexdb_rev,indexdb_size_threshold,
+			      floors_array,oligoindices_major,oligoindices_minor,
+			      pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,
+			      user_maxlevel,indel_penalty_middle,indel_penalty_end,
+			      localsplicing_penalty,distantsplicing_penalty,min_shortend,
+			      allow_end_indels_p,max_end_insertions,max_end_deletions,min_indel_end_matches,
+			      keep_floors_p,/*genestrand*/+1,/*first_read_p*/true);
+  }
+
+  if (read_oligos(&allvalidp,this_geneminus,queryuc_ptr,querylength,query_lastpos,/*genestrand*/+2,
+		  /*first_read_p*/true) > 0) {
+    hits_geneminus = align_end(&cutoff_level,gmap_history,this_geneminus,
+			       query_compress_fwd,query_compress_rev,
+			       Shortread_accession(queryseq),queryuc_ptr,queryrc,querylength,query_lastpos,
+			       indexdb_fwd,indexdb_rev,indexdb_size_threshold,
+			       floors_array,oligoindices_major,oligoindices_minor,
+			       pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,
+			       user_maxlevel,indel_penalty_middle,indel_penalty_end,
+			       localsplicing_penalty,distantsplicing_penalty,min_shortend,
+			       allow_end_indels_p,max_end_insertions,max_end_deletions,min_indel_end_matches,
+			       keep_floors_p,/*genestrand*/+2,/*first_read_p*/true);
+  }
+
+  hits = List_append(hits_geneplus,hits_geneminus);
+  hits = Stage3end_optimal_score(hits,cutoff_level,subopt_levels,query_compress_fwd,query_compress_rev,
+				 querylength,/*keep_gmap_p*/true,/*finalp*/true);
+  hits = Stage3end_reject_trimlengths(hits);
+  hits = Stage3end_remove_overlaps(hits,/*finalp*/true);
+  hits = Stage3end_optimal_score(hits,cutoff_level,subopt_levels,query_compress_fwd,query_compress_rev,
+				 querylength,/*keep_gmap_p*/false,/*finalp*/true);
+  hits = Stage3end_resolve_multimapping(hits);
+
+  if ((*npaths = List_length(hits)) == 0) {
+    stage3array = (Stage3end_T *) NULL;
+  } else {
+    stage3array = (Stage3end_T *) List_to_array_out(hits,NULL); List_free(&hits); /* Return value */
+    stage3array = Stage3end_eval_and_sort(&(*npaths),&(*first_absmq),&(*second_absmq),
+					  stage3array,maxpaths_search,queryseq,queryuc_ptr,queryrc,
+					  query_compress_fwd,query_compress_rev,
+					  quality_string,/*displayp*/true);
+  }
+
+  History_free(&gmap_history);
+  Compress_free(&query_compress_fwd);
+  Compress_free(&query_compress_rev);
+  Stage1_free(&this_geneminus,querylength); 
+  Stage1_free(&this_geneplus,querylength); 
+  return stage3array;
+}
+
+
+Stage3end_T *
+Stage1_single_read (int *npaths, int *first_absmq, int *second_absmq,
+		    Shortread_T queryseq, Indexdb_T indexdb_fwd, Indexdb_T indexdb_rev,
+		    int indexdb_size_threshold, Floors_T *floors_array,
+		    double user_maxlevel_float, int indel_penalty_middle, int indel_penalty_end,
+		    bool allow_end_indels_p, int max_end_insertions, int max_end_deletions, int min_indel_end_matches,
+		    int localsplicing_penalty, int distantsplicing_penalty, int min_shortend,
+		    Oligoindex_array_T oligoindices_major, Oligoindex_array_T oligoindices_minor,
+		    Pairpool_T pairpool, Diagpool_T diagpool, Cellpool_T cellpool,
+		    Dynprog_T dynprogL, Dynprog_T dynprogM, Dynprog_T dynprogR,
+		    bool keep_floors_p) {
+
+  if (mode == STANDARD || mode == CMET_STRANDED || mode == ATOI_STRANDED || mode == TTOC_STRANDED) {
+    return single_read(&(*npaths),&(*first_absmq),&(*second_absmq),
+		       queryseq,indexdb_fwd,indexdb_rev,indexdb_size_threshold,
+		       floors_array,user_maxlevel_float,
+		       indel_penalty_middle,indel_penalty_end,
+		       allow_end_indels_p,max_end_insertions,max_end_deletions,min_indel_end_matches,
+		       localsplicing_penalty,distantsplicing_penalty,min_shortend,
+		       oligoindices_major,oligoindices_minor,
+		       pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,keep_floors_p);
+  } else if (mode == CMET_NONSTRANDED || mode == ATOI_NONSTRANDED || mode == TTOC_NONSTRANDED) {
+    return single_read_tolerant_nonstranded(&(*npaths),&(*first_absmq),&(*second_absmq),queryseq,
+					    indexdb_fwd,indexdb_rev,indexdb_size_threshold,
+					    floors_array,user_maxlevel_float,
+					    indel_penalty_middle,indel_penalty_end,
+					    allow_end_indels_p,max_end_insertions,max_end_deletions,min_indel_end_matches,
+					    localsplicing_penalty,distantsplicing_penalty,min_shortend,
+					    oligoindices_major,oligoindices_minor,
+					    pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,keep_floors_p);
+  } else {
+    fprintf(stderr,"Do not recognize mode %d\n",mode);
+    abort();
+  }
+}
+
+
+
+/* #define HITARRAY_SHORTENDSPLICING 4 */
+/* #define HITARRAY_DISTANTSPLICING 4 */
+
+
+static List_T
+align_halfmapping_with_gmap (History_T gmap_history, Stage3end_T hit5, Stage3end_T hit3, 
+			     Shortread_T queryseq5, Shortread_T queryseq3,
+			     char *queryuc_ptr, int querylength, int query_lastpos,
+#ifdef END_KNOWNSPLICING_SHORTCUT
+			     char *queryrc, bool invertedp,
+#endif
+			     Compress_T query_compress_fwd, Compress_T query_compress_rev,
+			     struct Segment_T *plus_segments, int plus_nsegments,
+			     struct Segment_T *minus_segments, int minus_nsegments,
+			     Oligoindex_array_T oligoindices_major, Oligoindex_array_T oligoindices_minor,
+			     Pairpool_T pairpool, Diagpool_T diagpool, Cellpool_T cellpool,
+			     Dynprog_T dynprogL, Dynprog_T dynprogM, Dynprog_T dynprogR,
+			     Chrpos_T pairmax, Chrpos_T shortsplicedist, int user_maxlevel,
+			     int genestrand, bool first_read_p) {
+  List_T hits = NULL;
+  int sensedir, sense_try;
+
+  int zero_offset = 0;
+  Univcoord_T segmentstart, segmentend;
+  Univcoord_T genomicbound, mappingstart, mappingend,
+    chroffset, chrhigh, mappingpos;
+#ifdef USE_GREEDY
+  Univcoord_T close_mappingstart_greedy, close_mappingend_greedy,
+    middle_mappingstart_greedy, middle_mappingend_greedy;
+#endif
+  Univcoord_T close_mappingstart_last, close_mappingend_last,
+    middle_mappingstart_last, middle_mappingend_last;
+  Univcoord_T knownsplice_limit_low, knownsplice_limit_high;
+  Univcoord_T close_knownsplice_limit_low, close_knownsplice_limit_high;
+  Chrpos_T chrlength;
+  Chrnum_T chrnum;
+  bool close_mappingstart_p = false, close_mappingend_p = false;
+  bool middle_mappingstart_p = false, middle_mappingend_p = false;
+  bool fallback_mappingstart_p, fallback_mappingend_p;
+  bool good_start_p, good_end_p, watsonp, favor_right_p;
+
+  int starti, endi, i;
+
+  if (hit3 == NULL) {
+    /* Both events are tested by Stage3end_anomalous_splice_p */
+    if ((chrnum = Stage3end_chrnum(hit5)) == 0) {
+      /* Translocation */
+      return (List_T) NULL;
+
+    } else if (Stage3end_hittype(hit5) == SAMECHR_SPLICE) {
+      /* A genomic event that doesn't get reflected in chrnum */
+      return (List_T) NULL;
+
+    } else if ((watsonp = Stage3end_plusp(hit5)) == true) {
+      chroffset = Stage3end_chroffset(hit5);
+      chrhigh = Stage3end_chrhigh(hit5);
+      chrlength = Stage3end_chrlength(hit5);
+
+      if (Shortread_find_primers(queryseq5,queryseq3) == true) {
+	/* Go from genomicstart */
+	debug13(printf("Found primers\n"));
+	genomicbound = Stage3end_genomicstart(hit5);
+
+      } else if (Stage3end_anomalous_splice_p(hit5) == true) {
+	/* Go from genomicstart */
+	debug13(printf("Anomalous splice\n"));
+	genomicbound = Stage3end_genomicstart(hit5);
+
+      } else {
+	genomicbound = Stage3end_genomicend(hit5);
+
+#if 0
+	/* TODO: Previously called Shortread_find_overlap.  Now with Shortread_max_overlap, can optimize this code */
+	if ((overlap = Shortread_max_overlap(queryseq5,queryseq3)) > 0 &&
+	    Stage3end_genomicbound_from_end(&genomicbound2,hit5,overlap,chroffset) == true) {
+	  debug13(printf("Found overlap of %d\n",overlap));
+	  if (genomicbound2 < genomicbound) {
+	    zero_offset = genomicbound - genomicbound2;
+	    genomicbound = genomicbound2;
+	  }
+	}
+#endif
+      }
+
+      debug13(printf("Case 1: hit5 plus %s %u..%u (sensedir %d) => genomicbound %u\n",
+		     Stage3end_hittype_string(hit5),
+		     Stage3end_genomicstart(hit5) - chroffset,Stage3end_genomicend(hit5) - chroffset,
+		     Stage3end_sensedir(hit5),genomicbound - chroffset));
+
+      knownsplice_limit_low = mappingstart = segmentstart = genomicbound;
+      knownsplice_limit_high =  add_bounded(Stage3end_genomicend(hit5),pairmax + shortsplicedist,chrhigh);
+      segmentend = add_bounded(Stage3end_genomicend(hit5),pairmax,chrhigh);
+#ifdef LONG_ENDSPLICES
+      mappingend = add_bounded(Stage3end_genomicend(hit5),pairmax + shortsplicedist,chrhigh);
+#else
+      mappingend = add_bounded(Stage3end_genomicend(hit5),pairmax + shortsplicedist_novelend,chrhigh);
+      debug13(printf("Original bounds E: knownsplice_limit_low %u, knownsplice_limit_high %u, mappingend %u\n",
+		     knownsplice_limit_low - chroffset,knownsplice_limit_high - chroffset,mappingend - chroffset));
+#endif
+
+      close_mappingend_last = middle_mappingend_last = Stage3end_genomicend(hit5);
+#ifdef USE_GREEDY
+      close_mappingend_greedy = middle_mappingend_greedy = segmentend;
+#endif
+
+      if (plus_nsegments > 0) {
+	/* Use segments to bound */
+	debug13(printf("Finding segments from segmentstart %u to segmentend %u (plus_nsegments %d)\n",
+		       segmentstart - chroffset,segmentend - chroffset,plus_nsegments));
+	starti = endi = -1;
+	i = binary_search_segments(0,plus_nsegments-1,plus_segments,segmentstart);
+	while (i < plus_nsegments - 1 && plus_segments[i].diagonal == (Univcoord_T) -1) {
+	  i++;
+	}
+	starti = i;
+	while (plus_segments[i].diagonal < segmentend) {
+	  endi = i;
+	  i++;
+	}
+	if (starti >= 0 && endi >= 0) {
+	  debug13(printf("starti = %d, endi = %d\n",starti,endi));
+	  assert(starti <= endi);
+	  for (i = starti; i <= endi; i++) {
+	    debug13(printf("diagonal %u (%llu), querypos %d..%d\n",
+			   (Chrpos_T) (plus_segments[i].diagonal - chroffset),(unsigned long long) plus_segments[i].diagonal,
+			   plus_segments[i].querypos5,plus_segments[i].querypos3));
+	    if (query_lastpos - plus_segments[i].querypos3 >= STAGE2_MIN_OLIGO + index1interval) {
+	      /* Case 1. Missing end of query, so there could be a middle splice */
+	      debug13b(printf("  query_lastpos %d - querypos3 %d >= %d + %d, so using this diagonal plus shortsplicedist\n",
+			      query_lastpos,plus_segments[i].querypos3,STAGE2_MIN_OLIGO,index1interval));
+#ifdef USE_GREEDY
+	      if ((mappingpos = add_bounded(plus_segments[i].diagonal,shortsplicedist_novelend,chrhigh)) < middle_mappingend_greedy &&
+		  mappingpos > genomicbound) {
+		middle_mappingend_greedy = mappingpos;
+		middle_mappingend_p = true;
+		debug13(printf("  Redefining middle mappingend greedy to %u\n",middle_mappingend_greedy - chroffset));
+	      }
+#endif
+
+#ifdef LONG_ENDSPLICES
+	      if ((mappingpos = add_bounded(plus_segments[i].diagonal,shortsplicedist,chrhigh)) > middle_mappingend_last) {
+		/* Use > for NOT_GREEDY */
+		middle_mappingend_last = mappingpos;
+		middle_mappingend_p = true;
+		debug13(printf("  Redefining middle mappingend last to %u\n",middle_mappingend_last - chroffset));
+	      }
+#else
+	      if ((mappingpos = plus_segments[i].diagonal) > middle_mappingend_last) {
+		/* Use > for NOT_GREEDY */
+		middle_mappingend_last = mappingpos;
+		middle_mappingend_p = true;
+		debug13(printf("  Redefining middle mappingend last to %u\n",middle_mappingend_last - chroffset));
+	      }
+#endif
+
+	    } else {
+	      debug13b(printf("  query_lastpos %d - querypos3 %d < %d + %d, so using this diagonal\n",
+			      query_lastpos,plus_segments[i].querypos3,STAGE2_MIN_OLIGO,index1interval));
+#ifdef USE_GREEDY
+	      if ((mappingpos = plus_segments[i].diagonal) < close_mappingend_greedy &&
+		  mappingpos > genomicbound) {
+		close_mappingend_greedy = mappingpos;
+		close_mappingend_p = true;
+		debug13(printf("  Redefining close mappingend greedy to %u\n",close_mappingend_greedy - chroffset));
+	      }
+#endif
+	      if ((mappingpos = plus_segments[i].diagonal) > close_mappingend_last) {
+		/* Use > for NOT_GREEDY */
+		close_mappingend_last = mappingpos;
+		close_mappingend_p = true;
+		debug13(printf("  Redefining close mappingend last to %u\n",close_mappingend_last - chroffset));
+	      }
+	    }
+	  }
+
+#ifdef USE_GREEDY
+	  if (close_mappingend_p == true) {
+	    close_knownsplice_limit_high = add_bounded(close_mappingend_greedy,shortsplicedist,chrhigh);
+	  } else if (middle_mappingend_p == true) {
+	    debug13(printf("Using middle mappingend\n"));
+	    close_knownsplice_limit_high = middle_mappingend_greedy;
+	    close_mappingend_greedy = middle_mappingend_greedy;
+	    close_mappingend_p = true;
+	  }
+#else
+	  if (close_mappingend_p == true) {
+	    close_knownsplice_limit_high = add_bounded(close_mappingend_last,shortsplicedist,chrhigh);
+	  } else if (middle_mappingend_p == true) {
+	    debug13(printf("Using middle mappingend\n"));
+	    close_knownsplice_limit_high = middle_mappingend_last;
+	    close_mappingend_last = middle_mappingend_last;
+	    close_mappingend_p = true;
+	  }
+#endif
+#ifdef USE_GREEDY
+	  if (middle_mappingend_p == true && middle_mappingend_last > close_mappingend_greedy) {
+	    knownsplice_limit_high = middle_mappingend_last;
+	    mappingend = middle_mappingend_last;
+	  } else if (close_mappingend_p == true && close_mappingend_last != close_mappingend_greedy) {
+	    knownsplice_limit_high = add_bounded(close_mappingend_last,shortsplicedist,chrhigh);
+	    mappingend = close_mappingend_last;
+	  }
+#else
+	  if (middle_mappingend_p == true && middle_mappingend_last > close_mappingend_last) {
+	    knownsplice_limit_high = middle_mappingend_last;
+	    mappingend = middle_mappingend_last;
+	  }
+#endif
+
+	  if (close_mappingend_p == false) {
+	    fallback_mappingend_p = false;
+#ifdef USE_GREEDY
+	  } else if (mappingend <= close_mappingend_greedy) {
+	    fallback_mappingend_p = false;
+#endif
+	  } else {
+	    debug13(printf("Fallback mappingend = %u\n",mappingend - chroffset));
+	    fallback_mappingend_p = true;
+	  }
+	}
+      }
+
+      favor_right_p = false;
+
+    } else {
+      chroffset = Stage3end_chroffset(hit5);
+      chrhigh = Stage3end_chrhigh(hit5);
+      chrlength = Stage3end_chrlength(hit5);
+
+      if (Shortread_find_primers(queryseq5,queryseq3) == true) {
+	/* Go from genomicstart */
+	debug13(printf("Found primers\n"));
+	genomicbound = Stage3end_genomicstart(hit5);
+
+      } else if (Stage3end_anomalous_splice_p(hit5) == true) {
+	/* Go from genomicstart */
+	debug13(printf("Anomalous splice\n"));
+	genomicbound = Stage3end_genomicstart(hit5);
+
+      } else {
+	genomicbound = Stage3end_genomicend(hit5);
+
+#if 0
+	/* TODO: Previously called Shortread_find_overlap.  Now with Shortread_max_overlap, can optimize this code */
+	if ((overlap = Shortread_max_overlap(queryseq5,queryseq3)) > 0 &&
+	    Stage3end_genomicbound_from_end(&genomicbound2,hit5,overlap,chroffset) == true) {
+	  debug13(printf("Found overlap of %d\n",overlap));
+	  if (genomicbound2 > genomicbound) {
+	    zero_offset = genomicbound2 - genomicbound;
+	    genomicbound = genomicbound2;
+	  }
+	}
+#endif
+      }
+
+      debug13(printf("Case 2: hit5 minus %s %u..%u (sensedir %d) => genomicbound %u\n",
+		     Stage3end_hittype_string(hit5),
+		     Stage3end_genomicstart(hit5) - chroffset,Stage3end_genomicend(hit5) - chroffset,
+		     Stage3end_sensedir(hit5),genomicbound - chroffset));
+
+      knownsplice_limit_high = mappingend = segmentend = genomicbound;
+      knownsplice_limit_low = subtract_bounded(Stage3end_genomicend(hit5),pairmax + shortsplicedist,chroffset);
+      segmentstart = subtract_bounded(Stage3end_genomicend(hit5),pairmax,chroffset);
+#ifdef LONG_ENDSPLICES
+      mappingstart = subtract_bounded(Stage3end_genomicend(hit5),pairmax + shortsplicedist,chroffset);
+#else
+      mappingstart = subtract_bounded(Stage3end_genomicend(hit5),pairmax + shortsplicedist_novelend,chroffset);
+#endif
+      debug13(printf("Original bounds F: knownsplice_limit_low %u, knownsplice_limit_high %u, mappingstart %u\n",
+		     knownsplice_limit_low - chroffset,knownsplice_limit_high - chroffset,mappingstart - chroffset));
+
+      close_mappingstart_last = middle_mappingstart_last = Stage3end_genomicend(hit5);
+#ifdef USE_GREEDY
+      close_mappingstart_greedy = middle_mappingstart_greedy = segmentstart;
+#endif
+
+      if (minus_nsegments > 0) {
+	/* Use segments to bound */
+	debug13(printf("Finding segments from segmentstart %u to segmentend %u (minus_nsegments %d)\n",
+		       segmentstart - chroffset,segmentend - chroffset,minus_nsegments));
+	starti = endi = -1;
+	i = binary_search_segments(0,minus_nsegments-1,minus_segments,segmentend);
+	while (i >= 0 && minus_segments[i].diagonal >= segmentend) {
+	  i--;
+	}
+	starti = i;
+	while (i >= 0 && minus_segments[i].diagonal > segmentstart) {
+	  if (minus_segments[i].diagonal < (Univcoord_T) -1) {
+	    endi = i;
+	  }
+	  i--;
+	}
+	if (starti >= 0 && endi >= 0) {
+	  debug13(printf("starti = %d, endi = %d\n",starti,endi));
+	  assert(starti >= endi);
+	  for (i = starti; i >= endi; i--) {
+	    debug13(printf("diagonal %u (%llu), querypos %d..%d\n",
+			   (Chrpos_T) (minus_segments[i].diagonal - chroffset),(unsigned long long) minus_segments[i].diagonal,
+			   minus_segments[i].querypos5,minus_segments[i].querypos3));
+	    if (query_lastpos - minus_segments[i].querypos3 >= STAGE2_MIN_OLIGO + index1interval) {
+	      /* Case 2. Missing end of query, so there could be a middle splice */
+	      debug13b(printf("  query_lastpos %d - querypos3 %d >= %d + %d, so using this diagonal plus shortsplicedist\n",
+			      query_lastpos,minus_segments[i].querypos3,STAGE2_MIN_OLIGO,index1interval));
+#ifdef USE_GREEDY
+	      if ((mappingpos = subtract_bounded(minus_segments[i].diagonal,querylength + shortsplicedist_novelend,chroffset)) > middle_mappingstart_greedy &&
+		  mappingpos < genomicbound) {
+		middle_mappingstart_greedy = mappingpos;
+		middle_mappingstart_p = true;
+		debug13(printf("  Redefining middle mappingstart greedy to %u\n",middle_mappingstart_greedy - chroffset));
+	      }
+#endif
+#ifdef LONG_ENDSPLICES
+	      if ((mappingpos = subtract_bounded(minus_segments[i].diagonal,querylength + shortsplicedist,chroffset)) < middle_mappingstart_last) {
+		/* Use < for NOT_GREEDY */
+		middle_mappingstart_last = mappingpos;
+		middle_mappingstart_p = true;
+		debug13(printf("  Redefining middle mappingstart last to %u\n",middle_mappingstart_last - chroffset));
+	      }
+#else
+	      if ((mappingpos = subtract_bounded(minus_segments[i].diagonal,querylength,chroffset)) < middle_mappingstart_last) {
+		/* Use < for NOT_GREEDY */
+		middle_mappingstart_last = mappingpos;
+		middle_mappingstart_p = true;
+		debug13(printf("  Redefining middle mappingstart last to %u\n",middle_mappingstart_last - chroffset));
+	      }
+#endif
+
+	    } else {
+	      debug13b(printf("  query_lastpos %d - querypos3 %d < %d + %d, so using this diagonal\n",
+			      query_lastpos,minus_segments[i].querypos3,STAGE2_MIN_OLIGO,index1interval));
+#ifdef USE_GREEDY
+	      if ((mappingpos = subtract_bounded(minus_segments[i].diagonal,querylength,chroffset)) > close_mappingstart_greedy &&
+		  mappingpos < genomicbound) {
+		close_mappingstart_greedy = mappingpos;
+		close_mappingstart_p = true;
+		debug13(printf("  Redefining close mappingstart greedy to %u\n",close_mappingstart_greedy - chroffset));
+	      }
+#endif
+	      if ((mappingpos = subtract_bounded(minus_segments[i].diagonal,querylength,chroffset)) < close_mappingstart_last) {
+		/* Use < for NOT_GREEDY */
+		close_mappingstart_last = mappingpos;
+		close_mappingstart_p = true;
+		debug13(printf("  Redefining close mappingstart last to %u\n",close_mappingstart_last - chroffset));
+	      }
+	    }
+	  }
+
+#ifdef USE_GREEDY
+	  if (close_mappingstart_p == true) {
+	    close_knownsplice_limit_low = subtract_bounded(close_mappingstart_greedy,shortsplicedist,chroffset);
+	  } else if (middle_mappingstart_p == true) {
+	    debug13(printf("Using middle mappingstart\n"));
+	    close_knownsplice_limit_low = middle_mappingstart_greedy;
+	    close_mappingstart_greedy = middle_mappingstart_greedy;
+	    close_mappingstart_p = true;
+	  }
+#else
+	  if (close_mappingstart_p == true) {
+	    close_knownsplice_limit_low = subtract_bounded(close_mappingstart_last,shortsplicedist,chroffset);
+	  } else if (middle_mappingstart_p == true) {
+	    debug13(printf("Using middle mappingstart\n"));
+	    close_knownsplice_limit_low = middle_mappingstart_last;
+	    close_mappingstart_last = middle_mappingstart_last;
+	    close_mappingstart_p = true;
+	  }
+#endif
+#ifdef USE_GREEDY
+	  if (middle_mappingstart_p == true && middle_mappingstart_last < close_mappingstart_greedy) {
+	    knownsplice_limit_low = middle_mappingstart_last;
+	    mappingstart = middle_mappingstart_last;
+	  } else if (close_mappingstart_p == true && close_mappingstart_last != close_mappingstart_greedy) {
+	    knownsplice_limit_low = subtract_bounded(close_mappingstart_last,shortsplicedist,chroffset);
+	    mappingstart = close_mappingstart_last;
+	  }
+#else
+	  if (middle_mappingstart_p == true && middle_mappingstart_last < close_mappingstart_last) {
+	    knownsplice_limit_low = middle_mappingstart_last;
+	    mappingstart = middle_mappingstart_last;
+	  }
+#endif
+	  if (close_mappingstart_p == false) {
+	    fallback_mappingstart_p = false;
+#ifdef USE_GREEDY
+	  } else if (mappingstart >= close_mappingstart_greedy) {
+	    fallback_mappingstart_p = false;
+#endif
+	  } else {
+	    debug13(printf("Fallback mappingstart = %u\n",mappingstart - chroffset));
+	    fallback_mappingstart_p = true;
+	  }
+	}
+      }
+
+      favor_right_p = false;
+    }
+
+    if ((sensedir = Stage3end_sensedir(hit5)) == SENSE_FORWARD) {
+      sense_try = +1;
+    } else if (sensedir == SENSE_ANTI) {
+      sense_try = -1;
+    } else {
+      sense_try = 0;
+    }
+
+  } else if (hit5 == NULL) {
+    /* Both events are tested by Stage3end_anomalous_splice_p */
+    if ((chrnum = Stage3end_chrnum(hit3)) == 0) {
+      /* Translocation */
+      return (List_T) NULL;
+
+    } else if (Stage3end_hittype(hit3) == SAMECHR_SPLICE) {
+      /* A genomic event that doesn't get reflected in chrnum */
+      return (List_T) NULL;
+
+    } else if ((watsonp = Stage3end_plusp(hit3)) == true) {
+      chroffset = Stage3end_chroffset(hit3);
+      chrhigh = Stage3end_chrhigh(hit3);
+      chrlength = Stage3end_chrlength(hit3);
+
+      if (Shortread_find_primers(queryseq5,queryseq3) == true) {
+	/* Go from genomicend */
+	debug13(printf("Found primers\n"));
+	genomicbound = Stage3end_genomicend(hit3);
+
+      } else if (Stage3end_anomalous_splice_p(hit3) == true) {
+	/* Go from genomicend */
+	debug13(printf("Anomalous splice\n"));
+	genomicbound = Stage3end_genomicend(hit3);
+
+      } else {
+	genomicbound = Stage3end_genomicstart(hit3);
+
+#if 0
+	/* TODO: Previously called Shortread_find_overlap.  Now with Shortread_max_overlap, can optimize this code */
+	if ((overlap = Shortread_max_overlap(queryseq5,queryseq3)) > 0 &&
+	    Stage3end_genomicbound_from_start(&genomicbound2,hit3,overlap,chroffset) == true) {
+	  debug13(printf("Found overlap of %d\n",overlap));
+	  if (genomicbound2 > genomicbound) {
+	    zero_offset = genomicbound2 - genomicbound;
+	    genomicbound = genomicbound2;
+	  }
+	}
+#endif
+      }
+
+      debug13(printf("Case 3: hit3 plus %s %u..%u (sensedir %d) => genomicbound %u\n",
+		     Stage3end_hittype_string(hit3),
+		     Stage3end_genomicstart(hit3) - chroffset,Stage3end_genomicend(hit3) - chroffset,
+		     Stage3end_sensedir(hit3),genomicbound - chroffset));
+
+      knownsplice_limit_high = mappingend = segmentend = genomicbound;
+      knownsplice_limit_low = subtract_bounded(Stage3end_genomicstart(hit3),pairmax + shortsplicedist,chroffset);
+      segmentstart = subtract_bounded(Stage3end_genomicstart(hit3),pairmax,chroffset);
+#ifdef LONG_ENDSPLICES
+      mappingstart = subtract_bounded(Stage3end_genomicstart(hit3),pairmax + shortsplicedist,chroffset);
+#else
+      mappingstart = subtract_bounded(Stage3end_genomicstart(hit3),pairmax + shortsplicedist_novelend,chroffset);
+#endif
+
+      close_mappingstart_last = middle_mappingstart_last = Stage3end_genomicstart(hit3);
+#ifdef USE_GREEDY
+      close_mappingstart_greedy = middle_mappingstart_greedy = segmentstart;
+#endif
+
+      if (plus_nsegments > 0) {
+	/* Use segments to bound */
+	debug13(printf("Finding segments from segmentstart %u to segmentend %u (plus_nsegments %d)\n",
+		       segmentstart - chroffset,segmentend - chroffset,plus_nsegments));
+	starti = endi = -1;
+	i = binary_search_segments(0,plus_nsegments-1,plus_segments,segmentend);
+	while (i >= 0 && plus_segments[i].diagonal >= segmentend) {
+	  i--;
+	}
+	starti = i;
+	while (i >= 0 && plus_segments[i].diagonal > segmentstart) {
+	  if (plus_segments[i].diagonal < (Univcoord_T) -1) {
+	    endi = i;
+	  }
+	  i--;
+	}
+	if (starti >= 0 && endi >= 0) {
+	  debug13(printf("starti = %d, endi = %d\n",starti,endi));
+	  assert(starti >= endi);
+	  for (i = starti; i >= endi; i--) {
+	    debug13(printf("diagonal %u (%llu), querypos %d..%d\n",
+			   (Chrpos_T) (plus_segments[i].diagonal - chroffset),(unsigned long long) plus_segments[i].diagonal,
+			   plus_segments[i].querypos5,plus_segments[i].querypos3));
+	    if (plus_segments[i].querypos5 >= STAGE2_MIN_OLIGO + index1interval) {
+	      /* Case 3. Missing start of query, so there could be a middle splice */
+	      debug13b(printf("  querypos5 %d >= %d + %d, so using this diagonal plus shortsplicedist\n",
+			      plus_segments[i].querypos5,STAGE2_MIN_OLIGO,index1interval));
+#ifdef USE_GREEDY
+	      if ((mappingpos = subtract_bounded(plus_segments[i].diagonal,querylength + shortsplicedist_novelend,chroffset)) > middle_mappingstart_greedy &&
+		  mappingpos < genomicbound) {
+		middle_mappingstart_greedy = mappingpos;
+		middle_mappingstart_p = true;
+		debug13(printf("  Redefining middle mappingstart greedy to %u\n",middle_mappingstart_greedy - chroffset));
+	      }
+#endif
+#ifdef LONG_ENDSPLICES
+	      if ((mappingpos = subtract_bounded(plus_segments[i].diagonal,querylength + shortsplicedist,chroffset)) < middle_mappingstart_last) {
+		/* Use < for NOT_GREEDY */
+		middle_mappingstart_last = mappingpos;
+		middle_mappingstart_p = true;
+		debug13(printf("  Redefining middle mappingstart last to %u\n",middle_mappingstart_last - chroffset));
+	      }
+#else
+	      if ((mappingpos = subtract_bounded(plus_segments[i].diagonal,querylength,chroffset)) < middle_mappingstart_last) {
+		/* Use < for NOT_GREEDY */
+		middle_mappingstart_last = mappingpos;
+		middle_mappingstart_p = true;
+		debug13(printf("  Redefining middle mappingstart last to %u\n",middle_mappingstart_last - chroffset));
+	      }
+#endif
+
+	    } else {
+	      debug13b(printf("  querypos5 %d < %d + %d, so using this diagonal\n",
+			      plus_segments[i].querypos5,STAGE2_MIN_OLIGO,index1interval));
+#ifdef USE_GREEDY
+	      if ((mappingpos = subtract_bounded(plus_segments[i].diagonal,querylength,chroffset)) > close_mappingstart_greedy &&
+		  mappingpos < genomicbound) {
+		close_mappingstart_greedy = mappingpos;
+		close_mappingstart_p = true;
+		debug13(printf("  Redefining close mappingstart greedy to %u\n",close_mappingstart_greedy - chroffset));
+	      }
+#endif
+	      if ((mappingpos = subtract_bounded(plus_segments[i].diagonal,querylength,chroffset)) < close_mappingstart_last) {
+		/* Use < for NOT_GREEDY */
+		close_mappingstart_last = mappingpos;
+		close_mappingstart_p = true;
+		debug13(printf("  Redefining close mappingstart last to %u\n",close_mappingstart_last - chroffset));
+	      }
+	    }
+	  }
+
+#ifdef USE_GREEDY
+	  if (close_mappingstart_p == true) {
+	    close_knownsplice_limit_low = subtract_bounded(close_mappingstart_greedy,shortsplicedist,chroffset);
+	  } else if (middle_mappingstart_p == true) {
+	    debug13(printf("Using middle mappingstart\n"));
+	    close_knownsplice_limit_low = middle_mappingstart_greedy;
+	    close_mappingstart_greedy = middle_mappingstart_greedy;
+	    close_mappingstart_p = true;
+	  }
+#else
+	  if (close_mappingstart_p == true) {
+	    close_knownsplice_limit_low = subtract_bounded(close_mappingstart_last,shortsplicedist,chroffset);
+	  } else if (middle_mappingstart_p == true) {
+	    debug13(printf("Using middle mappingstart\n"));
+	    close_knownsplice_limit_low = middle_mappingstart_last;
+	    close_mappingstart_last = middle_mappingstart_last;
+	    close_mappingstart_p = true;
+	  }
+#endif
+#ifdef USE_GREEDY
+	  if (middle_mappingstart_p == true && middle_mappingstart_last < close_mappingstart_greedy) {
+	    knownsplice_limit_low = middle_mappingstart_last;
+	    mappingstart = middle_mappingstart_last;
+	  } else if (close_mappingstart_p == true && close_mappingstart_last != close_mappingstart_greedy) {
+	    knownsplice_limit_low = subtract_bounded(close_mappingstart_last,shortsplicedist,chroffset);
+	    mappingstart = close_mappingstart_last;
+	  }
+#else
+	  if (middle_mappingstart_p == true && middle_mappingstart_last < close_mappingstart_last) {
+	    knownsplice_limit_low = middle_mappingstart_last;
+	    mappingstart = middle_mappingstart_last;
+	  }
+#endif
+	  if (close_mappingstart_p == false) {
+	    fallback_mappingstart_p = false;
+#ifdef USE_GREEDY
+	  } else if (mappingstart >= close_mappingstart_greedy) {
+	    fallback_mappingstart_p = false;
+#endif
+	  } else {
+	    debug13(printf("Fallback mappingstart = %u\n",mappingstart - chroffset));
+	    fallback_mappingstart_p = true;
+	  }
+	}
+      }
+
+      favor_right_p = true;
+
+    } else {
+      chroffset = Stage3end_chroffset(hit3);
+      chrhigh = Stage3end_chrhigh(hit3);
+      chrlength = Stage3end_chrlength(hit3);
+
+      if (Shortread_find_primers(queryseq5,queryseq3) == true) {
+	/* Go from genomicend */
+	debug13(printf("Found primers\n"));
+	genomicbound = Stage3end_genomicend(hit3);
+
+      } else if (Stage3end_anomalous_splice_p(hit3) == true) {
+	/* Go from genomicend */
+	debug13(printf("Anomalous splice\n"));
+	genomicbound = Stage3end_genomicend(hit3);
+
+      } else {
+	genomicbound = Stage3end_genomicstart(hit3);
+
+#if 0
+	/* TODO: Previously called Shortread_find_overlap.  Now with Shortread_max_overlap, can optimize this code */
+	if ((overlap = Shortread_max_overlap(queryseq5,queryseq3)) > 0 &&
+	    Stage3end_genomicbound_from_start(&genomicbound2,hit3,overlap,chroffset) == true) {
+	  debug13(printf("Found overlap of %d\n",overlap));
+	  if (genomicbound2 < genomicbound) {
+	    zero_offset = genomicbound - genomicbound2;
+	    genomicbound = genomicbound2;
+	  }
+	}
+#endif
+      }
+
+      debug13(printf("Case 4: hit3 minus %s %u..%u (sensedir %d) => genomicbound %u\n",
+		     Stage3end_hittype_string(hit3),
+		     Stage3end_genomicstart(hit3) - chroffset,Stage3end_genomicend(hit3) - chroffset,
+		     Stage3end_sensedir(hit3),genomicbound - chroffset));
+
+      knownsplice_limit_low = mappingstart = segmentstart = genomicbound;
+      knownsplice_limit_high = add_bounded(Stage3end_genomicstart(hit3),pairmax + shortsplicedist,chrhigh);
+      segmentend = add_bounded(Stage3end_genomicstart(hit3),pairmax,chrhigh);
+#ifdef LONG_ENDSPLICES
+      mappingend = add_bounded(Stage3end_genomicstart(hit3),pairmax + shortsplicedist,chrhigh);
+#else
+      mappingend = add_bounded(Stage3end_genomicstart(hit3),pairmax + shortsplicedist_novelend,chrhigh);
+#endif
+
+      close_mappingend_last = middle_mappingend_last = Stage3end_genomicstart(hit3);
+#ifdef USE_GREEDY
+      close_mappingend_greedy = middle_mappingend_greedy = segmentend;
+#endif
+
+      if (minus_nsegments > 0) {
+	/* Use segments to bound */
+	debug13(printf("Finding segments from segmentstart %u to segmentend %u (minus_nsegments %d)\n",
+		       segmentstart - chroffset,segmentend - chroffset,minus_nsegments));
+	starti = endi = -1;
+	i = binary_search_segments(0,minus_nsegments-1,minus_segments,segmentstart);
+	while (i < minus_nsegments - 1 && minus_segments[i].diagonal == (Univcoord_T) -1) {
+	  i++;
+	}
+	starti = i;
+	while (minus_segments[i].diagonal < segmentend) {
+	  endi = i;
+	  i++;
+	}
+	if (starti >= 0 && endi >= 0) {
+	  debug13(printf("starti = %d, endi = %d\n",starti,endi));
+	  assert(starti <= endi);
+	  for (i = starti; i <= endi; i++) {
+	    debug13(printf("diagonal %u (%llu), querypos %d..%d\n",
+			   (Chrpos_T) (minus_segments[i].diagonal - chroffset),(unsigned long long) minus_segments[i].diagonal,
+			   minus_segments[i].querypos5,minus_segments[i].querypos3));
+	    if (minus_segments[i].querypos5 >= STAGE2_MIN_OLIGO + index1interval) {
+	      /* Case 4. Missing start of query, so there could be a middle splice */
+	      debug13b(printf("  querypos5 %d >= %d + %d, so using this diagonal plus shortsplicedist\n",
+			      minus_segments[i].querypos5,STAGE2_MIN_OLIGO,index1interval));
+#ifdef USE_GREEDY
+	      if ((mappingpos = add_bounded(minus_segments[i].diagonal,shortsplicedist_novelend,chrhigh)) < middle_mappingend_greedy &&
+		  mappingpos > genomicbound) {
+		middle_mappingend_greedy = mappingpos;
+		middle_mappingend_p = true;
+		debug13(printf("  Redefining middle mappingend greedy to %u\n",middle_mappingend_greedy - chroffset));
+	      }
+#endif
+#ifdef LONG_ENDSPLICES
+	      if ((mappingpos = add_bounded(minus_segments[i].diagonal,shortsplicedist,chrhigh)) > middle_mappingend_last) {
+		/* Use > for NOT_GREEDY */
+		middle_mappingend_last = mappingpos;
+		middle_mappingend_p = true;
+		debug13(printf("  Redefining middle mappingend to %u\n",middle_mappingend_last - chroffset));
+	      }
+#else
+	      if ((mappingpos = minus_segments[i].diagonal) > middle_mappingend_last) {
+		/* Use > for NOT_GREEDY */
+		middle_mappingend_last = mappingpos;
+		middle_mappingend_p = true;
+		debug13(printf("  Redefining middle mappingend to %u\n",middle_mappingend_last - chroffset));
+	      }
+#endif
+
+	    } else {
+	      debug13b(printf("  querypos5 %d < %d + %d, so using this diagonal\n",
+			      minus_segments[i].querypos5,STAGE2_MIN_OLIGO,index1interval));
+#ifdef USE_GREEDY
+	      if ((mappingpos = minus_segments[i].diagonal) < close_mappingend_greedy &&
+		  mappingpos > genomicbound) {
+		close_mappingend_greedy = mappingpos;
+		close_mappingend_p = true;
+		debug13(printf("  Redefining close mappingend greedy to %u\n",close_mappingend_greedy - chroffset));
+	      }
+#endif
+	      if ((mappingpos = minus_segments[i].diagonal) > close_mappingend_last) {
+		/* Use > for NOT_GREEDY */
+		close_mappingend_last = mappingpos;
+		close_mappingend_p = true;
+		debug13(printf("  Redefining close mappingend last to %u\n",close_mappingend_last - chroffset));
+	      }
+	    }
+	  }
+
+#ifdef USE_GREEDY
+	  if (close_mappingend_p == true) {
+	    close_knownsplice_limit_high = add_bounded(close_mappingend_greedy,shortsplicedist,chrhigh);
+	  } else if (middle_mappingend_p == true) {
+	    debug13(printf("Using middle mappingend\n"));
+	    close_knownsplice_limit_high = middle_mappingend_greedy;
+	    close_mappingend_greedy = middle_mappingend_greedy;
+	    close_mappingend_p = true;
+	  }
+#else
+	  if (close_mappingend_p == true) {
+	    close_knownsplice_limit_high = add_bounded(close_mappingend_last,shortsplicedist,chrhigh);
+	  } else if (middle_mappingend_p == true) {
+	    debug13(printf("Using middle mappingend\n"));
+	    close_knownsplice_limit_high = middle_mappingend_last;
+	    close_mappingend_last = middle_mappingend_last;
+	    close_mappingend_p = true;
+	  }
+#endif
+#ifdef USE_GREEDY
+	  if (middle_mappingend_p == true && middle_mappingend_last > close_mappingend_greedy) {
+	    knownsplice_limit_high = middle_mappingend_last;
+	    mappingend = middle_mappingend_last;
+	  } else if (close_mappingend_p == true && close_mappingend_last != close_mappingend_greedy) {
+	    knownsplice_limit_high = add_bounded(close_mappingend_last,shortsplicedist,chrhigh);
+	    mappingend = close_mappingend_last;
+	  }
+#else
+	  if (middle_mappingend_p == true && middle_mappingend_last > close_mappingend_last) {
+	    knownsplice_limit_high = middle_mappingend_last;
+	    mappingend = middle_mappingend_last;
+	  }
+#endif
+	  if (close_mappingend_p == false) {
+	    fallback_mappingend_p = false;
+#ifdef USE_GREEDY
+	  } else if (mappingend <= close_mappingend_greedy) {
+	    fallback_mappingend_p = false;
+#endif
+	  } else {
+	    debug13(printf("Fallback mappingend = %u\n",mappingend - chroffset));
+	    fallback_mappingend_p = true;
+	  }
+	}
+      }
+
+      favor_right_p = true;
+    }
+
+    if ((sensedir = Stage3end_sensedir(hit3)) == SENSE_FORWARD) {
+      sense_try = +1;
+    } else if (sensedir == SENSE_ANTI) {
+      sense_try = -1;
+    } else {
+      sense_try = 0;
+    }
+
+  } else {
+    abort();
+  }
+
+#ifdef OLD_GENOMICBOUND
+  knownsplice_limit_low = genomicstart + querylength;
+  knownsplice_limit_high = genomicend - querylength;
+#endif
+
+  if (close_mappingstart_p == true && close_mappingend_p == true) {
+    debug13(printf("Halfmapping: Running gmap with close mappingstart and close mappingend\n"));
+    hits = run_gmap_for_region(&good_start_p,&good_end_p,gmap_history,
+			       hits,Shortread_accession(queryseq5),queryuc_ptr,querylength,sense_try,favor_right_p,
+			       /*paired_favor_mode*/favor_right_p == true ? +1 : -1,zero_offset,
+			       query_compress_fwd,query_compress_rev,close_mappingstart_last,close_mappingend_last,
+			       close_knownsplice_limit_low,close_knownsplice_limit_high,
+			       watsonp,genestrand,first_read_p,chrnum,chroffset,chrhigh,chrlength,
+			       oligoindices_major,oligoindices_minor,
+			       pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,user_maxlevel);
+
+    if (good_start_p == true && good_end_p == true) {
+      /* Success */
+    } else if (gmap_rerun_p == false) {
+      debug13(printf("Skipping re-run of gmap\n"));
+    } else if (/* require both ends to be good */ 0 && good_start_p == true) {
+      if (fallback_mappingend_p == true) {
+	debug13(printf("Halfmapping: Re-running gmap with close mappingstart only\n"));
+	hits = run_gmap_for_region(&good_start_p,&good_end_p,gmap_history,
+				   hits,Shortread_accession(queryseq5),queryuc_ptr,querylength,sense_try,favor_right_p,
+				   /*paired_favor_mode*/favor_right_p == true ? +1 : -1,zero_offset,
+				   query_compress_fwd,query_compress_rev,close_mappingstart_last,mappingend,
+				   close_knownsplice_limit_low,knownsplice_limit_high,
+				   watsonp,genestrand,first_read_p,chrnum,chroffset,chrhigh,chrlength,
+				   oligoindices_major,oligoindices_minor,
+				   pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,user_maxlevel);
+      }
+
+    } else if (/* require both ends to be good */ 0 && good_end_p == true) {
+      if (fallback_mappingstart_p == true) {
+	debug13(printf("Halfmapping: Re-running gmap with close mappingend only\n"));
+	hits = run_gmap_for_region(&good_start_p,&good_end_p,gmap_history,
+				   hits,Shortread_accession(queryseq5),queryuc_ptr,querylength,sense_try,favor_right_p,
+				   /*paired_favor_mode*/favor_right_p == true ? +1 : -1,zero_offset,
+				   query_compress_fwd,query_compress_rev,mappingstart,close_mappingend_last,
+				   knownsplice_limit_low,close_knownsplice_limit_high,
+				   watsonp,genestrand,first_read_p,chrnum,chroffset,chrhigh,chrlength,
+				   oligoindices_major,oligoindices_minor,
+				   pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,user_maxlevel);
+      }
+    } else {
+      if (fallback_mappingstart_p == true && fallback_mappingend_p == true) {
+	debug13(printf("Halfmapping: Re-running gmap with far mappingstart and mappingend\n"));
+	hits = run_gmap_for_region(&good_start_p,&good_end_p,gmap_history,
+				   hits,Shortread_accession(queryseq5),queryuc_ptr,querylength,sense_try,favor_right_p,
+				   /*paired_favor_mode*/favor_right_p == true ? +1 : -1,zero_offset,
+				   query_compress_fwd,query_compress_rev,mappingstart,mappingend,
+				   knownsplice_limit_low,knownsplice_limit_high,
+				   watsonp,genestrand,first_read_p,chrnum,chroffset,chrhigh,chrlength,
+				   oligoindices_major,oligoindices_minor,
+				   pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,user_maxlevel);
+      }
+    }
+
+  } else if (close_mappingstart_p == true) {
+    debug13(printf("Halfmapping: Running gmap with close mappingstart\n"));
+    hits = run_gmap_for_region(&good_start_p,&good_end_p,gmap_history,
 			       hits,Shortread_accession(queryseq5),queryuc_ptr,querylength,sense_try,favor_right_p,
 			       /*paired_favor_mode*/favor_right_p == true ? +1 : -1,zero_offset,
 			       query_compress_fwd,query_compress_rev,close_mappingstart_last,mappingend,
@@ -17459,7 +17519,6 @@ align_pair_with_gmap (Pairtype_T *final_pairtype, List_T result,
   Stage3end_T hit5, hit3, gmap5, gmap3;
   List_T p, a, b, rest;
   int genestrand;
-  int missing_hit, missing_gmap;
   int i;
   bool replacedp;
 
@@ -17800,8 +17859,7 @@ align_pair (bool *abort_pairing_p, int *found_score, int *cutoff_level_5, int *c
 	    int user_maxlevel_5, int user_maxlevel_3, int indel_penalty_middle, int indel_penalty_end,
 	    int localsplicing_penalty, int distantsplicing_penalty, int min_shortend,
 	    bool allow_end_indels_p, int max_end_insertions, int max_end_deletions, int min_indel_end_matches,
-	    bool allvalidp5, bool allvalidp3, Chrpos_T pairmax,
-	    int maxpairedpaths, bool keep_floors_p, Shortread_T queryseq5, Shortread_T queryseq3,
+	    Chrpos_T pairmax, int maxpairedpaths, bool keep_floors_p, Shortread_T queryseq5, Shortread_T queryseq3,
 	    int genestrand) {
 
   List_T hitpairs = NULL, p;
@@ -17811,15 +17869,15 @@ align_pair (bool *abort_pairing_p, int *found_score, int *cutoff_level_5, int *c
   List_T hitarray5[HITARRAY_N], hitarray3[HITARRAY_N];
   List_T plus_anchor_segments_5 = NULL, minus_anchor_segments_5 = NULL, plus_anchor_segments_3 = NULL, minus_anchor_segments_3 = NULL;
   List_T greedy5 = NULL, subs5 = NULL, terminals5 = NULL,
-    indels5 = NULL, ambiguous5 = NULL, singlesplicing5 = NULL, doublesplicing5 = NULL,
+    indels5 = NULL, singlesplicing5 = NULL, doublesplicing5 = NULL,
     distantsplicing5 = NULL, gmap5_hits = NULL;
   List_T greedy3 = NULL, subs3 = NULL, terminals3 = NULL,
-    indels3 = NULL, ambiguous3 = NULL, singlesplicing3 = NULL, doublesplicing3 = NULL,
+    indels3 = NULL, singlesplicing3 = NULL, doublesplicing3 = NULL,
     distantsplicing3 = NULL, gmap3_hits = NULL;
   List_T longsinglesplicing5 = NULL, longsinglesplicing3 = NULL;
   int nmisses_allowed_sarray_5, nmisses_allowed_sarray_3;
   int ignore_found_score, done_level_5, done_level_3, opt_level, fast_level_5, fast_level_3,
-    mismatch_level_5, mismatch_level_3, nmismatches, max_mismatches_allowed;
+    mismatch_level_5, mismatch_level_3, nmismatches;
   int max_splice_mismatches_5 = -1, max_splice_mismatches_3 = -1, i;
   int nhits5 = 0, nhits3 = 0, nsplicepairs5 = 0, nsplicepairs3 = 0;
   List_T *donors_plus_5, *antidonors_plus_5, *acceptors_plus_5, *antiacceptors_plus_5,
@@ -17832,11 +17890,13 @@ align_pair (bool *abort_pairing_p, int *found_score, int *cutoff_level_5, int *c
   bool any_omitted_p_5, any_omitted_p_3;
   Floors_T floors5, floors3;
   bool alloc_floors_p_5 = false, alloc_floors_p_3 = false, floors5_computed_p = false, floors3_computed_p = false,
-    segments5_computed_p = false, segments3_computed_p = false, alloc5p, alloc3p;
+    segments5_computed_p = false, segments3_computed_p = false;
   int best_score_paired;
   bool found_terminals_p = false;
   int nconcordant = 0, nsamechr = 0;
   Indexdb_T plus_indexdb_5, plus_indexdb_3, minus_indexdb_5, minus_indexdb_3;
+  bool allvalidp5, allvalidp3;
+
 
   if (genestrand == +2) {
     plus_indexdb_5 = indexdb_rev;
@@ -17860,13 +17920,25 @@ align_pair (bool *abort_pairing_p, int *found_score, int *cutoff_level_5, int *c
   *found_score = querylength5 + querylength3;
   ignore_found_score = querylength5 + querylength3;
 
-  fast_level_5 = (querylength5 + index1interval - 1)/spansize - NREQUIRED_FAST;
-  fast_level_3 = (querylength3 + index1interval - 1)/spansize - NREQUIRED_FAST;
+  if (querylength5 < min_kmer_readlength) {
+    fast_level_5 = querylength5 - 1 - NREQUIRED_FAST;
+    debug(printf("fast_level_5 %d = querylength %d - 1 - nrequired_fast %d\n",
+		 fast_level_5,querylength5,NREQUIRED_FAST));
+  } else {
+    fast_level_5 = (querylength5 + index1interval - 1)/spansize - NREQUIRED_FAST;
+    debug(printf("fast_level_5 %d = (querylength %d + index1interval %d - 1)/spansize %d - nrequired_fast %d\n",
+		 fast_level_5,querylength5,index1interval,spansize,NREQUIRED_FAST));
+  }
 
-  debug(printf("fast_level_5 %d = (querylength %d + index1interval %d - 1)/spansize %d - nrequired_fast %d\n",
-	       fast_level_5,querylength5,index1interval,spansize,NREQUIRED_FAST));
-  debug(printf("fast_level_3 %d = (querylength %d + index1interval %d - 1)/spansize %d - nrequired_fast %d\n",
-	       fast_level_3,querylength3,index1interval,spansize,NREQUIRED_FAST));
+  if (querylength3 < min_kmer_readlength) {
+    fast_level_3 = querylength3 - 1 - NREQUIRED_FAST;
+    debug(printf("fast_level_3 %d = querylength %d - 1 - nrequired_fast %d\n",
+		 fast_level_3,querylength3,NREQUIRED_FAST));
+  } else {
+    fast_level_3 = (querylength3 + index1interval - 1)/spansize - NREQUIRED_FAST;
+    debug(printf("fast_level_3 %d = (querylength %d + index1interval %d - 1)/spansize %d - nrequired_fast %d\n",
+		 fast_level_3,querylength3,index1interval,spansize,NREQUIRED_FAST));
+  }
 
 #if 0
   /* This prevents complete_mm procedure, needed for short reads */
@@ -18041,6 +18113,13 @@ align_pair (bool *abort_pairing_p, int *found_score, int *cutoff_level_5, int *c
   }
 #endif
 
+  if (querylength5 < min_kmer_readlength) {
+    spanningset5p = false;
+  }
+  if (querylength3 < min_kmer_readlength) {
+    spanningset3p = false;
+  }
+
   /* Search 2: Exact/subs via spanning set algorithm */
   if (spanningset5p == true || spanningset3p == true) {
     /* 1A. Exact.  Requires compress if cmet or genomealt.  Creates and uses spanning set. */
@@ -18261,6 +18340,13 @@ align_pair (bool *abort_pairing_p, int *found_score, int *cutoff_level_5, int *c
     debug(printf("Test for completeset using better_free_end_exists_p: completeset5p %d, completeset3p %d\n",completeset5p,completeset3p));
   }
 
+  if (querylength5 < min_kmer_readlength) {
+    completeset5p = false;
+  }
+  if (querylength3 < min_kmer_readlength) {
+    completeset3p = false;
+  }
+
   if (completeset5p == true) {
     debug(printf("Performing complete set analysis on 5' end\n"));
     if (this5->read_oligos_p == false) {
@@ -19746,7 +19832,7 @@ realign_separately (Stage3end_T **stage3array5, int *nhits5, int *first_absmq5,
 			    user_maxlevel_5,indel_penalty_middle,indel_penalty_end,
 			    localsplicing_penalty,distantsplicing_penalty,min_shortend,
 			    allow_end_indels_p,max_end_insertions,max_end_deletions,min_indel_end_matches,
-			    allvalidp5,keep_floors_p,genestrand,/*first_read_p*/true);
+			    keep_floors_p,genestrand,/*first_read_p*/true);
   }
 
   if ((*nhits5 = List_length(singlehits5)) == 0) {
@@ -19774,7 +19860,7 @@ realign_separately (Stage3end_T **stage3array5, int *nhits5, int *first_absmq5,
 			    user_maxlevel_3,indel_penalty_middle,indel_penalty_end,
 			    localsplicing_penalty,distantsplicing_penalty,min_shortend,
 			    allow_end_indels_p,max_end_insertions,max_end_deletions,min_indel_end_matches,
-			    allvalidp3,keep_floors_p,genestrand,/*first_read_p*/false);
+			    keep_floors_p,genestrand,/*first_read_p*/false);
   }
 
   if ((*nhits3 = List_length(singlehits3)) == 0) {
@@ -20345,8 +20431,6 @@ paired_read (int *npaths, int *first_absmq, int *second_absmq, Pairtype_T *final
   int user_maxlevel_5, user_maxlevel_3;
   int found_score, cutoff_level_5, cutoff_level_3;
   int querylength5, querylength3, query5_lastpos, query3_lastpos;
-  int noligos5, noligos3;
-  bool allvalidp5, allvalidp3;
 #if 0
   int maxpairedpaths = 10*maxpaths; /* For computation, not for printing. */
 #else
@@ -20364,258 +20448,128 @@ paired_read (int *npaths, int *first_absmq, int *second_absmq, Pairtype_T *final
   querylength5 = Shortread_fulllength(queryseq5);
   querylength3 = Shortread_fulllength(queryseq3);
 
-#ifdef HAVE_ALLOCA
-  queryrc5 = (char *) ALLOCA((querylength5+1)*sizeof(char));
-  queryrc3 = (char *) ALLOCA((querylength3+1)*sizeof(char));
-#endif
-
-  if (querylength5 < min_readlength && querylength3 < min_readlength) {
-    fprintf(stderr,"Paired-read %s has lengths %d and %d < min_readlength %d.  Skipping.\n",
-	    Shortread_accession(queryseq5),querylength5,querylength3,min_readlength);
-    /* fprintf(stderr,"You may want to build a genomic index with a smaller k-mer value using the -k flag to gmap_build\n"); */
-    *npaths = *nhits5 = *nhits3 = 0;
-    *stage3array5 = *stage3array3 = (Stage3end_T *) NULL;
-    return (Stage3pair_T *) NULL;
-
 #ifndef HAVE_ALLOCA
-  } else if (querylength5 > MAX_READLENGTH || querylength3 > MAX_READLENGTH) {
+  if (querylength5 > MAX_READLENGTH || querylength3 > MAX_READLENGTH) {
     fprintf(stderr,"Paired-read %s has lengths %d and %d > MAX_READLENGTH %d.  Either run configure and make again with a higher value of MAX_READLENGTH, or consider using GMAP instead.\n",
 	    Shortread_accession(queryseq5),querylength5,querylength3,MAX_READLENGTH);
     *npaths = *nhits5 = *nhits3 = 0;
     *stage3array5 = *stage3array3 = (Stage3end_T *) NULL;
     return (Stage3pair_T *) NULL;
+  }
+#else
+  queryrc5 = (char *) ALLOCA((querylength5+1)*sizeof(char));
+  queryrc3 = (char *) ALLOCA((querylength3+1)*sizeof(char));
 #endif
 
-  } else if (querylength5 < min_readlength) {
-    /* Solve just 3' end */
-    fprintf(stderr,"First end of paired-read %s has length %d < min_readlength %d.  Aligning second end only.\n",
-	    Shortread_accession(queryseq5),querylength5,min_readlength);
-    *nhits5 = 0;
-    *stage3array5 = (Stage3end_T *) NULL;
-
-    if (user_maxlevel_float < 0.0) {
-      user_maxlevel_3 = -1;
-    } else if (user_maxlevel_float > 0.0 && user_maxlevel_float < 1.0) {
-      user_maxlevel_3 = (int) rint(user_maxlevel_float * (double) querylength3);
-    } else {
-      user_maxlevel_3 = (int) user_maxlevel_float;
-    }
+  if (user_maxlevel_float < 0.0) {
+    user_maxlevel_5 = user_maxlevel_3 = -1;
+  } else if (user_maxlevel_float > 0.0 && user_maxlevel_float < 1.0) {
+    user_maxlevel_5 = (int) rint(user_maxlevel_float * (double) querylength5);
+    user_maxlevel_3 = (int) rint(user_maxlevel_float * (double) querylength3);
+  } else {
+    user_maxlevel_5 = user_maxlevel_3 = (int) user_maxlevel_float;
+  }
+
+  this5 = Stage1_new(querylength5);
+  this3 = Stage1_new(querylength3);
+  queryuc_ptr_5 = Shortread_fullpointer_uc(queryseq5);
+  queryuc_ptr_3 = Shortread_fullpointer_uc(queryseq3);
+  quality_string_5 = Shortread_quality_string(queryseq5);
+  quality_string_3 = Shortread_quality_string(queryseq3);
+  query5_lastpos = querylength5 - index1part;
+  query3_lastpos = querylength3 - index1part;
+
+  /* Limit search on repetitive sequences */
+  if (check_dinucleotides(queryuc_ptr_5,querylength5) == false) {
+    user_maxlevel_5 = 0;
+  }
+  if (check_dinucleotides(queryuc_ptr_3,querylength3) == false) {
+    user_maxlevel_3 = 0;
+  }
+
+  query5_compress_fwd = Compress_new_fwd(queryuc_ptr_5,querylength5);
+  query5_compress_rev = Compress_new_rev(queryuc_ptr_5,querylength5);
+  query3_compress_fwd = Compress_new_fwd(queryuc_ptr_3,querylength3);
+  query3_compress_rev = Compress_new_rev(queryuc_ptr_3,querylength3);
+  gmap_history_5 = History_new();
+  gmap_history_3 = History_new();
+  make_complement_buffered(queryrc5,queryuc_ptr_5,querylength5);
+  make_complement_buffered(queryrc3,queryuc_ptr_3,querylength3);
+
+  hitpairs = align_pair(&abort_pairing_p,&found_score,&cutoff_level_5,&cutoff_level_3,
+			&samechr,&conc_transloc,gmap_history_5,gmap_history_3,
+			&hits5,&hits3,this5,this3,query5_compress_fwd,query5_compress_rev,
+			query3_compress_fwd,query3_compress_rev,
+			queryuc_ptr_5,queryuc_ptr_3,queryrc5,queryrc3,
+			querylength5,querylength3,query5_lastpos,query3_lastpos,
+			indexdb_fwd,indexdb_rev,indexdb_size_threshold,floors_array,
 
-    this3 = Stage1_new(querylength3);
-    queryuc_ptr_3 = Shortread_fullpointer_uc(queryseq3);
-    quality_string_3 = Shortread_quality_string(queryseq3);
-    query3_lastpos = querylength3 - index1part;
+			oligoindices_major,oligoindices_minor,
+			pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,
 
-    /* Limit search on repetitive sequences */
-    if (check_dinucleotides(queryuc_ptr_3,querylength3) == false) {
-      user_maxlevel_3 = 0;
-    }
+			user_maxlevel_5,user_maxlevel_3,indel_penalty_middle,indel_penalty_end,
+			localsplicing_penalty,distantsplicing_penalty,min_shortend,
+			allow_end_indels_p,max_end_insertions,max_end_deletions,min_indel_end_matches,
+			pairmax,maxpairedpaths,keep_floors_p,queryseq5,queryseq3,/*genestrand*/0);
 
-    query3_compress_fwd = Compress_new_fwd(queryuc_ptr_3,querylength3);
-    query3_compress_rev = Compress_new_rev(queryuc_ptr_3,querylength3);
-    gmap_history_3 = History_new();
-    make_complement_buffered(queryrc3,queryuc_ptr_3,querylength3);
+  if (abort_pairing_p == true) {
+    debug16(printf("abort_pairing_p is true\n"));
+    paired_results_free(this5,this3,hitpairs,samechr,conc_transloc,
+			hits5,hits3,querylength5,querylength3);
 
-    hits3 = align_end(&cutoff_level_3,gmap_history_3,this3,
-		      query3_compress_fwd,query3_compress_rev,
-		      Shortread_accession(queryseq5),queryuc_ptr_3,queryrc3,querylength3,query3_lastpos,
-		      indexdb_fwd,indexdb_rev,indexdb_size_threshold,
-		      floors_array,oligoindices_major,oligoindices_minor,
-		      pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,
-		      user_maxlevel_3,indel_penalty_middle,indel_penalty_end,
-		      localsplicing_penalty,distantsplicing_penalty,min_shortend,
-		      allow_end_indels_p,max_end_insertions,max_end_deletions,min_indel_end_matches,
-		      allvalidp3,keep_floors_p,/*genestrand*/0,/*first_read_p*/false);
+    this5 = Stage1_new(querylength5);
+    this3 = Stage1_new(querylength3);
+    realign_separately(stage3array5,&(*nhits5),&(*first_absmq5),&(*second_absmq5),
+		       stage3array3,&(*nhits3),&(*first_absmq3),&(*second_absmq3),
+		       gmap_history_5,gmap_history_3,this5,this3,
+		       query5_compress_fwd,query5_compress_rev,query3_compress_fwd,query3_compress_rev,
+		       queryseq5,queryuc_ptr_5,queryrc5,quality_string_5,querylength5,query5_lastpos,
+		       queryseq3,queryuc_ptr_3,queryrc3,quality_string_3,querylength3,query3_lastpos,
+		       indexdb_fwd,indexdb_rev,indexdb_size_threshold,floors_array,
+		       user_maxlevel_5,user_maxlevel_3,indel_penalty_middle,indel_penalty_end,
+		       allow_end_indels_p,max_end_insertions,max_end_deletions,min_indel_end_matches,
+		       localsplicing_penalty,distantsplicing_penalty,min_shortend,
+		       oligoindices_major,oligoindices_minor,
+		       pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,
+		       keep_floors_p,/*genestrand*/0);
 
-    if ((*nhits3 = List_length(hits3)) == 0) {
-      *stage3array3 = (Stage3end_T *) NULL;
-    } else {
-      *stage3array3 = (Stage3end_T *) List_to_array_out(hits3,NULL); List_free(&hits3); /* Return value */
-      *stage3array3 = Stage3end_eval_and_sort(&(*nhits3),&(*first_absmq3),&(*second_absmq3),
-					      *stage3array3,maxpaths_search,queryseq3,
-					      queryuc_ptr_3,queryrc3,
-					      query3_compress_fwd,query3_compress_rev,
-					      quality_string_3,/*displayp*/true);
-    }
     *npaths = 0;
     *final_pairtype = UNPAIRED;
     History_free(&gmap_history_3);
+    History_free(&gmap_history_5);
+    Compress_free(&query5_compress_fwd);
+    Compress_free(&query5_compress_rev);
     Compress_free(&query3_compress_fwd);
     Compress_free(&query3_compress_rev);
+    Stage1_free(&this5,querylength5);
     Stage1_free(&this3,querylength3);
     return (Stage3pair_T *) NULL;
 
-  } else if (querylength3 < min_readlength) {
-    /* Solve just 5' end */
-    fprintf(stderr,"Second end of paired-read %s has length %d < min_readlength %d.  Aligning first end only.\n",
-	    Shortread_accession(queryseq5),querylength3,min_readlength);
-    *nhits3 = 0;
-    *stage3array3 = (Stage3end_T *) NULL;
-
-    if (user_maxlevel_float < 0.0) {
-      user_maxlevel_5 = -1;
-    } else if (user_maxlevel_float > 0.0 && user_maxlevel_float < 1.0) {
-      user_maxlevel_5 = (int) rint(user_maxlevel_float * (double) querylength5);
-    } else {
-      user_maxlevel_5 = (int) user_maxlevel_float;
-    }
-
-    this5 = Stage1_new(querylength5);
-    queryuc_ptr_5 = Shortread_fullpointer_uc(queryseq5);
-    quality_string_5 = Shortread_quality_string(queryseq5);
-    query5_lastpos = querylength5 - index1part;
-
-    /* Limit search on repetitive sequences */
-    if (check_dinucleotides(queryuc_ptr_5,querylength5) == false) {
-      user_maxlevel_5 = 0;
-    }
-
-    query5_compress_fwd = Compress_new_fwd(queryuc_ptr_5,querylength5);
-    query5_compress_rev = Compress_new_rev(queryuc_ptr_5,querylength5);
-    gmap_history_5 = History_new();
-    make_complement_buffered(queryrc5,queryuc_ptr_5,querylength5);
-
-    hits5 = align_end(&cutoff_level_5,gmap_history_5,this5,
-		      query5_compress_fwd,query5_compress_rev,
-		      Shortread_accession(queryseq5),queryuc_ptr_5,queryrc5,querylength5,query5_lastpos,
-		      indexdb_fwd,indexdb_rev,indexdb_size_threshold,
-		      floors_array,oligoindices_major,oligoindices_minor,
-		      pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,
-		      user_maxlevel_5,indel_penalty_middle,indel_penalty_end,
-		      localsplicing_penalty,distantsplicing_penalty,min_shortend,
-		      allow_end_indels_p,max_end_insertions,max_end_deletions,min_indel_end_matches,
-		      allvalidp5,keep_floors_p,/*genestrand*/0,/*first_read_p*/true);
+  } else {
+    stage3pairarray =
+      consolidate_paired_results(&(*npaths),&(*first_absmq),&(*second_absmq),&(*final_pairtype),
+				 &(*stage3array5),&(*nhits5),&(*first_absmq5),&(*second_absmq5),
+				 &(*stage3array3),&(*nhits3),&(*first_absmq3),&(*second_absmq3),
+				 hitpairs,samechr,conc_transloc,hits5,hits3,gmap_history_5,gmap_history_3,
+				 query5_compress_fwd,query5_compress_rev,query3_compress_fwd,query3_compress_rev,
+				 &this5->plus_segments,&this5->plus_nsegments,&this5->minus_segments,&this5->minus_nsegments,
+				 &this3->plus_segments,&this3->plus_nsegments,&this3->minus_segments,&this3->minus_nsegments,
+				 queryseq5,queryuc_ptr_5,queryrc5,quality_string_5,querylength5,query5_lastpos,
+				 queryseq3,queryuc_ptr_3,queryrc3,quality_string_3,querylength3,query3_lastpos,
+				 cutoff_level_5,cutoff_level_3,
+				 localsplicing_penalty,
+				 oligoindices_major,oligoindices_minor,
+				 pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,pairmax,user_maxlevel_5,user_maxlevel_3);
 
-    if ((*nhits5 = List_length(hits5)) == 0) {
-      *stage3array5 = (Stage3end_T *) NULL;
-    } else {
-      *stage3array5 = (Stage3end_T *) List_to_array_out(hits5,NULL); List_free(&hits5); /* Return value */
-      *stage3array5 = Stage3end_eval_and_sort(&(*nhits5),&(*first_absmq5),&(*second_absmq5),
-					      *stage3array5,maxpaths_search,queryseq5,
-					      queryuc_ptr_5,queryrc5,
-					      query5_compress_fwd,query5_compress_rev,
-					      quality_string_5,/*displayp*/true);
-    }
-    *npaths = 0;
-    *final_pairtype = UNPAIRED;
+    History_free(&gmap_history_3);
     History_free(&gmap_history_5);
     Compress_free(&query5_compress_fwd);
     Compress_free(&query5_compress_rev);
+    Compress_free(&query3_compress_fwd);
+    Compress_free(&query3_compress_rev);
     Stage1_free(&this5,querylength5);
-    return (Stage3pair_T *) NULL;
-
-  } else {
-    if (user_maxlevel_float < 0.0) {
-      user_maxlevel_5 = user_maxlevel_3 = -1;
-    } else if (user_maxlevel_float > 0.0 && user_maxlevel_float < 1.0) {
-      user_maxlevel_5 = (int) rint(user_maxlevel_float * (double) querylength5);
-      user_maxlevel_3 = (int) rint(user_maxlevel_float * (double) querylength3);
-    } else {
-      user_maxlevel_5 = user_maxlevel_3 = (int) user_maxlevel_float;
-    }
-
-    this5 = Stage1_new(querylength5);
-    this3 = Stage1_new(querylength3);
-    queryuc_ptr_5 = Shortread_fullpointer_uc(queryseq5);
-    queryuc_ptr_3 = Shortread_fullpointer_uc(queryseq3);
-    quality_string_5 = Shortread_quality_string(queryseq5);
-    quality_string_3 = Shortread_quality_string(queryseq3);
-    query5_lastpos = querylength5 - index1part;
-    query3_lastpos = querylength3 - index1part;
-
-    /* Limit search on repetitive sequences */
-    if (check_dinucleotides(queryuc_ptr_5,querylength5) == false) {
-      user_maxlevel_5 = 0;
-    }
-    if (check_dinucleotides(queryuc_ptr_3,querylength3) == false) {
-      user_maxlevel_3 = 0;
-    }
-
-    query5_compress_fwd = Compress_new_fwd(queryuc_ptr_5,querylength5);
-    query5_compress_rev = Compress_new_rev(queryuc_ptr_5,querylength5);
-    query3_compress_fwd = Compress_new_fwd(queryuc_ptr_3,querylength3);
-    query3_compress_rev = Compress_new_rev(queryuc_ptr_3,querylength3);
-    gmap_history_5 = History_new();
-    gmap_history_3 = History_new();
-    make_complement_buffered(queryrc5,queryuc_ptr_5,querylength5);
-    make_complement_buffered(queryrc3,queryuc_ptr_3,querylength3);
-
-    hitpairs = align_pair(&abort_pairing_p,&found_score,&cutoff_level_5,&cutoff_level_3,
-			  &samechr,&conc_transloc,gmap_history_5,gmap_history_3,
-			  &hits5,&hits3,this5,this3,query5_compress_fwd,query5_compress_rev,
-			  query3_compress_fwd,query3_compress_rev,
-			  queryuc_ptr_5,queryuc_ptr_3,queryrc5,queryrc3,
-			  querylength5,querylength3,query5_lastpos,query3_lastpos,
-			  indexdb_fwd,indexdb_rev,indexdb_size_threshold,floors_array,
-
-			  oligoindices_major,oligoindices_minor,
-			  pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,
-
-			  user_maxlevel_5,user_maxlevel_3,indel_penalty_middle,indel_penalty_end,
-			  localsplicing_penalty,distantsplicing_penalty,min_shortend,
-			  allow_end_indels_p,max_end_insertions,max_end_deletions,min_indel_end_matches,
-			  allvalidp5,allvalidp3,pairmax,maxpairedpaths,keep_floors_p,
-			  queryseq5,queryseq3,/*genestrand*/0);
-
-    if (abort_pairing_p == true) {
-      debug16(printf("abort_pairing_p is true\n"));
-      paired_results_free(this5,this3,hitpairs,samechr,conc_transloc,
-			  hits5,hits3,querylength5,querylength3);
-
-      this5 = Stage1_new(querylength5);
-      this3 = Stage1_new(querylength3);
-      realign_separately(stage3array5,&(*nhits5),&(*first_absmq5),&(*second_absmq5),
-			 stage3array3,&(*nhits3),&(*first_absmq3),&(*second_absmq3),
-			 gmap_history_5,gmap_history_3,this5,this3,
-			 query5_compress_fwd,query5_compress_rev,query3_compress_fwd,query3_compress_rev,
-			 queryseq5,queryuc_ptr_5,queryrc5,quality_string_5,querylength5,query5_lastpos,
-			 queryseq3,queryuc_ptr_3,queryrc3,quality_string_3,querylength3,query3_lastpos,
-			 indexdb_fwd,indexdb_rev,indexdb_size_threshold,floors_array,
-			 user_maxlevel_5,user_maxlevel_3,indel_penalty_middle,indel_penalty_end,
-			 allow_end_indels_p,max_end_insertions,max_end_deletions,min_indel_end_matches,
-			 localsplicing_penalty,distantsplicing_penalty,min_shortend,
-			 oligoindices_major,oligoindices_minor,
-			 pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,
-			 keep_floors_p,/*genestrand*/0);
-
-      *npaths = 0;
-      *final_pairtype = UNPAIRED;
-      History_free(&gmap_history_3);
-      History_free(&gmap_history_5);
-      Compress_free(&query5_compress_fwd);
-      Compress_free(&query5_compress_rev);
-      Compress_free(&query3_compress_fwd);
-      Compress_free(&query3_compress_rev);
-      Stage1_free(&this5,querylength5);
-      Stage1_free(&this3,querylength3);
-      return (Stage3pair_T *) NULL;
-
-    } else {
-      stage3pairarray =
-	consolidate_paired_results(&(*npaths),&(*first_absmq),&(*second_absmq),&(*final_pairtype),
-				   &(*stage3array5),&(*nhits5),&(*first_absmq5),&(*second_absmq5),
-				   &(*stage3array3),&(*nhits3),&(*first_absmq3),&(*second_absmq3),
-				   hitpairs,samechr,conc_transloc,hits5,hits3,gmap_history_5,gmap_history_3,
-				   query5_compress_fwd,query5_compress_rev,query3_compress_fwd,query3_compress_rev,
-				   &this5->plus_segments,&this5->plus_nsegments,&this5->minus_segments,&this5->minus_nsegments,
-				   &this3->plus_segments,&this3->plus_nsegments,&this3->minus_segments,&this3->minus_nsegments,
-				   queryseq5,queryuc_ptr_5,queryrc5,quality_string_5,querylength5,query5_lastpos,
-				   queryseq3,queryuc_ptr_3,queryrc3,quality_string_3,querylength3,query3_lastpos,
-				   cutoff_level_5,cutoff_level_3,
-				   localsplicing_penalty,
-				   oligoindices_major,oligoindices_minor,
-				   pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,pairmax,user_maxlevel_5,user_maxlevel_3);
-
-      History_free(&gmap_history_3);
-      History_free(&gmap_history_5);
-      Compress_free(&query5_compress_fwd);
-      Compress_free(&query5_compress_rev);
-      Compress_free(&query3_compress_fwd);
-      Compress_free(&query3_compress_rev);
-      Stage1_free(&this5,querylength5);
-      Stage1_free(&this3,querylength3);
-      return stage3pairarray;
-    }
+    Stage1_free(&this3,querylength3);
+    return stage3pairarray;
   }
 }
 
@@ -20647,8 +20601,6 @@ paired_read_tolerant_nonstranded (int *npaths, int *first_absmq, int *second_abs
   int found_score_geneplus, found_score_geneminus;
   int cutoff_level_5, cutoff_level_3;
   int querylength5, querylength3, query5_lastpos, query3_lastpos;
-  int noligos5, noligos3;
-  bool allvalidp5, allvalidp3;
 #if 0
   int maxpairedpaths = 10*maxpaths; /* For computation, not for printing. */
 #else
@@ -20670,478 +20622,290 @@ paired_read_tolerant_nonstranded (int *npaths, int *first_absmq, int *second_abs
   querylength5 = Shortread_fulllength(queryseq5);
   querylength3 = Shortread_fulllength(queryseq3);
 
-#ifdef HAVE_ALLOCA
-  queryrc5 = (char *) ALLOCA((querylength5+1)*sizeof(char));
-  queryrc3 = (char *) ALLOCA((querylength3+1)*sizeof(char));
-#endif
-
-  if (querylength5 < min_readlength && querylength3 < min_readlength) {
-    fprintf(stderr,"Paired-read %s has lengths %d and %d < min_readlength %d.  Skipping.\n",
-	    Shortread_accession(queryseq5),querylength5,querylength3,min_readlength);
-    /* fprintf(stderr,"You may want to build a genomic index with a smaller k-mer value using the -k flag to gmap_build\n"); */
-    *npaths = *nhits5 = *nhits3 = 0;
-    *stage3array5 = *stage3array3 = (Stage3end_T *) NULL;
-    return (Stage3pair_T *) NULL;
-
 #ifndef HAVE_ALLOCA
-  } else if (querylength5 > MAX_READLENGTH || querylength3 > MAX_READLENGTH) {
+  if (querylength5 > MAX_READLENGTH || querylength3 > MAX_READLENGTH) {
     fprintf(stderr,"Paired-read %s has lengths %d and %d > MAX_READLENGTH %d.  Either run configure and make again with a higher value of MAX_READLENGTH, or consider using GMAP instead.\n",
 	    Shortread_accession(queryseq5),querylength5,querylength3,MAX_READLENGTH);
     *npaths = *nhits5 = *nhits3 = 0;
     *stage3array5 = *stage3array3 = (Stage3end_T *) NULL;
     return (Stage3pair_T *) NULL;
+  }
+#else
+  queryrc5 = (char *) ALLOCA((querylength5+1)*sizeof(char));
+  queryrc3 = (char *) ALLOCA((querylength3+1)*sizeof(char));
 #endif
 
-  } else if (querylength5 < min_readlength) {
-    /* Solve just 3' end */
-    fprintf(stderr,"First end of paired-read %s has length %d < min_readlength %d.  Aligning second end only.\n",
-	    Shortread_accession(queryseq5),querylength5,min_readlength);
-    *nhits5 = 0;
-    *stage3array5 = (Stage3end_T *) NULL;
-
-    if (user_maxlevel_float < 0.0) {
-      user_maxlevel_3 = -1;
-    } else if (user_maxlevel_float > 0.0 && user_maxlevel_float < 1.0) {
-      user_maxlevel_3 = (int) rint(user_maxlevel_float * (double) querylength3);
-    } else {
-      user_maxlevel_3 = (int) user_maxlevel_float;
-    }
-
-    this_geneplus_3 = Stage1_new(querylength3);
-    this_geneminus_3 = Stage1_new(querylength3);
-
-    queryuc_ptr_3 = Shortread_fullpointer_uc(queryseq3);
-    quality_string_3 = Shortread_quality_string(queryseq3);
-    query3_lastpos = querylength3 - index1part;
-
-    /* Limit search on repetitive sequences */
-    if (check_dinucleotides(queryuc_ptr_3,querylength3) == false) {
-      user_maxlevel_3 = 0;
-    }
-
-    query3_compress_fwd = Compress_new_fwd(queryuc_ptr_3,querylength3);
-    query3_compress_rev = Compress_new_rev(queryuc_ptr_3,querylength3);
-    gmap_history_3 = History_new();
-    make_complement_buffered(queryrc3,queryuc_ptr_3,querylength3);
-
-    if (read_oligos(&allvalidp3,this_geneplus_3,queryuc_ptr_3,querylength3,query3_lastpos,/*genestrand*/+1,
-		    /*first_read_p*/false) == 0) {
-      hits_geneplus_3 = (List_T) NULL;
-    } else {
-      hits_geneplus_3 = align_end(&cutoff_level_3,gmap_history_3,this_geneplus_3,
-				  query3_compress_fwd,query3_compress_rev,
-				  Shortread_accession(queryseq5),queryuc_ptr_3,queryrc3,querylength3,query3_lastpos,
-				  indexdb_fwd,indexdb_fwd,indexdb_size_threshold,
-				  floors_array,oligoindices_major,oligoindices_minor,
+  if (user_maxlevel_float < 0.0) {
+    user_maxlevel_5 = user_maxlevel_3 = -1;
+  } else if (user_maxlevel_float > 0.0 && user_maxlevel_float < 1.0) {
+    user_maxlevel_5 = (int) rint(user_maxlevel_float * (double) querylength5);
+    user_maxlevel_3 = (int) rint(user_maxlevel_float * (double) querylength3);
+  } else {
+    user_maxlevel_5 = user_maxlevel_3 = (int) user_maxlevel_float;
+  }
+
+  this_geneplus_5 = Stage1_new(querylength5);
+  this_geneplus_3 = Stage1_new(querylength3);
+  this_geneminus_5 = Stage1_new(querylength5);
+  this_geneminus_3 = Stage1_new(querylength3);
+
+  queryuc_ptr_5 = Shortread_fullpointer_uc(queryseq5);
+  queryuc_ptr_3 = Shortread_fullpointer_uc(queryseq3);
+  quality_string_5 = Shortread_quality_string(queryseq5);
+  quality_string_3 = Shortread_quality_string(queryseq3);
+  query5_lastpos = querylength5 - index1part;
+  query3_lastpos = querylength3 - index1part;
+
+  /* Limit search on repetitive sequences */
+  if (check_dinucleotides(queryuc_ptr_5,querylength5) == false) {
+    user_maxlevel_5 = 0;
+  }
+  if (check_dinucleotides(queryuc_ptr_3,querylength3) == false) {
+    user_maxlevel_3 = 0;
+  }
+
+  query5_compress_fwd = Compress_new_fwd(queryuc_ptr_5,querylength5);
+  query5_compress_rev = Compress_new_rev(queryuc_ptr_5,querylength5);
+  query3_compress_fwd = Compress_new_fwd(queryuc_ptr_3,querylength3);
+  query3_compress_rev = Compress_new_rev(queryuc_ptr_3,querylength3);
+  gmap_history_5 = History_new();
+  gmap_history_3 = History_new();
+  make_complement_buffered(queryrc5,queryuc_ptr_5,querylength5);
+  make_complement_buffered(queryrc3,queryuc_ptr_3,querylength3);
+
+  abort_pairing_p_geneplus = false;
+  hitpairs_geneplus = align_pair(&abort_pairing_p_geneplus,&found_score_geneplus,
+				 &cutoff_level_5,&cutoff_level_3,
+				 &samechr_geneplus,&conc_transloc_geneplus,
+				 gmap_history_5,gmap_history_3,
+				 &hits_geneplus_5,&hits_geneplus_3,this_geneplus_5,this_geneplus_3,
+				 query5_compress_fwd,query5_compress_rev,query3_compress_fwd,query3_compress_rev,
+				 queryuc_ptr_5,queryuc_ptr_3,queryrc5,queryrc3,
+				 querylength5,querylength3,query5_lastpos,query3_lastpos,
+				 indexdb_fwd,indexdb_rev,indexdb_size_threshold,floors_array,
+				 
+				 oligoindices_major,oligoindices_minor,
+				 pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,
+				 
+				 user_maxlevel_5,user_maxlevel_3,indel_penalty_middle,indel_penalty_end,
+				 localsplicing_penalty,distantsplicing_penalty,min_shortend,
+				 allow_end_indels_p,max_end_insertions,max_end_deletions,min_indel_end_matches,
+				 pairmax,maxpairedpaths,keep_floors_p,
+				 queryseq5,queryseq3,/*genestrand*/+1);
+
+  abort_pairing_p_geneminus = false;
+  hitpairs_geneminus = align_pair(&abort_pairing_p_geneminus,&found_score_geneminus,
+				  &cutoff_level_5,&cutoff_level_3,
+				  &samechr_geneminus,&conc_transloc_geneminus,
+				  gmap_history_5,gmap_history_3,
+				  &hits_geneminus_5,&hits_geneminus_3,this_geneminus_5,this_geneminus_3,
+				  query5_compress_fwd,query5_compress_rev,query3_compress_fwd,query3_compress_rev,
+				  queryuc_ptr_5,queryuc_ptr_3,queryrc5,queryrc3,
+				  querylength5,querylength3,query5_lastpos,query3_lastpos,
+				  indexdb_fwd,indexdb_rev,indexdb_size_threshold,floors_array,
+				  
+				  oligoindices_major,oligoindices_minor,
 				  pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,
-				  user_maxlevel_3,indel_penalty_middle,indel_penalty_end,
+				  
+				  user_maxlevel_5,user_maxlevel_3,indel_penalty_middle,indel_penalty_end,
 				  localsplicing_penalty,distantsplicing_penalty,min_shortend,
 				  allow_end_indels_p,max_end_insertions,max_end_deletions,min_indel_end_matches,
-				  allvalidp3,keep_floors_p,/*genestrand*/+1,/*first_read_p*/false);
-    }
+				  pairmax,maxpairedpaths,keep_floors_p,queryseq5,queryseq3,/*genestrand*/+2);
 
-    if (read_oligos(&allvalidp3,this_geneminus_3,queryuc_ptr_3,querylength3,query3_lastpos,/*genestrand*/+2,
-		    /*first_read_p*/false) == 0) {
-      hits_geneminus_3 = (List_T) NULL;
-    } else {
-      hits_geneminus_3 = align_end(&cutoff_level_3,gmap_history_3,this_geneminus_3,
-				   query3_compress_fwd,query3_compress_rev,
-				   Shortread_accession(queryseq5),queryuc_ptr_3,queryrc3,querylength3,query3_lastpos,
-				   indexdb_fwd,indexdb_rev,indexdb_size_threshold,
-				   floors_array,oligoindices_major,oligoindices_minor,
-				   pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,
-				   user_maxlevel_3,indel_penalty_middle,indel_penalty_end,
-				   localsplicing_penalty,distantsplicing_penalty,min_shortend,
-				   allow_end_indels_p,max_end_insertions,max_end_deletions,min_indel_end_matches,
-				   allvalidp3,keep_floors_p,/*genestrand*/+2,/*first_read_p*/false);
-    }
+  if (found_score_geneplus < found_score_geneminus) {
+    paired_results_free(this_geneminus_5,this_geneminus_3,hitpairs_geneminus,samechr_geneminus,conc_transloc_geneminus,
+			hits_geneminus_5,hits_geneminus_3,querylength5,querylength3);
 
-    hits3 = List_append(hits_geneplus_3,hits_geneminus_3);
-    if ((*nhits3 = List_length(hits3)) == 0) {
-      *stage3array3 = (Stage3end_T *) NULL;
-    } else {
-      *stage3array3 = (Stage3end_T *) List_to_array_out(hits3,NULL); List_free(&hits3); /* Return value */
-      *stage3array3 = Stage3end_eval_and_sort(&(*nhits3),&(*first_absmq3),&(*second_absmq3),
-					      *stage3array3,maxpaths_search,queryseq3,
-					      queryuc_ptr_3,queryrc3,
-					      query3_compress_fwd,query3_compress_rev,
-					      quality_string_3,/*displayp*/true);
-    }
+    if (abort_pairing_p_geneplus == true) {
+    debug16(printf("abort_pairing_p_geneplus is true\n"));
+    paired_results_free(this_geneplus_5,this_geneplus_3,hitpairs_geneplus,samechr_geneplus,conc_transloc_geneplus,
+			hits_geneplus_5,hits_geneplus_3,querylength5,querylength3);
+
+    this_geneplus_5 = Stage1_new(querylength5);
+    this_geneplus_3 = Stage1_new(querylength3);
+    realign_separately(stage3array5,&(*nhits5),&(*first_absmq5),&(*second_absmq5),
+		       stage3array3,&(*nhits3),&(*first_absmq3),&(*second_absmq3),
+		       gmap_history_5,gmap_history_3,this_geneplus_5,this_geneplus_3,
+		       query5_compress_fwd,query5_compress_rev,query3_compress_fwd,query3_compress_rev,
+		       queryseq5,queryuc_ptr_5,queryrc5,quality_string_5,querylength5,query5_lastpos,
+		       queryseq3,queryuc_ptr_3,queryrc3,quality_string_3,querylength3,query3_lastpos,
+		       indexdb_fwd,indexdb_rev,indexdb_size_threshold,floors_array,
+		       user_maxlevel_5,user_maxlevel_3,indel_penalty_middle,indel_penalty_end,
+		       allow_end_indels_p,max_end_insertions,max_end_deletions,min_indel_end_matches,
+		       localsplicing_penalty,distantsplicing_penalty,min_shortend,
+		       oligoindices_major,oligoindices_minor,
+		       pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,
+		       keep_floors_p,/*genestrand*/+1);
 
     *npaths = 0;
     *final_pairtype = UNPAIRED;
     History_free(&gmap_history_3);
+    History_free(&gmap_history_5);
+    Compress_free(&query5_compress_fwd);
+    Compress_free(&query5_compress_rev);
     Compress_free(&query3_compress_fwd);
     Compress_free(&query3_compress_rev);
-    Stage1_free(&this_geneminus_3,querylength3);
+    Stage1_free(&this_geneplus_5,querylength5);
     Stage1_free(&this_geneplus_3,querylength3);
     return (Stage3pair_T *) NULL;
 
-  } else if (querylength3 < min_readlength) {
-    /* Solve just 5' end */
-    fprintf(stderr,"Second end of paired-read %s has length %d < min_readlength %d.  Aligning first end only.\n",
-	    Shortread_accession(queryseq5),querylength3,min_readlength);
-    *nhits3 = 0;
-    *stage3array3 = (Stage3end_T *) NULL;
-
-    if (user_maxlevel_float < 0.0) {
-      user_maxlevel_5 = -1;
-    } else if (user_maxlevel_float > 0.0 && user_maxlevel_float < 1.0) {
-      user_maxlevel_5 = (int) rint(user_maxlevel_float * (double) querylength5);
-    } else {
-      user_maxlevel_5 = (int) user_maxlevel_float;
-    }
-
-    this_geneplus_5 = Stage1_new(querylength5);
-    this_geneminus_5 = Stage1_new(querylength5);
-
-    queryuc_ptr_5 = Shortread_fullpointer_uc(queryseq5);
-    quality_string_5 = Shortread_quality_string(queryseq5);
-    query5_lastpos = querylength5 - index1part;
-
-    /* Limit search on repetitive sequences */
-    if (check_dinucleotides(queryuc_ptr_5,querylength5) == false) {
-      user_maxlevel_5 = 0;
-    }
-
-    query5_compress_fwd = Compress_new_fwd(queryuc_ptr_5,querylength5);
-    query5_compress_rev = Compress_new_rev(queryuc_ptr_5,querylength5);
-    gmap_history_5 = History_new();
-    make_complement_buffered(queryrc5,queryuc_ptr_5,querylength5);
+  } else {
+    plus_segments_genestrand_5[+1] = this_geneplus_5->plus_segments;
+    plus_nsegments_genestrand_5[+1] = this_geneplus_5->plus_nsegments;
+    minus_segments_genestrand_5[+1] = this_geneplus_5->minus_segments;
+    minus_nsegments_genestrand_5[+1] = this_geneplus_5->minus_nsegments;
+
+    plus_segments_genestrand_3[+1] = this_geneplus_3->plus_segments;
+    plus_nsegments_genestrand_3[+1] = this_geneplus_3->plus_nsegments;
+    minus_segments_genestrand_3[+1] = this_geneplus_3->minus_segments;
+    minus_nsegments_genestrand_3[+1] = this_geneplus_3->minus_nsegments;
+
+    stage3pairarray =
+      consolidate_paired_results(&(*npaths),&(*first_absmq),&(*second_absmq),&(*final_pairtype),
+				 &(*stage3array5),&(*nhits5),&(*first_absmq5),&(*second_absmq5),
+				 &(*stage3array3),&(*nhits3),&(*first_absmq3),&(*second_absmq3),
+				 hitpairs_geneplus,samechr_geneplus,conc_transloc_geneplus,
+				 hits_geneplus_5,hits_geneplus_3,gmap_history_5,gmap_history_3,
+				 query5_compress_fwd,query5_compress_rev,query3_compress_fwd,query3_compress_rev,
+				 plus_segments_genestrand_5,plus_nsegments_genestrand_5,minus_segments_genestrand_5,minus_nsegments_genestrand_5,
+				 plus_segments_genestrand_3,plus_nsegments_genestrand_3,minus_segments_genestrand_3,minus_nsegments_genestrand_3,
+				 queryseq5,queryuc_ptr_5,queryrc5,quality_string_5,querylength5,query5_lastpos,
+				 queryseq3,queryuc_ptr_3,queryrc3,quality_string_3,querylength3,query3_lastpos,
+				 cutoff_level_5,cutoff_level_3,localsplicing_penalty,oligoindices_major,oligoindices_minor,
+				 pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,pairmax,user_maxlevel_5,user_maxlevel_3);
+    History_free(&gmap_history_3);
+    History_free(&gmap_history_5);
+    Compress_free(&query5_compress_fwd);
+    Compress_free(&query5_compress_rev);
+    Compress_free(&query3_compress_fwd);
+    Compress_free(&query3_compress_rev);
+    Stage1_free(&this_geneplus_5,querylength5);
+    Stage1_free(&this_geneplus_3,querylength3);
+    return stage3pairarray;
+  }
 
-    if (read_oligos(&allvalidp5,this_geneplus_5,queryuc_ptr_5,querylength5,query5_lastpos,/*genestrand*/+1,
-		    /*first_read_p*/true) == 0) {
-      hits_geneplus_5 = (List_T) NULL;
-    } else {
-      hits_geneplus_5 = align_end(&cutoff_level_5,gmap_history_5,this_geneplus_5,
-				  query5_compress_fwd,query5_compress_rev,
-				  Shortread_accession(queryseq5),queryuc_ptr_5,queryrc5,querylength5,query5_lastpos,
-				  indexdb_fwd,indexdb_rev,indexdb_size_threshold,
-				  floors_array,oligoindices_major,oligoindices_minor,
-				  pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,
-				  user_maxlevel_5,indel_penalty_middle,indel_penalty_end,
-				  localsplicing_penalty,distantsplicing_penalty,min_shortend,
-				  allow_end_indels_p,max_end_insertions,max_end_deletions,min_indel_end_matches,
-				  allvalidp5,keep_floors_p,/*genestrand*/+1,/*first_read_p*/true);
-    }
+  } else if (found_score_geneminus < found_score_geneplus) {
+    paired_results_free(this_geneplus_5,this_geneplus_3,hitpairs_geneplus,samechr_geneplus,conc_transloc_geneplus,
+			hits_geneplus_5,hits_geneplus_3,querylength5,querylength3);
 
-    if (read_oligos(&allvalidp5,this_geneminus_5,queryuc_ptr_5,querylength5,query5_lastpos,/*genestrand*/+2,
-		    /*first_read_p*/true) == 0) {
-      hits_geneminus_5 = (List_T) NULL;
-    } else {
-      hits_geneminus_5 = align_end(&cutoff_level_5,gmap_history_5,this_geneminus_5,
-				   query5_compress_fwd,query5_compress_rev,
-				   Shortread_accession(queryseq5),queryuc_ptr_5,queryrc5,querylength5,query5_lastpos,
-				   indexdb_fwd,indexdb_rev,indexdb_size_threshold,
-				   floors_array,oligoindices_major,oligoindices_minor,
-				   pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,
-				   user_maxlevel_5,indel_penalty_middle,indel_penalty_end,
-				   localsplicing_penalty,distantsplicing_penalty,min_shortend,
-				   allow_end_indels_p,max_end_insertions,max_end_deletions,min_indel_end_matches,
-				   allvalidp5,keep_floors_p,/*genestrand*/+2,/*first_read_p*/true);
-    }
+    if (abort_pairing_p_geneminus == true) {
+    debug16(printf("abort_pairing_p_geneminus is true\n"));
+    paired_results_free(this_geneminus_5,this_geneminus_3,hitpairs_geneminus,samechr_geneminus,conc_transloc_geneminus,
+			hits_geneminus_5,hits_geneminus_3,querylength5,querylength3);
 
-    hits5 = List_append(hits_geneplus_5,hits_geneminus_5);
-    if ((*nhits5 = List_length(hits5)) == 0) {
-      *stage3array5 = (Stage3end_T *) NULL;
-    } else {
-      *stage3array5 = (Stage3end_T *) List_to_array_out(hits5,NULL); List_free(&hits5); /* Return value */
-      *stage3array5 = Stage3end_eval_and_sort(&(*nhits5),&(*first_absmq5),&(*second_absmq5),
-					      *stage3array5,maxpaths_search,queryseq5,
-					      queryuc_ptr_5,queryrc5,
-					      query5_compress_fwd,query5_compress_rev,
-					      quality_string_5,/*displayp*/true);
-    }
+    this_geneminus_5 = Stage1_new(querylength5);
+    this_geneminus_3 = Stage1_new(querylength3);
+    realign_separately(stage3array5,&(*nhits5),&(*first_absmq5),&(*second_absmq5),
+		       stage3array3,&(*nhits3),&(*first_absmq3),&(*second_absmq3),
+		       gmap_history_5,gmap_history_3,this_geneminus_5,this_geneminus_3,
+		       query5_compress_fwd,query5_compress_rev,query3_compress_fwd,query3_compress_rev,
+		       queryseq5,queryuc_ptr_5,queryrc5,quality_string_5,querylength5,query5_lastpos,
+		       queryseq3,queryuc_ptr_3,queryrc3,quality_string_3,querylength3,query3_lastpos,
+		       indexdb_fwd,indexdb_rev,indexdb_size_threshold,floors_array,
+		       user_maxlevel_5,user_maxlevel_3,indel_penalty_middle,indel_penalty_end,
+		       allow_end_indels_p,max_end_insertions,max_end_deletions,min_indel_end_matches,
+		       localsplicing_penalty,distantsplicing_penalty,min_shortend,
+		       oligoindices_major,oligoindices_minor,
+		       pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,
+		       keep_floors_p,/*genestrand*/+2);
 
     *npaths = 0;
     *final_pairtype = UNPAIRED;
+    History_free(&gmap_history_3);
     History_free(&gmap_history_5);
     Compress_free(&query5_compress_fwd);
     Compress_free(&query5_compress_rev);
+    Compress_free(&query3_compress_fwd);
+    Compress_free(&query3_compress_rev);
     Stage1_free(&this_geneminus_5,querylength5);
-    Stage1_free(&this_geneplus_5,querylength5);
+    Stage1_free(&this_geneminus_3,querylength3);
     return (Stage3pair_T *) NULL;
 
   } else {
-    if (user_maxlevel_float < 0.0) {
-      user_maxlevel_5 = user_maxlevel_3 = -1;
-    } else if (user_maxlevel_float > 0.0 && user_maxlevel_float < 1.0) {
-      user_maxlevel_5 = (int) rint(user_maxlevel_float * (double) querylength5);
-      user_maxlevel_3 = (int) rint(user_maxlevel_float * (double) querylength3);
-    } else {
-      user_maxlevel_5 = user_maxlevel_3 = (int) user_maxlevel_float;
-    }
-
-    this_geneplus_5 = Stage1_new(querylength5);
-    this_geneplus_3 = Stage1_new(querylength3);
-    this_geneminus_5 = Stage1_new(querylength5);
-    this_geneminus_3 = Stage1_new(querylength3);
-
-    queryuc_ptr_5 = Shortread_fullpointer_uc(queryseq5);
-    queryuc_ptr_3 = Shortread_fullpointer_uc(queryseq3);
-    quality_string_5 = Shortread_quality_string(queryseq5);
-    quality_string_3 = Shortread_quality_string(queryseq3);
-    query5_lastpos = querylength5 - index1part;
-    query3_lastpos = querylength3 - index1part;
-
-    /* Limit search on repetitive sequences */
-    if (check_dinucleotides(queryuc_ptr_5,querylength5) == false) {
-      user_maxlevel_5 = 0;
-    }
-    if (check_dinucleotides(queryuc_ptr_3,querylength3) == false) {
-      user_maxlevel_3 = 0;
-    }
-
-    query5_compress_fwd = Compress_new_fwd(queryuc_ptr_5,querylength5);
-    query5_compress_rev = Compress_new_rev(queryuc_ptr_5,querylength5);
-    query3_compress_fwd = Compress_new_fwd(queryuc_ptr_3,querylength3);
-    query3_compress_rev = Compress_new_rev(queryuc_ptr_3,querylength3);
-    gmap_history_5 = History_new();
-    gmap_history_3 = History_new();
-    make_complement_buffered(queryrc5,queryuc_ptr_5,querylength5);
-    make_complement_buffered(queryrc3,queryuc_ptr_3,querylength3);
-
-    abort_pairing_p_geneplus = false;
-    hitpairs_geneplus = align_pair(&abort_pairing_p_geneplus,&found_score_geneplus,
-				   &cutoff_level_5,&cutoff_level_3,
-				   &samechr_geneplus,&conc_transloc_geneplus,
-				   gmap_history_5,gmap_history_3,
-				   &hits_geneplus_5,&hits_geneplus_3,this_geneplus_5,this_geneplus_3,
-				   query5_compress_fwd,query5_compress_rev,query3_compress_fwd,query3_compress_rev,
-				   queryuc_ptr_5,queryuc_ptr_3,queryrc5,queryrc3,
-				   querylength5,querylength3,query5_lastpos,query3_lastpos,
-				   indexdb_fwd,indexdb_rev,indexdb_size_threshold,floors_array,
-
-				   oligoindices_major,oligoindices_minor,
-				   pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,
-
-				   user_maxlevel_5,user_maxlevel_3,indel_penalty_middle,indel_penalty_end,
-				   localsplicing_penalty,distantsplicing_penalty,min_shortend,
-				   allow_end_indels_p,max_end_insertions,max_end_deletions,min_indel_end_matches,
-				   allvalidp5,allvalidp3,pairmax,maxpairedpaths,keep_floors_p,
-				   queryseq5,queryseq3,/*genestrand*/+1);
-
-    abort_pairing_p_geneminus = false;
-    hitpairs_geneminus = align_pair(&abort_pairing_p_geneminus,&found_score_geneminus,
-				    &cutoff_level_5,&cutoff_level_3,
-				    &samechr_geneminus,&conc_transloc_geneminus,
-				    gmap_history_5,gmap_history_3,
-				    &hits_geneminus_5,&hits_geneminus_3,this_geneminus_5,this_geneminus_3,
-				    query5_compress_fwd,query5_compress_rev,query3_compress_fwd,query3_compress_rev,
-				    queryuc_ptr_5,queryuc_ptr_3,queryrc5,queryrc3,
-				    querylength5,querylength3,query5_lastpos,query3_lastpos,
-				    indexdb_fwd,indexdb_rev,indexdb_size_threshold,floors_array,
-
-				    oligoindices_major,oligoindices_minor,
-				    pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,
-
-				    user_maxlevel_5,user_maxlevel_3,indel_penalty_middle,indel_penalty_end,
-				    localsplicing_penalty,distantsplicing_penalty,min_shortend,
-				    allow_end_indels_p,max_end_insertions,max_end_deletions,min_indel_end_matches,
-				    allvalidp5,allvalidp3,pairmax,maxpairedpaths,keep_floors_p,
-				    queryseq5,queryseq3,/*genestrand*/+2);
-
-    if (found_score_geneplus < found_score_geneminus) {
-      paired_results_free(this_geneminus_5,this_geneminus_3,hitpairs_geneminus,samechr_geneminus,conc_transloc_geneminus,
-			  hits_geneminus_5,hits_geneminus_3,querylength5,querylength3);
-
-      if (abort_pairing_p_geneplus == true) {
-	debug16(printf("abort_pairing_p_geneplus is true\n"));
-	paired_results_free(this_geneplus_5,this_geneplus_3,hitpairs_geneplus,samechr_geneplus,conc_transloc_geneplus,
-			    hits_geneplus_5,hits_geneplus_3,querylength5,querylength3);
-
-	this_geneplus_5 = Stage1_new(querylength5);
-	this_geneplus_3 = Stage1_new(querylength3);
-	realign_separately(stage3array5,&(*nhits5),&(*first_absmq5),&(*second_absmq5),
-			   stage3array3,&(*nhits3),&(*first_absmq3),&(*second_absmq3),
-			   gmap_history_5,gmap_history_3,this_geneplus_5,this_geneplus_3,
-			   query5_compress_fwd,query5_compress_rev,query3_compress_fwd,query3_compress_rev,
-			   queryseq5,queryuc_ptr_5,queryrc5,quality_string_5,querylength5,query5_lastpos,
-			   queryseq3,queryuc_ptr_3,queryrc3,quality_string_3,querylength3,query3_lastpos,
-			   indexdb_fwd,indexdb_rev,indexdb_size_threshold,floors_array,
-			   user_maxlevel_5,user_maxlevel_3,indel_penalty_middle,indel_penalty_end,
-			   allow_end_indels_p,max_end_insertions,max_end_deletions,min_indel_end_matches,
-			   localsplicing_penalty,distantsplicing_penalty,min_shortend,
-			   oligoindices_major,oligoindices_minor,
-			   pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,
-			   keep_floors_p,/*genestrand*/+1);
-
-	*npaths = 0;
-	*final_pairtype = UNPAIRED;
-	History_free(&gmap_history_3);
-	History_free(&gmap_history_5);
-	Compress_free(&query5_compress_fwd);
-	Compress_free(&query5_compress_rev);
-	Compress_free(&query3_compress_fwd);
-	Compress_free(&query3_compress_rev);
-	Stage1_free(&this_geneplus_5,querylength5);
-	Stage1_free(&this_geneplus_3,querylength3);
-	return (Stage3pair_T *) NULL;
-
-      } else {
-	plus_segments_genestrand_5[+1] = this_geneplus_5->plus_segments;
-	plus_nsegments_genestrand_5[+1] = this_geneplus_5->plus_nsegments;
-	minus_segments_genestrand_5[+1] = this_geneplus_5->minus_segments;
-	minus_nsegments_genestrand_5[+1] = this_geneplus_5->minus_nsegments;
-
-	plus_segments_genestrand_3[+1] = this_geneplus_3->plus_segments;
-	plus_nsegments_genestrand_3[+1] = this_geneplus_3->plus_nsegments;
-	minus_segments_genestrand_3[+1] = this_geneplus_3->minus_segments;
-	minus_nsegments_genestrand_3[+1] = this_geneplus_3->minus_nsegments;
-
-	stage3pairarray =
-	  consolidate_paired_results(&(*npaths),&(*first_absmq),&(*second_absmq),&(*final_pairtype),
-				     &(*stage3array5),&(*nhits5),&(*first_absmq5),&(*second_absmq5),
-				     &(*stage3array3),&(*nhits3),&(*first_absmq3),&(*second_absmq3),
-				     hitpairs_geneplus,samechr_geneplus,conc_transloc_geneplus,
-				     hits_geneplus_5,hits_geneplus_3,gmap_history_5,gmap_history_3,
-				     query5_compress_fwd,query5_compress_rev,query3_compress_fwd,query3_compress_rev,
-				     plus_segments_genestrand_5,plus_nsegments_genestrand_5,minus_segments_genestrand_5,minus_nsegments_genestrand_5,
-				     plus_segments_genestrand_3,plus_nsegments_genestrand_3,minus_segments_genestrand_3,minus_nsegments_genestrand_3,
-				     queryseq5,queryuc_ptr_5,queryrc5,quality_string_5,querylength5,query5_lastpos,
-				     queryseq3,queryuc_ptr_3,queryrc3,quality_string_3,querylength3,query3_lastpos,
-				     cutoff_level_5,cutoff_level_3,
-				     localsplicing_penalty,
-				     oligoindices_major,oligoindices_minor,
-				     pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,pairmax,user_maxlevel_5,user_maxlevel_3);
-	History_free(&gmap_history_3);
-	History_free(&gmap_history_5);
-	Compress_free(&query5_compress_fwd);
-	Compress_free(&query5_compress_rev);
-	Compress_free(&query3_compress_fwd);
-	Compress_free(&query3_compress_rev);
-	Stage1_free(&this_geneplus_5,querylength5);
-	Stage1_free(&this_geneplus_3,querylength3);
-	return stage3pairarray;
-      }
-
-    } else if (found_score_geneminus < found_score_geneplus) {
-      paired_results_free(this_geneplus_5,this_geneplus_3,hitpairs_geneplus,samechr_geneplus,conc_transloc_geneplus,
-			  hits_geneplus_5,hits_geneplus_3,querylength5,querylength3);
-
-      if (abort_pairing_p_geneminus == true) {
-	debug16(printf("abort_pairing_p_geneminus is true\n"));
-	paired_results_free(this_geneminus_5,this_geneminus_3,hitpairs_geneminus,samechr_geneminus,conc_transloc_geneminus,
-			    hits_geneminus_5,hits_geneminus_3,querylength5,querylength3);
-
-	this_geneminus_5 = Stage1_new(querylength5);
-	this_geneminus_3 = Stage1_new(querylength3);
-	realign_separately(stage3array5,&(*nhits5),&(*first_absmq5),&(*second_absmq5),
-			   stage3array3,&(*nhits3),&(*first_absmq3),&(*second_absmq3),
-			   gmap_history_5,gmap_history_3,this_geneminus_5,this_geneminus_3,
-			   query5_compress_fwd,query5_compress_rev,query3_compress_fwd,query3_compress_rev,
-			   queryseq5,queryuc_ptr_5,queryrc5,quality_string_5,querylength5,query5_lastpos,
-			   queryseq3,queryuc_ptr_3,queryrc3,quality_string_3,querylength3,query3_lastpos,
-			   indexdb_fwd,indexdb_rev,indexdb_size_threshold,floors_array,
-			   user_maxlevel_5,user_maxlevel_3,indel_penalty_middle,indel_penalty_end,
-			   allow_end_indels_p,max_end_insertions,max_end_deletions,min_indel_end_matches,
-			   localsplicing_penalty,distantsplicing_penalty,min_shortend,
-			   oligoindices_major,oligoindices_minor,
-			   pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,
-			   keep_floors_p,/*genestrand*/+2);
-
-	*npaths = 0;
-	*final_pairtype = UNPAIRED;
-	History_free(&gmap_history_3);
-	History_free(&gmap_history_5);
-	Compress_free(&query5_compress_fwd);
-	Compress_free(&query5_compress_rev);
-	Compress_free(&query3_compress_fwd);
-	Compress_free(&query3_compress_rev);
-	Stage1_free(&this_geneminus_5,querylength5);
-	Stage1_free(&this_geneminus_3,querylength3);
-	return (Stage3pair_T *) NULL;
+    plus_segments_genestrand_5[+2] = this_geneminus_5->plus_segments;
+    plus_nsegments_genestrand_5[+2] = this_geneminus_5->plus_nsegments;
+    minus_segments_genestrand_5[+2] = this_geneminus_5->minus_segments;
+    minus_nsegments_genestrand_5[+2] = this_geneminus_5->minus_nsegments;
+
+    plus_segments_genestrand_3[+2] = this_geneminus_3->plus_segments;
+    plus_nsegments_genestrand_3[+2] = this_geneminus_3->plus_nsegments;
+    minus_segments_genestrand_3[+2] = this_geneminus_3->minus_segments;
+    minus_nsegments_genestrand_3[+2] = this_geneminus_3->minus_nsegments;
+
+    stage3pairarray =
+      consolidate_paired_results(&(*npaths),&(*first_absmq),&(*second_absmq),&(*final_pairtype),
+				 &(*stage3array5),&(*nhits5),&(*first_absmq5),&(*second_absmq5),
+				 &(*stage3array3),&(*nhits3),&(*first_absmq3),&(*second_absmq3),
+				 hitpairs_geneminus,samechr_geneminus,conc_transloc_geneminus,
+				 hits_geneminus_5,hits_geneminus_3,gmap_history_5,gmap_history_3,
+				 query5_compress_fwd,query5_compress_rev,query3_compress_fwd,query3_compress_rev,
+				 plus_segments_genestrand_5,plus_nsegments_genestrand_5,minus_segments_genestrand_5,minus_nsegments_genestrand_5,
+				 plus_segments_genestrand_3,plus_nsegments_genestrand_3,minus_segments_genestrand_3,minus_nsegments_genestrand_3,
+				 queryseq5,queryuc_ptr_5,queryrc5,quality_string_5,querylength5,query5_lastpos,
+				 queryseq3,queryuc_ptr_3,queryrc3,quality_string_3,querylength3,query3_lastpos,
+				 cutoff_level_5,cutoff_level_3,localsplicing_penalty,oligoindices_major,oligoindices_minor,
+				 pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,pairmax,user_maxlevel_5,user_maxlevel_3);
+    History_free(&gmap_history_3);
+    History_free(&gmap_history_5);
+    Compress_free(&query5_compress_fwd);
+    Compress_free(&query5_compress_rev);
+    Compress_free(&query3_compress_fwd);
+    Compress_free(&query3_compress_rev);
+    Stage1_free(&this_geneminus_5,querylength5);
+    Stage1_free(&this_geneminus_3,querylength3);
+    return stage3pairarray;
+  }
 
-      } else {
-	plus_segments_genestrand_5[+2] = this_geneminus_5->plus_segments;
-	plus_nsegments_genestrand_5[+2] = this_geneminus_5->plus_nsegments;
-	minus_segments_genestrand_5[+2] = this_geneminus_5->minus_segments;
-	minus_nsegments_genestrand_5[+2] = this_geneminus_5->minus_nsegments;
-
-	plus_segments_genestrand_3[+2] = this_geneminus_3->plus_segments;
-	plus_nsegments_genestrand_3[+2] = this_geneminus_3->plus_nsegments;
-	minus_segments_genestrand_3[+2] = this_geneminus_3->minus_segments;
-	minus_nsegments_genestrand_3[+2] = this_geneminus_3->minus_nsegments;
-
-	stage3pairarray =
-	  consolidate_paired_results(&(*npaths),&(*first_absmq),&(*second_absmq),&(*final_pairtype),
-				     &(*stage3array5),&(*nhits5),&(*first_absmq5),&(*second_absmq5),
-				     &(*stage3array3),&(*nhits3),&(*first_absmq3),&(*second_absmq3),
-				     hitpairs_geneminus,samechr_geneminus,conc_transloc_geneminus,
-				     hits_geneminus_5,hits_geneminus_3,gmap_history_5,gmap_history_3,
-				     query5_compress_fwd,query5_compress_rev,query3_compress_fwd,query3_compress_rev,
-				     plus_segments_genestrand_5,plus_nsegments_genestrand_5,minus_segments_genestrand_5,minus_nsegments_genestrand_5,
-				     plus_segments_genestrand_3,plus_nsegments_genestrand_3,minus_segments_genestrand_3,minus_nsegments_genestrand_3,
-				     queryseq5,queryuc_ptr_5,queryrc5,quality_string_5,querylength5,query5_lastpos,
-				     queryseq3,queryuc_ptr_3,queryrc3,quality_string_3,querylength3,query3_lastpos,
-				     cutoff_level_5,cutoff_level_3,
-				     localsplicing_penalty,
-				     oligoindices_major,oligoindices_minor,
-				     pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,pairmax,user_maxlevel_5,user_maxlevel_3);
-	History_free(&gmap_history_3);
-	History_free(&gmap_history_5);
-	Compress_free(&query5_compress_fwd);
-	Compress_free(&query5_compress_rev);
-	Compress_free(&query3_compress_fwd);
-	Compress_free(&query3_compress_rev);
-	Stage1_free(&this_geneminus_5,querylength5);
-	Stage1_free(&this_geneminus_3,querylength3);
-	return stage3pairarray;
-      }
+  } else {
+    hitpairs = List_append(hitpairs_geneplus,hitpairs_geneminus);
+    samechr = List_append(samechr_geneplus,samechr_geneminus);
+    conc_transloc = List_append(conc_transloc_geneplus,conc_transloc_geneminus);
+    hits5 = List_append(hits_geneplus_5,hits_geneminus_5);
+    hits3 = List_append(hits_geneplus_3,hits_geneminus_3);
 
-    } else {
-      hitpairs = List_append(hitpairs_geneplus,hitpairs_geneminus);
-      samechr = List_append(samechr_geneplus,samechr_geneminus);
-      conc_transloc = List_append(conc_transloc_geneplus,conc_transloc_geneminus);
-      hits5 = List_append(hits_geneplus_5,hits_geneminus_5);
-      hits3 = List_append(hits_geneplus_3,hits_geneminus_3);
-
-      plus_segments_genestrand_5[+1] = this_geneplus_5->plus_segments;
-      plus_nsegments_genestrand_5[+1] = this_geneplus_5->plus_nsegments;
-      minus_segments_genestrand_5[+1] = this_geneplus_5->minus_segments;
-      minus_nsegments_genestrand_5[+1] = this_geneplus_5->minus_nsegments;
-
-      plus_segments_genestrand_3[+1] = this_geneplus_3->plus_segments;
-      plus_nsegments_genestrand_3[+1] = this_geneplus_3->plus_nsegments;
-      minus_segments_genestrand_3[+1] = this_geneplus_3->minus_segments;
-      minus_nsegments_genestrand_3[+1] = this_geneplus_3->minus_nsegments;
-
-      plus_segments_genestrand_5[+2] = this_geneminus_5->plus_segments;
-      plus_nsegments_genestrand_5[+2] = this_geneminus_5->plus_nsegments;
-      minus_segments_genestrand_5[+2] = this_geneminus_5->minus_segments;
-      minus_nsegments_genestrand_5[+2] = this_geneminus_5->minus_nsegments;
-
-      plus_segments_genestrand_3[+2] = this_geneminus_3->plus_segments;
-      plus_nsegments_genestrand_3[+2] = this_geneminus_3->plus_nsegments;
-      minus_segments_genestrand_3[+2] = this_geneminus_3->minus_segments;
-      minus_nsegments_genestrand_3[+2] = this_geneminus_3->minus_nsegments;
-
-      stage3pairarray =
-	consolidate_paired_results(&(*npaths),&(*first_absmq),&(*second_absmq),&(*final_pairtype),
-				   &(*stage3array5),&(*nhits5),&(*first_absmq5),&(*second_absmq5),
-				   &(*stage3array3),&(*nhits3),&(*first_absmq3),&(*second_absmq3),
-				   hitpairs,samechr,conc_transloc,hits5,hits3,gmap_history_5,gmap_history_3,
-				   query5_compress_fwd,query5_compress_rev,query3_compress_fwd,query3_compress_rev,
-				   plus_segments_genestrand_5,plus_nsegments_genestrand_5,minus_segments_genestrand_5,minus_nsegments_genestrand_5,
-				   plus_segments_genestrand_3,plus_nsegments_genestrand_3,minus_segments_genestrand_3,minus_nsegments_genestrand_3,
-				   queryseq5,queryuc_ptr_5,queryrc5,quality_string_5,querylength5,query5_lastpos,
-				   queryseq3,queryuc_ptr_3,queryrc3,quality_string_3,querylength3,query3_lastpos,
-				   cutoff_level_5,cutoff_level_3,
-				   localsplicing_penalty,
-				   oligoindices_major,oligoindices_minor,
-				   pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,pairmax,user_maxlevel_5,user_maxlevel_3);
-      History_free(&gmap_history_3);
-      History_free(&gmap_history_5);
-      Compress_free(&query5_compress_fwd);
-      Compress_free(&query5_compress_rev);
-      Compress_free(&query3_compress_fwd);
-      Compress_free(&query3_compress_rev);
-      Stage1_free(&this_geneminus_5,querylength5);
-      Stage1_free(&this_geneminus_3,querylength3);
-      Stage1_free(&this_geneplus_5,querylength5);
-      Stage1_free(&this_geneplus_3,querylength3);
-      return stage3pairarray;
-    }
+    plus_segments_genestrand_5[+1] = this_geneplus_5->plus_segments;
+    plus_nsegments_genestrand_5[+1] = this_geneplus_5->plus_nsegments;
+    minus_segments_genestrand_5[+1] = this_geneplus_5->minus_segments;
+    minus_nsegments_genestrand_5[+1] = this_geneplus_5->minus_nsegments;
+
+    plus_segments_genestrand_3[+1] = this_geneplus_3->plus_segments;
+    plus_nsegments_genestrand_3[+1] = this_geneplus_3->plus_nsegments;
+    minus_segments_genestrand_3[+1] = this_geneplus_3->minus_segments;
+    minus_nsegments_genestrand_3[+1] = this_geneplus_3->minus_nsegments;
+
+    plus_segments_genestrand_5[+2] = this_geneminus_5->plus_segments;
+    plus_nsegments_genestrand_5[+2] = this_geneminus_5->plus_nsegments;
+    minus_segments_genestrand_5[+2] = this_geneminus_5->minus_segments;
+    minus_nsegments_genestrand_5[+2] = this_geneminus_5->minus_nsegments;
+
+    plus_segments_genestrand_3[+2] = this_geneminus_3->plus_segments;
+    plus_nsegments_genestrand_3[+2] = this_geneminus_3->plus_nsegments;
+    minus_segments_genestrand_3[+2] = this_geneminus_3->minus_segments;
+    minus_nsegments_genestrand_3[+2] = this_geneminus_3->minus_nsegments;
+
+    stage3pairarray =
+      consolidate_paired_results(&(*npaths),&(*first_absmq),&(*second_absmq),&(*final_pairtype),
+				 &(*stage3array5),&(*nhits5),&(*first_absmq5),&(*second_absmq5),
+				 &(*stage3array3),&(*nhits3),&(*first_absmq3),&(*second_absmq3),
+				 hitpairs,samechr,conc_transloc,hits5,hits3,gmap_history_5,gmap_history_3,
+				 query5_compress_fwd,query5_compress_rev,query3_compress_fwd,query3_compress_rev,
+				 plus_segments_genestrand_5,plus_nsegments_genestrand_5,minus_segments_genestrand_5,minus_nsegments_genestrand_5,
+				 plus_segments_genestrand_3,plus_nsegments_genestrand_3,minus_segments_genestrand_3,minus_nsegments_genestrand_3,
+				 queryseq5,queryuc_ptr_5,queryrc5,quality_string_5,querylength5,query5_lastpos,
+				 queryseq3,queryuc_ptr_3,queryrc3,quality_string_3,querylength3,query3_lastpos,
+				 cutoff_level_5,cutoff_level_3,localsplicing_penalty,oligoindices_major,oligoindices_minor,
+				 pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,pairmax,user_maxlevel_5,user_maxlevel_3);
+    History_free(&gmap_history_3);
+    History_free(&gmap_history_5);
+    Compress_free(&query5_compress_fwd);
+    Compress_free(&query5_compress_rev);
+    Compress_free(&query3_compress_fwd);
+    Compress_free(&query3_compress_rev);
+    Stage1_free(&this_geneminus_5,querylength5);
+    Stage1_free(&this_geneminus_3,querylength3);
+    Stage1_free(&this_geneplus_5,querylength5);
+    Stage1_free(&this_geneplus_3,querylength3);
+    return stage3pairarray;
   }
 }
 
@@ -21161,7 +20925,7 @@ Stage1_paired_read (int *npaths, int *first_absmq, int *second_absmq, Pairtype_T
 		    Dynprog_T dynprogL, Dynprog_T dynprogM, Dynprog_T dynprogR,
 		    Chrpos_T pairmax, bool keep_floors_p) {
 
-  if (mode == STANDARD || mode == CMET_STRANDED || mode == ATOI_STRANDED) {
+  if (mode == STANDARD || mode == CMET_STRANDED || mode == ATOI_STRANDED || mode == TTOC_STRANDED) {
     return paired_read(&(*npaths),&(*first_absmq),&(*second_absmq),&(*final_pairtype),
 		       &(*stage3array5),&(*nhits5),&(*first_absmq5),&(*second_absmq5),
 		       &(*stage3array3),&(*nhits3),&(*first_absmq3),&(*second_absmq3),
@@ -21172,7 +20936,7 @@ Stage1_paired_read (int *npaths, int *first_absmq, int *second_absmq, Pairtype_T
 		       oligoindices_major,oligoindices_minor,
 		       pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,pairmax,keep_floors_p);
 
-  } else if (mode == CMET_NONSTRANDED || mode == ATOI_NONSTRANDED) {
+  } else if (mode == CMET_NONSTRANDED || mode == ATOI_NONSTRANDED || mode == TTOC_NONSTRANDED) {
     return paired_read_tolerant_nonstranded(&(*npaths),&(*first_absmq),&(*second_absmq),&(*final_pairtype),
 					    &(*stage3array5),&(*nhits5),&(*first_absmq5),&(*second_absmq5),
 					    &(*stage3array3),&(*nhits3),&(*first_absmq3),&(*second_absmq3),
@@ -21230,7 +20994,7 @@ Stage1hr_setup (bool use_sarray_p_in, bool use_only_sarray_p_in, int index1part_
   two_index1intervals = index1interval_in + index1interval_in;
   spansize = spansize_in;
 
-  min_readlength = index1part_in + index1interval_in - 1;
+  min_kmer_readlength = index1part_in + index1interval_in - 1;
   chromosome_iit = chromosome_iit_in;
   circular_typeint = Univ_IIT_typeint(chromosome_iit,"circular");
   nchromosomes = nchromosomes_in;
diff --git a/src/stage3hr.c b/src/stage3hr.c
index b3ee14d..9dd4189 100644
--- a/src/stage3hr.c
+++ b/src/stage3hr.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: stage3hr.c 167162 2015-06-09 20:53:13Z twu $";
+static char rcsid[] = "$Id: stage3hr.c 167393 2015-06-11 22:16:20Z twu $";
 #ifdef HAVE_CONFIG_H
 #include <config.h>
 #endif
@@ -4802,9 +4802,15 @@ Stage3end_new_substrings (int *found_score, Intlist_T endpoints,
       }
       nmismatches_whole += nmismatches;
       debug0(printf("nmismatches %d from sarray\n",nmismatches));
+#ifdef LARGE_GENOMES
+      if (Uint8list_next(q) == NULL && right_ambig == NULL) {
+	trim_right_p = true;
+      }
+#else
       if (Uintlist_next(q) == NULL && right_ambig == NULL) {
 	trim_right_p = true;
       }
+#endif
       if ((substring = Substring_new(/*nmismatches_whole*/nmismatches,chrnum,chroffset,chrhigh,chrlength,
 				     query_compress,/*start_endtype*/END,/*end_endtype*/END,
 				     querystart,queryend,querylength,alignstart,alignend,
@@ -4932,9 +4938,15 @@ Stage3end_new_substrings (int *found_score, Intlist_T endpoints,
       }
       nmismatches_whole += nmismatches;
       debug0(printf("nmismatches %d from sarray\n",nmismatches));
+#ifdef LARGE_GENOMES
+      if (Uint8list_next(q) == NULL && right_ambig == NULL) {
+	trim_left_p = true;
+      }
+#else
       if (Uintlist_next(q) == NULL && right_ambig == NULL) {
 	trim_left_p = true;
       }
+#endif
       if ((substring = Substring_new(/*nmismatches_whole*/nmismatches,chrnum,chroffset,chrhigh,chrlength,
 				     query_compress,/*start_endtype*/END,/*end_endtype*/END,
 				     /*querystart*/querylength - queryend,/*queryend*/querylength - querystart,querylength,
diff --git a/src/substring.c b/src/substring.c
index 4d72940..a6043df 100644
--- a/src/substring.c
+++ b/src/substring.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: substring.c 166827 2015-06-03 06:55:46Z twu $";
+static char rcsid[] = "$Id: substring.c 167592 2015-06-15 18:56:59Z twu $";
 #ifdef HAVE_CONFIG_H
 #include <config.h>
 #endif
@@ -1466,6 +1466,95 @@ mark_mismatches_atoi_sam (char *gbuffer, char *query, int start, int end, int ge
 }
 
 
+static void
+mark_mismatches_ttoc_gsnap (char *gbuffer, char *query, int start, int end, int genestrand) {
+  int i;
+  
+  debug1(printf("query:  %s\n",query));
+  debug1(printf("genome: %s\n",gbuffer));
+  debug1(printf("count:  "));
+
+  if (genestrand == +2) {
+    for (i = start; i < end; i++) {
+      if (gbuffer[i] == 'A' && query[i] == 'G') {
+	debug1(printf("."));
+	gbuffer[i] = '.';
+      } else if (query[i] != gbuffer[i]) {
+	debug1(printf("x"));
+	assert(gbuffer[i] != OUTOFBOUNDS);
+	gbuffer[i] = (char) tolower(gbuffer[i]);
+      } else {
+	debug1(printf("*"));
+      }
+    }
+
+  } else {
+    for (i = start; i < end; i++) {
+      if (gbuffer[i] == 'T' && query[i] == 'C') {
+	debug1(printf("."));
+	gbuffer[i] = '.';
+      } else if (query[i] != gbuffer[i]) {
+	debug1(printf("x"));
+	assert(gbuffer[i] != OUTOFBOUNDS);
+	gbuffer[i] = (char) tolower(gbuffer[i]);
+      } else {
+	debug1(printf("*"));
+      }
+    }
+  }
+  
+  return;
+}
+
+
+
+static void
+mark_mismatches_ttoc_sam (char *gbuffer, char *query, int start, int end, int genestrand) {
+  int i;
+  
+  debug1(printf("query:  %s\n",query));
+  debug1(printf("genome: %s\n",gbuffer));
+  debug1(printf("count:  "));
+
+  if (genestrand == +2) {
+    for (i = start; i < end; i++) {
+      if (gbuffer[i] == 'A' && query[i] == 'G') {
+	debug1(printf("."));
+#if 0
+	/* Want to show mismatches */
+	gbuffer[i] = 'G';		/* Avoids showing mismatches in MD and NM strings */
+#endif
+      } else if (query[i] != gbuffer[i]) {
+	debug1(printf("x"));
+	assert(gbuffer[i] != OUTOFBOUNDS);
+	gbuffer[i] = (char) tolower(gbuffer[i]);
+      } else {
+	debug1(printf("*"));
+      }
+    }
+
+  } else {
+    for (i = start; i < end; i++) {
+      if (gbuffer[i] == 'T' && query[i] == 'C') {
+	debug1(printf("."));
+#if 0
+	/* Want to show mismatches */
+	gbuffer[i] = 'C';		/* Avoids showing mismatches in MD and NM strings */
+#endif
+      } else if (query[i] != gbuffer[i]) {
+	debug1(printf("x"));
+	assert(gbuffer[i] != OUTOFBOUNDS);
+	gbuffer[i] = (char) tolower(gbuffer[i]);
+      } else {
+	debug1(printf("*"));
+      }
+    }
+  }
+
+  return;
+}
+
+
 
 void
 Substring_setup (bool print_nsnpdiffs_p_in, bool print_snplabels_p_in,
@@ -1539,6 +1628,8 @@ embellish_genomic (char *genomic_diff, char *query, int querystart, int queryend
     mark_mismatches_cmet_gsnap(result,query,querystart,queryend,genestrand);
   } else if (mode == ATOI_STRANDED || mode == ATOI_NONSTRANDED) {
     mark_mismatches_atoi_gsnap(result,query,querystart,queryend,genestrand);
+  } else if (mode == TTOC_STRANDED || mode == TTOC_NONSTRANDED) {
+    mark_mismatches_ttoc_gsnap(result,query,querystart,queryend,genestrand);
   } else {
     abort();
   }
@@ -1583,6 +1674,8 @@ embellish_genomic_sam (char *genomic_diff, char *query, int querystart, int quer
     mark_mismatches_cmet_sam(result,query,querystart,queryend,genestrand);
   } else if (mode == ATOI_STRANDED || mode == ATOI_NONSTRANDED) {
     mark_mismatches_atoi_sam(result,query,querystart,queryend,genestrand);
+  } else if (mode == TTOC_STRANDED || mode == TTOC_NONSTRANDED) {
+    mark_mismatches_ttoc_sam(result,query,querystart,queryend,genestrand);
   } else {
     abort();
   }
diff --git a/src/types.h b/src/types.h
index cb02ba0..c7df5f2 100644
--- a/src/types.h
+++ b/src/types.h
@@ -1,4 +1,4 @@
-/* $Id: types.h 157223 2015-01-22 18:43:01Z twu $ */
+/* $Id: types.h 168395 2015-06-26 17:13:13Z twu $ */
 #ifndef TYPES_INCLUDED
 #define TYPES_INCLUDED
 #ifdef HAVE_CONFIG_H
@@ -116,6 +116,13 @@ typedef Uintlist_T Univcoordlist_T;
 
 #endif
 
+/* For univintervals and Univ_IIT (chromosome_iit) files.  Use the largest word size allowable on the machine.  */
+#ifdef HAVE_64_BIT
+typedef UINT8 Univ_IIT_coord_T;
+#else
+typedef UINT4 Univ_IIT_coord_T;
+#endif
+
 /* For splicetrie */
 typedef UINT4 Trieoffset_T;
 typedef UINT4 Triecontent_T;
diff --git a/src/uniqscan.c b/src/uniqscan.c
index 79b0470..c2359ad 100644
--- a/src/uniqscan.c
+++ b/src/uniqscan.c
@@ -1,4 +1,4 @@
-static char rcsid[] = "$Id: uniqscan.c 166641 2015-05-29 21:13:04Z twu $";
+static char rcsid[] = "$Id: uniqscan.c 167592 2015-06-15 18:56:59Z twu $";
 #ifdef HAVE_CONFIG_H
 #include <config.h>
 #endif
@@ -775,8 +775,12 @@ main (int argc, char *argv[]) {
 	  mode = ATOI_STRANDED;
 	} else if (!strcmp(optarg,"atoi-nonstranded")) {
 	  mode = ATOI_NONSTRANDED;
+	} else if (!strcmp(optarg,"ttoc-stranded")) {
+	  mode = TTOC_STRANDED;
+	} else if (!strcmp(optarg,"ttoc-nonstranded")) {
+	  mode = TTOC_NONSTRANDED;
 	} else {
-	  fprintf(stderr,"--mode must be standard, cmet-stranded, cmet-nonstranded, atoi-stranded, or atoi\n");
+	  fprintf(stderr,"--mode must be standard, cmet-stranded, cmet-nonstranded, atoi-stranded, atoi-nonstranded, ttoc-stranded, or ttoc-nonstranded\n");
 	  exit(9);
 	}
 
@@ -1063,6 +1067,30 @@ main (int argc, char *argv[]) {
 	exit(9);
       }
 
+    } else if (mode == TTOC_STRANDED || mode == TTOC_NONSTRANDED) {
+      if (user_atoidir == NULL) {
+	modedir = genomesubdir;
+      } else {
+	modedir = user_atoidir;
+      }
+
+      if ((indexdb = Indexdb_new_genome(&index1part,&index1interval,
+					modedir,fileroot,/*idx_filesuffix*/"a2itc",/*snps_root*/NULL,
+					required_index1part,required_interval,
+					expand_offsets_p,offsetsstrm_access,positions_access,
+					/*sharedp*/false)) == NULL) {
+	fprintf(stderr,"Cannot find a2itc index file.  Need to run atoiindex first\n");
+	exit(9);
+      }
+
+      if ((indexdb2 = Indexdb_new_genome(&index1part,&index1interval,
+					 modedir,fileroot,/*idx_filesuffix*/"a2iag",/*snps_root*/NULL,
+					 required_index1part,required_interval,
+					 expand_offsets_p,offsetsstrm_access,positions_access,
+					 /*sharedp*/false)) == NULL) {
+	fprintf(stderr,"Cannot find a2iag index file.  Need to run atoiindex first\n");
+	exit(9);
+      }
 
     } else {
       /* Standard behavior */
@@ -1140,6 +1168,30 @@ main (int argc, char *argv[]) {
 	exit(9);
       }
 
+    } else if (mode == TTOC_STRANDED || mode == TTOC_NONSTRANDED) {
+      if (user_atoidir == NULL) {
+	modedir = snpsdir;
+      } else {
+	modedir = user_atoidir;
+      }
+
+      if ((indexdb = Indexdb_new_genome(&index1part,&index1interval,
+					modedir,fileroot,/*idx_filesuffix*/"a2itc",snps_root,
+					required_index1part,required_interval,
+					expand_offsets_p,offsetsstrm_access,positions_access,
+					/*sharedp*/false)) == NULL) {
+	fprintf(stderr,"Cannot find a2itc index file.  Need to run atoiindex first\n");
+	exit(9);
+      }
+      if ((indexdb2 = Indexdb_new_genome(&index1part,&index1interval,
+					 modedir,fileroot,/*idx_filesuffix*/"a2iag",snps_root,
+					 required_index1part,required_interval,
+					 expand_offsets_p,offsetsstrm_access,positions_access,
+					 /*sharedp*/false)) == NULL) {
+	fprintf(stderr,"Cannot find a2iag index file.  Need to run atoiindex first\n");
+	exit(9);
+      }
+
     } else {
       indexdb = Indexdb_new_genome(&index1part,&index1interval,
 				   snpsdir,fileroot,/*idx_filesuffix*/"ref",snps_root,
diff --git a/src/univinterval.h b/src/univinterval.h
index a0c6877..406e989 100644
--- a/src/univinterval.h
+++ b/src/univinterval.h
@@ -1,4 +1,4 @@
-/* $Id: univinterval.h 157221 2015-01-22 18:38:57Z twu $ */
+/* $Id: univinterval.h 168395 2015-06-26 17:13:13Z twu $ */
 #ifndef UNIVINTERVAL_INCLUDED
 #define UNIVINTERVAL_INCLUDED
 
@@ -9,8 +9,8 @@
 #define T Univinterval_T
 typedef struct T *T;
 struct T {
-  Univcoord_T low;		/* low <= high */
-  Univcoord_T high;
+  Univ_IIT_coord_T low;		/* low <= high */
+  Univ_IIT_coord_T high;
   int sign;
   int type;
 };
diff --git a/tests/Makefile.in b/tests/Makefile.in
index 23a693e..5cb4615 100644
--- a/tests/Makefile.in
+++ b/tests/Makefile.in
@@ -49,6 +49,7 @@ am__aclocal_m4_deps = $(top_srcdir)/config/libtool.m4 \
 	$(top_srcdir)/config/mmap-flags.m4 \
 	$(top_srcdir)/config/acx_mmap_fixed.m4 \
 	$(top_srcdir)/config/acx_mmap_variable.m4 \
+	$(top_srcdir)/config/shm-flags.m4 \
 	$(top_srcdir)/config/ax_mpi.m4 \
 	$(top_srcdir)/config/acx_pthread.m4 \
 	$(top_srcdir)/config/builtin-popcount.m4 \
diff --git a/util/Makefile.in b/util/Makefile.in
index 2a662a2..0f440f3 100644
--- a/util/Makefile.in
+++ b/util/Makefile.in
@@ -58,6 +58,7 @@ am__aclocal_m4_deps = $(top_srcdir)/config/libtool.m4 \
 	$(top_srcdir)/config/mmap-flags.m4 \
 	$(top_srcdir)/config/acx_mmap_fixed.m4 \
 	$(top_srcdir)/config/acx_mmap_variable.m4 \
+	$(top_srcdir)/config/shm-flags.m4 \
 	$(top_srcdir)/config/ax_mpi.m4 \
 	$(top_srcdir)/config/acx_pthread.m4 \
 	$(top_srcdir)/config/builtin-popcount.m4 \

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/gmap.git



More information about the debian-med-commit mailing list